diff --git a/misc-scripts/assembly_patches/add_GRC_align_features.pl b/misc-scripts/assembly_patches/add_GRC_align_features.pl deleted file mode 100755 index d53edb6531fca5ff0f0e3544b2921cae94e89078..0000000000000000000000000000000000000000 --- a/misc-scripts/assembly_patches/add_GRC_align_features.pl +++ /dev/null @@ -1,333 +0,0 @@ -#!/usr/bin/env perl -# -# Adds GRC genomic mapping to the dna align feature table -# in the HAP pipeline with their cigar strings. Note that -# in some cases there is more than one alignment per patch. -# -# Example: -# -# perl add_GRC_align_features.pl -dbhost genebuildn \ -# -dbname homo_sapiens_core_nn_nn -dbuser user -dbpass pass \ -# -patch_release GRCh37.p8 -verbose - -use strict; -use warnings; - -use Getopt::Long; -use Net::FTP; -use Bio::EnsEMBL::Utils::Exception qw(throw warning); -use Bio::EnsEMBL::Slice; -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::DBSQL::SliceAdaptor; -use Bio::EnsEMBL::DBSQL::AssemblyExceptionFeatureAdaptor; -use Bio::EnsEMBL::AssemblyExceptionFeature; -use Bio::EnsEMBL::DnaDnaAlignFeature; -use Bio::EnsEMBL::Analysis; - -$| = 1; - -my $dbname = ''; -my $dbhost = ''; -my $dbuser = 'ensro'; -my $dbpass = ''; -my $dbport = '3306'; -my $patch_release = ''; -my $store = 0; -my $external_db_id = '50692'; # GRC_alignment_import -my $syn_external_db_id = '50710'; # seq_region_synonym slice type - i.e. INSDC - -my $verbose = 0; - -my @patch_types = ('PATCH_FIX','PATCH_NOVEL'); -my @dna_align_features = (); - -&GetOptions( - 'dbhost:s' => \$dbhost, - 'dbuser:s' => \$dbuser, - 'dbpass:s' => \$dbpass, - 'dbname:s' => \$dbname, - 'dbport:n' => \$dbport, - 'patch_release:s' => \$patch_release, - 'external_db_id:n' => \$external_db_id, - 'write!' => \$store, - 'verbose!' => \$verbose, -); - -if(!$patch_release){ - throw ("Need to specify assembly version with -patch_release.\n"); -} - -# get alt_scaffold_placement.txt to generate filename that we will need -# to retrieve files from the NCBI ftp site. Populates... - -my %ftp_filename=(); - -my ($content, $remote_file_handle) = ""; -open($remote_file_handle, '>', \$content); - -my $ftp = Net::FTP->new('ftp.ncbi.nlm.nih.gov', Debug => 0) - or die "Can't connect to NCBI FTP: $@"; - -$ftp->login('anonymous', '-anonymous@') - or die 'Cannot login ', $ftp->message; - -chomp $patch_release; - -my $ncbi_wd = - "genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/". - $patch_release. - "/PATCHES/alt_scaffolds/"; - -$ftp->cwd($ncbi_wd) - or die 'Cannot change working directory ', $ftp->message; - -$ftp->get('alt_scaffold_placement.txt', $remote_file_handle) - or die "get failed ", $ftp->message; - -my @asp_lines = split /\n/, $content; - -foreach my $asp_line (@asp_lines) { - next if $asp_line =~ /^#/; - my @elem = split /\t/, $asp_line; - my $patch_name = $elem[2]; - my $file_name = $elem[3]."_".$elem[6].".gff"; - print "Filename: $file_name\t\tPatchname: $patch_name\n" if $verbose; - $ftp_filename{$patch_name} = $file_name; -} - -# change directory to where the GRC alignments are kept: - -$ncbi_wd = "alignments"; -$ftp->cwd($ncbi_wd) or die 'Cannot change working directory ', $ftp->message; - -# hash of arrays - there way be more than one alignment per file if they -# have been manually annotated, However the GRC may change all to one -# line in the near future, in the meantime, we need to deal with them. - -my %align_str =(); - -foreach my $patch (keys %ftp_filename) { - close $remote_file_handle; - open($remote_file_handle, '>', \$content); - $ftp->get($ftp_filename{$patch}, $remote_file_handle) - or die "get failed ", $ftp->message; - - my @lines = split "\n", $content; - foreach my $line (@lines) { - next if $line =~ /^\#/; - # We'll parse the data later because we need most of it. - push @{$align_str{$patch}},$line; - } - #sleep 1; -} - - - -my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $dbhost, - -user => $dbuser, - -pass => $dbpass, - -port => $dbport, - -dbname => $dbname ); - -my $sa = $db->get_SliceAdaptor(); - -my $analysis = new Bio::EnsEMBL::Analysis( -logic_name => "grc_alignment_import", - -db_version => $patch_release); - - - -# TODO - leave $asm_exc_adaptor in - that way we can compare -# information from file with what we already know as a sanity check. - - -# Now get the patches, they come in pairs, the assembly exception and the reference -print "Getting patches...\n" if $verbose; -my $asm_exc_adaptor = $db->get_AssemblyExceptionFeatureAdaptor(); -my @exceptions = @{$asm_exc_adaptor->fetch_all()}; -my @patches; -EXC: foreach my $exc (@exceptions){ - foreach my $type (@patch_types){ - if($exc->type() =~ m/$type/){ - push(@patches, $exc); - next EXC; - } - } -} -# Assuming that AssemblyExceptionFeatureAdaptor's fetch_all will always -# return 2 entries for each patch and that those two entries are adjacent -my $num_patches = scalar(@patches)/2; - -print "Have ".$num_patches." patches.\n"; - -# for each patch -for (my $i = 0; $i < $num_patches; $i++) { - - # get the two slices - my $ref_slice; - my $patch_slice; - - for(my $j = 0; $j < 2; $j++) { - my $exc = pop(@patches); - # if this is the ref version - if($exc->type =~ m/REF/){ - # alt is only the patch slice - $patch_slice = $exc->alternate_slice(); - } - else{ - # alt is replaced region of ref - $ref_slice = $exc->alternate_slice(); - } - } - if(!($patch_slice and $ref_slice)){ - throw("Something is wrong, the patch and ref slices were not set correctly.\n"); - } - - my @patch_vals = split /:/, $patch_slice->display_id; - my $patch_name = $patch_vals[2]; - - foreach my $string ( @{ $align_str{$patch_name}}) { - my @el = split /\t/, $string; - - my $num = $#el; - throw ("Incorrect number of elements in gtf file: $num") unless $num == 8; - - my ($seq_id, $source, $type, $start, $end, $score, $strand, $phase, $attr) = split /\t/, $string; - - $strand = fix_strand($strand); - - my %attribute = (); - foreach my $kvp (split ';', $attr) { - my ($key, $value) = split '=', $kvp; - $attribute{$key} = $value; - } - - my $target = $attribute{"Target"}; - my ($hseqname, $hstart, $hend, $hstrand ) = split " ", $target; - - $hstrand = fix_strand($hstrand); - - my $length = ($hend - $hstart) + 1; - - my $cigar_line; - $cigar_line = $attribute{"Gap"}; - if (defined $cigar_line) { - sanity_check_cigar_line($cigar_line, $length); - $cigar_line = reformat_cigar_line($cigar_line); - } else { - $cigar_line = $length."M"; - } - - - # need the seq_region_id from seq_region_synonym - my @synonyms = @{$ref_slice->get_all_synonyms()}; - my $seq_region_id = ''; - foreach my $syn (@synonyms) { - if ($syn->external_db_id() == $syn_external_db_id) { - $seq_region_id = $syn->seq_region_id(); - last(); - } - } - # ...to obtain the slice: - - my $slice = $sa->fetch_by_seq_region_id($seq_region_id); - - - my $daf = new Bio::EnsEMBL::DnaDnaAlignFeature( - -slice => $slice, - -start => $start, - -end => $end, - -strand => $strand, - -analysis => $analysis, - -score => $score, - -hstart => $hstart, - -hend => $hend, - -hstrand => $hstrand, - -hseqname => $hseqname, - -hcoverage => $attribute{"pct_coverage"}, - -percent_id => $attribute{"pct_identity_ungap"}, - -external_db_id => $external_db_id, - -cigar_string => $cigar_line, - ); - - push @dna_align_features, $daf; - } -} - - -# now store all the dna_align features -if (scalar(@dna_align_features) > 0) { - write_dna_align_features_to_db($db,\@dna_align_features,$store) -} -print "There are ".scalar (@dna_align_features)." dna_align_features.\n"; - - -sub write_dna_align_features_to_db { - my ($db,$dna_align_features,$store) = @_; - - DAF: foreach my $dna_align_feat (@$dna_align_features) { - if ($store) { - $db->get_DnaAlignFeatureAdaptor->store($dna_align_feat); - if ($@) { - throw("ERROR: Can't write dna_align_feat ".$dna_align_feat->hseqname." [$@]"); - } else { - print "Written ".$dna_align_feat->hseqname." on chr ".$dna_align_feat->slice->name - ." strand ".$dna_align_feat->hstrand." with start ".$dna_align_feat->start - ." end ".$dna_align_feat->end."\n" if $verbose; - } - } else { - print "Not storing ".$dna_align_feat->hseqname."\n" if $verbose; - } - } # DAF - return 1; -} - -sub fix_strand { - my $strand = shift; - if ($strand eq '+') { - $strand = 1; - } elsif ($strand eq '-') { - $strand = -1; - } else { - throw("Strand problem :".$strand); - } - return $strand; -} - -sub sanity_check_cigar_line { - # ok, it sanity checks the GRCs idea of a cigar line which is close to GFF3 format - my ($line, $len) = @_; - my $cl_length = 0; - throw("Can only sanity check cigar lines with whitespace") unless $line =~ /\s/; - my @elements = split /\s/, $line; - foreach my $el (@elements) { - my ($operator, $num) = ($1, $2) if $el =~ /^(\w{1})(\d+)$/; - if ($operator =~ /[MI]/) { - $cl_length += $num; - } elsif ($operator eq 'D') { - # nothing to do - } else { - throw("Unknown alignment operator: $operator acting on $num"); - } - } - if ($cl_length != $len) { - warn("Cigar_line length: $cl_length does not match length: $len for this line:\n$line\n\n"); - } -} - -sub reformat_cigar_line { - my $line = shift; - # hack - # the GRC cigar line format turns out to be back to front - fix it - # this is a retrospective hack, with hindsight the logic of the script - # would be different and probably incorporated into the sub above. - my @elements = split /\s/, $line; - $line = ''; - foreach my $el (@elements) { - my ($operator, $num) = ($1, $2) if $el =~ /^(\w{1})(\d+)$/; - $line .= $num.$operator; - } - return $line; -} - - -exit; diff --git a/misc-scripts/assembly_patches/remove_patch_karyotype.sql b/misc-scripts/assembly_patches/remove_patch_karyotype.sql deleted file mode 100644 index 2173c5d8018c188bbd45e66256969d770d689d78..0000000000000000000000000000000000000000 --- a/misc-scripts/assembly_patches/remove_patch_karyotype.sql +++ /dev/null @@ -1 +0,0 @@ -delete karyotype from attrib_type, seq_region_attrib, karyotype where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = karyotype.seq_region_id; diff --git a/misc-scripts/assembly_patches/remove_patch_raw_compute.sql b/misc-scripts/assembly_patches/remove_patch_raw_compute.sql deleted file mode 100644 index d913d9ca78b8c37271b2f7a0966a95b12714ea5f..0000000000000000000000000000000000000000 --- a/misc-scripts/assembly_patches/remove_patch_raw_compute.sql +++ /dev/null @@ -1,12 +0,0 @@ -delete repeat_feature from attrib_type, seq_region_attrib, repeat_feature where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = repeat_feature.seq_region_id; - -delete prediction_exon from attrib_type, seq_region_attrib, prediction_exon where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = prediction_exon.seq_region_id; - -delete prediction_transcript from attrib_type, seq_region_attrib, prediction_transcript where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = prediction_transcript.seq_region_id; - -delete simple_feature from attrib_type, seq_region_attrib, simple_feature where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = simple_feature.seq_region_id; - -delete dna_align_feature from attrib_type, seq_region_attrib, dna_align_feature where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = dna_align_feature.seq_region_id and dna_align_feature_id not in (select feature_id from transcript_supporting_feature where feature_type = 'dna_align_feature') and dna_align_feature_id not in (select feature_id from supporting_feature where feature_type = 'dna_align_feature'); - -delete protein_align_feature from attrib_type, seq_region_attrib, protein_align_feature where attrib_type.code in ('patch_novel','patch_fix') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = protein_align_feature.seq_region_id and protein_align_feature_id not in (select feature_id from transcript_supporting_feature where feature_type = 'protein_align_feature') and protein_align_feature_id not in (select feature_id from supporting_feature where feature_type = 'protein_align_feature'); - diff --git a/misc-scripts/assembly_patches/remove_updated_and_deprecated_patches.pl b/misc-scripts/assembly_patches/remove_updated_and_deprecated_patches.pl deleted file mode 100644 index eb50cb721cca4d0b1f5fbbcc5487aa5d1d711297..0000000000000000000000000000000000000000 --- a/misc-scripts/assembly_patches/remove_updated_and_deprecated_patches.pl +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/local/ensembl/bin/perl -w - -use Bio::EnsEMBL::Registry; -use Bio::EnsEMBL::Utils::Exception qw(throw); -use Getopt::Long; - -use strict; - -my $pass; -my $patchtype_file = "./data/patch_type"; -my $dbname; -my $host; -my $user; -my $port = 3306; -my $central_coord_system = 'supercontig'; -my $toplevel_coord_system = 'chromosome'; - -&GetOptions( - 'pass=s' => \$pass, - 'patchtype_file=s' => \$patchtype_file, - 'host=s' => \$host, - 'dbname=s' => \$dbname, - 'user=s' => \$user, - 'port=n' => \$port, - ); - -open(TYPE,"<".$patchtype_file) || die "Could not open file $patchtype_file"; -open(SQL,">delete_patch.sql") || die "Could not open delete_patch.sql for writing\n"; -#connect to the database - -my $dba = new Bio::EnsEMBL::DBSQL::DBAdaptor( - '-host' => $host, - '-user' => $user, - '-pass' => $pass, - '-dbname' => $dbname, - '-species' => "load" - ); - -my $get_all_synonyms_sth = $dba->dbc->prepare("select synonym from seq_region, seq_region_synonym, coord_system where seq_region.name in (select seq_region.name from seq_region, seq_region_attrib, attrib_type where -attrib_type.code in ('patch_fix','patch_novel') and attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id and seq_region_attrib.seq_region_id = seq_region.seq_region_id) and seq_region.coord_system_id=coord_system.coord_system_id and coord_system.name = '".$central_coord_system."' and seq_region.seq_region_id=seq_region_synonym.seq_region_id") -|| die "Could not prepare to get synonyms"; - -if ($get_all_synonyms_sth == 0) { - throw("Could not prepare to get synonyms"); -} -my $get_name_sth = $dba->dbc->prepare("select name from seq_region,seq_region_synonym where synonym = ? and seq_region_synonym.seq_region_id = seq_region.seq_region_id") -|| die "Could not prepare to get name"; - -if ($get_name_sth == 0) { - throw("Could not prepare to get name"); -} -my $get_synonym_sth = $dba->dbc->prepare("select synonym from seq_region_synonym, seq_region, coord_system where seq_region. name = ? and seq_region.coord_system_id=coord_system.coord_system_id and coord_system.name = ? and seq_region.seq_region_id=seq_region_synonym.seq_region_id") - || die "Could not prepare to get synonym"; - -if ($get_synonym_sth == 0) { - throw("Could not prepare to get synonym"); -} -my $get_seq_region_id_sth = $dba->dbc->prepare("select seq_region_id from seq_region, coord_system where seq_region. name = ? and seq_region.coord_system_id=coord_system.coord_system_id and -coord_system.name = ?") - || die "Could not prepare to get seq_region_id"; - -if ($get_seq_region_id_sth == 0) { - throw("Could not prepare to get seq_region_id"); -} -#get the full list of existing synonyms/accessions -my $existing_synonym; -my %existing_synonyms; - -$get_all_synonyms_sth->execute() || die "problem executing seq_region_id"; -$get_all_synonyms_sth->bind_columns(\$existing_synonym) || die "problem binding"; -while($get_all_synonyms_sth->fetch()){ - print "Synonym ".$existing_synonym."\n"; - $existing_synonyms{$existing_synonym} = 1; -} - - -#for the new files -while (<TYPE>) { - chomp; - next if (/^#/); - my ($alt_scaf_name,$alt_scaf_acc,$type) = split(/\t/,$_); - if (!$alt_scaf_name || !$alt_scaf_acc || !$type) { - throw("Unable to find name, accession or type"); - } - - #check to see if this is an updated patch - my $synonym; - $get_synonym_sth->execute($alt_scaf_name, $central_coord_system) || die "problem executing get synonym"; - $get_synonym_sth->bind_columns(\$synonym) || die "problem binding"; - $get_synonym_sth->fetch(); - - if(defined($synonym)){ - #if patch still exists (even if modified) record by setting to 0 - #modified patches are dealt with here already - $existing_synonyms{$synonym} = 0; - #unchanged - if($synonym eq $alt_scaf_acc){ - print $alt_scaf_name." exists and is unchanged\n"; - } - #modified - else{ - print $alt_scaf_name." has been modified acc ".$synonym." to ".$alt_scaf_acc."\n"; - remove_patch($alt_scaf_name); - } - } - #new - else{ - print $alt_scaf_name." is a new patch\n"; - } - -} -#removed -my @exist_accs = keys %existing_synonyms; -foreach my $acc (@exist_accs){ - if($existing_synonyms{$acc}){ - my $name; - $get_name_sth->execute($acc) || die "problem executing get name"; - $get_name_sth->bind_columns(\$name) || die "problem binding"; - $get_name_sth->fetch(); - print $name." with acc ".$acc." has been removed from the new set\n"; - remove_patch($name); - } -} - - -sub remove_patch{ - - my $alt_scaf_name = shift; - #get the patch seq_region_ids (chrom and scaffold) - my($scaf_id, $chrom_id); - $get_seq_region_id_sth->execute($alt_scaf_name, $central_coord_system) || die "problem executing seq_region_id"; - $get_seq_region_id_sth->bind_columns(\$scaf_id) || die "problem binding"; - $get_seq_region_id_sth->fetch(); - - $get_seq_region_id_sth->execute($alt_scaf_name, $toplevel_coord_system) || die "problem executing seq_region_id"; - $get_seq_region_id_sth->bind_columns(\$chrom_id) || die "problem binding"; - $get_seq_region_id_sth->fetch(); - - #check for components only used in patch - my $scaf_comp; - my $get_region_components_sth = $dba->dbc->prepare("select distinct cmp_seq_region_id from assembly where asm_seq_region_id = ?") || die "Couldn't get components"; - $get_region_components_sth->execute($scaf_id) || die "problem executing get components"; - $get_region_components_sth->bind_columns(\$scaf_comp) || die "problem binding"; - COMP: while($get_region_components_sth->fetch()){ - #check each component - my $asm_id; - my $get_asm_ids_sth = $dba->dbc->prepare("select distinct asm_seq_region_id from assembly where cmp_seq_region_id = ?") || die "Couldn't get asm ids"; - $get_asm_ids_sth->execute($scaf_comp) || die "problem executing get components"; - $get_asm_ids_sth->bind_columns(\$asm_id) || die "problem binding"; - while($get_asm_ids_sth->fetch()){ - if($asm_id != $chrom_id and $asm_id != $scaf_id){ - #print "Used outside of the patch\n"; - next COMP; - } - } - #component only used in patch - #print $scaf_comp." only used in patch\n"; - print SQL "delete from dna where seq_region_id = ".$scaf_comp.";\n"; - print SQL "delete from seq_region_attrib where seq_region_id = ".$scaf_comp.";\n"; - print SQL "delete from seq_region where seq_region_id = ".$scaf_comp.";\n"; - } - - print SQL "delete from seq_region_synonym where seq_region_id = ".$scaf_id.";\n"; - - print SQL "delete from seq_region_attrib where seq_region_id = ".$chrom_id.";\n"; - - print SQL "delete from assembly_exception where seq_region_id = ".$chrom_id.";\n"; - - print SQL "delete from assembly where asm_seq_region_id = ".$chrom_id.";\n"; - print SQL "delete from assembly where asm_seq_region_id = ".$scaf_id.";\n"; - - print SQL "delete from seq_region where seq_region_id = ".$chrom_id.";\n"; - print SQL "delete from seq_region where seq_region_id = ".$scaf_id.";\n"; - - print SQL "delete from splicing_event where seq_region_id = ".$chrom_id.";\n"; - - print SQL "delete from marker_feature where seq_region_id = ".$chrom_id.";\n"; -} - diff --git a/misc-scripts/assembly_patches/run_patch_mapping.pl b/misc-scripts/assembly_patches/run_patch_mapping.pl deleted file mode 100755 index 9236bdaa766717ee01114e6dbfeb0d2a30f55e1e..0000000000000000000000000000000000000000 --- a/misc-scripts/assembly_patches/run_patch_mapping.pl +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/local/ensembl/bin/perl - -use strict; -use warnings; -use Data::Dumper; -use Getopt::Long; -use Bio::EnsEMBL::Utils::Exception qw(throw warning); -use Bio::EnsEMBL::Slice; -use Bio::EnsEMBL::DBSQL::DBAdaptor; -use Bio::EnsEMBL::DBSQL::SliceAdaptor; -use Bio::EnsEMBL::DBSQL::AssemblyExceptionFeatureAdaptor; -use Bio::EnsEMBL::AssemblyExceptionFeature; - -$| = 1; - -my $host = ''; -my $user = ''; -my $pass = ''; -my $port = ''; -my $dbname = ''; -my @patch_types = ('PATCH_FIX','PATCH_NOVEL','HAP'); -my $align_by_component; -my $align_non_ident; -my $fix_overlap; -my $new_align_session; -my $check_repeat_masked; -my $out_dir; -my $perl; -my $pipe_path; -my $script; -my $assembly; - -&GetOptions( 'dbhost:s' => \$host, - 'dbuser:s' => \$user, - 'dbname:s' => \$dbname, - 'dbpass:s' => \$pass, - 'dbport:n' => \$port, - 'align_by_component' => \$align_by_component, - 'align_non_ident' => \$align_non_ident, - 'new_align_session' => \$new_align_session, - 'check_repeat_masked' => \$check_repeat_masked, - 'fix_overlap' => \$fix_overlap, - 'out_dir:s' => \$out_dir, - 'perl:s' => \$perl, - 'pipeline_path:s' => \$pipe_path, - 'assembly:s' => \$assembly); - -my $db = new Bio::EnsEMBL::DBSQL::DBAdaptor( -host => $host, - -user => $user, - -pass => $pass, - -port => $port, - -dbname => $dbname ); - -if((!($align_by_component or $align_non_ident or $fix_overlap or $new_align_session or $check_repeat_masked))){ - throw("Must specify one of -align_by_component, -align_non_ident, -new_align_session, -check_repeat_masked or -fix_overlap.\n"); -} -if((!$out_dir) or (!(-d $out_dir))){ - throw("Must specify a valid output directory.\n"); -} -if(!$perl){ - throw("Must specify perl path.\n"); -} -if(!$pipe_path){ - throw("Must specify path to ensembl-pipeline.\n"); -} -if(!$assembly){ - throw("Must specify assembly i.e. GRCh37.\n"); -} -if($align_by_component){ - $script = $pipe_path."/scripts/Finished/assembly/align_by_component_identity.pl"; - if(! -e $script){ - throw("Can't find script: ".$script."\n"); - } -} -elsif($align_non_ident){ - $script = $pipe_path."/scripts/Finished/assembly/align_nonident.pl"; - if(! -e $script){ - throw("Can't find script: ".$script."\n"); - } -} -elsif($new_align_session){ - $script = $pipe_path."/scripts/Finished/assembly/new_align_session.pl"; - if(! -e $script){ - throw("Can't find script: ".$script."\n"); - } -} -elsif($check_repeat_masked){ - $script = $pipe_path."/scripts/Finished/assembly/check_repeat_masked.pl"; - if(! -e $script){ - throw("Can't find script: ".$script."\n"); - } -} -elsif($fix_overlap){ - $script = $pipe_path."/scripts/Finished/assembly/fix_overlaps.pl"; - if(! -e $script){ - throw("Can't find script: ".$script."\n"); - } -} - -#get the patches -print "Getting patches...\n"; -my $asm_exc_adaptor = $db->get_AssemblyExceptionFeatureAdaptor(); -my @exceptions = @{$asm_exc_adaptor->fetch_all()}; -my @patches; -EXC: foreach my $exc (@exceptions){ - foreach my $type (@patch_types){ - if($exc->type() =~ m/$type/){ - push(@patches, $exc); - next EXC; - } - } -} -#Assuming that AssemblyExceptionFeatureAdaptor's fetch_all will always return two -#entries for each patch and that those two entries are adjacent -my $num_patches = scalar(@patches)/2; -print "Have ".$num_patches." patches.\n"; - -my $command = ""; -my @ref_chroms; -my @patch_chroms; - -#for each patch -for (my $i = 0;$i < $num_patches;$i++){ - - #get the two slices - my $ref_slice; - my $patch_slice; - for(my $j = 0;$j < 2; $j++){ - my $exc = pop(@patches); - #if this is the ref version - if($exc->type =~ m/REF/){ - #alt is only the patch slice - $patch_slice = $exc->alternate_slice(); - } - else{ - #alt is replaced region of ref - $ref_slice = $exc->alternate_slice(); - } - } - if(!($patch_slice and $ref_slice)){ - throw("Something is wrong, the patch and ref slices were not set correctly.\n"); - } - - print "Reached ".$patch_slice->name()." and ".$ref_slice->name()."\n"; - - #running for all patches together below - if($align_by_component or $fix_overlap or $new_align_session or $check_repeat_masked){ - push @ref_chroms, $ref_slice->seq_region_name; - push @patch_chroms, $patch_slice->seq_region_name; - } - elsif($align_non_ident){ - #run for each patch - $command = "bsub -R'select[mem>4000] rusage[mem=4000]' -M4000000 -o ".$out_dir.$patch_slice->seq_region_name."_nonident.out -e ". - $out_dir.$patch_slice->seq_region_name."_nonident.err ". - $perl." ".$script." --dbhost ".$host." --dbport ".$port." --dbuser ".$user." --dbpass ".$pass." --dbname ".$dbname. - " --assembly ".$assembly." --altdbname ".$dbname." --altassembly ".$assembly." --chromosomes ".$ref_slice->seq_region_name. - " --altchromosomes ".$patch_slice->seq_region_name." --logpath ".$out_dir." --logfile ".$patch_slice->seq_region_name."_nonident.log --force-stage"; - - if ( ($patch_slice->seq_region_name eq 'HG1472_PATCH') or ($patch_slice->seq_region_name eq 'HG858_PATCH') ) { - $command = "bsub -R'select[mem>10000] rusage[mem=10000]' -M10000000 -o ".$out_dir.$patch_slice->seq_region_name."_nonident.out -e ". - $out_dir.$patch_slice->seq_region_name."_nonident.err ". - $perl." ".$script." --dbhost ".$host." --dbport ".$port." --dbuser ".$user." --dbpass ".$pass." --dbname ".$dbname. - " --assembly ".$assembly." --altdbname ".$dbname." --altassembly ".$assembly." --chromosomes ".$ref_slice->seq_region_name. - " --altchromosomes ".$patch_slice->seq_region_name." --logpath ".$out_dir." --logfile ".$patch_slice->seq_region_name."_nonident.log --force-stage"; - } - - print $command."\n"; - my $cmd_rtn = `$command`; - print $cmd_rtn."\n"; - } - -} - -if($align_by_component){ - - my $ref_chromosomes = join ",", @ref_chroms; - my $patch_chromosomes = join ",", @patch_chroms; - - $command = "bsub -R\"select[mem>3800] rusage[mem=3800]\" -M3800000 -o ".$out_dir."component_ident.out -e ".$out_dir."component_ident.err ". - $perl." ".$script." --host ".$host." --port ".$port." --user ".$user." --pass ".$pass." --dbname ".$dbname." --altdbname ".$dbname. - " --assembly ".$assembly." --altassembly ".$assembly." --chromosomes ".$ref_chromosomes. - " --altchromosomes ".$patch_chromosomes." --logpath ".$out_dir." --logfile component_ident.log"; - - print $command."\n"; - - my $cmd_rtn = `$command`; - print $cmd_rtn."\n"; -} -elsif($new_align_session){ - my $ref_chromosomes = join ",", @ref_chroms; - my $patch_chromosomes = join ",", @patch_chroms; - - $command = "bsub -o ".$out_dir."new_align_session.out -e ".$out_dir."new_align_session.err ". - $perl." ".$script." --host ".$host." --port ".$port." --user ".$user." --pass ".$pass." --dbname ".$dbname." --altdbname ".$dbname. - " --assembly ".$assembly." --altassembly ".$assembly." --chromosomes ".$ref_chromosomes. - " --altchromosomes ".$patch_chromosomes." --logpath ".$out_dir." --logfile new_align_session.log -author patch -nolog"; - - print $command."\n"; - - my $cmd_rtn = `$command`; - print $cmd_rtn."\n"; - -} -elsif($fix_overlap){ - - my $ref_chromosomes = join ",", @ref_chroms; - my $patch_chromosomes = join ",", @patch_chroms; - - $command = "bsub -o ".$out_dir."fix_overlap.out -e ".$out_dir."fix_overlap.err ". - $perl." ".$script." --dbhost ".$host." --dbport ".$port." --dbuser ".$user." --dbname ".$dbname." --altdbname ".$dbname. - " --dbpass ".$pass." --assembly ".$assembly." --altassembly ".$assembly." --chromosomes ".$ref_chromosomes." --altchromosomes ". - $patch_chromosomes." --logpath ".$out_dir." --logfile fix_overlap.log --force-stage"; - - print $command."\n"; - - my $cmd_rtn = `$command`; - print $cmd_rtn."\n"; -} -print "Completed\n";