diff --git a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm index 50a044169f2a4a19e7476b4e9d9feebb0f0b861a..4a9fac64b2c157294fa020f8e9cd7abb0bdb11a0 100644 --- a/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm +++ b/misc-scripts/xref_mapping/XrefMapper/BasicMapper.pm @@ -32,6 +32,10 @@ Post questions to the EnsEMBL development list ensembl-dev@ebi.ac.uk =cut +# Hashes to hold method-specific thresholds +my %method_query_threshold; +my %method_target_threshold; + =head2 dump_seqs Arg[1]: xref object which holds info needed for the dump of xref @@ -87,11 +91,7 @@ sub build_list_and_map { $i++; } - if (!defined($self->use_existing_mappings)) { - $self->run_mapping(\@list); - } else { - print "Using existing mappings"; - } + $self->run_mapping(\@list); } @@ -583,11 +583,17 @@ sub run_mapping { my ($self, $lists) = @_; - # delete old output files in target directory - my $dir = $self->dir(); - unlink (<$dir/*.map $dir/*.out $dir/*.err>); + # delete old output files in target directory if we're going to produce new ones + if (!defined($self->use_existing_mappings)) { + my $dir = $self->dir(); + unlink (<$dir/*.map $dir/*.out $dir/*.err>); + } # foreach method, submit the appropriate job & keep track of the job name + # note we check if use_existing_mappings is set here, not earlier, as we + # still need to instantiate the method object in order to fill + # method_query_threshold and method_target_threshold + my @job_names; foreach my $list (@$lists){ @@ -604,17 +610,22 @@ sub run_mapping { } else { my $obj = $obj_name->new(); - my $job_name = $obj->run($queryfile, $targetfile, $self->dir()); - push @job_names, $job_name; - sleep 1; # make sure unique names really are unique + $method_query_threshold{$method} = $obj->query_identity_threshold(); + $method_target_threshold{$method} = $obj->target_identity_threshold(); + if (!defined($self->use_existing_mappings)) { + my $job_name = $obj->run($queryfile, $targetfile, $self->dir()); + push @job_names, $job_name; + sleep 1; # make sure unique names really are unique + } } } # foreach method - # submit depend job to wait for all mapping jobs - submit_depend_job($self->dir, @job_names); - + if (!defined($self->use_existing_mappings)) { + # submit depend job to wait for all mapping jobs + submit_depend_job($self->dir, @job_names); + } } # run_mapping @@ -669,7 +680,7 @@ sub submit_depend_job { } -=head2 store +=head2 generate_core_xrefs Arg[1] : The target file used in the exonerate run. Used to work out the Ensembl object type. Arg[2] : @@ -681,7 +692,7 @@ sub submit_depend_job { =cut -sub store { +sub generate_core_xrefs { my ($self, $xref) = @_; @@ -745,6 +756,8 @@ sub store { # files are named Method_(dna|peptide)_N.map my $type = get_ensembl_object_type($file); + my $method = get_method($file); + # get or create the appropriate analysis ID # XXX restore when using writeable database #my $analysis_id = $self->get_analysis_id($type); @@ -763,8 +776,9 @@ sub store { # TODO make sure query & target are the right way around - # only take mappings where there is a good match on or both sequences - #next if ($query_identity < 98 and $target_identity < 98); + # only take mappings where there is a good match on one or both sequences + next if ($query_identity < $method_query_threshold{$method} && + $target_identity < $method_target_threshold{$method}); # note we add on $xref_id_offset to avoid clashes print OBJECT_XREF "$object_xref_id\t$target_id\t$type\t" . ($query_id+$xref_id_offset) . "\n"; @@ -838,6 +852,17 @@ sub get_ensembl_object_type { } +sub get_method { + + my $filename = shift; + + $filename = basename($filename); + + my ($method) = $filename =~ /^(.*)_(dna|peptide)_\d+\.map/; + + return $method; + +} sub get_analysis_id { @@ -947,7 +972,7 @@ sub dump_core_xrefs { # write to file and add to object_xref_mappings if (defined $xref_to_objects{$master_xref_id}) { # XXX check my @ensembl_object_ids = keys( %{$xref_to_objects{$master_xref_id}} ); # XXX check - print "xref $accession has " . scalar(@ensembl_object_ids) . " associated ensembl objects\n"; + #print "xref $accession has " . scalar(@ensembl_object_ids) . " associated ensembl objects\n"; foreach my $object_id (@ensembl_object_ids) { my $type = $ensembl_object_types{$object_id}; print OBJECT_XREF "$object_xref_id\t$object_id\t$type\t" . ($xref_id+$xref_id_offset) . "\tDEPENDENT\n"; diff --git a/misc-scripts/xref_mapping/XrefMapper/Methods/ExonerateBasic.pm b/misc-scripts/xref_mapping/XrefMapper/Methods/ExonerateBasic.pm index f7b68f4f88814b124f8cb9c6a4b9d5fdc6ec7323..b5a02b8e89bf56a09249643d02ff0041fc99fd5b 100644 --- a/misc-scripts/xref_mapping/XrefMapper/Methods/ExonerateBasic.pm +++ b/misc-scripts/xref_mapping/XrefMapper/Methods/ExonerateBasic.pm @@ -174,11 +174,11 @@ sub get_class_name() { my $self = shift; - $self =~ s/=.*$//; + my $module_name = ref($self); - my @bits = split(/::/, $self); + my @bits = split(/::/, $module_name); - return @bits[$#bits]; + return $bits[-1]; } @@ -196,5 +196,22 @@ sub check_err { } } +# Percentage identity that query (xref) must match to be considered. + +sub query_identity_threshold { + + return 98; + +} + + +# Percentage identity that target (ensembl) must match to be considered. + +sub target_identity_threshold { + + return 98; + +} + 1; diff --git a/misc-scripts/xref_mapping/XrefMapper/Methods/ExonerateUngappedBest1.pm b/misc-scripts/xref_mapping/XrefMapper/Methods/ExonerateUngappedBest1.pm index 833535b1ed54db7453e942a7a022d02e06f91715..80ab7ec1deae3c4fb3212dcc6870ed1315ee8b80 100644 --- a/misc-scripts/xref_mapping/XrefMapper/Methods/ExonerateUngappedBest1.pm +++ b/misc-scripts/xref_mapping/XrefMapper/Methods/ExonerateUngappedBest1.pm @@ -15,5 +15,16 @@ sub options { } +sub query_identity_threshold { + + return 100; + +} + +sub target_identity_threshold { + + return 100; + +} 1;