Commit e23e4e1e authored by cvs2git's avatar cvs2git
Browse files

This commit was manufactured by cvs2svn to create tag 'mergepoint-

vega-46-dev-e48'.

Sprout from master 2008-02-14 08:57:57 UTC Glenn Proctor <gp1@sanger.ac.uk> 'Fix job naming.'
Delete:
    misc-scripts/protein_match/process_pmach.pl
    modules/Bio/EnsEMBL/Collection.pm
    modules/Bio/EnsEMBL/Collection/DnaAlignFeature.pm
    modules/Bio/EnsEMBL/Collection/Exon.pm
    modules/Bio/EnsEMBL/Collection/Gene.pm
    modules/Bio/EnsEMBL/Collection/ProteinAlignFeature.pm
    modules/Bio/EnsEMBL/Collection/RepeatFeature.pm
    modules/Bio/EnsEMBL/Collection/Transcript.pm
    modules/Bio/EnsEMBL/DBSQL/Clone.pm
parent 33afb9d1
use strict;
=head1 Process pmatch
=head1 Description
=head2 Aims
This script aims to run pmatch and postprocess pmatch to map Ensembl peptides to external databases (currently Swissprot and Refseq but may be extented). The first part of the script runs pmatch, the second part gets the percentage of a match of a unique Ensembl peptide which match to an unique external protein. The third part classify each ensembl match as PRIMARY match (the longest one and the one which will be used for the mapping, PSEUDO, DUPLICATE and REPEAT (pretty arbitrary criterias but may be useful for quality control).
NB: All of the intermediary files are written.
=head2 Options
-ens : Ensembl peptide fasta file
-sp : SP, SPTREMBL fasta file
-refseq: Refseq peptide fasta file
=head2 Contacts
mongin@ebi.ac.uk
birney@ebi.ac.uk
=cut
use Getopt::Long;
my ($ens,$sp,$refseq,$pdb);
&GetOptions(
'ens:s'=>\$ens,
'sp:s'=>\$sp,
'refseq:s'=>\$refseq,
'pdb:s'=>\$pdb
);
&runpmatch();
&postprocesspmatch($sp);
&postprocesspmatch($refseq);
<<<<<<< process_pmach.pl
=======
&postprocesspmatch($pdb);
>>>>>>> 1.8
&finalprocess($sp);
&finalprocess($refseq);
&finalprocess($pdb);
#perl ../../../src/ensembl-live/misc-scripts/protein_match/process_pmach.pl -ens ../primary/SPAN_pepfile -sp ../primary/SPTr.human.expanded -refseq ../primary/hs2.fsa -pdb ../primary/scop_human.fas
sub runpmatch {
print STDERR "Running pmatch\n";
#Run pmatch and store the data in files which will be kept for debugging
my $pmatch1 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $sp $ens > ens_sp_rawpmatch";
my $pmatch2 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $refseq $ens > ens_refseq_rawpmatch";
#my $pmatch3 = "/nfs/griffin2/rd/bin.ALPHA/pmatch -T 14 $pdb $ens > ens_pdb_rawpmatch";
system($pmatch1); # == 0 or die "$0\Error running '$pmatch1' : $!";
system($pmatch2); #== 0 or die "$0\Error running '$pmatch2' : $!";
#system($pmatch3); #== 0 or die "$0\Error running '$pmatch2' : $!";
}
sub postprocesspmatch {
my ($db) = @_;
my %hash1;
my %hashlength;
#Post process the raw data from pmatch
if ($db eq $sp) {
print STDERR "Postprocessing pmatch for SP mapping\n";
open (OUT, ">ens_sp.processed") || die "Can't open File\n";
open (PROC, "ens_sp_rawpmatch") || die "Can't open File\n";
}
elsif ($db eq $refseq) {
print STDERR "Postprocessing pmatch for REFSEQ mapping\n";
open (OUT, ">ens_refseq.processed") || die "Can't open File\n";;
open (PROC, "ens_refseq_rawpmatch") || die "Can't open file ens_refseq_rawpmatch\n";
}
elsif ($db eq $pdb) {
print STDERR "Postprocessing pmatch for PDB mapping\n";
open (OUT, ">ens_pdb.processed") || die "Can't open File\n";;
open (PROC, "ens_pdb_rawpmatch") || die "Can't open file ens_pdb_rawpmatch\n";
}
while (<PROC>) {
#538 COBP00000033978 1 538 35.3 Q14146 1 538 35.3
my ($len,$id,$start,$end,$tperc,$query,$qst,$qend,$perc) = split;
if ($db eq $refseq) {
#Get only the refseq ac (NP_\d+)
($query) = $query =~ /\w+\|\d+\|\w+\|(\w+)/;
}
my $uniq = "$id:$query";
#Add the percentage of similarity for the Ensembl peptide for a single match
#There is a bug at this step, some similarities can be over 100% !!! This problem may be solved by changing pmatch source code
$hash1{$uniq} += $perc;
$hashlength{$uniq} += $len;
}
#Write out the processed data
foreach my $key ( keys %hash1 ) {
#if (($hashlength{$key} >= 20)) {
if (($hash1{$key} >= 25)) {
($a,$b) = split(/:/,$key);
print OUT "$a\t$b\t$hash1{$key}\n";
}
#else {
# print "$a\t$b\t$hash1{$key}\t$hashlength{$key}\n";
#}
}
close (PROC);
close (OUT);
}
sub finalprocess {
#This final subroutine will use the postprocessed pmatch file and get back the best Ensembl match (labelled as PRIMARY) for a given external known protein.
my ($db) = @_;
if ($db eq $sp) {
print STDERR "Getting final mapping for SP mapping\n";
open (PROC, "ens_sp.processed");
open (OUT, ">ens_sp.final");
}
elsif ($db eq $refseq) {
print STDERR "Getting final mapping for REFSEQ mapping\n";
open (PROC, "ens_refseq.processed") || die "Can' open file ens_refseq.processed\n";
open (OUT, ">ens_refseq.final");
}
elsif ($db eq $pdb) {
print STDERR "Getting final mapping for PDB mapping\n";
open (PROC, "ens_pdb.processed") || die "Can' open file ens_refseq.processed\n";
open (OUT, ">ens_pdb.final");
}
my %hash2;
while (<PROC>) {
my ($ens,$known,$perc) = split;
#if ($perc > 100) {
# print "$ens\t$known\t$perc\n";
#}
if( !defined $hash2{$known} ) {
$hash2{$known} = [];
}
#Each single external protein correspond to an array of objects dealing with the name and the percentage of similarity of the Ensembl peptide matching with the the known external protein.
my $p= NamePerc->new;
$p->name($ens);
$p->perc($perc);
push(@{$hash2{$known}},$p);
}
foreach my $know ( keys %hash2 ) {
my @array = @{$hash2{$know}};
@array = sort { $b->perc <=> $a->perc } @array;
#The Ensembl match to the known protein is labelled as PRIMARY and will be used later for the mapping
my $top = shift @array;
#if ($top->perc >= 20) {
print OUT "$know\t",$top->name,"\t",$top->perc,"\tPRIMARY\n";
foreach $ens ( @array ) {
if( $ens->perc > $top->perc ) {
die "Not good....";
}
}
#If there is more than 20 Ensembl peptides matching a single known protein, these Ensembl peptides are labelled as REPEAT
if (scalar(@array) >= 20) {
foreach my $repeat (@array) {
if( $repeat->perc+1 >= $top->perc ) {
print OUT "$know\t",$repeat->name,"\t",$repeat->perc,"\tDUPLICATE\n";
}
else {
print OUT "$know\t",$repeat->name,"\t",$repeat->perc,"\tREPEAT\n";
}
}
}
#If less than 20, either duplicate if percentage of identity close to the PRIMARY labelled as DUPLICATE or labelled as PSEUDO. DUPLICATEs can also be used for the mapping
if (scalar(@array) < 20) {
foreach my $duplicate (@array) {
if( $duplicate->perc+1 >= $top->perc ) {
print OUT "$know\t",$duplicate->name,"\t",$duplicate->perc,"\tDUPLICATE\n";
}
else {
print OUT "$know\t",$duplicate->name,"\t",$duplicate->perc,"\tPSEUDO\n";
}
}
}
}
#}
close (PROC);
close (OUT);
}
#Set of objects to deal with the script
package NamePerc;
sub new {
my $class= shift;
my $self = {};
bless $self,$class;
return $self;
}
=head2 name
Title : name
Usage : $obj->name($newval)
Function:
Returns : value of name
Args : newvalue (optional)
=cut
sub name{
my $obj = shift;
if( @_ ) {
my $value = shift;
$obj->{'name'} = $value;
}
return $obj->{'name'};
}
=head2 perc
Title : perc
Usage : $obj->perc($newval)
Function:
Returns : value of perc
Args : newvalue (optional)
=cut
sub perc{
my $obj = shift;
if( @_ ) {
my $value = shift;
$obj->{'perc'} = $value;
}
return $obj->{'perc'};
}
# $Id$
# Ensembl module for Bio::EnsEMBL::Collection
#
# You may distribute this module under the same terms as perl itself.
=head1 NAME
Bio::EnsEMBL::Collection - Abstract base class for feature collection
classes.
=head1 SYNOPSIS
use Bio::EnsEMBL::Collection::RepeatFeature;
# Pick a slice.
my $slice =
$slice_adaptor->fetch_by_region( 'Chromosome', '2', 1, 1e9 );
# Create a feature collection on the slice, restricting it to
# 'Dust' features.
my $collection =
Bio::EnsEMBL::Collection::RepeatFeature->new(-slice => $slice,
-light => 0,
-analysis => 'Dust'
);
# Populate the feature collection from the slice.
$collection->populate();
# Populate the feature collection from the slice. Sort the
# entries on feature start position.
$collection->populate( -sorted => 1 );
# Populate the feature collection from the slice. Sort the
# entries on feature length.
my $entry_start_idx = Bio::EnsEMBL::Collection::ENTRY_SEQREGIONSTART;
my $entry_end_idx = Bio::EnsEMBL::Collection::ENTRY_SEQREGIONEND;
$collection->populate(
-sorted => 1,
-sortsub => sub {
$a->[$entry_end_idx] - $a->[$entry_start_idx]
<=>
$b->[$entry_end_idx] - $b->[$entry_start_idx];
} );
# Retrieve the entries from the collection as an array of arrays.
my @entries = @{ $collection->entries() };
# Retrieve a binned representation of the entries using 100 bins.
my @binned_entries = @{
$collection->get_bins( -nbins => 100,
-method => 'entries'
) };
# Retrieve a binned representation of the entries using 100 bins,
# where each entry is represented by its index in the feature
# collection array (@entries above).
my @binned_entry_indicies = @{
$collection->get_bins( -nbins => 100,
-method => 'indices'
) };
# Retrieve only the bin counts/densities.
my @bin_counts = @{ $collection->get_bins( -nbins => 100 ) };
=head1 DESCRIPTION
This is the abstract base class for feature collections.
A feature collection provides a compact representation of features of a
particular type on a slice. Each entry in a collection is a short array
of data representing a feature. This data is divided into two halfs:
=over 4
=item 1.
Basic feature representation.
=item 2.
Extended feature representation.
=back
=head2 Basic feature representation
The basic feature representation is common to all entries in any type
of feature collection and consists of a minimal set of values. Each
collection entry is an array that contains at least the following data
(in this order)
=over 4
=item 1.
Ensembl internal database ID.
=item 2.
Ensembl internal sequence region ID.
=item 3.
Feature start position.
=item 4.
Feature end position.
=item 5.
Feature strand.
=back
The module defines a number of constants that may be used
as symbolic constants in place of the index numbers 0 to
4: ENTRY_DBID, ENTRY_SEQREGIONID, ENTRY_SEQREGIONSTART,
ENTRY_SEQREGIONEND, ENTRY_SEQREGIONSTRAND. For an entry $entry,
$entry->[Bio::EnsEMBL::Collection::ENTRY_SEQREGIONEND] will thus be the
end position for the feature that the entry represents.
The position of the feature is in the same coordinate system as the
slice associated with the collection object.
=head2 Extended feature representation
A sub-class of this abstract base class will specify further
data to be added to the entries in order to account for the
particular feature type. An entry from a gene feature collection
(Bio::EnsEMBL::Collection::Gene) might, for example, contain the Ensembl
Stable ID of the gene.
The extended feature representation is defined by the method
_extra_columns() which is implemented by the sub-class.
=head2 Light-weight collections/entries
A light-weight collection is a feature collection whose collection
entries are light-weight, whose entries does not contain the extended
feature representation.
A collection which is light-weight may be created by using the argument
'-light=>1' in the constructor, new().
=head2 Binning methods
This module allows for various ways of binning the result of the
populate() method by using the get_bins() method.
Each binning method bins the collection entries and gives an array
with the specified length (number of bins). An entry, which basically
consists of a start and a end position, is allocated to one or several
bins depending on its span and the size of the individual bins.
Features: |----| |----------------| |--| |------|
|-------------| |-----| |--|
|-------------------| |----| |--|
Finer bins: 3 3 3 2 2 2 3 3 2 2 2 1 1 1 1 0 2 2 1 1 1 1 1 1 1 1 2 2
Coarser bins: 3 3 2 2 3 2 2 1 1 1 0 2 1 1 1 1 1 3 2
The example above shows the arrays that might be returned from
get_bins() when using the 'count' binning method with 28 and 19 bins
respectively.
Here follows a brief description of each implemented binning method.
=over 4
=item 'count' and 'density'
Returns an array of bins, each bin containing the number of entries
allocated to (i.e. overlapping) that bin. The 'density' binning method
is equivalent to 'count' and this is also the default binning method.
=item 'indices' and 'index'
Returns an array of bins, each bin containing an array of indices into
the collection entry array (available from the entries() method) for the
entries allocated to that bin. The 'index' binning method is equivalent
to 'indicies'.
=item 'entries' and 'entry'
Returns an array of bins, each bin containing an array of entries
(references into the array of entries retrieved from the entries()
method) allocated to that bin. The 'entry' binning method is equivalent
to 'entries'.
=item 'fractional_count' and 'weight'
Returns an array of bins, each bin containing the sum of the fractions
of features overlapping that bin. A feature fully inside a bin will
contribute 1 to the sum while a feature spanning exactly three bins
(from the very start of the first to the very end of the third) will
contribute 1/3 to the sum of each bin.
=item 'coverage'
Returns an array of bins, each bin containing the fraction of the bin
that is coverage by any feature.
=back
=head1 CONTACT
This modules is part of the Ensembl project. See
http://www.ensembl.org/ for further information.
Questions may be posted to the ensembl-dev mailing list:
ensembl-dev@ebi.ac.uk
=cut
package Bio::EnsEMBL::Collection;
use strict;
use warnings;
use Bio::EnsEMBL::DBSQL::BaseAdaptor;
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
use Bio::EnsEMBL::Utils::Exception qw( throw );
use base qw( Bio::EnsEMBL::DBSQL::BaseAdaptor );
# Here are some class constants and class variables.
# Symbolic constants that acts as indices into the individual feature
# collection entries. These must be in sync with the columns returned
# by the _columns() method.
use constant { ENTRY_DBID => 0,
ENTRY_SEQREGIONID => 1,
ENTRY_SEQREGIONSTART => 2,
ENTRY_SEQREGIONEND => 3,
ENTRY_SEQREGIONSTRAND => 4 };
# We'll keep the current slice and the projection segments of all slices
# on the various coordinate systems as class variables rather than
# as object attributes. This way, the Ensembl drawing code can have
# several collection objects attached to one web view (e.g. one per
# track in ContigView), all sharing the same projection segments. These
# structures may be emptied using the flush() method.
our $SLICE;
our %SEGMENTS;
our %SEQ_REG_MAP;
our %VALID_BINNING_METHODS = (
'count' => 0,
'density' => 0, # Same as 'count'.
'indices' => 1,
'index' => 1, # Same as 'indices'.
'entries' => 2,
'entry' => 2, # Same as 'entries'.
'fractional_count' => 3,
'weight' => 3, # Same as 'fractional_count'.
'coverage' => 4 );
=head1 METHODS (constructor)
=head2 new
Arg [SLICE] : Bio::EnsEMBL::Slice
The slice to be associated with this feature
collection.
Arg [LIGHT] : Boolean (optional, default false)
If true, the collection will be 'light-weight',
i.e. no type-specific data will be stored in its
entries when populate() is called.
Arg [ANALYSIS]: String (optional, default undef)
Restrict the feature collection to a specific
analysis logic name, e.g. 'Dust' in a feature
collection of repeat features or 'ncRNA' in a gene
feature collection.
Example : my $collection =
Bio::EnsEMBL::Collection::<feature_type>->new(
-slice => $slice,
-light => 1 );
Description : When called for a sub-class, creates a feature
collection object and associates a slice with it.
Return type : Bio::EnsEMBL::Collection::<feature_type>
Exceptions : Throws if no slice is specified.
Warns if trying to restrict by analysis for a
feature type that does not have an analysis
associated with it, e.g. exon.
Caller : General (through a sub-class).
Status : At Risk (under development)
=cut
sub new {
my $proto = shift;
my ( $slice, $light, $analysis_logic_name ) =
rearrange( [ 'SLICE', 'LIGHT', 'ANALYSIS' ], @_ );
if ( !defined($slice) ) {
throw( 'Unable to create a feature collection '
. 'without a slice object, I am.' );
}
my $this = $proto->SUPER::new( $slice->adaptor()->db() );
if ( defined($analysis_logic_name) ) {
if ( !$this->_has_analysis() ) {
warning( 'This feature type does not have '
. 'an analysis to restrict by.' );
} else {
my $table_alias = $this->_feature_table()->[1];
$this->__attrib( 'restrict_by',
sprintf( 'AND %s.analysis_id = a.analysis_id '
. 'AND a.logic_name = %s',
$table_alias,
$this->dbc()->db_handle()
->quote($analysis_logic_name) ) );
}
}
my $sql = qq(
SELECT cs.name, mc.max_length, m.meta_value
FROM coord_system cs,
meta_coord mc
LEFT JOIN meta m ON m.meta_key = ?
WHERE mc.table_name = ?
AND mc.coord_system_id = cs.coord_system_id
);
my $sth = $this->prepare($sql);
my $feature_table = $this->_feature_table()->[0];
$sth->execute( $feature_table . 'build.level', $feature_table );
my ( $cs_name, $max_length, $build_level );
$sth->bind_columns( \( $cs_name, $max_length, $build_level ) );
my %coordinate_systems;
while ( $sth->fetch() ) {
$coordinate_systems{$cs_name} = {