Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
876d7b58
Commit
876d7b58
authored
17 years ago
by
Patrick Meidl
Browse files
Options
Downloads
Patches
Plain Diff
static module to analyse stable ID mapping results
parent
bb149798
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
modules/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm
+519
-0
519 additions, 0 deletions
modules/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm
with
519 additions
and
0 deletions
modules/Bio/EnsEMBL/IdMapping/ResultAnalyser.pm
0 → 100644
+
519
−
0
View file @
876d7b58
package
Bio::EnsEMBL::IdMapping::
ResultAnalyser
;
=head1 NAME
=head1 SYNOPSIS
=head1 DESCRIPTION
=head1 METHODS
=head1 LICENCE
This code is distributed under an Apache style licence. Please see
http:#www.ensembl.org/info/about/code_licence.html for details.
=head1 AUTHOR
Patrick Meidl <meidl@ebi.ac.uk>, Ensembl core API team
=head1 CONTACT
Please post comments/questions to the Ensembl development list
<ensembl-dev@ebi.ac.uk>
=cut
use
strict
;
use
warnings
;
no
warnings
'
uninitialized
';
use
Bio::EnsEMBL::IdMapping::
BaseObject
;
our
@ISA
=
qw(Bio::EnsEMBL::IdMapping::BaseObject)
;
use
Bio::EnsEMBL::Utils::
Exception
qw(throw warning)
;
use
Bio::EnsEMBL::Utils::
ScriptUtils
qw(path_append)
;
sub
analyse
{
my
$self
=
shift
;
my
$gene_mappings
=
shift
;
my
$similarity_events
=
shift
;
# argument check
unless
(
$gene_mappings
and
$gene_mappings
->
isa
('
Bio::EnsEMBL::IdMapping::MappingList
'))
{
throw
("
Need a Bio::EnsEMBL::IdMapping::MappingList of genes.
");
}
unless
(
$similarity_events
and
ref
(
$similarity_events
)
eq
'
ARRAY
')
{
throw
("
Need a list of similarity events.
");
}
# classify source genes by type (status-logic_name-biotype)
$self
->
classify_source_genes_by_type
;
# classify source genes by mapping status
$self
->
classify_genes_by_mapping
(
$gene_mappings
,
$similarity_events
);
}
#
# Analyse stable ID data from two existing dbs.
# This is for potential stand-alone use of this module.
#
# [todo]
sub
analyse_db
{
}
sub
classify_source_genes_by_type
{
my
$self
=
shift
;
foreach
my
$s_gene
(
values
%
{
$self
->
cache
->
get_by_name
('
genes_by_id
',
'
source
')
})
{
$self
->
add
('
source
',
$self
->
class_key
(
$s_gene
),
'
all
',
$s_gene
->
stable_id
);
}
}
sub
classify_target_genes_by_type
{
my
$self
=
shift
;
my
$gene_mappings
=
shift
;
# argument check
unless
(
$gene_mappings
and
$gene_mappings
->
isa
('
Bio::EnsEMBL::IdMapping::MappingList
'))
{
throw
("
Need a Bio::EnsEMBL::IdMapping::MappingList of genes.
");
}
my
%result
=
();
# firrst, create a lookup hash of source genes by target internal ID
my
%source_genes_by_target
=
();
foreach
my
$e
(
@
{
$gene_mappings
->
get_all_Entries
})
{
my
$s_gene
=
$self
->
cache
->
get_by_key
('
genes_by_id
',
'
source
',
$e
->
source
);
my
$t_gene
=
$self
->
cache
->
get_by_key
('
genes_by_id
',
'
target
',
$e
->
target
);
$source_genes_by_target
{
$t_gene
->
id
}
=
$s_gene
;
}
# now loop over target genes
foreach
my
$t_gene
(
values
%
{
$self
->
cache
->
get_by_name
('
genes_by_id
',
'
target
')
})
{
# check if target gene has all required properties set
unless
(
$t_gene
->
status
and
$t_gene
->
logic_name
and
$t_gene
->
biotype
)
{
$self
->
logger
->
warning
("
Missing data for target gene:
"
.
$t_gene
->
to_string
.
"
\n
",
1
);
}
my
$class
=
$self
->
class_key
(
$t_gene
);
# classify as '1' if mapped (using source gene's stable ID), otherwise '0'
if
(
my
$s_gene
=
$source_genes_by_target
{
$t_gene
->
id
})
{
$self
->
add
('
target
',
$class
,
'
mapped
',
$s_gene
->
stable_id
);
}
else
{
$self
->
add
('
target
',
$class
,
'
unmapped
',
$t_gene
->
stable_id
);
}
}
}
#
# genes will be classified as:
# - mapped
# - deleted
# - lost_similar
# - lost_definite
#
sub
classify_genes_by_mapping
{
my
$self
=
shift
;
my
$gene_mappings
=
shift
;
my
$similarity_events
=
shift
;
# argument check
unless
(
$gene_mappings
and
$gene_mappings
->
isa
('
Bio::EnsEMBL::IdMapping::MappingList
'))
{
throw
("
Need a Bio::EnsEMBL::IdMapping::MappingList of genes.
");
}
unless
(
$similarity_events
and
ref
(
$similarity_events
)
eq
'
ARRAY
')
{
throw
("
Need a list of similarity events.
");
}
# mapped genes
foreach
my
$e
(
@
{
$gene_mappings
->
get_all_Entries
})
{
my
$s_gene
=
$self
->
cache
->
get_by_key
('
genes_by_id
',
'
source
',
$e
->
source
);
$self
->
add
('
source
',
$self
->
class_key
(
$s_gene
),
'
mapped
',
$s_gene
->
stable_id
);
}
# lookup hash for similarities
my
%similar
=
();
foreach
my
$event
(
@
{
$similarity_events
})
{
my
(
$stable_id
)
=
split
("
\t
",
$event
);
$similar
{
$stable_id
}
=
1
;
}
# deleted genes
foreach
my
$s_gene
(
values
%
{
$self
->
cache
->
get_by_name
('
genes_by_id
',
'
source
')
})
{
my
$stable_id
=
$s_gene
->
stable_id
;
my
$class
=
$self
->
class_key
(
$s_gene
);
unless
(
$self
->
get
('
source
',
$class
,
'
mapped
',
$stable_id
))
{
# sub-classify as 'lost_similar' or 'lost_definite'
if
(
$similar
{
$stable_id
})
{
$self
->
add
('
source
',
$class
,
'
lost_similar
',
$stable_id
);
}
else
{
$self
->
add
('
source
',
$class
,
'
lost_definite
',
$stable_id
);
}
}
}
}
#
# Add a stable ID / property pair to a name/dbtype lookup hash.
#
# This datastructure is a bloat for some applications, but is general enough to
# be used as a lookup hash and to generate statistics (counts by type) and
# debug lists (dump by type).
#
sub
add
{
my
(
$self
,
$dbtype
,
$class
,
$subclass
,
$stable_id
,
$val
)
=
@_
;
# private method, so no argument check done for performance reasons
# default to a value of '1'
$val
=
1
unless
(
defined
(
$val
));
$self
->
{
$dbtype
}
->
{
$class
}
->
{
$subclass
}
->
{
$stable_id
}
=
$val
;
}
sub
get
{
my
(
$self
,
$dbtype
,
$class
,
$subclass
,
$stable_id
)
=
@_
;
# private method, so no argument check done for performance reasons
return
$self
->
{
$dbtype
}
->
{
$class
}
->
{
$subclass
}
->
{
$stable_id
};
}
sub
get_all_by_subclass
{
my
(
$self
,
$dbtype
,
$class
,
$subclass
)
=
@_
;
# argument check
throw
("
Need a dbtype (source|target).
")
unless
(
$dbtype
);
throw
("
Need a class.
")
unless
(
$class
);
throw
("
Need a subclass.
")
unless
(
$subclass
);
return
[
keys
%
{
$self
->
{
$dbtype
}
->
{
$class
}
->
{
$subclass
}
||
{}
}
];
}
sub
get_all_by_class
{
my
(
$self
,
$dbtype
,
$class
)
=
@_
;
# argument check
throw
("
Need a dbtype (source|target).
")
unless
(
$dbtype
);
throw
("
Need a class.
")
unless
(
$class
);
my
%merged
=
();
foreach
my
$subclass
(
keys
%
{
$self
->
{
$dbtype
}
->
{
$class
}
||
{}
})
{
while
(
my
(
$key
,
$val
)
=
each
(
%
{
$self
->
{
$dbtype
}
->
{
$class
}
->
{
$subclass
}
}))
{
$merged
{
$key
}
=
$val
;
}
}
return
[
keys
%merged
];
}
sub
get_count_by_subclass
{
my
(
$self
,
$dbtype
,
$class
,
$subclass
)
=
@_
;
# argument check
throw
("
Need a dbtype (source|target).
")
unless
(
$dbtype
);
throw
("
Need a class.
")
unless
(
$class
);
throw
("
Need a subclass.
")
unless
(
$subclass
);
return
scalar
(
keys
%
{
$self
->
{
$dbtype
}
->
{
$class
}
->
{
$subclass
}
||
{}
});
}
sub
get_count_by_class
{
my
(
$self
,
$dbtype
,
$class
)
=
@_
;
# argument check
throw
("
Need a dbtype (source|target).
")
unless
(
$dbtype
);
throw
("
Need a class.
")
unless
(
$class
);
return
scalar
(
@
{
$self
->
get_all_by_class
(
$dbtype
,
$class
)
});
}
sub
get_all_classes
{
my
(
$self
,
$dbtype
)
=
@_
;
# argument check
throw
("
Need a dbtype (source|target).
")
unless
(
$dbtype
);
return
[
sort
keys
%
{
$self
->
{
$dbtype
}
||
{}
}
];
}
sub
class_key
{
my
(
$self
,
$gene
)
=
@_
;
return
join
('
-
',
map
{
$gene
->
$_
}
qw(status logic_name biotype)
);
}
sub
write_results_to_file
{
my
$self
=
shift
;
my
$fh
=
$self
->
get_filehandle
('
gene_detailed_mapping_stats.txt
',
'
stats
');
my
$fmt1
=
"
%-60s%-16s%-16s%-16s
\n
";
my
$fmt2
=
"
%-60s%5.0f (%7s) %5.0f (%7s) %5.0f (%7s)
\n
";
my
$fmt3
=
"
%3.2f%%
";
print
$fh
"
Gene detailed mapping results:
\n\n
";
print
$fh
sprintf
(
$fmt1
,
"
Gene type
",
"
mapped
",
"
lost (similar)
",
"
lost (definite)
");
print
$fh
('
-
'
x108
),
"
\n
";
foreach
my
$class
(
@
{
$self
->
get_all_classes
('
source
')
})
{
next
if
(
$class
eq
'
all
');
my
$total
=
$self
->
get_count_by_class
('
source
',
$class
);
# avoid division by zero error
unless
(
$total
)
{
$self
->
logger
->
warning
("
No count found for
$class
.
\n
",
1
);
next
;
}
my
$mapped
=
$self
->
get_count_by_subclass
('
source
',
$class
,
'
mapped
');
my
$similar
=
$self
->
get_count_by_subclass
('
source
',
$class
,
'
lost_similar
');
my
$lost
=
$self
->
get_count_by_subclass
('
source
',
$class
,
'
lost_definite
');
print
$fh
sprintf
(
$fmt2
,
$class
,
$mapped
,
sprintf
(
$fmt3
,
$mapped
/
$total
*
100
),
$similar
,
sprintf
(
$fmt3
,
$similar
/
$total
*
100
),
$lost
,
sprintf
(
$fmt3
,
$lost
/
$total
*
100
));
}
close
(
$fh
);
}
sub
create_clicklist
{
my
$self
=
shift
;
my
$fh
=
$self
->
get_filehandle
('
genes_lost.html
',
'
stats
');
# start html output
print
$fh
qq(<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n)
;
print
$fh
qq(<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-gb" lang="en-gb">)
;
print
$fh
"
<head>
\n
";
print
$fh
"
<title>Lost genes
";
print
$fh
$self
->
conf
->
param
('
sourcedbname
'),
'
->
',
$self
->
conf
->
param
('
targetdbname
');
print
$fh
"
</title>
\n
";
print
$fh
"
</head>
\n
<body>
\n
";
my
$prefix
=
$self
->
conf
->
param
('
urlprefix
');
unless
(
$prefix
)
{
$self
->
logger
->
warning
("
No urlprefix set, clicklists might not be useable.
\n
",
1
);
}
my
$navigation
;
my
$clicklist
;
foreach
my
$class
(
@
{
$self
->
get_all_classes
('
source
')
})
{
next
if
(
$class
eq
'
all
');
$navigation
.=
"
$class
";
$clicklist
.=
"
<h1>
$class
</h1>
\n
";
foreach
my
$subclass
(
qw(lost_similar lost_definite)
)
{
# navigation
$navigation
.=
qq(<a href="#${class}-$subclass">$subclass</a> )
;
# clicklist
$clicklist
.=
"
<h2>
$subclass
</h2>
\n
";
foreach
my
$stable_id
(
@
{
$self
->
get_all_by_subclass
('
source
',
$class
,
$subclass
)
})
{
$clicklist
.=
qq(<a href="${prefix}$stable_id">$stable_id</a><br />\n)
;
}
}
$navigation
.=
"
<br />
\n
";
}
# print navigation and clicklist
print
$fh
"
$navigation
\n\n
";
print
$fh
"
$clicklist
\n\n
";
# html footer
print
$fh
"
</body></html>
\n
";
close
(
$fh
);
}
sub
create_summary_email
{
my
$self
=
shift
;
my
$fh
=
$self
->
get_filehandle
('
summary_email.txt
');
#
# title
#
print
$fh
qq(Stable ID mapping results\n)
;
print
$fh
qq(=========================\n\n)
;
#
# parameters used for this run
#
print
$fh
$self
->
conf
->
list_param_values
;
print
$fh
"
\n
";
#
# mapping stats
#
foreach
my
$type
(
qw(exon transcript translation gene_detailed)
)
{
my
$filename
=
"
${type}
_mapping_stats.txt
";
if
(
$self
->
file_exists
(
$filename
,
'
stats
'))
{
print
$fh
$self
->
read_from_file
(
$filename
,
'
stats
');
print
$fh
"
\n\n
";
}
else
{
print
$fh
"
No mapping stats found for
$type
.
\n\n
";
}
}
#
# db uploads
#
my
@uploads
=
(
['
stable_ids
'
=>
'
Stable IDs
'],
['
events
'
=>
'
Stable ID events and mapping session
'],
['
archive
'
=>
'
Gene and peptide archive
'],
);
my
$fmt1
=
"
%-40s%-20s
\n
";
print
$fh
qq(Data uploaded to db:\n)
;
print
$fh
qq(====================\n\n)
;
if
(
$self
->
conf
->
param
('
dry_run
'))
{
print
$fh
"
None (dry run).
\n
";
}
else
{
foreach
my
$u
(
@uploads
)
{
my
$uploaded
=
'
no
';
$uploaded
=
'
yes
'
if
(
$self
->
conf
->
is_true
("
upload_
"
.
$u
->
[
0
]));
print
$fh
sprintf
(
$fmt1
,
$u
->
[
1
],
$uploaded
);
}
}
print
$fh
"
\n
";
#
# stats and clicklist
#
my
@output
=
(
['
stats
'
=>
'
statistics (including clicklists of deleted IDs)
'],
['
debug
'
=>
'
detailed mapping output for debugging
'],
['
tables
'
=>
'
data files for db upload
'],
);
my
$fmt2
=
"
%-20s%-50s
\n
";
print
$fh
qq(\nOutput directories:\n)
;
print
$fh
qq(===================\n\n)
;
print
$fh
sprintf
(
$fmt2
,
qw(DIRECTORY DESCRIPTION)
);
print
$fh
('
-
'
x72
),
"
\n
";
print
$fh
sprintf
(
$fmt2
,
'
basedir
',
$self
->
conf
->
param
('
dumppath
'));
foreach
my
$o
(
@output
)
{
print
$fh
sprintf
(
$fmt2
,
'
$basedir/
'
.
$o
->
[
0
],
$o
->
[
1
]);
}
print
$fh
"
\n
";
#
# clicklist of first 10 deleted genes
#
print
$fh
qq(\nFirst 10 deleted known genes:\n)
;
print
$fh
qq(=============================\n\n)
;
my
$in_fh
=
$self
->
get_filehandle
('
genes_lost.txt
',
'
debug
',
'
<
');
my
$prefix
=
$self
->
conf
->
param
('
urlprefix
');
my
$i
;
while
(
<
$in_fh
>
)
{
last
if
(
++
$i
>
10
);
chomp
;
my
(
$stable_id
,
$type
)
=
split
(
/\s+/
);
next
unless
(
$type
eq
'
known
');
print
$fh
sprintf
(
$fmt2
,
$stable_id
,
"
${prefix}$stable_id
");
}
close
(
$in_fh
);
print
$fh
"
\n
";
#
# timing
#
print
$fh
"
\n
Runtime:
\n
";
print
$fh
"
========
\n\n
";
print
$fh
$self
->
logger
->
runtime
,
"
\n\n
";
close
(
$fh
);
}
sub
read_from_file
{
my
$self
=
shift
;
my
$filename
=
shift
;
my
$append
=
shift
;
my
$in_fh
=
$self
->
get_filehandle
(
$filename
,
$append
,
'
<
');
my
$txt
;
while
(
<
$in_fh
>
)
{
$txt
.=
$_
;
}
return
$txt
;
}
1
;
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment