Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Tamara El Naboulsi
ensembl
Commits
cf7bf886
Commit
cf7bf886
authored
11 years ago
by
Alessandro Vullo
Browse files
Options
Downloads
Patches
Plain Diff
Finished, documented and run tests
parent
12892190
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
modules/t/gtfSerialiser.t
+115
-22
115 additions, 22 deletions
modules/t/gtfSerialiser.t
with
115 additions
and
22 deletions
modules/t/gtfSerialiser.t
+
115
−
22
View file @
cf7bf886
#!/usr/bin/env perl
#
# gtfSerializer.t
#
# Test Bio::EnsEMBL::Utils::IO::GTFSerializer.
#
# Strategy:
#
# In theory, we should know the GTF output for some
# transcripts and compare it against the output of the
# serializer. In practice, this is difficult and very
# time consuming.
# One way to circumnvent this is to start from the observation
# that the serializer was built from the ashes of the web
# script ensembl-webcode/utils/ensembl2gtf_forweb used by
# production to get GTFs for last release (71).
# The strategy is then to download the GTF files for all last release
# species from the FTP site, and for each file compare its output
# relative to transcripts with the corresponding output of the
# serializer.
#
# Notes:
#
# - This strategy only works for testing the GTFSerializer during
# the transition from 71 to 72, as after the current release the
# GTF files will be produced using the GTFSerializer itself via
# the Hive pipeline.
#
# - The first run of the test reports 9 out 610 failuere for certain
# species (canis_familiaris cavia_porcellus danio_rerio equus_caballus
# loxodonta_africana ornithorhynchus_anatinus oryctolagus_cuniculus
# xenopus_tropicalis). This is because the GTFSerializer has corrected
# a bug of the original script, which was using an incorrect codon
# table assumption in method check_start_and_stop which is not working
# for all species.
# Hence, this failures are ok.
#
use
strict
;
use
warnings
;
use
Test::
More
;
use
Data::
Dumper
;
use
Test::
Differences
;
use
IO::
String
;
use
Net::
FTP
;
use
Bio::EnsEMBL::
Registry
;
use
Bio::EnsEMBL::
ApiVersion
qw/software_version/
;
use
Bio::EnsEMBL::Utils::
IO
qw/gz_work_with_file iterate_lines/
;
use
Bio::EnsEMBL::Test::
MultiTestDB
;
use
Bio::EnsEMBL::Utils::
IO
qw/gz_work_with_file/
;
# iterate_lines
# use Bio::EnsEMBL::Test::MultiTestDB;
use
Bio::EnsEMBL::Utils::
Exception
;
use
Bio::EnsEMBL::Utils::IO::
GTFSerializer
;
use
IO::
String
;
use
Net::
FTP
;
my
$current_release
=
software_version
();
my
$last_release
=
$current_release
-
1
;
my
$release
=
$last_release
;
# my $current_release = software_version();
# my $last_release = $current_release - 1;
my
$release
=
71
;
my
$registry
=
'
Bio::EnsEMBL::Registry
';
eval
{
$registry
->
load_registry_from_db
(
-
host
=>
'
mysql-ensembl-mirror.ebi.ac.uk
',
-
port
=>
4240
,
-
db_version
=>
$release
,
-
user
=>
'
anonymous
',
-
verbose
=>
0
)
};
throw
"
Can't connect to db host
"
if
$@
;
SKIP:
{
note
"
Connecting to Ensembl FTP site
";
my
$ftp
=
eval
{
ensembl_ftp_connect
()
};
skip
"
Cannot login to Ensembl FTP site. We cannot continue with the tests
"
skip
"
Cannot login to Ensembl FTP site. We cannot continue with the tests
"
,
1
if
$@
;
note
"
Retrieving species list
";
my
$species_list
=
eval
{
get_species_list
(
$ftp
,
$release
)
};
skip
"
Cannot retrieve GTF species list from FTP site (release
$release
). We cannot continue with the tests.
"
skip
"
Cannot retrieve GTF species list from FTP site (release
$release
). We cannot continue with the tests.
"
,
1
if
$@
;
skip
"
Empty GTF species list from FTP site (release
$release
). We cannot continue with the tests
"
skip
"
Empty GTF species list from FTP site (release
$release
). We cannot continue with the tests
"
,
1
unless
scalar
$species_list
;
foreach
my
$species
(
@
{
$species_list
})
{
note
"
Testing GTF dumps for species
$species
";
my
$species_gtf
=
retrieve_gtf_for_species
(
$ftp
,
$release
,
$species
);
skip
"
Cannot retrieve GTF for
$species
. We cannot continue with the tests
"
skip
"
Cannot retrieve GTF for
$species
. We cannot continue with the tests
"
,
1
unless
$species_gtf
;
note
"
Fetching GTF for 10 transcripts randomly chosen
";
my
$transcripts_gtf
=
get_random_transcripts_gtf
(
$species_gtf
,
10
);
note
"
Fetching GTF for 10 randomly chosen transcripts
";
my
$transcripts_gtf
=
eval
{
get_random_transcripts_gtf
(
$species_gtf
,
10
)
};
skip
"
Couldn't fetch 10 transcript. We cannot continue the test.
",
1
unless
scalar
keys
%
{
$transcripts_gtf
}
==
10
;
note
"
Comparing GTF serializer output with release
$release
data for
$species
";
compare_transcript_output
(
$species
,
$transcripts_gtf
);
unlink
$species_gtf
;
}
...
...
@@ -45,19 +106,49 @@ SKIP: {
done_testing
();
sub
compare_transcript_output
{
my
(
$species
,
$transcripts_gtf
)
=
@_
;
my
$transcript_adaptor
=
$registry
->
get_adaptor
(
$species
,
'
Core
',
'
Transcript
'
);
foreach
my
$transcript_id
(
keys
%
{
$transcripts_gtf
})
{
my
$transcript
=
$transcript_adaptor
->
fetch_by_stable_id
(
$transcript_id
);
defined
$transcript
or
$transcript
=
$transcript_adaptor
->
fetch_by_dbID
(
$transcript_id
);
skip
"
Cannot retrieve transcript
$transcript_id
. Skipping test
",
1
unless
defined
$transcript
;
my
$fh
=
IO::
String
->
new
();
my
$gtf_serializer
=
Bio::EnsEMBL::Utils::IO::
GTFSerializer
->
new
(
$fh
);
$gtf_serializer
->
print_feature
(
$transcript
);
is
(
$
{
$fh
->
string_ref
()},
$transcripts_gtf
->
{
$transcript_id
},
"
Transcript
$transcript_id
serialises to GTF as expected
");
# if ($eq_out == 0) {
# diag ${$fh->string_ref()};
# diag $transcripts_gtf->{$transcript_id};
# }
}
}
sub
get_random_transcripts_gtf
{
my
(
$gtf_file
,
$how_many_transcripts
)
=
@_
;
my
$transcripts_gtf
;
gz_work_with_file
(
$gtf_file
,
'
r
',
sub
{
my
(
$fh
)
=
@_
;
while
(
my
$line
=
<
$fh
>
and
scalar
keys
%
{
$transcripts_gtf
}
<
$how_many_transcripts
)
{
if
(
random
>
.5
)
{
chomp
(
$line
);
$line
=~
/transcript_id \"(.+?)\"/
;
# double quotes?
$transcripts_gtf
->
{
$
1
}
=
$line
;
}
my
$last_insert
;
while
(
defined
$fh
and
my
$line
=
<
$fh
>
and
scalar
keys
%
{
$transcripts_gtf
}
<=
$how_many_transcripts
)
{
$line
=~
/transcript_id \"(.+?)\"/
;
$transcripts_gtf
->
{
$
1
}
.
=
$line
;
$last_insert
=
$
1
;
}
delete
$transcripts_gtf
->
{
$last_insert
};
return
;
});
return
$transcripts_gtf
;
...
...
@@ -70,6 +161,8 @@ sub retrieve_gtf_for_species {
$ftp
->
cwd
("
/pub/release-
$release
/gtf/
$species
");
my
@gtf_files
=
grep
{
$_
=~
/gtf/
}
@
{
$ftp
->
ls
()};
throw
("
No GTF file detected for
$species
")
unless
scalar
@gtf_files
;
throw
("
More than one GTF file detected for
$species
")
if
scalar
@gtf_files
>
1
;
...
...
@@ -125,4 +218,4 @@ sub ensembl_ftp_connect {
# is(${$fh->string_ref()}, $expected, 'Gene serialises to GFF3 as expected');
# }
# done_testing();
\ No newline at end of file
# done_testing();
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment