Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Open sidebar
ensembl-gh-mirror
ensembl-io
Commits
47560a27
Commit
47560a27
authored
Feb 14, 2019
by
Tiago Grego
Browse files
added harmony tests for GFF and GTF, prototype for FASTA and BED
parent
7335ac40
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
272 additions
and
0 deletions
+272
-0
modules/t/utils_io/harmony.t
modules/t/utils_io/harmony.t
+272
-0
No files found.
modules/t/utils_io/harmony.t
0 → 100644
View file @
47560a27
# Copyright [2018-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use
strict
;
use
warnings
;
use
FindBin
qw/$Bin/
;
use
Test::
More
;
use
Test::
Deep
;
use
Test::
Differences
;
use
Test::
Exception
;
use
IO::
String
;
use
Bio::EnsEMBL::Test::
MultiTestDB
;
use
Bio::EnsEMBL::
CoordSystem
;
use
Bio::EnsEMBL::
Slice
;
use
Bio::EnsEMBL::Utils::IO::
FASTASerializer
;
use
Bio::EnsEMBL::Utils::IO::
GFFSerializer
;
use
Bio::EnsEMBL::IO::
Writer
;
use
Bio::EnsEMBL::IO::Translator::
Gene
;
use
Bio::EnsEMBL::IO::Writer::
GFF3
;
use
Bio::EnsEMBL::IO::Object::
GXFMetadata
;
use
Bio::EnsEMBL::Utils::IO::
GTFSerializer
;
use
Bio::EnsEMBL::IO::Writer::
GTF
;
use
Bio::EnsEMBL::Utils::IO::
BEDSerializer
;
use
Bio::EnsEMBL::IO::Writer::
BED
;
use
Smart::
Comments
;
# Get a DBAdaptor to from the test system
my
$multi
=
Bio::EnsEMBL::Test::
MultiTestDB
->
new
(
undef
,
"
$Bin
/..
");
ok
(
$multi
,
"
Test DB loaded successfully
");
my
$db
=
$multi
->
get_DBAdaptor
("
core
");
ok
(
$db
,
"
Core DB adaptor loaded successfully
");
# fetch a protein_coding gene object
my
$ga
=
$db
->
get_GeneAdaptor
();
my
$genes
=
$ga
->
fetch_all
;
is
(
ref
@
{
$genes
}[
0
],
'
Bio::EnsEMBL::Gene
',
"
Gene objects loaded successfully
");
# to test with a single gene
# my $gene = $ga->fetch_by_stable_id("ENSG00000171456");
# ok($gene, "Gene object loaded successfully");
# # FASTA
# {
# my ($utils_output, $ensio_output);
# open my $utils_fh, '>', \$utils_output;
# my $utils_fasta_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($utils_fh); # No need to fiddle with chunk sizes at this point
# # foreach my $feature (@$features) {
# $utils_fasta_writer->print_Seq($gene->feature_Slice);
# # }
# close $utils_fh;
# ## $utils_output
# open my $ensio_fh, '>', \$ensio_output;
# my $ensio_fasta_writer = Bio::EnsEMBL::IO::Writer->new('Fasta', $ensio_fh);
# my $translator = Bio::EnsEMBL::IO::Translator::Gene->new();
# # foreach my $feature (@$features) {
# $ensio_fasta_writer->write($translator);
# # }
# ### $utils_output
# close $ensio_output;
# is($utils_output, $ensio_output, "FASTA harmonization test pass");
# }
# GFF
{
my
(
$utils_output
,
$ensio_output
);
open
my
$utils_fh
,
'
>
',
\
$utils_output
;
my
$utils_gff_writer
=
Bio::EnsEMBL::Utils::IO::
GFFSerializer
->
new
(
$utils_fh
);
# No need to fiddle with chunk sizes at this point
# $utils_gff_writer->print_main_header([$gene->feature_Slice()]);
$utils_gff_writer
->
print_main_header
;
foreach
my
$gene
(
@$genes
)
{
$utils_gff_writer
->
print_feature
(
$gene
);
}
close
$utils_fh
;
open
my
$ensio_fh
,
'
>
',
\
$ensio_output
;
my
$translator
=
Bio::EnsEMBL::IO::Translator::
Gene
->
new
();
my
$serializer
=
Bio::EnsEMBL::IO::Writer::
GFF3
->
new
(
$translator
);
$serializer
->
open
(
$ensio_fh
);
# Write headers
$serializer
->
write
(
Bio::EnsEMBL::IO::Object::
GXFMetadata
->
directive
('
gff-version
',
3
));
# $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->ens_directive('genome-build', 'ensembl', 'GRCh38'));
foreach
my
$gene
(
@$genes
)
{
$serializer
->
write
(
$gene
);
}
close
$ensio_fh
;
is
(
$utils_output
,
$ensio_output
,
"
GFF3 harmonization test pass
");
}
# GTF
{
my
(
$utils_output
,
$ensio_output
);
open
my
$utils_fh
,
'
>
',
\
$utils_output
;
my
$utils_gtf_writer
=
Bio::EnsEMBL::Utils::IO::
GTFSerializer
->
new
(
$utils_fh
);
# No need to fiddle with chunk sizes at this point
# $utils_gtf_writer->print_main_header([$gene->feature_Slice()]);
# $utils_gtf_writer->print_main_header($db);
foreach
my
$gene
(
@$genes
)
{
$utils_gtf_writer
->
print_Gene
(
$gene
);
}
my
@utils_output
=
split
(
/\n/
,
$utils_output
);
my
$utils_output_clean
=
shift
@utils_output
;
foreach
my
$line
(
@utils_output
)
{
if
(
$line
=~
/^\d+\t\w+\tgene\t/
)
{
$utils_output_clean
.=
"
$line
\n
";
}
}
my
$utils_data
;
foreach
my
$line
(
split
(
/\n/
,
$utils_output_clean
))
{
my
@line_data
=
split
(
/\t/
,
$line
);
my
$last_field
=
pop
@line_data
;
my
@fields
=
split
(
/;/
,
$last_field
);
my
$hash_data
=
{};
foreach
my
$data
(
@fields
)
{
my
(
$key
,
$value
)
=
$data
=~
/\s?(\w*)\s+\"?(\w*)\"?/
;
$hash_data
->
{
$key
}
=
$value
;
}
push
(
@line_data
,
$hash_data
);
push
@
{
$utils_data
},
[
@line_data
];
}
close
$utils_fh
;
open
my
$ensio_fh
,
'
>
',
\
$ensio_output
;
my
$translator
=
Bio::EnsEMBL::IO::Translator::
Gene
->
new
();
my
$serializer
=
Bio::EnsEMBL::IO::Writer::
GTF
->
new
(
$translator
);
$serializer
->
open
(
$ensio_fh
);
# Write headers
# $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->directive('gff-version', 3));
# $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->ens_directive('genome-build', 'ensembl', 'GRCh38'));
foreach
my
$gene
(
@$genes
)
{
$serializer
->
write
(
$gene
);
}
my
$ensio_data
;
foreach
my
$line
(
split
(
/\n/
,
$utils_output_clean
))
{
my
@line_data
=
split
(
/\t/
,
$line
);
my
$last_field
=
pop
@line_data
;
my
@fields
=
split
(
/;/
,
$last_field
);
my
$hash_data
=
{};
foreach
my
$data
(
@fields
)
{
my
(
$key
,
$value
)
=
$data
=~
/\s?(\w*)\s+\"?(\w*)\"?/
;
$hash_data
->
{
$key
}
=
$value
;
}
push
(
@line_data
,
$hash_data
);
push
@
{
$ensio_data
},
[
@line_data
];
}
close
$ensio_fh
;
is_deeply
(
$utils_data
,
$ensio_data
,
"
GTF harmonization test pass
");
}
# # BED
# {
# my ($utils_output, $ensio_output);
# open my $utils_fh, '>', \$utils_output;
# my $utils_bed_writer = Bio::EnsEMBL::Utils::IO::BEDSerializer->new($utils_fh); # No need to fiddle with chunk sizes at this point
# # $utils_gtf_writer->print_main_header([$gene->feature_Slice()]);
# # $utils_bed_writer->print_main_header($db);
# # foreach my $transcript ( @{$gene->get_all_Transcripts} ) {
# $utils_bed_writer->print_feature($gene);
# # }
# ### $utils_output
# open my $ensio_fh, '>', \$ensio_output;
# my $translator = Bio::EnsEMBL::IO::Translator::Gene->new();
# my $serializer = Bio::EnsEMBL::IO::Writer::BED->new($translator);
# # my $serializer = Bio::EnsEMBL::IO::Writer->new('Bed', $translator);
# # $translator->add_callbacks( { type => sub { return 'fake_type' } } );
# # $translator->add_callbacks( { type => sub { return 'region' } } );
# $serializer->open($ensio_fh);
# # Write headers
# # $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->directive('gff-version', 3));
# # $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->ens_directive('genome-build', 'ensembl', 'GRCh38'));
# # foreach my $transcript ( @{$gene->get_all_Transcripts} ) {
# $serializer->write($gene);
# # }
# close $ensio_fh;
# ### $ensio_output
# is($utils_output, $ensio_output, "BED harmonization test pass");
# }
done_testing
();
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment