added harmony tests for GFF and GTF, prototype for FASTA and BED

47560a27 · Tiago Grego · 7335ac40 · 47560a27
Commit 47560a27 authored 6 years ago by Tiago Grego
--- a/modules/t/utils_io/harmony.t
+++ b/modules/t/utils_io/harmony.t
+# Copyright [2018-2019] EMBL-European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+use strict;
+use warnings;
+
+use FindBin qw/$Bin/;
+use Test::More;
+use Test::Deep;
+use Test::Differences;
+use Test::Exception;
+
+use IO::String;
+
+use Bio::EnsEMBL::Test::MultiTestDB;
+use Bio::EnsEMBL::CoordSystem;
+use Bio::EnsEMBL::Slice;
+
+
+use Bio::EnsEMBL::Utils::IO::FASTASerializer;
+use Bio::EnsEMBL::Utils::IO::GFFSerializer;
+
+use Bio::EnsEMBL::IO::Writer;
+use Bio::EnsEMBL::IO::Translator::Gene;
+use Bio::EnsEMBL::IO::Writer::GFF3;
+use Bio::EnsEMBL::IO::Object::GXFMetadata;
+
+use Bio::EnsEMBL::Utils::IO::GTFSerializer;
+use Bio::EnsEMBL::IO::Writer::GTF;
+
+use Bio::EnsEMBL::Utils::IO::BEDSerializer;
+use Bio::EnsEMBL::IO::Writer::BED;
+
+use Smart::Comments;
+
+
+# Get a DBAdaptor to from the test system
+my $multi = Bio::EnsEMBL::Test::MultiTestDB->new(undef, "$Bin/..");
+ok($multi, "Test DB loaded successfully");
+my $db = $multi->get_DBAdaptor("core");
+ok($db, "Core DB adaptor loaded successfully");
+
+
+# fetch a protein_coding gene object
+my $ga = $db->get_GeneAdaptor();
+my $genes = $ga->fetch_all;
+is(ref @{$genes}[0], 'Bio::EnsEMBL::Gene', "Gene objects loaded successfully");
+
+
+
+# to test with a single gene
+
+# my $gene = $ga->fetch_by_stable_id("ENSG00000171456");
+# ok($gene, "Gene object loaded successfully");
+
+
+
+
+# # FASTA
+# {
+#   my ($utils_output, $ensio_output);
+
+#   open my $utils_fh, '>', \$utils_output;
+#   my $utils_fasta_writer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($utils_fh); # No need to fiddle with chunk sizes at this point
+
+#     # foreach my $feature (@$features) {
+#       $utils_fasta_writer->print_Seq($gene->feature_Slice);
+#     # }
+
+#   close $utils_fh;
+
+#   ## $utils_output
+
+#   open my $ensio_fh, '>', \$ensio_output;
+
+#   my $ensio_fasta_writer = Bio::EnsEMBL::IO::Writer->new('Fasta', $ensio_fh);
+
+# my $translator = Bio::EnsEMBL::IO::Translator::Gene->new();
+#   # foreach my $feature (@$features) {
+#     $ensio_fasta_writer->write($translator);
+#   # }
+
+#   ### $utils_output
+
+#   close $ensio_output;
+
+
+#   is($utils_output, $ensio_output, "FASTA harmonization test pass");
+# }
+
+
+
+# GFF
+{
+  my ($utils_output, $ensio_output);
+
+  open my $utils_fh, '>', \$utils_output;
+  my $utils_gff_writer = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($utils_fh); # No need to fiddle with chunk sizes at this point
+
+  # $utils_gff_writer->print_main_header([$gene->feature_Slice()]);
+  $utils_gff_writer->print_main_header;
+  foreach my $gene (@$genes) {
+    $utils_gff_writer->print_feature($gene);
+  }
+  close $utils_fh;
+
+
+
+  open my $ensio_fh, '>', \$ensio_output;
+
+  my $translator = Bio::EnsEMBL::IO::Translator::Gene->new();
+  my $serializer = Bio::EnsEMBL::IO::Writer::GFF3->new($translator);
+
+  $serializer->open($ensio_fh);
+
+  # Write headers
+  $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->directive('gff-version', 3));
+  # $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->ens_directive('genome-build', 'ensembl', 'GRCh38'));
+
+  foreach my $gene (@$genes) {
+    $serializer->write($gene);
+  }
+
+  close $ensio_fh;
+
+
+  is($utils_output, $ensio_output, "GFF3 harmonization test pass");
+}
+
+
+
+
+
+# GTF
+{
+  my ($utils_output, $ensio_output);
+
+  open my $utils_fh, '>', \$utils_output;
+  my $utils_gtf_writer = Bio::EnsEMBL::Utils::IO::GTFSerializer->new($utils_fh); # No need to fiddle with chunk sizes at this point
+
+  # $utils_gtf_writer->print_main_header([$gene->feature_Slice()]);
+  # $utils_gtf_writer->print_main_header($db);
+  foreach my $gene (@$genes) {
+    $utils_gtf_writer->print_Gene($gene);
+  }
+
+  my @utils_output = split (/\n/, $utils_output);
+  my $utils_output_clean = shift @utils_output;
+
+  foreach my $line (@utils_output) {
+   if ($line =~ /^\d+\t\w+\tgene\t/ ) {
+      $utils_output_clean .= "$line\n";
+    }
+  }
+
+  my $utils_data;
+  foreach my $line (split (/\n/, $utils_output_clean)) {
+
+    my @line_data = split (/\t/, $line);
+
+    my $last_field = pop @line_data;
+    my @fields = split(/;/, $last_field);
+
+    my $hash_data = {};
+    foreach my $data (@fields) {
+      my ($key, $value) = $data =~ /\s?(\w*)\s+\"?(\w*)\"?/;
+      $hash_data->{$key} = $value;
+    }
+    push (@line_data, $hash_data);
+    push @{$utils_data}, [@line_data];
+  }
+
+  close $utils_fh;
+
+
+
+  open my $ensio_fh, '>', \$ensio_output;
+
+  my $translator = Bio::EnsEMBL::IO::Translator::Gene->new();
+  my $serializer = Bio::EnsEMBL::IO::Writer::GTF->new($translator);
+
+  $serializer->open($ensio_fh);
+
+  # Write headers
+  # $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->directive('gff-version', 3));
+  # $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->ens_directive('genome-build', 'ensembl', 'GRCh38'));
+
+  foreach my $gene (@$genes) {
+    $serializer->write($gene);
+  }
+
+  my $ensio_data;
+  foreach my $line (split (/\n/, $utils_output_clean)) {
+
+    my @line_data = split (/\t/, $line);
+
+    my $last_field = pop @line_data;
+    my @fields = split(/;/, $last_field);
+
+    my $hash_data = {};
+    foreach my $data (@fields) {
+      my ($key, $value) = $data =~ /\s?(\w*)\s+\"?(\w*)\"?/;
+      $hash_data->{$key} = $value;
+    }
+    push (@line_data, $hash_data);
+    push @{$ensio_data}, [@line_data];
+  }
+
+  close $ensio_fh;
+
+
+  is_deeply($utils_data, $ensio_data, "GTF harmonization test pass");
+}
+
+
+
+
+
+
+# # BED
+# {
+#   my ($utils_output, $ensio_output);
+
+#   open my $utils_fh, '>', \$utils_output;
+#   my $utils_bed_writer = Bio::EnsEMBL::Utils::IO::BEDSerializer->new($utils_fh); # No need to fiddle with chunk sizes at this point
+
+#   # $utils_gtf_writer->print_main_header([$gene->feature_Slice()]);
+#   # $utils_bed_writer->print_main_header($db);
+#   # foreach my $transcript ( @{$gene->get_all_Transcripts} ) {
+#     $utils_bed_writer->print_feature($gene);
+#   # }
+# ### $utils_output
+
+#   open my $ensio_fh, '>', \$ensio_output;
+
+#   my $translator = Bio::EnsEMBL::IO::Translator::Gene->new();
+#   my $serializer = Bio::EnsEMBL::IO::Writer::BED->new($translator);
+
+#   # my $serializer = Bio::EnsEMBL::IO::Writer->new('Bed', $translator);
+
+#   # $translator->add_callbacks( { type => sub { return 'fake_type' } } );
+#   # $translator->add_callbacks( { type => sub { return 'region' } } );
+#   $serializer->open($ensio_fh);
+
+#   # Write headers
+#   # $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->directive('gff-version', 3));
+#   # $serializer->write(Bio::EnsEMBL::IO::Object::GXFMetadata->ens_directive('genome-build', 'ensembl', 'GRCh38'));
+
+#   # foreach my $transcript ( @{$gene->get_all_Transcripts} ) {
+#     $serializer->write($gene);
+#   # }
+
+#   close $ensio_fh;
+# ### $ensio_output
+
+#   is($utils_output, $ensio_output, "BED harmonization test pass");
+# }
+
+
+
+done_testing();