Commit 788f52ae authored by Tiago Grego's avatar Tiago Grego
Browse files

import of ensembl Utils-IO tests

parent d2992acd
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use strict;
use warnings;
use Test::More;
use Test::Warnings;
use Test::Exception;
use IO::String;
use Bio::EnsEMBL::Utils::IO::FASTASerializer;
use Bio::EnsEMBL::Test::TestUtils qw/warns_like/;
use Bio::EnsEMBL::Slice;
use Bio::EnsEMBL::CoordSystem;
use Bio::Seq;
#
# TEST - Slice creation from adaptor
#
my $coord_system = Bio::EnsEMBL::CoordSystem->new(
-NAME => 'chromosome', -RANK => 1
);
# instantiate slice
#SEQ COORD_SYSTEM SEQ_REGION_NAME SEQ_REGION_LENGTH
# START END STRAND ADAPTOR EMPTY
my $slice = Bio::EnsEMBL::Slice->new(
-SEQ_REGION_NAME => "top_banana",
-COORD_SYSTEM => $coord_system,
-STRAND => 1,
-START => 110,
-END => 199,
-SEQ_REGION_LENGTH => 90,
-SEQ => "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACGCGCGCGCGGGA",
);
# ensure Serializer produces output identical to the well-used SeqDumper.
my $fh_SeqDumper = IO::String->new();
Bio::EnsEMBL::Utils::SeqDumper->dump_fasta( $slice, $fh_SeqDumper);
my $fh_Serializer = IO::String->new();
my $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh_Serializer);
$serializer->print_Seq($slice);
my $SeqDumper_output = ${$fh_SeqDumper->string_ref()};
my $Serializer_output = ${$fh_Serializer->string_ref()};
$fh_SeqDumper->close;
$fh_Serializer->close;
#print STDERR $Serializer_output."\n";
is ($SeqDumper_output,$Serializer_output,"Outputs should match from both serializers");
# Test custom header capabilities
my $custom_header = sub {
my $slice = shift;
return "It's a FASTA header";
};
$fh_Serializer = IO::String->new();
$serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($fh_Serializer, $custom_header);
$serializer->print_Seq($slice);
$Serializer_output = ${$fh_Serializer->string_ref()};
note $Serializer_output;
is ($Serializer_output,">It's a FASTA header\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATGAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAACGCGCGCGCGGGA\n", "Serializer custom header should override correctly.");
$fh_Serializer->close;
{
my $seq = 'A'x120;
my $s = Bio::EnsEMBL::Slice->new(-SEQ_REGION_NAME => 'a', -COORD_SYSTEM => $coord_system, -SEQ => $seq, -SEQ_REGION_LENGTH => 120, -START => 1, -END => 120);
my $header = sub { return 'a'; };
my $io = IO::String->new();
my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($io, $header, 600, 20); # chunk is smaller than line width
my $expected = <<'FASTA';
>a
AAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAA
FASTA
$ser->print_Seq($s);
is(${$io->string_ref()}, $expected, 'Testing round number serialisation');
}
{
my $seq = 'A'x21;
my $s = Bio::EnsEMBL::Slice->new(-SEQ_REGION_NAME => 'a', -COORD_SYSTEM => $coord_system, -SEQ => $seq, -SEQ_REGION_LENGTH => 21, -START => 1, -END => 21);
my $header = sub { return 'a'; };
my $io = IO::String->new();
my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($io, $header, 1, 20);
my $expected = <<'FASTA';
>a
AAAAAAAAAAAAAAAAAAAA
A
FASTA
$ser->print_Seq($s);
is(${$io->string_ref()}, $expected, 'Testing odd (as in strange) number line length serialisation');
}
{
my $seq = Bio::Seq->new(-SEQ => 'M', -DISPLAY_ID => 'A');
my $io = IO::String->new();
my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($io);
my $expected = <<'FASTA';
>A
M
FASTA
$ser->print_Seq($seq);
is(${$io->string_ref()}, $expected, 'Testing single base serialisation');
my $header = sub { return 'A'; };
$io = IO::String->new();
$ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($io, $header, 1, '100000');
$ser->print_Seq($seq);
is(${$io->string_ref()}, $expected, 'Test with crazy line length');
$io->close;
}
throws_ok( sub {
my $io = IO::String->new;
my $ser = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($io, 'Hello', 1, '1000000000000');
}, qr/Maximum line width/, 'Test really crazy line length');
throws_ok( sub { Bio::EnsEMBL::Utils::IO::FASTASerializer->new(undef, 'Hello', 1, '+INF'); }, qr/Maximum line width/,'Stupid line length input');
done_testing();
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use strict;
use warnings;
use Test::More;
use Test::Warnings;
use IO::String;
use Bio::EnsEMBL::Utils::IO::GFFParser;
{
my $io = IO::String->new(<<'GFF');
##gff-version 3
##sequence-region ctg123 1 1497228
##taken-from http://www.sequenceontology.org/gff3.shtml
ctg123 . gene 1000 9000 . + . ID=gene00001;Name=EDEN
ctg123 . TF_binding_site 1000 1012 . + . ID=tfbs00001;Parent=gene00001
ctg123 . mRNA 1050 9000 . + . ID=mRNA00001;Parent=gene00001;Name=EDEN.1
ctg123 . mRNA 1050 9000 . + . ID=mRNA00002;Parent=gene00001;Name=EDEN.2
ctg123 . mRNA 1300 9000 . + . ID=mRNA00003;Parent=gene00001;Name=EDEN.3
ctg123 . exon 1300 1500 . + . ID=exon00001;Parent=mRNA00003
ctg123 . exon 1050 1500 . + . ID=exon00002;Parent=mRNA00001,mRNA00002
ctg123 . exon 3000 3902 . + . ID=exon00003;Parent=mRNA00001,mRNA00003
ctg123 . exon 5000 5500 . + . ID=exon00004;Parent=mRNA00001,mRNA00002,mRNA00003
ctg123 . exon 7000 9000 . + . ID=exon00005;Parent=mRNA00001,mRNA00002,mRNA00003
ctg123 . CDS 1201 1500 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1
ctg123 . CDS 3000 3902 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1
ctg123 . CDS 5000 5500 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1
ctg123 . CDS 7000 7600 . + 0 ID=cds00001;Parent=mRNA00001;Name=edenprotein.1
GFF
my $gff = Bio::EnsEMBL::Utils::IO::GFFParser->new($io);
my $header = $gff->parse_header();
is_deeply(
$header,
[ '##gff-version 3', '##sequence-region ctg123 1 1497228', '##taken-from http://www.sequenceontology.org/gff3.shtml'],
'Checking headers all parse'
);
my $actual_gene = $gff->parse_next_feature();
my $expected_gene = {
seqid => 'ctg123', start => 1000, end => 9000, strand => 1,
source => '.', type => 'gene', score => '.', phase => '.',
attribute => { ID => 'gene00001', Name => 'EDEN' }
};
is_deeply($actual_gene, $expected_gene, 'Checking gene record parses');
my $actual_tf = $gff->parse_next_feature();
my $expected_tf = {
seqid => 'ctg123', start => 1000, end => 1012, strand => 1,
source => '.', type => 'TF_binding_site', score => '.', phase => '.',
attribute => { ID => 'tfbs00001', Parent => 'gene00001' }
};
is_deeply($actual_tf, $expected_tf, 'Checking TF record parses');
#SKIP TO EXONS
$gff->parse_next_feature(); #mrna
$gff->parse_next_feature(); #mrna
$gff->parse_next_feature(); #mrna
#EXONS
{
my $actual = $gff->parse_next_feature();
my $expected = {
seqid => 'ctg123', start => 1300, end => 1500, strand => 1,
source => '.', type => 'exon', score => '.', phase => '.',
attribute => { ID => 'exon00001', Parent => 'mRNA00003' }
};
is_deeply($actual, $expected, 'Checking Exon 1 record parses');
}
{
my $actual = $gff->parse_next_feature();
my $expected = {
seqid => 'ctg123', start => 1050, end => 1500, strand => 1,
source => '.', type => 'exon', score => '.', phase => '.',
attribute => { ID => 'exon00002', Parent => ['mRNA00001', 'mRNA00002'] }
};
is_deeply($actual, $expected, 'Checking Exon 2 record parses');
}
}
{
my $string = <<GFF;
##gff-version 3
##sequence-region ctg123 1 1497228
##taken-from example parsing file with FASTA directive
ctg123\t.\tgene\t1000\t9000\t.\t+\t.\tID=gene00001;Name=EDEN
###
##FASTA
>ctg123
AACCTTTGGGCCGGGCCTTAAAA
AACC
GFF
my $io = IO::String->new(\$string);
my $gff = Bio::EnsEMBL::Utils::IO::GFFParser->new($io);
my $header = $gff->parse_header();
is_deeply(
$header,
[ '##gff-version 3', '##sequence-region ctg123 1 1497228', '##taken-from example parsing file with FASTA directive'],
'Checking headers all parse'
);
my $actual_gene = $gff->parse_next_feature();
my $expected_gene = {
seqid => 'ctg123', start => 1000, end => 9000, strand => 1,
source => '.', type => 'gene', score => '.', phase => '.',
attribute => { ID => 'gene00001', Name => 'EDEN' }
};
is_deeply($actual_gene, $expected_gene, 'Checking TF record parses');
my $id = $gff->parse_next_feature();
ok(! defined $id, 'No more features');
my $seq = $gff->parse_next_sequence();
is_deeply($seq, {header => '>ctg123', sequence => "AACCTTTGGGCCGGGCCTTAAAAAACC"}, "Checking Sequence parses correctly")
}
done_testing();
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## no critic (RequireFilenameMatchesPackage)
package Test::SO::Term;
use strict;
use warnings;
sub new {
my ($class) = @_;
return bless({}, ref($class) || $class);
}
# default lookup and always returns region
sub name {
my ($self) = @_;
return 'region';
}
package Test::SO;
use base qw/Bio::EnsEMBL::DBSQL::OntologyTermAdaptor/;
sub new {
my ($class) = @_;
return bless({}, ref($class) || $class);
}
sub fetch_by_accession {
my ($self) = @_;
return Test::SO::Term->new();
}
package main;
use strict;
use warnings;
use Test::More;
use Test::Warnings;
use Bio::EnsEMBL::Test::MultiTestDB;
use Bio::EnsEMBL::Utils::IO::GFFSerializer;
use Bio::EnsEMBL::Feature;
use Bio::EnsEMBL::Slice;
use IO::String;
use Test::Differences;
my $db = Bio::EnsEMBL::Test::MultiTestDB->new();
my $dba = $db->get_DBAdaptor('core');
my $omulti = Bio::EnsEMBL::Test::MultiTestDB->new('ontology');
my $odb = $omulti->get_DBAdaptor('ontology');
my $id = 'ENSG00000131044';
my $ga = $dba->get_GeneAdaptor();
{
my $gene = $ga->fetch_by_stable_id($id);
delete $gene->{source};
$gene->{description} = undef; #empty value means don't emit the key
my $expected = <<'OUT';
##gff-version 3
##sequence-region 20 30274334 30300924
OUT
#Have to do this outside of the HERETO thanks to tabs
$expected .= join("\t",
qw/20 ensembl region 30274334 30300924 . + ./,
'ID=gene:ENSG00000131044;Name=C20orf125;biotype=protein_coding;gene_id=ENSG00000131044;logic_name=ensembl;projection_parent_gene=ENSG_PARENT_GENE;version=1'
);
$expected .= "\n";
assert_gff3($gene, $expected, 'Gene with no source serialises to GFF3 as expected. Source is ensembl');
}
{
my $cs = $dba->get_CoordSystemAdaptor()->fetch_by_name('chromosome');
my $feature = Bio::EnsEMBL::Feature->new(
-SLICE => Bio::EnsEMBL::Slice->new(
-COORD_SYSTEM => $cs,
-SEQ => ('A'x10),
-SEQ_REGION_NAME => 'wibble',
-START => 1,
-END => 10
),
-START => 1,
-END => 10,
-STRAND => 1,
);
my $expected = <<'OUT';
##gff-version 3
##sequence-region wibble 1 10
OUT
#Have to do this outside of the HERETO thanks to tabs
$expected .= join("\t",
qw/wibble . region 1 10 . + ./,
''
);
$expected .= "\n";
assert_gff3($feature, $expected, 'Default feature should seralise without attributes but leave a trailing \t');
}
{
my $gene = $ga->fetch_by_stable_id($id);
$gene->source('wibble');
my $expected = <<'OUT';
##gff-version 3
##sequence-region 20 30274334 30300924
OUT
#Have to do this outside of the HERETO thanks to tabs
$expected .= join("\t",
qw/20 wibble region 30274334 30300924 . + ./,
'ID=gene:ENSG00000131044;Name=C20orf125;biotype=protein_coding;description=DJ310O13.1.2 (NOVEL PROTEIN SIMILAR DROSOPHILA PROTEIN CG7474%2C ISOFORM 2 ) (FRAGMENT). [Source:SPTREMBL%3BAcc:Q9BR18];gene_id=ENSG00000131044;logic_name=ensembl;projection_parent_gene=ENSG_PARENT_GENE;version=1'
);
$expected .= "\n";
assert_gff3($gene, $expected, 'Gene with custom source serialises to GFF3 as expected. Source is wibble');
$expected = <<'OUT';
##gff-version 3
##sequence-region 20 30274334 30298904
OUT
$expected .= join("\t",
qw/20 ensembl region 30274334 30298904 . + ./,
'ID=transcript:ENST00000310998;Name=C20orf125;Parent=gene:ENSG00000131044;biotype=protein_coding;logic_name=ensembl;projection_parent_transcript=ENSG_PARENT_TRANSCRIPT;transcript_id=ENST00000310998;version=1'
);
$expected .= "\n";
assert_gff3($gene->canonical_transcript(), $expected, 'Transcript with custom source serialises to GFF3 as expected. Source is wibble');
$expected = <<'OUT';
##gff-version 3
##sequence-region 20 30274334 30274425
OUT
$expected .= join("\t",
qw/20 ensembl region 30274334 30274425 . + 0/,
'ID=region:ENSP00000308980;Parent=transcript:ENST00000310998;protein_id=ENSP00000308980'
);
$expected .= "\n";
my $cds = $gene->canonical_transcript->get_all_CDS();
assert_gff3($cds->[0], $expected, 'CDS with custom source serialises to GFF3 as expected. Source is wibble');
my $cds_expected = $expected;
$expected = <<'OUT';
##gff-version 3
##sequence-region 20 30274334 30274425
OUT
$expected .= join("\t",
qw/20 ensembl region 30274334 30274425 . + ./,
'Name=ENSE00001155821;Parent=transcript:ENST00000310998;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSE00001155821;rank=1;version=1'
);
$expected .= "\n";
my $exon = $gene->canonical_transcript->get_all_ExonTranscripts();
assert_gff3($exon->[0], $expected, 'Exon with custom source serialises to GFF3 as expected. Source is wibble');
my $new_id = 'ENSG00000126003';
my $utr_gene = $ga->fetch_by_stable_id($new_id);
my $utrs = $utr_gene->canonical_transcript->get_all_five_prime_UTRs();
$expected = <<'OUT';
##gff-version 3
##sequence-region 20 30583501 30583588
OUT
$expected .= join("\t",
qw/20 ensembl region 30583501 30583588 . - ./,
'Parent=transcript:ENST00000246229'
);
$expected .= "\n";
assert_gff3($utrs->[0], $expected, 'UTR feature serialises to GFF3 as expected');
$cds_expected .= join("\t",
qw/20 ensembl region 30274334 30274425 . + ./,
'Name=ENSE00001155821;Parent=transcript:ENST00000310998;constitutive=0;ensembl_end_phase=2;ensembl_phase=0;exon_id=ENSE00001155821;rank=1;version=1'
);
$cds_expected .= "\n";
my @list;
push @list, $cds->[0];
push @list, $exon->[0];
assert_gff3_list(\@list, $cds_expected, "List of features serialises to GFF3 as expected");
}
{
my $gene = $ga->fetch_by_stable_id($id);
my $summary = $gene->summary_as_hash;
$$summary{'Dbxref'} = ['bibble', 'fibble'];
$$summary{'Ontology_term'} = 'GO:0001612';
local undef &{Bio::EnsEMBL::Gene::summary_as_hash};
local *{Bio::EnsEMBL::Gene::summary_as_hash} = sub {return $summary};
my $expected = <<'OUT';
##gff-version 3
##sequence-region 20 30274334 30300924
OUT
#Have to do this outside of the HERETO thanks to tabs
$expected .= join("\t",
qw/20 ensembl region 30274334 30300924 . + ./,
'ID=gene:ENSG00000131044;Name=C20orf125;Dbxref=bibble,fibble;Ontology_term=GO:0001612;biotype=protein_coding;description=DJ310O13.1.2 (NOVEL PROTEIN SIMILAR DROSOPHILA PROTEIN CG7474%2C ISOFORM 2 ) (FRAGMENT). [Source:SPTREMBL%3BAcc:Q9BR18];gene_id=ENSG00000131044;logic_name=ensembl;projection_parent_gene=ENSG_PARENT_GENE;version=1'
);
$expected .= "\n";
assert_gff3($gene, $expected, 'Gene with array- and string-valued attributes as expected.');
}
sub assert_gff3 {
my ($feature, $expected, $msg) = @_;
my $ota = Test::SO->new();
my $fh = IO::String->new();
my $ser = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ota, $fh);
$ser->print_main_header([$feature->feature_Slice()]);
$ser->print_feature($feature);
eq_or_diff(${$fh->string_ref()}, $expected, $msg);
}
sub assert_gff3_list {
my ($features, $expected, $msg) = @_;
my $ota = Test::SO->new();
my $fh = IO::String->new();
my $ser = Bio::EnsEMBL::Utils::IO::GFFSerializer->new($ota, $fh);
$ser->print_main_header([$features->[0]->feature_Slice()]);
$ser->print_feature_list($features);
eq_or_diff(${$fh->string_ref()}, $expected, $msg);
}
done_testing();
This diff is collapsed.
# Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
# Copyright [2016-2019] EMBL-European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use strict;
use warnings;
use Test::More;
use Test::Warnings;
use Test::Exception;
use File::Temp;
use Bio::EnsEMBL::Utils::IO qw/:all/;
use IO::String;
my ($tmp_fh, $file) = File::Temp::tempfile();
close($tmp_fh);
unlink $file;
my $contents = <<'EOF';
>X
AAAAGGGTTCCC
TTGGCCAAAAAA
ATTC
EOF
my $expected_array = [qw/>X AAAAGGGTTCCC TTGGCCAAAAAA ATTC/];
{
throws_ok { slurp($file) } qr/No such file/, 'File does not currently exist so die';
work_with_file($file, 'w', sub {
my ($fh) = @_;
print $fh $contents;
return;
});
my $written_contents = slurp($file);
is($contents, $written_contents, 'Contents should be the same');
my $written_contents_ref = slurp($file, 1);
is('SCALAR', ref($written_contents_ref), 'Asked for a ref so expect one back');
is($contents, $$written_contents_ref, 'Contents should be the same');
work_with_file($file, 'r', sub {
my ($fh) = @_;
my $line = <$fh>;
chomp($line);
is($line, '>X', 'First line expected to be FASTA header');
});
my $chomp = 1;
is_deeply(slurp_to_array($file, $chomp), $expected_array, 'Checking slurp to array with chomp');
$chomp = 0;
is_deeply(slurp_to_array($file, $chomp), [ map { $_."\n" } @{$expected_array}], 'Checking slurp to array with chomp');
my $iterator_counter = 0;
iterate_file($file, sub {
my ($line) = @_;
chomp($line);
is($line, $expected_array->[$iterator_counter++], sprintf('Checking line %d is ok', $iterator_counter+1));
return;
});
unlink $file;
dies_ok { slurp($file) } 'File no longer exists so die';
spurt($file, $contents);
my $rewritten_contents = slurp($file);
is($contents, $rewritten_contents, 'Contents should still be the same');
spurt($file, $contents, 'append');