From 96f7b6c6d9259772da37286f31d90e8fa6a5cd7d Mon Sep 17 00:00:00 2001 From: Kieron Taylor <ktaylor@ebi.ac.uk> Date: Fri, 28 Oct 2011 15:28:32 +0000 Subject: [PATCH] Introduced Utils::IO namespace along with serializer code for GFF3 and human-readable formats. Pre-release, not intended for public use. Also added Utils::BiotypeMapper that performs ontology mappings. --- modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm | 221 ++++++++++++++++++ modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm | 214 +++++++++++++++++ .../Bio/EnsEMBL/Utils/IO/ReportSerializer.pm | 209 +++++++++++++++++ modules/Bio/EnsEMBL/Utils/IO/Serializer.pm | 160 +++++++++++++ 4 files changed, 804 insertions(+) create mode 100644 modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm create mode 100644 modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm create mode 100644 modules/Bio/EnsEMBL/Utils/IO/ReportSerializer.pm create mode 100644 modules/Bio/EnsEMBL/Utils/IO/Serializer.pm diff --git a/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm b/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm new file mode 100644 index 0000000000..d5148f0053 --- /dev/null +++ b/modules/Bio/EnsEMBL/Utils/BiotypeMapper.pm @@ -0,0 +1,221 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +BiotypeMapper - Translates EnsEMBL biotypes into Sequence Ontology terms and back + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Utils::BiotypeMapper + +my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' ); +my $biotype_mapper = new BiotypeMapper($ontology_adaptor); + +print $biotype_mapper->translate_feature_to_SO_term($feature); + +=head1 DESCRIPTION + +BiotypeMapper provides a series of nearest matches between EnsEMBL biotypes and +the Sequence Ontology (http://www.sequenceontology.org) + +Mappings are imperfect due to the inexact correspondance of biotypes to +several SO terms. The a best guess has been chosen in each case. + +Reverse mappings from SO to biotype are vague, due to many-to-one relationships. +In this case a list of possible terms is given. + +=cut + +package Bio::EnsEMBL::Utils::BiotypeMapper; + +use strict; +use warnings; +use Carp; + +my %gene_so_mapping = ( + 'protein_coding' => 'SO:0001217', # protein_coding_gene + 'pseudogene' => 'SO:0000336', # pseudogene + 'processed_transcript' => 'SO:0001503', # processed_transcript + 'lincRNA' => 'SO:0001641', # lincRNA_gene + 'polymorphic_pseudogene'=> 'SO:0000336', # pseudogene + 'Mt_tRNA' => 'SO:0000088', # mt_gene + 'IG_D_gene' => 'SO:0000510', # D_gene + 'snoRNA' => 'SO:0001267', #snoRNA_gene + 'misc_RNA' => 'SO:0000356', #RNA + 'miRNA' => 'SO:0001265', #miRNA_gene + 'rRNA' => 'SO:0001637', #rRNA_gene + 'snRNA' => 'SO:0001268', #snRNA_gene + 'snRNA_pseudogene' => 'SO:0000336', # pseudogene + 'tRNA_pseudogene' => 'SO:0000778', # pseudogenic_tRNA + 'rRNA_pseudogene' => 'SO:0000777', # pseudogenic_rRNA + 'TR_J_gene' => 'SO:0000470', # J_gene + 'TR_V_gene' => 'SO:0000466', # V_gene + 'TR_C_gene' => 'SO:0000478', # C_gene + 'ncRNA' => 'SO:0001263', # ncRNA_gene + 'tRNA' => 'SO:0001272', # tRNA_gene + 'retrotransposed' => 'SO:0000569', # retrotransposed +## heavily abbreviated + ); + +my %transcript_so_mapping = ( + 'processed_transcript' => 'SO:0001503', # processed_transcript + 'nonsense_mediated_decay' => 'SO:0001621', # NMD_transcript_variant + 'retained_intron' => 'SO:0000681', # aberrant_processed_transcript + 'transcribed_unprocessed_pseudogene'=> 'SO:0000516', # pseudogenic_transcript + 'processed_pseudogene' => 'SO:0000043', # processed_pseudogene + 'unprocessed_pseudogene' => 'SO:0000336', # pseudogene + 'unitary_pseudogene' => 'SO:0000336', + 'pseudogene' => 'SO:0000336', # pseudogene + 'transcribed_processed_pseudogene' => 'SO:0000043', + 'retrotransposed' => 'SO:0000569', #retrotransposed + 'ncrna_host' => 'SO:0000483', + 'polymorphic_pseudogene' => 'SO:0000336', + 'lincRNA' => 'SO:0001463', + 'ncrna_host' => 'SO:0000483', + '3prime_overlapping_ncrna' => 'SO:0000483', + 'TR_V_gene' => 'SO:0000466', + 'TR_V_pseudogene' => 'SO:0000336', + + 'TR_J_gene' => 'SO:0000470', + 'IG_C_gene' => 'SO:0000478', + 'IG_C_pseudogene' => 'SO:0000336', + 'TR_C_gene' => 'SO:0000478', + 'IG_J_pseudogene' => 'SO:0000336', + 'miRNA' => 'SO:0000276', #miRNA + 'miRNA_pseudogene' => 'SO:0000336', + 'disrupted_domain' => 'SO:0000681', # aberrant_processed_transcript + 'rRNA' => 'SO:0000252', #rRNA + 'rRNA_pseudogene' => 'SO:0000777', + 'scRNA_pseudogene' => 'SO:0000336', + 'snoRNA' => 'SO:0000275', # snoRNA + 'snoRNA_pseudogene' => 'SO:0000336', + 'snRNA' => 'SO:0000274', # snRNA + 'snRNA_pseudogene' => 'SO:0000336', + + ); + +my %feature_so_mapping = ( + 'Bio::EnsEMBL::Gene' => 'SO:0000704', # gene + 'Bio::EnsEMBL::Transcript' => 'SO:0000673', # transcript + 'Bio::EnsEMBL::Slice' => 'SO:0000001', # region + 'Bio::EnsEMBL::Variation::VariationFeature' => 'SO:0001060', # sequence variant + 'Bio::EnsEMBL::Variation::StructuralVariationFeature' => 'SO:0001537', # structural variant + 'Bio::EnsEMBL::Compara::ConstrainedElement' => 'SO:0001009', #DNA_constraint_sequence ???? + 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' => 'SO:0001679', # transcription_regulatory_region +); + +=head2 new + + Constructor + Arg [1] : OntologyAdaptor from the EnsEMBL registry + Returntype : Bio::EnsEMBL::BiotypeMapper + +=cut + +sub new { + my $class = shift; + my $self = { + ontology_adaptor => shift, + }; + + bless $self, $class; + return $self; +} + +=head2 translate_feature_to_SO_term + + Arg [0] : Bio::EnsEMBL::Feature, subclass or related Storable + Description: Translates a Feature type into an SO term. If the Feature is a + Gene or Transcript, then a further refinement of the type is made + via Biotype + Returntype : String + +=cut + +sub translate_feature_to_SO_term { + my $self = shift; + my $feature = shift; + my $so_accession; + my $so_term; + if (ref($feature) eq "Bio::EnsEMBL::Gene" and exists $gene_so_mapping{$feature->biotype}) { + $so_accession = $gene_so_mapping{$feature->biotype}; + } + elsif (ref($feature) eq "Bio::EnsEMBL::Transcription" and exists $transcript_so_mapping{$feature->biotype}) { + $so_accession = $transcript_so_mapping{$feature->biotype}; + } + else { + $so_accession = $feature_so_mapping{ref($feature)}; + } + if (defined($so_accession)) { + $so_term = $self->{'ontology_adaptor'}->fetch_by_accession($so_accession); + } + else { + carp "Ontology mapping not found for ".ref($feature)."\n"; + return "????????"; + } + + return $so_term->name; +} + + +=head2 translate_SO_to_biotype + + Arg [0] : Sequence Ontology term, either in name or URI format + Description: Returns the closest corresponding Ensembl biotypes to a given SO term + Returntype : String containing a comma-separated list of biotypes +=cut + +sub translate_SO_to_biotype { + my $self = shift; + my $translate_me = shift; + + my @so_names; +# look up text in ontology database + if ($translate_me !~ /^SO:/) { + my $so_terms = $self->{'ontology_adaptor'}->fetch_all_by_name($translate_me); + @so_names = []; + foreach my $term (@{$so_terms}) { + push @so_names,$term->accession(); + } + } + else { + push @so_names,$translate_me; + } +# convert list of accessions into biotypes + my @biotypes; + foreach my $accession (@so_names) { + foreach my $key (keys %gene_so_mapping) { + if ($gene_so_mapping{$key} eq $accession) { + push @biotypes,$key; + } + } + foreach my $key (keys %transcript_so_mapping) { + if ($transcript_so_mapping{$key} eq $accession) { + push @biotypes,$key; + } + } + foreach my $key (keys %feature_so_mapping) { + if ($feature_so_mapping{$key} eq $accession) { + push @biotypes,$key; + } + } + } + + return join (',',@biotypes); +} + +1; diff --git a/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm b/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm new file mode 100644 index 0000000000..345564f64b --- /dev/null +++ b/modules/Bio/EnsEMBL/Utils/IO/GFFSerializer.pm @@ -0,0 +1,214 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +GFFSerializer - Feature to GFF converter + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Utils::IO::GFFSerializer; +use Bio::EnsEMBL::Utils::BiotypeMapper; + +my $ontology_adaptor = $registry->get_adaptor( 'Multi', 'Ontology', 'OntologyTerm' ); +my $biotype_mapper = new BiotypeMapper($ontology_adaptor); +my $serializer = new GFFSerializer($biotype_mapper,$output_fh); + +my $variation_feature_adaptor = $registry->get_adaptor( $config{'species'}, 'variation', 'variationfeature' ); +$serializer->print_metadata("Variation Features:"); +my $iterator = $variation_feature_adaptor->fetch_Iterator_by_Slice($slice,undef,60000); +$serializer->print_feature_Iterator($iterator); + +=head1 DESCRIPTION + +Subclass of Serializer that can turn a feature into a line for the GFF3 format. Requires +a BiotypeMapper in order to translate biotypes to SO terms. + +=cut + +package Bio::EnsEMBL::Utils::IO::GFFSerializer; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use Bio::EnsEMBL::Utils::BiotypeMapper; +use URI::Escape; +use Bio::EnsEMBL::Utils::IO::Serializer; + +use base qw(Bio::EnsEMBL::Utils::IO::Serializer); + +my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-'); + +=head2 new + + Constructor + Arg [1] : BiotypeMapper + Arg [2] : Optional File handle + + Returntype : Bio::EnsEMBL::Utils::IO::Serializer + +=cut + +sub new { + my $class = shift; + my $self = { + mapper => shift, + filehandle => shift, + }; + bless $self, $class; + if ( ref($self->mapper) ne "Bio::EnsEMBL::Utils::BiotypeMapper" ) { + throw("GFF format requires an instance of Bio::EnsEMBL::Utils::BiotypeMapper to function"); + } + + if (!defined ($self->{'filehandle'})) { + # no file handle, let the handle point to a copy of STDOUT instead + open $self->{'filehandle'}, ">&STDOUT"; + $self->{'stdout'} = 1; + } + return $self; +} + +=head2 print_feature + + Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature + Example : $reporter->print_feature($feature,$slice_start_coordinate,"X") + Description: Asks a feature for its summary, and generates a GFF3 compliant entry to hand back again + Additional attributes are handed through to column 9 of the output using exact spelling + and capitalisation of the feature-supplied hash. + Returntype : none +=cut + +sub print_feature { + my $self = shift; + my $feature = shift; + my $biotype_mapper = $self->{'mapper'}; + + my $text_buffer = ""; + if ($feature->can('summary_as_hash') ) { + my %summary = %{$feature->summary_as_hash}; + my $row = ""; +# Column 1 - seqname, the name of the sequence/chromosome the feature is on. Landmark for start below + if (!defined($summary{'seq_region_name'})) {$summary{'seq_region_name'} = "?";} + $row .= $summary{'seq_region_name'}."\t"; + +# Column 2 - source, complicated with Ensembl not being the originator of all data + $row .= "EnsEMBL\t"; + +# Column 3 - feature, the ontology term for the kind of feature this row is + my $so_term = $biotype_mapper->translate_feature_to_SO_term($feature); + $row .= $so_term."\t"; + +# Column 4 - start, the start coordinate of the feature, here shifted to chromosomal coordinates +# Start and end must be in ascending order for GFF. Circular genomes require the length of +# the circuit to be added on. + if ($summary{'start'} > $summary{'end'}) { + #assumes this is not a Compara circular sequence and can treat is as a Feature + if ($feature->slice() && $feature->slice()->is_circular() ) { + $summary{'end'} = $summary{'end'} + $feature->seq_region_length; + } + # non-circular, but end still before start + else {$summary{'end'} = $summary{'start'};} + } + $row .= $summary{'start'} . "\t"; + +# Column 5 - end, coordinates (absolute) for the end of this feature + $row .= $summary{'end'} . "\t"; + +# Column 6 - score, for variations only. + if (exists($summary{'score'})) { + $row .= $summary{'score'}."\t"; + } + else { + $row .= ".\t"; + } + +# Column 7 - strand, up or down + if (exists($summary{'strand'})) { + $row .= $strand_conversion{$summary{'strand'}}."\t"; + } + else { + $row .= ".\t"; + } + +# Column 8 - reading frame, necessary only for Exons + $row .= ".\t"; + +# Column 9 - the 'other' section for all GFF and GVF compliant attributes +# We include Stable ID and biotype where possible to supplement the information in the other columns + delete $summary{'seq_region_start'}; + delete $summary{'seq_region_name'}; + delete $summary{'start'}; + delete $summary{'end'}; + delete $summary{'strand'}; + delete $summary{'score'}; +# Slice the hash for specific keys in GFF-friendly order + my @ordered_keys = qw(ID Name Alias Parent Target Gap Derives_from Note Dbxref Ontology_term Is_circular); + my @ordered_values = @summary{@ordered_keys}; + while (my $key = shift @ordered_keys) { + my $value = shift @ordered_values; + if ($value) { + $row .= $key."=".uri_escape($value,'\t\n\r;=%&,').";"; + } + delete $summary{$key}; + } +# Catch the remaining keys, containing whatever else the Feature provided + foreach my $attribute ( keys(%summary)) { + if (ref $summary{$attribute} eq "ARRAY") { + $row .= $attribute."=".join (',',@{$summary{$attribute}}) . ";" + } + else { + if ($summary{$attribute}) { $row .= $attribute."=".uri_escape($summary{$attribute},'\t\n\r;=%&,') . ";"; } + } + } +# trim off any trailing commas left by the ordered keys stage above: + $text_buffer .= $row."\n"; + } + else { + warning("Feature failed to self-summarise"); + } + #filehandle is inherited + my $fh = $self->{'filehandle'}; + print $fh $text_buffer; +} + +=head2 print_main_header + + Arg [1] : Arrayref of slices going into the file. + Description: Printing the header text or metadata required for GFF, + using a list of slices to be written + Returntype : None +=cut + +sub print_main_header { + my $self = shift; + my $arrayref_of_slices = shift; + my $fh = $self->{'filehandle'}; + + print $fh "##gff-version 3\n"; + foreach my $slice (@{$arrayref_of_slices}) { + if (not defined($slice)) { warning("Slice not defined.\n"); return;} + print $fh "##sequence-region ",$slice->seq_region_name," ",$slice->start," ",$slice->end,"\n"; + } +} + +sub print_metadata { + my $self = shift; + my $text = shift; + my $fh = $self->{'filehandle'}; + print $fh "\n# ".$text."\n"; +} + + +1; diff --git a/modules/Bio/EnsEMBL/Utils/IO/ReportSerializer.pm b/modules/Bio/EnsEMBL/Utils/IO/ReportSerializer.pm new file mode 100644 index 0000000000..ae400328c9 --- /dev/null +++ b/modules/Bio/EnsEMBL/Utils/IO/ReportSerializer.pm @@ -0,0 +1,209 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +Report Serializer - generating textual summary reports + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + use Bio::EnsEMBL::Utils::IO::ReportSerializer; + use IO::File; + + my $registry = 'Bio::EnsEMBL::Registry'; + $output_fh = IO::File->new($config{'output'},'w') or die; + $serializer = new ReportSerializer($output_fh); + my $slice_adaptor = $registry->get_adaptor( 'Human', 'Core', 'Slice' ); + my $slice = $slice_adaptor->fetch_by_toplevel_location("6:1000000..1500000"); + + $serializer->print_section_header($slice); + $serializer->print_feature_list($slice->get_all_Genes); + +=head1 DESCRIPTION + +Subclass of Serializer that can turn a feature into a text block +Unsuited to very large slices, because it requires a select-all approach for features. + +=cut + +package Bio::EnsEMBL::Utils::IO::ReportSerializer; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use URI::Escape; +use Bio::EnsEMBL::Utils::IO::Serializer; + +use base qw(Bio::EnsEMBL::Utils::IO::Serializer); + +my %strand_conversion = ( '1' => '+', '0' => '?', '-1' => '-'); + +my %feature_conversion = ( 'Bio::EnsEMBL::Gene' => 'Gene', + 'Bio::EnsEMBL::Transcript' => 'Transcript', + 'Bio::EnsEMBL::Translation' => 'Translation', + 'Bio::EnsEMBL::Variation::StructuralVariationFeature' => 'Structural Variation', + 'Bio::EnsEMBL::Variation::VariationFeature' => 'Variation', + 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' => 'Regulatory Feature', + 'Bio::EnsEMBL::Compara::ConstrainedElement' => 'Constrained Element', +); + +# Hash for selecting the correct attributes of unseen features for crude summary. This hash is +# for fallback behaviour, slicing summary hashes for a limited set of values. +my %printables = ( + 'Bio::EnsEMBL::Gene' => ['ID','biotype','start','end'], + 'Bio::EnsEMBL::Transcript' => ['ID','start','end'], + 'Bio::EnsEMBL::Translation' => ['ID'], + 'Bio::EnsEMBL::Variation::VariationFeature' => ['ID','start','end','strand','seq_region_name'], + 'Bio::EnsEMBL::Variation::StructuralVariationFeature' => ['ID','start','end','strand','seq_region_name'], + 'Bio::EnsEMBL::Funcgen::RegulatoryFeature' => ['ID','start','end','strand'], + 'Bio::EnsEMBL::Compara::ConstrainedElement' => ['ID','start','end','strand','seq_region_name'], + ); + +=head2 print_feature + + Arg [1] : Bio::EnsEMBL::Feature, subclass or related pseudo-feature + Example : $reporter->print_feature($feature,$slice_start_coordinate,"X") +=cut + +sub print_feature { + my $self = shift; + my $feature = shift; + my $fh = $self->{'filehandle'}; + my $feature_type = ref($feature); + + if ($feature->can('summary_as_hash') ) { + my %summary = %{$feature->summary_as_hash}; + my @values = @summary{ @{$printables{$feature_type}} }; + print $fh join(',',@values)."\n"; + } + else { + warning("Feature failed to self-summarise"); + } +} + +=head2 print_feature_list + + Arg [1] : Listref of Bio::EnsEMBL::Feature, subclass or related pseudo-feature + Description: Relies on a list of similar features to print in a block together. + Overrides superclass method + Results are truncated after the first 100 features for brevity. + Example : $reporter->print_feature_list(\@features); +=cut + +sub print_feature_list { + my $self = shift; + my $feature_list = shift; + my $fh = $self->{'filehandle'}; + + my $example_feature = $feature_list->[0]; + my $feature_type = ref($example_feature); + my $feature_count = 0; + unless (defined $feature_type) {$feature_type = "Feature"}; + print $fh "There are ",scalar(@$feature_list)," ",$feature_conversion{$feature_type},(scalar(@$feature_list) != 1) ? "s":""," in this region\n"; + if (scalar(@$feature_list) > 100 ) { print $fh "Too many to display, results truncated to the first 100\n";} + print $fh "\n"; + foreach my $feature (@$feature_list) { + $feature_count++; + my %attributes = %{$feature->summary_as_hash}; + + if ($feature_count == 100) {last;} + # Begin the feature-specific formatting code + if ($feature_type eq "Bio::EnsEMBL::Gene") { + print $fh "\tGene ".$feature_count.": ".$attributes{'external_name'}.",".$attributes{'ID'}."\n"; + print $fh "\tBiotype: ".$attributes{'biotype'}."\n"; + print $fh "\tLocation: ".$attributes{'start'}."-".$attributes{'end'}." bp\n\n"; + + print $fh "\tTranscripts and proteins\n"; + foreach my $transcript (@{$feature->get_all_Transcripts}) { + my %tr_summary = %{$transcript->summary_as_hash}; + print $fh "\t\t ".$tr_summary{'ID'}; + my $translation = $transcript->translation; + if (defined $translation) { + my %pr_summary = %{$translation->summary_as_hash}; + print $fh " - ".$pr_summary{'ID'}."\n\n"; + } + else {print $fh " - no protein\n\n";} + } + print $fh "\n"; + } + elsif ($feature_type eq "Bio::EnsEMBL::Funcgen::RegulatoryFeature") { + print $fh "\t".$attributes{'ID'}."\n"; + } + elsif ($feature_type eq "Bio::EnsEMBL::Compara::ConstrainedElement") { + print $fh "\t".$attributes{'start'}."-".$attributes{'end'}."\n"; + } + elsif ( $feature_type eq "Bio::EnsEMBL::Variation::StructuralVariationFeature" + or $feature_type eq "Bio::EnsEMBL::Variation::VariationFeature") { + print $fh "\tID: ".$attributes{'ID'}." Position: ". + $attributes{'start'}."-".$attributes{'end'}." on strand ".$attributes{'strand'}." \n"; + } + else { + # slice favourite values out unformatted. + my @values = @attributes{ @{$printables{$feature_type}} }; + print $fh $feature_type.join(',',@values)."\n"; + + } + } +} + +# Just print individuals without awareness of list size and position. +sub print_feature_iterator { + my $self = shift; + my $feature_iterator = shift; + while ($feature_iterator->has_next) { + my $feature = $feature_iterator->next; + $self->print_feature($feature); + } +} + +=head2 print_main_header + + Arg [1] : Arrayref of slices going into the file. + Description: Printing the header text for this report + Requires a slice list in order to report how many will be printed + Returntype : None +=cut + +sub print_main_header { + my $self = shift; + my $arrayref_of_slices = shift; + my $fh = $self->{'filehandle'}; + + my $regions = scalar @{$arrayref_of_slices}; + print $fh "Report for $regions region"; + if ($regions > 1) { print $fh "s";} + print $fh "\n\n"; +} + + +=head2 print_section_header + + Arg [1] : Bio::EnsEMBL::Slice + Description: Prints a summary of the slice + Intended to be used prior to print_feature_list() + Returntype : None + +=cut + +sub print_section_header { + my $self = shift; + my $slice = shift; + my $fh = $self->{'filehandle'}; + + print $fh " Region: ",$slice->seq_region_name," ",$slice->start,"-",$slice->end," bp\n\n"; + +} + diff --git a/modules/Bio/EnsEMBL/Utils/IO/Serializer.pm b/modules/Bio/EnsEMBL/Utils/IO/Serializer.pm new file mode 100644 index 0000000000..2dff8efdd9 --- /dev/null +++ b/modules/Bio/EnsEMBL/Utils/IO/Serializer.pm @@ -0,0 +1,160 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +Serializer - An abstract serializer for turning EnsEMBL data into other formats + +=head1 AUTHOR + +Kieron Taylor, 2011 - ktaylor@ebi.ac.uk + +=head1 SYNOPSIS + +my $serializer = new Serializer( $filehandle ); +$serializer->print_feature_list( \@list_of_features ); + +=head1 DESCRIPTION + +Subclass this class to create a format-specific serializer. +Be sure to implement print_feature at the bare minimum + +=cut + +package Bio::EnsEMBL::Utils::IO::Serializer; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use Bio::EnsEMBL::Utils::SeqDumper; + + +=head2 new + + Constructor + Arg [1] : Optional File handle + Returntype : Bio::EnsEMBL::Utils::IO::Serializer + +=cut + +sub new { + my $class = shift; + my $self = { + 'filehandle' => shift, + }; + bless $self, $class; + if (!defined ($self->{'filehandle'})) { + # no file handle, let the handle point to a copy of STDOUT instead + open $self->{'filehandle'}, ">&STDOUT"; + $self->{'stdout'} = 1; + } + return $self; +} + +=head2 DESTROY + + Destructor + Description: Restores default state of the STDOUT filehandle as it is a copy + and may not flush correctly. +=cut + +sub DESTROY { + my $self = shift; + if ($self->{'stdout'}) { + close $self->{'filehandle'}; + } +} + +sub print_feature { + throw( "print_feature method not implemented."); +} + +=head2 print_feature_list + + Arg [1] : Listref of features + Description: Run print_feature on every feature in the list + +=cut + +sub print_feature_list { + my $self = shift; + my $feature_list = shift; + if (ref($feature_list) eq 'ARRAY') { + foreach my $feature (@{$feature_list}) { + $self->print_feature($feature); + } + } + else { + throw( "print_feature_list requires a listref as argument" ); + } +} + +=head2 print_feature_Iterator + + Arg [1] : Bio::EnsEMBL::Utils::Iterator + Description: Automatically spools through an iterator for convenience + Returntype : None +=cut + +sub print_feature_Iterator { + my $self = shift; + my $iterator = shift; + if ($iterator->can('has_next')) { + $iterator->each(sub {$self->print_feature($_)}); + } + else { + throw("Supplied iterator does not look like Bio::EnsEMBL::Utils::Iterator"); + } +} + +=head2 print_metadata + + Arg [1] : String + Description: Pipes a custom string into the filehandle that the serializer is using + +=cut + +sub print_metadata { + my $self = shift; + my $text = shift; + my $fh = $self->{'filehandle'}; + print $fh "\n".$text."\n"; +} + +=head2 print_main_header + + Arg [1] : Arrayref of slices going into the file. + Description: Printing the header text or metadata required for this file format, + Re-implement in the serializer. + Returntype : None +=cut + +sub print_main_header { + my $self = shift; +# my $arrayref_of_slices = shift; +# my $fh = $self->{'filehandle'}; + warning("No writer for headers in this format. Nothing done" ); +} + +=head2 print_sequence + Arg [1] : Bio::EnsEMBL::Slice + Description: By default, prints a block of FASTA format sequence from the given slice +=cut + +sub print_sequence { + my $self = shift; + my $slice = shift; + print "##FASTA\n"; + Bio::EnsEMBL::Utils::SeqDumper->dump( $slice, 'FASTA', $self->{'filehandle'}); +} + + +1; -- GitLab