From a5c5d208e51b9f38f307dd6243448722f6d9d3cd Mon Sep 17 00:00:00 2001 From: Eduardo Eyras <eae@sanger.ac.uk> Date: Wed, 18 Jun 2003 13:24:44 +0000 Subject: [PATCH] module with some methods to convert gff into ensembl exons --- modules/Bio/EnsEMBL/Utils/GFF2Exon.pm | 368 ++++++++++++++++++++++++++ 1 file changed, 368 insertions(+) create mode 100644 modules/Bio/EnsEMBL/Utils/GFF2Exon.pm diff --git a/modules/Bio/EnsEMBL/Utils/GFF2Exon.pm b/modules/Bio/EnsEMBL/Utils/GFF2Exon.pm new file mode 100644 index 0000000000..e54e7adfe7 --- /dev/null +++ b/modules/Bio/EnsEMBL/Utils/GFF2Exon.pm @@ -0,0 +1,368 @@ +# You may distribute this module under the same terms as perl itself +# POD documentation - main docs before the code + +=head1 NAME + +GFF2Exon + +=head1 SYNOPSIS + +use Bio::EnsEMBL::Utils::GFF2Exon; +my $gff = GFF2Exon->new(-gff_version => 2); +my $exon = $gff->exon_from_gff( $gff_string ); + +=cut + +# Let the code begin... + +package Bio::EnsEMBL::Utils::GFF2Exon; + +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::Root; +@ISA = qw(Bio::EnsEMBL::Root); + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($gff_version) = $self->_rearrange([qw(GFF_VERSION)],@args); + + if(($gff_version != 1) && ($gff_version != 2)) { + $self->throw("unknown version: $gff_version"); + } + $self->gff_version($gff_version); + return $self; +} + +=head2 exon_from_gff + + Usage : my $exon = $gff->exon_from_gff_string($gff_string); + Function: Sets properties of an exon object from a GFF-formatted + string. Interpretation of the string depends on the version + that has been specified at initialization. + Args : The GFF-formatted string + +=cut + +sub exon_from_gff { + my ($self, $gff_string, $trans_hash ) = @_; + + if($self->gff_version() == 1) { + return $self->_from_gff1_string($gff_string, $trans_hash); + } + else{ + return $self->_from_gff2_string($gff_string, $trans_hash); + } +} + +=head2 _from_gff1_string + + Returns : Exon + Args : The GFF-formatted string to initialize it from + +=cut + +sub _from_gff1_string { + my ($self, $gffstring, $trans_hash) = @_; + + #print STDERR "gff_string: $gffstring\n"; + + #39896 human_cdna exon 1030942 1031591 100 + 0 6604 + #39897 human_cdna exon 1034227 1034285 100 + 0 6604 + #39898 human_cdna exon 1035280 1035339 100 + 0 6604 + #39899 human_cdna exon 1035741 1035820 100 + 0 6604 + #39900 human_cdna exon 1036477 1036629 100 + 0 6604 + #243299 human_cdna exon 1037161 1037229 100 + 0 40150 + #243301 human_cdna exon 1043180 1043315 100 + 0 40150 + #243303 human_cdna exon 1048194 1048340 100 + 0 40150 + #243305 human_cdna exon 1053724 1053855 100 + 0 40150 + #243307 human_cdna exon 1054346 1054451 100 + 0 40150 + #243309 human_cdna exon 1055667 1055833 100 + 0 40150 + #243310 human_cdna exon 1057048 1057221 100 + 0 40150 + #243311 human_cdna exon 1058529 1058630 100 + 0 40150 + #243313 human_cdna exon 1061487 1062080 100 + 0 40150 + my ($seqname, + $source, + $primary, + $start, + $end, + $score, + $strand, + $frame, + @group) + = split(/\s+/, $gffstring); + + if ( !defined $frame ) { + $self->throw("[$gffstring] does not look like GFF to me"); + } + $frame = 0 unless( $frame =~ /^\d+$/); + + my $exon = Bio::EnsEMBL::Exon->new(); + $exon->seqname($seqname); + #$exon->source_tag($source); + $exon->start($start); + $exon->end($end); + my $phase = ( 3 - $frame )%3; + $exon->phase($phase); + $exon->end_phase( ( $exon->phase + $exon->length)%3 ); + if ( $score ){ + $exon->score( $score ); + } + if ( $strand eq '-' ) { $exon->strand(-1); } + if ( $strand eq '+' ) { $exon->strand(1); } + if ( $strand eq '.' ) { $exon->strand(0); } + + ############################################################ + # warning: it parses only the first element of the group + + $trans_hash->{ $exon } = $group[0]; + return $exon; +} + +=head2 _from_gff2_string + + Returns : Exon + Args : The GFF2-formatted string to initialize it from + +=cut + +sub _from_gff2_string { + my ($self, $gffstring) = @_; + chomp($gffstring); + + # according to the Sanger website, GFF2 should be single-tab separated elements, and the + # free-text at the end should contain text-translated tab symbols but no "real" tabs, + # so splitting on \t is safe, and $attribs gets the + # entire attributes field to be parsed later + + my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @attribs) + = split(/\t+/, $gffstring); + + # just in case the rule against tab characters has been broken + my $attribs = join '', @attribs; + + if ( !defined $frame ) { + $self->throw("[$gffstring] does not look like GFF2 to me"); + } + my $exon = Bio::EnsEMBL::Exon->new(); + $exon->seqname($seqname); + $exon->source_tag($source); + $exon->analysis->gff_feature($primary); + $exon->start($start); + $exon->end($end); + $exon->phase($frame); + $exon->end_phase( ( $exon->phase + $exon->length)%3 ); + if ( $score eq '.' ) { + #$exon->score(undef); + } else { + $exon->score($score); + } + if ( $strand eq '-' ) { $exon->strand(-1); } + if ( $strand eq '+' ) { $exon->strand(1); } + if ( $strand eq '.' ) { $exon->strand(0); } + + # <Begin Inefficient Code from Mark Wilkinson> + # this routine is necessay to allow the presence of semicolons in quoted text + # Semicolons are the delimiting character for new tag/value attributes. + # it is more or less a "state" machine, with the "quoted" flag going up and down + # as we pass thorugh quotes to distinguish free-text semicolon and hash + # symbols from GFF control characters + + # remove comments field (format: #blah blah blah... at the end of the GFF line) + #$attribs =~ s/\#(.*)$//; + #my @att = split //, $attribs; # split into individual characters + #my $num = $#att; # count them + #my $flag = 0; + #my @parsed; # this is needed to hold the characters that have been parsed + + # run through each character one at a time and check it + #for (my $a = 0; $a <= $num ; $a +=1){ + # if ($att[$a] eq "\""){$flag=($flag==0)?1:0} # flag up on entering quoted text, down on leaving it + # if (($att[$a] eq ";") && $flag){$att[$a] = "INSERT_SEMICOLON_HERE"} # replace semicolon with an unusual message if the quoted text flag is up + # if (($att[$a] eq "#") && !$flag){last} # an unquoted hash symbol means the beginning of the comments field - discard + # push @parsed, $att[$a] # take the parsed character and push it onto the parsed list + # } + + #$attribs = join "", @parsed; # rejoin into a single string + + # <End Inefficient Code> Please feel free to fix this and make it more "perlish" + + # my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited + + # foreach my $pair ( @key_vals ) { + # $pair =~ s/INSERT_SEMICOLON_HERE/;/g; # replace semicolons that were removed from free-text above. + # my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair; # separate the key from the value + + # my @values; + + # while ($values =~ s/"(.*?)"//){ # free text is quoted, so match each free-text block + # if ($1){push @values, $1}; # and push it on to the list of values (tags may have more than one value...) + #} + + # my @othervals = split /\s+/, $values; # and what is left over should be space-separated non-free-text values + # foreach my $othervalue(@othervals){ +# if (CORE::length($othervalue) > 0){push @values, $othervalue} # get rid of any empty strings which might result from the split +# } + +# foreach my $value(@values){ +# $exon->add_tag_value($key, $value); +# } + # } + + return $exon; +} + +=head2 gff_string + +=cut + +sub gff_string{ + my ($self, $exon,$transcript) = @_; + + if($self->gff_version() == 1) { + return $self->_gff1_string($exon,$transcript); + } else { + return $self->_gff2_string($exon,$transcript); + } +} + +=head2 _gff1_string + +=cut + +sub _gff1_string{ + my ($self, $exon, $transcript) = @_; + my ($str,$source,$primary_tag,$score,$frame,$name,$strand); + + if( $exon->can('score') ) { + $score = $exon->score(); + } + $score = '.' unless defined $score; + + if( $exon->can('frame') ) { + $frame = $exon->frame(); + } + $frame = '.' unless defined $frame; + + $strand = $exon->strand(); + if(! $strand) { + $strand = "."; + } elsif( $strand == 1 ) { + $strand = '+'; + } elsif ( $exon->strand == -1 ) { + $strand = '-'; + } + + $name = $exon->seqname(); + $source = "merged"; + $primary_tag = "exon"; + + $str = join("\t", + $name, + $source, + $primary_tag, + $exon->start(), + $exon->end(), + $score, + $strand, + $frame); + + my $tag_str = $transcript->type; + $str .= "\t$tag_str"; + return $str; +} + +=head2 _gff2_string + +=cut + +sub _gff2_string{ + my ($gff, $exon) = @_; + my ($str,$score,$frame,$name,$strand); + + if( $exon->can('score') ) { + $score = $exon->score(); + } + $score = '.' unless defined $score; + + if( $exon->can('frame') ) { + $frame = $exon->frame(); + } + $frame = '.' unless defined $frame; + + $strand = $exon->strand(); + if(! $strand) { + $strand = "."; + } elsif( $strand == 1 ) { + $strand = '+'; + } elsif ( $exon->strand == -1 ) { + $strand = '-'; + } + + if( $exon->can('seqname') ) { + $name = $exon->seqname(); + $name ||= 'SEQ'; + } else { + $name = 'SEQ'; + } + + + $str = join("\t", + $name, + $exon->source_tag(), + $exon->analysis->gff_feature(), + $exon->start(), + $exon->end(), + $score, + $strand, + $frame); + + # the routine below is the only modification I made to the original + # ->gff_string routine (above) as on November 17th, 2000, the + # Sanger webpage describing GFF2 format reads: "From version 2 + # onwards, the attribute field must have a tag value structure + # following the syntax used within objects in a .ace file, + # flattened onto one line by semicolon separators. Tags must be + # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values + # must be quoted with double quotes". + + # MW + + #my $valuestr; + #if ($exon->all_tags){ # only play this game if it is worth playing... + # $str .= "\t"; # my interpretation of the GFF2 specification suggests the need for this additional TAB character...?? + # foreach my $tag ( $exon->all_tags ) { + # my $valuestr; # a string which will hold one or more values for this tag, with quoted free text and space-separated individual values. + # foreach my $value ( $exon->each_tag_value($tag) ) { + # if ($value =~ /[^A-Za-z0-9_]/){ + # $value =~ s/\t/\\t/g; # substitute tab and newline characters + # $value =~ s/\n/\\n/g; # to their UNIX equivalents + # $value = '"' . $value . '" '} # if the value contains anything other than valid tag/value characters, then quote it + # $valuestr .= $value . " "; # with a trailing space in case there are multiple values + # # for this tag (allowed in GFF2 and .ace format) + # } + # $str .= "$tag $valuestr ; "; # semicolon delimited with no '=' sign + # } + # chop $str; chop $str # remove the trailing semicolon and space + # } + return $str; +} + +=head2 gff_version + +=cut + +sub gff_version { + my ($self, $value) = @_; + if(defined $value && (($value == 1) || ($value == 2))) { + $self->{'GFF_VERSION'} = $value; + } + return $self->{'GFF_VERSION'}; +} + + +1; + -- GitLab