From b96de5e2f866f6464cde53e98589a7077fdbe4d2 Mon Sep 17 00:00:00 2001 From: Bronwen Aken <ba1@sanger.ac.uk> Date: Fri, 12 Oct 2007 14:53:08 +0000 Subject: [PATCH] Removed CDS parsing. Made pattern matching more strict when choosing type (gene, transcript, translation) as 'pseudogene' transcripts had previously been classified as genes. --- .../XrefParser/Flybase_dmel_GFFv3_Parser.pm | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm index 10f0b1185b..be2de20e57 100644 --- a/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm +++ b/misc-scripts/xref_mapping/XrefParser/Flybase_dmel_GFFv3_Parser.pm @@ -6,7 +6,20 @@ # First of all, it read knows what all the gene, transcript and translation types are, found in column 3 of the gff file: # Gene = gene # Transcript = mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA -# Translation = CDS protein +# Translation = protein (could include CDS here but haven't ?) +# +# ID=FBgn => flybase_gene_id +# ID=FBtr => flybase_transcript_id +# ID=FBpp => flybase_polypeptide_id +# Name=CG0123 => FlyBaseName_gene +# Name=CG0123-RA => FlyBaseName_transcript +# Name=CG0123-PA => FlyBaseName_translations +# Dbxref=FlyBase:FBan => flybase_annotation_id +# Dbxref=FlyBase_Annotation_IDs:CG0123 => gadfly_gene_cgid +# Dbxref=FlyBase_Annotation_IDs:CG0123-RA => gadfly_transcript_cgid +# Dbxref=FlyBase_Annotation_IDs:CG0123-PA => gadfly_translation_id +# Alias= => flybase_synonym +# # For each line in the gff file for the above list of genes, transcript and translations, the following happens: # The unique_id is read in from ID= (FBgn, FBtr, FBpp). This is the direct xref for all xrefs of this entry. # An xref is made for the entry, using the ID as the xref's accession. Synonyms from Alias= are added to this xref. @@ -57,7 +70,7 @@ sub new { # my @gff_obj =qw( CDS exon gene mRNA ncRNA pseudogene rRNA snRNA snoRNA tRNA ); # this array may need to change between releases so check that it's updated - my @gff_obj =qw( CDS gene mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA); + my @gff_obj =qw( gene mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA); $self->gff_object_types(\@gff_obj); # @@ -129,7 +142,7 @@ sub new { $self->source_name_prefix_modDDgene('modDD_gene'); # For Dbxref=modDD my @gene_types = qw (gene) ; - my @translation_types = qw (CDS protein); + my @translation_types = qw (protein); # The transcript_types may change from release to release so check that this list is up-to-date my @transcript_types = qw (mRNA ncRNA snRNA tRNA rRNA pseudogene snoRNA miRNA); @@ -273,17 +286,17 @@ sub set_ensembl_object_type{ my ($self,$t) = @_ ; # $t is identifier in gff for object : CDS,mRNA,gene,pseudogene,snRNA,.... for my $hc (@{ $self->gene_types } ){ - if ($t=~m/$hc/){ + if ($t=~m/^$hc$/){ return 'gene'; } } for my $hc (@{ $self->translation_types } ){ - if ($t=~m/$hc/){ + if ($t=~m/^$hc$/){ return 'translation'; } } for my $hc (@{ $self->transcript_types} ){ - if ($t=~m/$hc/){ + if ($t=~m/^$hc$/){ return 'transcript'; } } -- GitLab