Skip to content
Snippets Groups Projects
Commit e17ed1c5 authored by edgrif's avatar edgrif
Browse files

Add SO term stuff to GFF parser code.

parent 3cf73701
No related branches found
No related tags found
No related merge requests found
......@@ -28,9 +28,9 @@
* of ZMapFeatureStruct's, one for each GFF source.
*
* HISTORY:
* Last edited: Jun 15 15:20 2004 (edgrif)
* Last edited: Jun 17 16:33 2004 (edgrif)
* Created: Sat May 29 13:18:32 2004 (edgrif)
* CVS info: $Id: zmapGFF.h,v 1.1 2004-06-15 14:30:25 edgrif Exp $
* CVS info: $Id: zmapGFF.h,v 1.2 2004-06-18 11:03:21 edgrif Exp $
*-------------------------------------------------------------------
*/
#ifndef ZMAP_GFF_H
......@@ -60,10 +60,16 @@ ZMapGFFParser zMapGFFCreateParser(void) ;
gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line) ;
void zMapGFFSetStopOnError(ZMapGFFParser parser, gboolean stop_on_error) ;
void zMapGFFSetSOCompliance(ZMapGFFParser parser, gboolean SO_compliant) ;
GArray *zmapGFFGetFeatures(ZMapGFFParser parser) ;
int zMapGFFGetVersion(ZMapGFFParser parser) ;
int zMapGFFGetLineNumber(ZMapGFFParser parser) ;
GError *zMapGFFGetError(ZMapGFFParser parser) ;
void zMapGFFSetFreeOnDestroy(ZMapGFFParser parser, gboolean free_on_destroy) ;
......
......@@ -26,13 +26,14 @@
*
* Exported functions: See ZMap/zmapGFF.h
* HISTORY:
* Last edited: Jun 15 15:27 2004 (edgrif)
* Last edited: Jun 17 16:34 2004 (edgrif)
* Created: Fri May 28 14:25:12 2004 (edgrif)
* CVS info: $Id: zmapGFF2parser.c,v 1.1 2004-06-15 14:37:35 edgrif Exp $
* CVS info: $Id: zmapGFF2parser.c,v 1.2 2004-06-18 11:03:20 edgrif Exp $
*-------------------------------------------------------------------
*/
#include <stdio.h>
#include <strings.h>
#include <errno.h>
#include <glib.h>
#include <ZMap/zmapFeature.h>
......@@ -56,7 +57,7 @@ static char *getFeatureName(char *attributes) ;
static gboolean getHomolAttrs(char *attributes, ZMapHomolType *homol_type_out,
int *start_out, int *end_out) ;
static gboolean formatType(char *feature_type, ZMapFeatureType *type_out) ;
static gboolean formatType(gboolean SO_compliant, char *feature_type, ZMapFeatureType *type_out) ;
static gboolean formatScore(char *score_str, gdouble *score_out) ;
static gboolean formatStrand(char *strand_str, ZMapStrand *strand_out) ;
static gboolean formatPhase(char *phase_str, ZMapPhase *phase_out) ;
......@@ -66,6 +67,8 @@ static void getFeatureArray(GQuark key_id, gpointer data, gpointer user_data) ;
void destroyFeatureArray(gpointer data) ;
ZMapGFFParser zMapGFFCreateParser(void)
{
ZMapGFFParser parser ;
......@@ -75,10 +78,14 @@ ZMapGFFParser zMapGFFCreateParser(void)
parser->state = ZMAPGFF_PARSE_HEADER ;
parser->error = NULL ;
parser->error_domain = g_quark_from_string(ZMAP_GFF_ERROR) ;
parser->stop_on_error = FALSE ;
parser->line_count = 0 ;
parser->SO_compliant = FALSE ;
parser->done_version = FALSE ;
parser->gff_version = 0 ;
parser->gff_version = -1 ;
parser->done_source = FALSE ;
parser->source_name = parser->source_version = NULL ;
......@@ -117,6 +124,8 @@ gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line)
{
gboolean result = FALSE ;
parser->line_count++ ;
/* Look for the header information. */
if (parser->state == ZMAPGFF_PARSE_HEADER)
{
......@@ -124,8 +133,9 @@ gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line)
{
/* returns FALSE for two reasons: there was a parse error, or the header section has
* finished, for the latter we need to cancel the error. */
if (parser->error)
if (parser->error && parser->stop_on_error)
{
result = FALSE ;
parser->state = ZMAPGFF_PARSE_ERROR ;
}
else
......@@ -150,7 +160,7 @@ gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line)
/* THIS NEEDS WORK, ONCE I'VE SORTED OUT ALL THE PARSING STUFF...... */
if (!(result = parseBodyLine(parser, line)))
{
if (parser->error)
if (parser->error && parser->stop_on_error)
{
result = FALSE ;
parser->state = ZMAPGFF_PARSE_ERROR ;
......@@ -178,6 +188,25 @@ GArray *zmapGFFGetFeatures(ZMapGFFParser parser)
}
/* If stop_on_error is TRUE the parser will not parse any further lines after it encounters
* the first error in the GFF file. */
void zMapGFFSetStopOnError(ZMapGFFParser parser, gboolean stop_on_error)
{
parser->stop_on_error = stop_on_error ;
return ;
}
/* If SO_compliant is TRUE the parser will only accept SO terms for feature types. */
void zMapGFFSetSOCompliance(ZMapGFFParser parser, gboolean SO_compliant)
{
parser->SO_compliant = SO_compliant ;
return ;
}
/* Return the GFF version which the parser is using. This is determined from the GFF
* input stream from the header comments. */
int zMapGFFGetVersion(ZMapGFFParser parser)
......@@ -186,6 +215,13 @@ int zMapGFFGetVersion(ZMapGFFParser parser)
}
/* Return line number of last line processed (this is the same as the number of lines processed. */
int zMapGFFGetLineNumber(ZMapGFFParser parser)
{
return parser->line_count ;
}
/* If a zMapGFFNNN function has failed then this function returns a description of the error
* in the glib GError format. If there has been no error then NULL is returned. */
GError *zMapGFFGetError(ZMapGFFParser parser)
......@@ -287,7 +323,8 @@ static gboolean parseHeaderLine(ZMapGFFParser parser, char *line)
if ((fields = sscanf(line, format_str, &version)) != 1)
{
parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
"Bad ##gff-version line: \"%s\"", line) ;
"Bad ##gff-version line %d: \"%s\"",
parser->line_count, line) ;
result = FALSE ;
}
else
......@@ -307,7 +344,8 @@ static gboolean parseHeaderLine(ZMapGFFParser parser, char *line)
if ((fields = sscanf(line, format_str, &program[0], &version[0])) != 2)
{
parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
"Bad ##source-version line: \"%s\"", line) ;
"Bad ##source-version line %d: \"%s\"",
parser->line_count, line) ;
result = FALSE ;
}
else
......@@ -329,7 +367,8 @@ static gboolean parseHeaderLine(ZMapGFFParser parser, char *line)
if ((fields = sscanf(line, format_str, &sequence_name[0], &start, &end)) != 3)
{
parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
"Bad ##sequence-region line: \"%s\"", line) ;
"Bad ##sequence-region line %d: \"%s\"",
parser->line_count, line) ;
result = FALSE ;
}
else
......@@ -361,7 +400,7 @@ static gboolean parseHeaderLine(ZMapGFFParser parser, char *line)
* a number. The only tricky bit is to get at the attributes and comments which have
* white space in them, this scanf format string seems to do it:
*
* format_str = "%49s%49s%49s%d%d%49s%49s%49s %999[^#] %999c"
* format_str = "%50s%50s%50s%d%d%50s%50s%50s %999[^#] %999c"
*
* " %999[^#]" Jumps white space after the last mandatory field and then gets everything up to
* the next "#", so this will fail if people put a "#" in their attributes !
......@@ -380,10 +419,10 @@ static gboolean parseBodyLine(ZMapGFFParser parser, char *line)
source[GFF_MAX_FIELD_CHARS + 1] = {''}, feature_type[GFF_MAX_FIELD_CHARS + 1] = {''},
score_str[GFF_MAX_FIELD_CHARS + 1] = {''}, strand_str[GFF_MAX_FIELD_CHARS + 1] = {''},
phase_str[GFF_MAX_FIELD_CHARS + 1] = {''},
attributes[GFF_MAX_FIELD_CHARS + 1] = {''}, comments[GFF_MAX_FIELD_CHARS + 1] = {''} ;
attributes[GFF_MAX_FREETEXT_CHARS + 1] = {''}, comments[GFF_MAX_FREETEXT_CHARS + 1] = {''} ;
int start = 0, end = 0 ;
double score = 0 ;
char *format_str = "%49s%49s%49s%d%d%49s%49s%49s %999[^#] %999c" ;
char *format_str = "%50s%50s%50s%d%d%50s%50s%50s %1000[^#] %1000c" ;
int fields ;
......@@ -394,7 +433,8 @@ static gboolean parseBodyLine(ZMapGFFParser parser, char *line)
< GFF_MANDATORY_FIELDS)
{
parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_BODY,
"Mandatory fields missing in GFF line: \"%s\"", line) ;
"Mandatory fields missing in GFF line %d: \"%s\"",
parser->line_count, line) ;
result = FALSE ;
}
else
......@@ -403,15 +443,26 @@ static gboolean parseBodyLine(ZMapGFFParser parser, char *line)
ZMapStrand strand ;
ZMapPhase phase ;
/* I'm afraid I'm not doing assembly stuff at the moment, its not worth it....if I need
* to change this decision I can just this section.....
* Code just silently drops these lines.
* */
if (g_ascii_strcasecmp(source, "assembly_tag") == 0)
{
return TRUE ;
}
if (strlen(sequence) == GFF_MAX_FREETEXT_CHARS
|| strlen(source) == GFF_MAX_FREETEXT_CHARS
|| (strlen(feature_type) == GFF_MAX_FREETEXT_CHARS || !formatType(feature_type, &type))
|| (strlen(feature_type) == GFF_MAX_FREETEXT_CHARS ||
!formatType(parser->SO_compliant, feature_type, &type))
|| !formatScore(score_str, &score)
|| !formatStrand(strand_str, &strand)
|| !formatPhase(phase_str, &phase))
{
parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_BODY,
"Bad format GFF line: \"%s\"", line) ;
"Bad format GFF line %d: \"%s\"",
parser->line_count, line) ;
result = FALSE ;
}
else
......@@ -438,13 +489,15 @@ static gboolean makeNewFeature(ZMapGFFParser parser, char *sequence, char *sourc
ZMapFeature feature = NULL ;
char *first_attr = NULL ;
ZMapGFFParserFeatureSet feature_set = NULL ; ;
gboolean has_name = TRUE ;
/* Look for an explicit feature name for the GFF record, if none exists use the sequence
* name itself. */
if (!(feature_name = getFeatureName(attributes)))
feature_name = sequence ;
{
feature_name = sequence ;
has_name = FALSE ;
}
/* Check if the "source" for this feature is already known, if it is then check if there
* is already a multiline feature with the same name as we will need to augment it with this data. */
......@@ -497,8 +550,13 @@ static gboolean makeNewFeature(ZMapGFFParser parser, char *sequence, char *sourc
/* THIS PIECE OF CODE WILL NEED TO BE CHANGED AS I DO MORE TYPES..... */
/* If the feature is one that must be built up from several GFF lines then add it to
* our set of such features. */
if (feature_type != ZMAPFEATURE_HOMOL)
* our set of such features. There are arcane/adhoc rules in action here, any features
* that do not have their own feature_name _cannot_ be multiline features as such features
* can _only_ be identified if they do have their own name. */
if (has_name
&& (feature_type == ZMAPFEATURE_SEQUENCE || feature_type == ZMAPFEATURE_TRANSCRIPT
|| feature_type == ZMAPFEATURE_EXON || feature_type == ZMAPFEATURE_INTRON))
{
g_datalist_set_data(&(feature_set->multiline_features), feature_name, feature) ;
}
......@@ -639,7 +697,7 @@ static char *getFeatureName(char *attributes)
{
char *feature_name = NULL ;
int attr_fields ;
char *attr_format_str = "%49s %*[\"]%50[^\"]%*[\"]%*s" ;
char *attr_format_str = "%50s %*[\"]%50[^\"]%*[\"]%*s" ;
char class[GFF_MAX_FIELD_CHARS + 1] = {''}, name[GFF_MAX_FIELD_CHARS + 1] = {''} ;
......@@ -716,51 +774,65 @@ static gboolean getHomolAttrs(char *attributes, ZMapHomolType *homol_type_out,
* Not all of these are below because I'm not sure what type they should be classified
* in to.
*
*
* If SO_compliant is TRUE then only recognised SO terms will be accepted for feature
* types, if FALSE then both SO and more the earlier more adhoc names will be accepted.
*
* */
gboolean formatType(char *feature_type, ZMapFeatureType *type_out)
gboolean formatType(gboolean SO_compliant, char *feature_type, ZMapFeatureType *type_out)
{
gboolean result = FALSE ;
ZMapFeatureType type = ZMAPFEATURE_INVALID ;
if (g_ascii_strcasecmp(feature_type, "Clone") == 0
|| g_ascii_strcasecmp(feature_type, "Clone_right_end") == 0
|| g_ascii_strcasecmp(feature_type, "Clone_left_end") == 0
|| g_ascii_strcasecmp(feature_type, "utr") == 0
|| g_ascii_strcasecmp(feature_type, "experimental") == 0
|| g_ascii_strcasecmp(feature_type, "reagent") == 0
|| g_ascii_strcasecmp(feature_type, "repeat") == 0
|| g_ascii_strcasecmp(feature_type, "structural") == 0
)
/* Is feature_type a SO term. */
if (g_ascii_strcasecmp(feature_type, "trans_splice_acceptor_site") == 0)
{
type = ZMAPFEATURE_BASIC ;
type = ZMAPFEATURE_BOUNDARY ;
}
else if (g_ascii_strcasecmp(feature_type, "Pseudogene") == 0)
else if (g_ascii_strcasecmp(feature_type, "transposable_element_insertion_site") == 0)
{
/* REALLY NOT SURE ABOUT THIS CLASSIFICATION......SHOULD IT BE A TRANSCRIPT ? */
type = ZMAPFEATURE_BASIC ;
type = ZMAPFEATURE_VARIATION ;
}
else if (g_ascii_strcasecmp(feature_type, "SNP") == 0)
if (g_ascii_strcasecmp(feature_type, "region") == 0)
{
type = ZMAPFEATURE_VARIATION ;
type = ZMAPFEATURE_BASIC ;
}
else if (g_ascii_strcasecmp(feature_type, "Sequence") == 0)
else if (g_ascii_strcasecmp(feature_type, "virtual_sequence") == 0)
{
type = ZMAPFEATURE_SEQUENCE ;
}
else if (g_ascii_strcasecmp(feature_type, "transcript") == 0)
else if (g_ascii_strcasecmp(feature_type, "reagent") == 0
|| g_ascii_strcasecmp(feature_type, "clone") == 0
|| g_ascii_strcasecmp(feature_type, "clone_end") == 0
|| g_ascii_strcasecmp(feature_type, "clone_end") == 0)
{
type = ZMAPFEATURE_BASIC ;
}
else if (g_ascii_strcasecmp(feature_type, "UTR") == 0)
{
/* this should in the end be part of a transcript..... */
type = ZMAPFEATURE_BASIC ;
}
else if (g_ascii_strcasecmp(feature_type, "pseudogene") == 0)
{
/* In SO terms this is a region but we don't have a basic "region" type that includes
* exons like structure...suggests we need to remodel our feature struct.... */
type = ZMAPFEATURE_TRANSCRIPT ;
}
else if (g_ascii_strcasecmp(feature_type, "similarity") == 0)
else if (g_ascii_strcasecmp(feature_type, "experimental_result_region") == 0
|| g_ascii_strcasecmp(feature_type, "chromosomal_structural_element") == 0)
{
type = ZMAPFEATURE_HOMOL ;
type = ZMAPFEATURE_BASIC ;
}
else if (g_ascii_strcasecmp(feature_type, "trans-splice_acceptor") == 0)
else if (g_ascii_strcasecmp(feature_type, "transcript") == 0
|| g_ascii_strcasecmp(feature_type, "protein_coding_primary_transcript") == 0
|| g_ascii_strcasecmp(feature_type, "CDS") == 0)
{
type = ZMAPFEATURE_BOUNDARY ;
type = ZMAPFEATURE_TRANSCRIPT ;
}
else if (g_ascii_strcasecmp(feature_type, "coding_exon") == 0
|| g_ascii_strcasecmp(feature_type, "exon") == 0)
else if (g_ascii_strcasecmp(feature_type, "exon") == 0)
{
type = ZMAPFEATURE_EXON ;
}
......@@ -768,7 +840,79 @@ gboolean formatType(char *feature_type, ZMapFeatureType *type_out)
{
type = ZMAPFEATURE_INTRON ;
}
else if (g_ascii_strcasecmp(feature_type, "nucleotide_match") == 0
|| g_ascii_strcasecmp(feature_type, "expressed_sequence_match") == 0
|| g_ascii_strcasecmp(feature_type, "EST_match") == 0
|| g_ascii_strcasecmp(feature_type, "cDNA_match") == 0
|| g_ascii_strcasecmp(feature_type, "translated_nucleotide_match") == 0
|| g_ascii_strcasecmp(feature_type, "protein_match") == 0)
{
type = ZMAPFEATURE_HOMOL ;
}
else if (g_ascii_strcasecmp(feature_type, "repeat_region") == 0
|| g_ascii_strcasecmp(feature_type, "inverted_repeat") == 0
|| g_ascii_strcasecmp(feature_type, "tandem_repeat") == 0)
{
type = ZMAPFEATURE_BASIC ;
}
else if (g_ascii_strcasecmp(feature_type, "SNP") == 0)
{
type = ZMAPFEATURE_VARIATION ;
}
if (!SO_compliant)
{
if (g_ascii_strcasecmp(feature_type, "Clone_right_end") == 0)
{
type = ZMAPFEATURE_BASIC ;
}
else if (g_ascii_strcasecmp(feature_type, "Clone") == 0
|| g_ascii_strcasecmp(feature_type, "Clone_left_end") == 0
|| g_ascii_strcasecmp(feature_type, "utr") == 0
|| g_ascii_strcasecmp(feature_type, "experimental") == 0
|| g_ascii_strcasecmp(feature_type, "reagent") == 0
|| g_ascii_strcasecmp(feature_type, "repeat") == 0
|| g_ascii_strcasecmp(feature_type, "structural") == 0)
{
type = ZMAPFEATURE_BASIC ;
}
else if (g_ascii_strcasecmp(feature_type, "Pseudogene") == 0)
{
/* REALLY NOT SURE ABOUT THIS CLASSIFICATION......SHOULD IT BE A TRANSCRIPT ? */
type = ZMAPFEATURE_TRANSCRIPT ;
}
else if (g_ascii_strcasecmp(feature_type, "SNP") == 0)
{
type = ZMAPFEATURE_VARIATION ;
}
else if (g_ascii_strcasecmp(feature_type, "Sequence") == 0)
{
type = ZMAPFEATURE_SEQUENCE ;
}
else if (g_ascii_strcasecmp(feature_type, "transcript") == 0)
{
type = ZMAPFEATURE_TRANSCRIPT ;
}
else if (g_ascii_strcasecmp(feature_type, "similarity") == 0
|| g_ascii_strcasecmp(feature_type, "transcription") == 0)
{
type = ZMAPFEATURE_HOMOL ;
}
else if (g_ascii_strcasecmp(feature_type, "trans-splice_acceptor") == 0)
{
type = ZMAPFEATURE_BOUNDARY ;
}
else if (g_ascii_strcasecmp(feature_type, "coding_exon") == 0
|| g_ascii_strcasecmp(feature_type, "exon") == 0)
{
type = ZMAPFEATURE_EXON ;
}
else if (g_ascii_strcasecmp(feature_type, "intron") == 0)
{
type = ZMAPFEATURE_INTRON ;
}
}
if (type != ZMAPFEATURE_INVALID)
......
......@@ -25,9 +25,9 @@
* Description:
* Exported functions: See XXXXXXXXXXXXX.h
* HISTORY:
* Last edited: Jun 15 15:29 2004 (edgrif)
* Last edited: Jun 17 13:52 2004 (edgrif)
* Created: Sat May 29 13:18:32 2004 (edgrif)
* CVS info: $Id: zmapGFF_P.h,v 1.1 2004-06-15 14:37:35 edgrif Exp $
* CVS info: $Id: zmapGFF_P.h,v 1.2 2004-06-18 11:03:21 edgrif Exp $
*-------------------------------------------------------------------
*/
#ifndef ZMAP_GFF_P_H
......@@ -37,7 +37,7 @@
/* Some defines for parsing stuff....my need v2 and v3 versions of these. */
/* Some defines for parsing stuff....may need v2 and v3 versions of these. */
enum {GFF_MANDATORY_FIELDS = 8, GFF_MAX_FIELD_CHARS = 50, GFF_MAX_FREETEXT_CHARS = 1000} ;
......@@ -60,6 +60,9 @@ typedef enum
/* Some features need to be built up from multiple GFF lines so we keep associations
* of these features in arrays. The arrays are indexed via sources. These arrays are only used
* while building up the final arrays of features. */
/* For each set of features that come from a single source, we keep an array of those features
* but also a list of features that need to be built up from several GFF lines. */
......@@ -86,8 +89,15 @@ typedef struct ZMapGFFParserFeatureSetStruct_
typedef struct ZMapGFFParserStruct_
{
ZMapGFFParseState state ;
GError *error ; /* Holds recoverable error. */
GError *error ; /* Holds last parser error. */
GQuark error_domain ;
gboolean stop_on_error ; /* Stop parsing if there is an error. */
int line_count ; /* Contains number of lines processed. */
gboolean SO_compliant ; /* TRUE => use only SO terms for
feature types. */
/* Header data, need to find all this for parsing to be valid. */
gboolean done_version ;
......@@ -101,30 +111,15 @@ typedef struct ZMapGFFParserStruct_
char *sequence_name ;
int sequence_start, sequence_end ;
/* Some features need to be built up from multiple GFF lines so we keep associations
* of these features in arrays. The arrays are indexed via sources. These arrays are only used
* while building up the final arrays of features. */
GData *feature_sets ; /* A list of arrays of features, the
arrays are indexed via their
"source". Hence each array contains
features from a single source. */
GData *feature_sets ; /* A list of ZMapGFFParserFeatureSetStruct.
There is one of these structs per
"source". The struct contains among
other things an array of all
features for that source. */
gboolean free_on_destroy ; /* TRUE => free all feature arrays
when parser is destroyed. */
#ifdef ED_G_NEVER_INCLUDE_THIS_CODE
/* Let's not worry about this for now.........we could even just return the array of arrays
* dynamically by zooming through the sources and extracting all the sources.... */
/* The final array of arrays of features. Each sub-array contains features that all have the
* same source. */
GArray *feature_sets ;
#endif /* ED_G_NEVER_INCLUDE_THIS_CODE */
} ZMapGFFParserStruct ;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment