From fa1884e3741dbc00ecf467fd5ecb4d74f598c3d5 Mon Sep 17 00:00:00 2001
From: edgrif <edgrif>
Date: Mon, 30 Nov 2009 10:47:52 +0000
Subject: [PATCH] add code to parse sequence separately and simplify handling
 of parser state/errors.

---
 src/zmapGFF/zmapGFF2parser.c | 575 +++++++++++++++++++++--------------
 1 file changed, 344 insertions(+), 231 deletions(-)

diff --git a/src/zmapGFF/zmapGFF2parser.c b/src/zmapGFF/zmapGFF2parser.c
index 4bf427579..ec333e045 100755
--- a/src/zmapGFF/zmapGFF2parser.c
+++ b/src/zmapGFF/zmapGFF2parser.c
@@ -26,9 +26,9 @@
  *              
  * Exported functions: See ZMap/zmapGFF.h
  * HISTORY:
- * Last edited: Sep 11 14:35 2009 (edgrif)
+ * Last edited: Nov 30 10:26 2009 (edgrif)
  * Created: Fri May 28 14:25:12 2004 (edgrif)
- * CVS info:   $Id: zmapGFF2parser.c,v 1.95 2009-09-11 13:50:09 edgrif Exp $
+ * CVS info:   $Id: zmapGFF2parser.c,v 1.96 2009-11-30 10:47:52 edgrif Exp $
  *-------------------------------------------------------------------
  */
 
@@ -41,20 +41,14 @@
 #include <zmapGFF_P.h>
 
 
-/* THIS FILE NEEDS WORK TO COPE WITH ALIGN/BLOCK/COORD INFO..... */
-
-#ifdef RDS_NO_TO_HACKED_CODE
-/* HACKED CODE TO FIX UP LINKS IN BLOCK->SET->FEATURE */
-static void setBlock(GQuark key_id, gpointer data, gpointer user_data) ;
-static void setSet(GQuark key_id, gpointer data, gpointer user_data) ;
-/* END OF HACKED CODE TO FIX UP LINKS IN BLOCK->SET->FEATURE */
-#endif
 
 typedef enum {NAME_FIND, NAME_USE_SOURCE, NAME_USE_SEQUENCE} NameFindType ;
 
 
+
 static gboolean parseHeaderLine(ZMapGFFParser parser, char *line) ;
 static gboolean parseBodyLine(ZMapGFFParser parser, char *line) ;
+static gboolean parseSequenceLine(ZMapGFFParser parser, char *line) ;
 static gboolean makeNewFeature(ZMapGFFParser parser, NameFindType name_find,
 			       char *sequence, char *source, char *ontology,
 			       ZMapStyleMode feature_type,
@@ -90,7 +84,6 @@ static char *getNoteText(char *attributes) ;
 
 
 
-
 /* types is the list of methods/types, call it what you will that we want to see
  * in the output, we may need to filter the incoming data stream to get this.
  * 
@@ -98,14 +91,9 @@ static char *getNoteText(char *attributes) ;
  * _not_ create any features. This means the parser can be tested/used on huge datasets
  * without having to have huge amounts of memory to hold the feature structs.
  * You can only set parse_only when you create the parser, it cannot be set later. */
-ZMapGFFParser zMapGFFCreateParser(GData *sources, gboolean parse_only)
+ZMapGFFParser zMapGFFCreateParser(void)
 {
   ZMapGFFParser parser ;
-  GQuark locus_id ;
-
-#ifdef ED_G_NEVER_INCLUDE_THIS_CODE
-  g_list_foreach(sources, stylePrintCB, NULL) ; /* debug */
-#endif /* ED_G_NEVER_INCLUDE_THIS_CODE */
 
   parser = g_new0(ZMapGFFParserStruct, 1) ;
 
@@ -113,7 +101,6 @@ ZMapGFFParser zMapGFFCreateParser(GData *sources, gboolean parse_only)
   parser->error = NULL ;
   parser->error_domain = g_quark_from_string(ZMAP_GFF_ERROR) ;
   parser->stop_on_error = FALSE ;
-  parser->parse_only = parse_only ;
 
   parser->line_count = 0 ;
   parser->SO_compliant = FALSE ;
@@ -122,15 +109,37 @@ ZMapGFFParser zMapGFFCreateParser(GData *sources, gboolean parse_only)
   parser->clip_mode = GFF_CLIP_NONE ;
   parser->clip_start = parser->clip_end = 0 ;
 
-  parser->done_header = FALSE ;
-  parser->done_version = FALSE ;
+  parser->header_flags.done_header = FALSE ;
+  parser->header_flags.done_version = FALSE ;
   parser->gff_version = -1 ;
-  parser->done_source = FALSE ;
+  parser->header_flags.done_source = FALSE ;
   parser->source_name = parser->source_version = NULL ;
-  parser->done_sequence_region = FALSE ;
+  parser->header_flags.done_sequence_region = FALSE ;
   parser->sequence_name = NULL ;
   parser->features_start = parser->features_end = 0 ;
+
+  parser->raw_line_data = g_string_sized_new(2000);
+  parser->sequence_flags.done_finished = TRUE; /* default we don't parse the dna/protein */
+
+  /* Allocated dynamically as these fields in GFF can be big. */
+  parser->attributes_str = g_string_sized_new(GFF_MAX_FREETEXT_CHARS) ;
+  parser->comments_str = g_string_sized_new(GFF_MAX_FREETEXT_CHARS) ;
+
+  return parser ;
+}
+
+
+
+/* We should do this internally with a var in the parser struct.... */
+/* This function must be called prior to parsing feature lines, it is not required
+ * for either the header lines or sequences. */
+gboolean zMapGFFParserInitForFeatures(ZMapGFFParser parser, GData *sources, gboolean parse_only)
+{
+  gboolean result = FALSE ;
+  GQuark locus_id ;
+
   parser->sources = sources ;
+  parser->parse_only = parse_only ;
 
   /* Check for Locus as one of the sources as this needs to be constructed as we go along. */
   locus_id = zMapStyleCreateID(ZMAP_FIXED_STYLE_LOCUS_NAME) ;
@@ -158,48 +167,35 @@ ZMapGFFParser zMapGFFCreateParser(GData *sources, gboolean parse_only)
       parser->free_on_destroy  = FALSE ;
     }
 
-
-  parser->parsed_sequence.raw_line_data = g_string_sized_new(2000);
-  parser->parsed_sequence.finished = TRUE; /* default we don't parse the dna/protein */
-
-  /* Allocated dynamically as these fields in GFF can be big. */
-  parser->attributes_str = g_string_sized_new(GFF_MAX_FREETEXT_CHARS) ;
-  parser->comments_str = g_string_sized_new(GFF_MAX_FREETEXT_CHARS) ;
-
-
-  return parser ;
+  return result ;
 }
 
 
-
 /* Parses a single line of GFF data, should be called repeatedly with successive lines
- * GFF data from a GFF source. This function expects to find first the GFF header and
- * then the GFF data. (See zMapGFFParseHeader() if you want to parse out the header
- * first.
+ * of GFF data from a GFF source. This function expects the line to be a null-terminated
+ * C string with any terminating newline char to already have been removed (this latter
+ * because we don't know how the line is stored so don't want to write to it).
+ *
+ * This function expects to find the GFF header in the format below, once all these
+ * required header lines have been found or a non-comment line is found it will stop.
+ * The zMapGFFParseLine() function can then be used to parse the rest of the file.
  * 
- * This function expects a null-terminated C string that contains a complete GFF line
- * (comment or non-comment line), the function expects the caller to already have removed the
- * newline char from the end of the GFF line.
+ * ##gff-version 2
+ * ##source-version EnsEMBL2GFF 1.0
+ * ##date 2009-11-13
+ * ##sequence-region chr19-03 1000000 1010000
  * 
  * Returns FALSE if there is any error in the GFF header.
- * Returns FALSE if there is an error in the GFF body and stop_on_error == TRUE.
  *
  * Once an error has been returned the parser object cannot be used anymore and
  * zMapGFFDestroyParser() should be called to free it.
  *
- */
-
-/* ISSUE: need to decide on rules for comments, can they be embedded within other gff lines, are
- * the header comments compulsory ? etc. etc. 
- * 
- * Current code assumes that the header block will be a contiguous set of header lines
- * at the top of the file and that the first non-header line marks the beginning
- * of the GFF data. If this is not true then its an error.
  */ 
-gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line)
+gboolean zMapGFFParseHeader(ZMapGFFParser parser, char *line, gboolean *header_finished)
 {
   gboolean result = FALSE ;
 
+  zMapLogReturnValIfFail((parser && line && header_finished), FALSE) ;
 
   parser->line_count++ ;
 
@@ -208,55 +204,83 @@ gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line)
     {
       if (!(result = parseHeaderLine(parser, line)))
 	{
-	  /* returns FALSE for two reasons: there was a parse error (note that we ignore
-	   * stop_on_error, the header _must_ be correct), or the header section has
-	   * finished - in this case we need to cancel the error and reparse the line. */
-	  if (parser->error)
+	  parser->state = ZMAPGFF_PARSE_ERROR ;
+	}
+      else
+	{
+	  if (parser->header_flags.done_header)
 	    {
-	      result = FALSE ;
-	      parser->state = ZMAPGFF_PARSE_ERROR ;
+	      parser->state = ZMAPGFF_PARSE_BODY ;
+	      *header_finished = TRUE ;
 	    }
 	  else
 	    {
-	      result = TRUE ;
-
-	      /* If we found all the header parts move on to the body. */
-	      if (parser->done_header)
-		parser->state = ZMAPGFF_PARSE_BODY ;
+	      *header_finished = FALSE ;
 	    }
 	}
     }
 
-  /* Note can only be in parse body state if header correctly parsed. */
+  return result ;
+}
+
+
+/* Parses a single line of GFF data, should be called repeatedly with successive lines
+ * of GFF data from a GFF source. This function expects the line to be a null-terminated
+ * C string with any terminating newline char to already have been removed (this latter
+ * because we don't know how the line is stored so don't want to write to it).
+ *
+ * This function expects to find sequence in the GFF format format below, once all the
+ * sequence lines have been found or a non-comment line is found it will stop.
+ * The zMapGFFParseLine() function can then be used to parse the rest of the file.
+ * 
+ * Returns FALSE if there is any error in the GFF sequence.
+ *
+ * Once an error has been returned the parser object cannot be used anymore and
+ * zMapGFFDestroyParser() should be called to free it.
+ *
+ * Returns TRUE in sequence_finished once all the sequence is parsed.
+ */ 
+gboolean zMapGFFParseSequence(ZMapGFFParser parser, char *line, gboolean *sequence_finished)
+{
+  gboolean result = FALSE ;
+
+  zMapLogReturnValIfFail((parser && line && sequence_finished), FALSE) ;
+
+  parser->line_count++ ;
+
   if (parser->state == ZMAPGFF_PARSE_BODY)
+    parser->state = ZMAPGFF_PARSE_SEQUENCE ;
+
+  if (parser->state == ZMAPGFF_PARSE_SEQUENCE)
     {
-      /* Skip over comment lines, this is a CRUDE test, probably need something more subtle. */
-      if (*(line) == '#')
-	result = TRUE ;
+      if (!(result = parseSequenceLine(parser, line)))
+	{
+	  parser->state = ZMAPGFF_PARSE_ERROR ;
+	}
       else
 	{
-	  /* THIS NEEDS WORK, ONCE I'VE SORTED OUT ALL THE PARSING STUFF...... */
-	  if (!(result = parseBodyLine(parser, line)))
+	  if (parser->sequence_flags.done_finished)
 	    {
-	      if (parser->error && parser->stop_on_error)
-		{
-		  result = FALSE ;
-		  parser->state = ZMAPGFF_PARSE_ERROR ;
-		}
+	      parser->state = ZMAPGFF_PARSE_BODY ;
+	      *sequence_finished = TRUE ;
+	    }
+	  else
+	    {
+	      *sequence_finished = FALSE ;
 	    }
 	}
     }
 
-
   return result ;
 }
 
 
 
+
 /* Parses a single line of GFF data, should be called repeatedly with successive lines
- * GFF data from a GFF source. This function expects to find the GFF header, once all
- * the required header lines have been found or a non-comment line is found it will stop.
- * The zMapGFFParseLine() function can then be used to parse the rest of the file.
+ * GFF data from a GFF source. This function expects to find first the GFF header and
+ * then the GFF data. (See zMapGFFParseHeader() if you want to parse out the header
+ * first.
  * 
  * This function expects a null-terminated C string that contains a complete GFF line
  * (comment or non-comment line), the function expects the caller to already have removed the
@@ -268,15 +292,19 @@ gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line)
  * Once an error has been returned the parser object cannot be used anymore and
  * zMapGFFDestroyParser() should be called to free it.
  *
+ */
+
+/* ISSUE: need to decide on rules for comments, can they be embedded within other gff lines, are
+ * the header comments compulsory ? etc. etc. 
+ * 
  * Current code assumes that the header block will be a contiguous set of header lines
  * at the top of the file and that the first non-header line marks the beginning
  * of the GFF data. If this is not true then its an error.
  */ 
-gboolean zMapGFFParseHeader(ZMapGFFParser parser, char *line, gboolean *header_finished)
+gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line)
 {
   gboolean result = FALSE ;
 
-  zMapAssert(parser && line && header_finished) ;
 
   parser->line_count++ ;
 
@@ -285,9 +313,6 @@ gboolean zMapGFFParseHeader(ZMapGFFParser parser, char *line, gboolean *header_f
     {
       if (!(result = parseHeaderLine(parser, line)))
 	{
-	  /* returns FALSE for two reasons: there was a parse error (note that we ignore
-	   * stop_on_error, the header _must_ be correct), or the header section has
-	   * finished - in this case we need to cancel the error and reparse the line. */
 	  if (parser->error)
 	    {
 	      result = FALSE ;
@@ -298,16 +323,52 @@ gboolean zMapGFFParseHeader(ZMapGFFParser parser, char *line, gboolean *header_f
 	      result = TRUE ;
 
 	      /* If we found all the header parts move on to the body. */
-	      if (parser->done_header)
+	      if (parser->header_flags.done_header)
+		parser->state = ZMAPGFF_PARSE_BODY ;
+	    }
+	}
+    }
+
+  /* Note can only be in parse body state if header correctly parsed. */
+  if (parser->state == ZMAPGFF_PARSE_BODY)
+    {
+      if (g_str_has_prefix(line, "##DNA"))
+	{
+	  parser->state = ZMAPGFF_PARSE_SEQUENCE ;
+	}
+      else if (*(line) == '#')
+	{
+	  /* Skip over comment lines, this is a CRUDE test, probably need something more subtle. */	  
+	  result = TRUE ;
+	}
+      else
+	{
+	  if (!(result = parseBodyLine(parser, line)))
+	    {
+	      if (parser->error && parser->stop_on_error)
 		{
-		  parser->state = ZMAPGFF_PARSE_BODY ;
-		  *header_finished = TRUE ;
+		  result = FALSE ;
+		  parser->state = ZMAPGFF_PARSE_ERROR ;
 		}
 	    }
 	}
+    }
+
+
+  if (parser->state == ZMAPGFF_PARSE_SEQUENCE)
+    {
+      if (!(result = parseSequenceLine(parser, line)))
+	{
+	  if (parser->error && parser->stop_on_error)
+	    {
+	      result = FALSE ;
+	      parser->state = ZMAPGFF_PARSE_ERROR ;
+	    }
+	}
       else
 	{
-	  *header_finished = FALSE ;
+	  if (parser->sequence_flags.done_finished)
+	    parser->state = ZMAPGFF_PARSE_BODY ;
 	}
     }
 
@@ -316,6 +377,15 @@ gboolean zMapGFFParseHeader(ZMapGFFParser parser, char *line, gboolean *header_f
 
 
 
+/* WE NEED A NEW FUNC THAT PARSES A WHOLE STREAM.....AND HAS THESE RULES IN IT. */
+
+/* Current code assumes that the header block will be a contiguous set of header lines
+ * at the top of the file and that the first non-header line marks the beginning
+ * of the GFF data. If this is not true then its an error. */
+
+
+
+
 
 /* Returns as much information as possible from the header comments of the gff file.
  * Note that our current parsing code makes this an all or nothing piece of code:
@@ -324,7 +394,7 @@ ZMapGFFHeader zMapGFFGetHeader(ZMapGFFParser parser)
 {
   ZMapGFFHeader header = NULL ;
 
-  if (parser->done_header)
+  if (parser->header_flags.done_header)
     {
       header = g_new0(ZMapGFFHeaderStruct, 1) ;
 
@@ -345,7 +415,7 @@ gboolean zMapGFFParserSetSequenceFlag(ZMapGFFParser parser)
 {
   gboolean set = TRUE;
 
-  parser->parsed_sequence.finished = FALSE;
+  parser->sequence_flags.done_finished = FALSE;
 
   return set;
 }
@@ -354,20 +424,18 @@ ZMapSequence zMapGFFGetSequence(ZMapGFFParser parser)
 {
   ZMapSequence sequence = NULL;
 
-  if(parser->done_header)
+  if (parser->header_flags.done_header)
     {
-      /* parsed_sequence.raw_line_data == NULL means we got to the end-XXXXXX  */
-
-      if(parser->parsed_sequence.seq_data.type != ZMAPSEQUENCE_NONE &&
-	 parser->parsed_sequence.seq_data.sequence != NULL &&
-	 parser->parsed_sequence.raw_line_data == NULL)
+      if(parser->seq_data.type != ZMAPSEQUENCE_NONE
+	 && (parser->seq_data.sequence != NULL && parser->raw_line_data == NULL))
 	{
 	  sequence = g_new0(ZMapSequenceStruct, 1);
-	  *sequence = parser->parsed_sequence.seq_data;
+	  *sequence = parser->seq_data;
 	  sequence->name = g_quark_from_string(parser->sequence_name);
+
 	  /* So we don't copy empty data */
-	  parser->parsed_sequence.seq_data.type     = ZMAPSEQUENCE_NONE;
-	  parser->parsed_sequence.seq_data.sequence = NULL; /* So it doesn't get free'd */
+	  parser->seq_data.type     = ZMAPSEQUENCE_NONE;
+	  parser->seq_data.sequence = NULL; /* So it doesn't get free'd */
 	}
     }
 
@@ -378,11 +446,15 @@ void zMapGFFFreeHeader(ZMapGFFHeader header)
 {
   zMapAssert(header) ;
 
+
+#ifdef ED_G_NEVER_INCLUDE_THIS_CODE
   g_free(header->source_name) ;
   g_free(header->source_version) ;
   g_free(header->sequence_name) ;
 
   g_free(header) ;
+#endif /* ED_G_NEVER_INCLUDE_THIS_CODE */
+
 
   return ;
 }
@@ -569,6 +641,8 @@ void zMapGFFSetFreeOnDestroy(ZMapGFFParser parser, gboolean free_on_destroy)
 
 void zMapGFFDestroyParser(ZMapGFFParser parser)
 {
+
+#ifdef ED_G_NEVER_INCLUDE_THIS_CODE
   if (parser->error)
     g_error_free(parser->error) ;
 
@@ -577,6 +651,9 @@ void zMapGFFDestroyParser(ZMapGFFParser parser)
   if (parser->source_version)
     g_free(parser->source_version) ;
 
+  if (parser->date)
+    g_free(parser->date) ;
+
   if (parser->sequence_name)
     g_free(parser->sequence_name) ;
 
@@ -589,6 +666,8 @@ void zMapGFFDestroyParser(ZMapGFFParser parser)
   g_string_free(parser->comments_str, TRUE) ;
 
   g_free(parser) ;
+#endif /* ED_G_NEVER_INCLUDE_THIS_CODE */
+
 
   return ;
 }
@@ -626,193 +705,227 @@ static gboolean parseHeaderLine(ZMapGFFParser parser, char *line)
   enum {FIELD_BUFFER_LEN = 1001} ;			    /* If you change this, change the
 							       scanf's below... */
 
-
-  if (!g_str_has_prefix(line, "##"))
-    {
-      /* If we encounter a non-header comment line and we haven't yet finished the header
-       * then its an error, otherwise we just return FALSE as the line is probably the first.
-       * line of the GFF body. */
-      if (!parser->done_header)
-	parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
-				    "Bad ## line %d: \"%s\"",
-				    parser->line_count, line) ;
-
-      result = FALSE ;
-    }
-  else
+  if (parser->state == ZMAPGFF_PARSE_HEADER)
     {
-      int fields = 0 ;
-      char *format_str = NULL ;
+      gboolean comment_line ;
 
-      /* There may be other header comments that we are not interested in so we just return TRUE. */
-      result = TRUE ;
+      comment_line = g_str_has_prefix(line, "##") ;
 
-      /* Note that number of fields returned by sscanf calls does not include the initial ##<word>
-       * as this is not assigned to a variable. */
-      /* this could be turned into a table driven piece of code but not worth the effort currently. */
-      if (g_str_has_prefix(line, "##gff-version")
-	  && !parser->done_version)
+      if (!parser->header_flags.done_header && !comment_line)
 	{
-	  int version ;
+	  /* If we encounter a non-header comment line and we haven't yet finished the header
+	   * then its an error. */
+	  result = FALSE ;
 
-	  fields = 1 ;
-	  format_str = "%*13s%d" ;
-	  
-	  if ((fields = sscanf(line, format_str, &version)) != 1)
-	    {
-	      parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
-					  "Bad ##gff-version line %d: \"%s\"",
-					  parser->line_count, line) ;
-	      result = FALSE ;
-	    }
-	  else
-	    {
-	      parser->gff_version = version ;
-	      parser->done_version = TRUE ;
-	    }
+	  parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
+				      "Bad ## line %d: \"%s\"",
+				      parser->line_count, line) ;
 	}
-      else if (g_str_has_prefix(line, "##source-version")
-	       && !parser->done_source)
+      else if (comment_line)
 	{
-	  char program[FIELD_BUFFER_LEN] = {'\0'}, version[FIELD_BUFFER_LEN] = {'\0'} ;
+	  int fields = 0 ;
+	  char *format_str = NULL ;
 
-	  fields = 3 ;
-	  format_str = "%*s%1000s%1000s" ;
-	  
-	  if ((fields = sscanf(line, format_str, &program[0], &version[0])) != 2)
-	    {
-	      parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
-					  "Bad ##source-version line %d: \"%s\"",
-					  parser->line_count, line) ;
-	      result = FALSE ;
-	    }
-	  else
-	    {
-	      parser->source_name = g_strdup(&program[0]) ;
-	      parser->source_version = g_strdup(&version[0]) ;
-	      parser->done_source = TRUE ;
-	    }
-     	}
-      else if (g_str_has_prefix(line, "##sequence-region") && !parser->done_sequence_region)
-	{
-	  char sequence_name[FIELD_BUFFER_LEN] = {'\0'} ;
-	  int start = 0, end = 0 ;
+	  /* There may be other header comments that we are not interested in so we just return TRUE. */
+	  result = TRUE ;
 
-	  fields = 4 ;
-	  format_str = "%*s%1000s%d%d" ;
-	  
-	  if ((fields = sscanf(line, format_str, &sequence_name[0], &start, &end)) != 3)
+	  /* Note that number of fields returned by sscanf calls does not include the initial ##<word>
+	   * as this is not assigned to a variable. */
+	  /* this could be turned into a table driven piece of code but not worth the effort currently. */
+	  if (g_str_has_prefix(line, "##gff-version") && !parser->header_flags.done_version)
 	    {
-	      parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
-					  "Bad ##sequence-region line %d: \"%s\"",
-					  parser->line_count, line) ;
-	      result = FALSE ;
-	    }
-	  else
-	    {
-	      parser->sequence_name = g_strdup(&sequence_name[0]) ;
-	      parser->features_start = start ;
-	      parser->features_end = end ;
-	      parser->done_sequence_region = TRUE ;
+	      int version ;
 
-	      /* If Clip start/end not set, they default to features start/end. */
-	      if (parser->clip_start == 0)
+	      fields = 1 ;
+	      format_str = "%*13s%d" ;
+	  
+	      if ((fields = sscanf(line, format_str, &version)) != 1)
+		{
+		  parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
+					      "Bad ##gff-version line %d: \"%s\"",
+					      parser->line_count, line) ;
+		  result = FALSE ;
+		}
+	      else
 		{
-		  parser->clip_start = parser->features_start ;
-		  parser->clip_end = parser->features_end ;
+		  parser->gff_version = version ;
+		  parser->header_flags.done_version = TRUE ;
 		}
 	    }
-     
-	}
-      else if(!parser->parsed_sequence.finished)
-	{
-	  if(g_str_has_prefix(line, "##Type"))
+	  else if (g_str_has_prefix(line, "##source-version") && !parser->header_flags.done_source)
 	    {
-	      char seq_type[11] = {'\0'};
-	      fields = 2;
-	      format_str = "%*6s%10s";
-	      if((fields = sscanf(line, format_str, &seq_type)) != 2)
+	      char program[FIELD_BUFFER_LEN] = {'\0'}, version[FIELD_BUFFER_LEN] = {'\0'} ;
+
+	      fields = 2 ;
+	      format_str = "%*s%1000s%1000s" ;
+	  
+	      if ((fields = sscanf(line, format_str, &program[0], &version[0])) != 2)
 		{
 		  parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
-					      "Bad ##Type line %d: \"%s\"", 
-					      parser->line_count, line);
+					      "Bad ##source-version line %d: \"%s\"",
+					      parser->line_count, line) ;
+		  result = FALSE ;
 		}
 	      else
 		{
-		  if(g_ascii_strcasecmp(seq_type, "DNA") == 0)
-		    {
-		      parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_DNA;
-		    }
-		  else if(g_ascii_strcasecmp(seq_type, "Protein"))
-		    {
-		      parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_PEPTIDE;
-		    }
+		  parser->source_name = g_strdup(&program[0]) ;
+		  parser->source_version = g_strdup(&version[0]) ;
+		  parser->header_flags.done_source = TRUE ;
 		}
 	    }
-
-	  if(!parser->parsed_sequence.in_sequence_block)
+	  else if (g_str_has_prefix(line, "##date") && !parser->header_flags.done_date)
 	    {
-	      gboolean in_block = FALSE;
+	      char date[FIELD_BUFFER_LEN] = {'\0'} ;
 
-	      if((parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_NONE))
+	      fields = 1 ;
+	      format_str = "%*s%1000s" ;
+
+	      if ((fields = sscanf(line, format_str, &date)) != 1)
 		{
-		  if(g_str_has_prefix(line, "##DNA"))
-		    parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_DNA;
-		  else if(g_str_has_prefix(line, "##Protein"))
-		    parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_PEPTIDE;
+		  parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
+					      "Bad ##date line %d: \"%s\"",
+					      parser->line_count, line) ;
+		  result = FALSE ;
 		}
-
-	      if(g_str_has_prefix(line, "##DNA") && parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_DNA)
+	      else
 		{
-		  in_block = TRUE;
+		  parser->date = g_strdup(&date[0]) ;
 		}
-	      else if(g_str_has_prefix(line, "##Protein") && parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_PEPTIDE)
+	    }
+	  else if (g_str_has_prefix(line, "##sequence-region") && !parser->header_flags.done_sequence_region)
+	    {
+	      char sequence_name[FIELD_BUFFER_LEN] = {'\0'} ;
+	      int start = 0, end = 0 ;
+
+	      fields = 4 ;
+	      format_str = "%*s%1000s%d%d" ;
+	  
+	      if ((fields = sscanf(line, format_str, &sequence_name[0], &start, &end)) != 3)
 		{
-		  in_block = TRUE;		  
+		  parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
+					      "Bad ##sequence-region line %d: \"%s\"",
+					      parser->line_count, line) ;
+		  result = FALSE ;
 		}
+	      else
+		{
+		  parser->sequence_name = g_strdup(&sequence_name[0]) ;
+		  parser->features_start = start ;
+		  parser->features_end = end ;
+		  parser->header_flags.done_sequence_region = TRUE ;
 
-	      parser->parsed_sequence.in_sequence_block = in_block;
+		  /* If Clip start/end not set, they default to features start/end. */
+		  if (parser->clip_start == 0)
+		    {
+		      parser->clip_start = parser->features_start ;
+		      parser->clip_end = parser->features_end ;
+		    }
+		}
+     
 	    }
-	  else if(g_str_has_prefix(line, "##end-")) /* I don't think we really need to care about matching type */
+	  else if (g_str_has_prefix(line, "##Type") && !parser->header_flags.done_type)
 	    {
-	      parser->parsed_sequence.seq_data.length   = parser->parsed_sequence.raw_line_data->len;
-	      parser->parsed_sequence.seq_data.sequence = g_string_free(parser->parsed_sequence.raw_line_data, FALSE);
-	      parser->parsed_sequence.raw_line_data     = NULL;
-	      parser->parsed_sequence.finished          = TRUE;	      
+	      /* We don't currently scan for type because we don't support any sequence
+	       * type other than DNA. */
+
+	      parser->header_flags.done_type = TRUE ;
 	    }
-	  else
+
+	  if (parser->header_flags.done_version && parser->header_flags.done_source
+	      && parser->header_flags.done_sequence_region)
 	    {
-	      char *line_ptr = line;
-	      /* must be sequence */
-	      line_ptr+=2;	/* move past ## */
-	      /* save the string */
-	      g_string_append(parser->parsed_sequence.raw_line_data, line_ptr);
+	      parser->header_flags.done_header = TRUE ;
 	    }
 	}
+    }
+
+  return result ;
+}
+
+
+/* This function expects a null-terminated C string that contains a GFF sequence line
+ * which is a special form of comment line starting with a "##".
+ *
+ * GFF version 2 format for sequence lines is:
+ *
+ * ##DNA
+ * ##CGGGCTTTCACCATGTTGGCCAGGCT...CTGCCCGCCTCGGCCTCCCA
+ * ##end-DNA
+ *
+ * We only support DNA sequences, anything else is an error.
+ *
+ * Returns FALSE (and sets parse->error) if there was a parse error,
+ * TRUE if not. parser->sequence_flags.done_finished is set to TRUE
+ * when sequence parse has finished.
+ */
+static gboolean parseSequenceLine(ZMapGFFParser parser, char *line)
+{
+  gboolean result = FALSE ;
+  enum {FIELD_BUFFER_LEN = 1001} ;			    /* If you change this, change the
+							       scanf's below... */
+
+  if (parser->state == ZMAPGFF_PARSE_SEQUENCE)
+    {
+      if (!parser->sequence_flags.done_finished && !g_str_has_prefix(line, "##"))
+	{
+	  /* If we encounter a non-sequence line and we haven't yet finished the sequence
+	   * then its an error. */
+	  result = FALSE ;
 
-      if (parser->done_version && parser->done_source && parser->done_sequence_region &&
-	  parser->parsed_sequence.finished)
+	  parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
+				      "Bad ## line %d: \"%s\"",
+				      parser->line_count, line) ;
+	}
+      else
 	{
-	  parser->done_header = TRUE ;
+	  result = TRUE ;
 
-	  if((parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_DNA) &&
-	     (parser->features_end - parser->features_start + 1) != parser->parsed_sequence.seq_data.length)
+	  if (g_str_has_prefix(line, "##DNA") && !parser->sequence_flags.done_start)
 	    {
-	      parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
-					  "##sequence-region length [%d] does not match DNA base count [%d].", 
-					  (parser->features_end - parser->features_start + 1), 
-					  parser->parsed_sequence.seq_data.length);
+	      parser->sequence_flags.done_start = TRUE ;
+	      parser->seq_data.type = ZMAPSEQUENCE_DNA ;
 	    }
+	  else if (g_str_has_prefix(line, "##end-DNA") && parser->sequence_flags.in_sequence_block)
+	    {
+	      if ((parser->features_end - parser->features_start + 1) != parser->raw_line_data->len)
+		{
+		  parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
+					      "##sequence-region length [%d] does not match DNA base count [%d].", 
+					      (parser->features_end - parser->features_start + 1), 
+					      parser->seq_data.length);
+
+		  g_string_free(parser->raw_line_data, TRUE) ;
+		  parser->raw_line_data = NULL ;
+		}
+	      else
+		{
+		  parser->seq_data.length = parser->raw_line_data->len ;
+		  parser->seq_data.sequence = g_string_free(parser->raw_line_data, FALSE) ;
+		  parser->raw_line_data = NULL ;
+		  parser->sequence_flags.done_finished = TRUE ;
+		}
+	    }
+	  else if (g_str_has_prefix(line, "##")
+		   && (parser->sequence_flags.done_start || parser->sequence_flags.in_sequence_block))
+	    {
+	      char *line_ptr = line ;
 
+	      parser->sequence_flags.in_sequence_block = TRUE ;
+
+	      /* must be sequence */
+	      line_ptr+=2 ;					    /* move past ## */
+	      /* save the string */
+	      g_string_append(parser->raw_line_data, line_ptr) ;
+	    }
 	}
     }
 
-
   return result ;
 }
 
 
 
+
 /* This function expects a null-terminated C string that contains a complete GFF record
  * (i.e. a non-comment line), the function expects the caller to already have removed the
  * newline char from the end of the GFF record.
-- 
GitLab