From 84bfea53ec8f36e4af0a82d0f6863e7f8c490dab Mon Sep 17 00:00:00 2001
From: rds <rds>
Date: Thu, 2 Jul 2009 22:21:32 +0000
Subject: [PATCH] implement sequence fetching from gff file

---
 src/include/ZMap/zmapGFF.h       |   6 +-
 src/zmapGFF/zmapGFF2parser.c     | 155 ++++++++++++++++++++-------
 src/zmapGFF/zmapGFF_P.h          |  12 ++-
 src/zmapServer/file/fileServer.c | 176 +++++++++++++++++++++++++++++--
 4 files changed, 301 insertions(+), 48 deletions(-)

diff --git a/src/include/ZMap/zmapGFF.h b/src/include/ZMap/zmapGFF.h
index 67b1be32a..d2dc7a0e2 100755
--- a/src/include/ZMap/zmapGFF.h
+++ b/src/include/ZMap/zmapGFF.h
@@ -28,9 +28,9 @@
  *              of ZMapFeatureStruct's, one for each GFF source.
  *              
  * HISTORY:
- * Last edited: Apr 24 11:01 2009 (edgrif)
+ * Last edited: Jul  2 21:26 2009 (rds)
  * Created: Sat May 29 13:18:32 2004 (edgrif)
- * CVS info:   $Id: zmapGFF.h,v 1.17 2009-04-24 10:36:18 edgrif Exp $
+ * CVS info:   $Id: zmapGFF.h,v 1.18 2009-07-02 22:22:53 rds Exp $
  *-------------------------------------------------------------------
  */
 #ifndef ZMAP_GFF_H
@@ -101,6 +101,8 @@ typedef struct
 ZMapGFFParser zMapGFFCreateParser(GData *sources, gboolean parse_only) ;
 gboolean zMapGFFParseHeader(ZMapGFFParser parser, char *line, gboolean *header_finished) ;
 gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line) ;
+gboolean zMapGFFParserSetSequenceFlag(ZMapGFFParser parser);
+ZMapSequence zMapGFFGetSequence(ZMapGFFParser parser);
 void zMapGFFParseSetSourceHash(ZMapGFFParser parser,
 			       GHashTable *source_2_feature_set, GHashTable *source_2_sourcedata) ;
 void zMapGFFSetStopOnError(ZMapGFFParser parser, gboolean stop_on_error) ;
diff --git a/src/zmapGFF/zmapGFF2parser.c b/src/zmapGFF/zmapGFF2parser.c
index a033cdf5a..914d4ecca 100755
--- a/src/zmapGFF/zmapGFF2parser.c
+++ b/src/zmapGFF/zmapGFF2parser.c
@@ -26,9 +26,9 @@
  *              
  * Exported functions: See ZMap/zmapGFF.h
  * HISTORY:
- * Last edited: Jun 25 15:51 2009 (edgrif)
+ * Last edited: Jul  2 21:27 2009 (rds)
  * Created: Fri May 28 14:25:12 2004 (edgrif)
- * CVS info:   $Id: zmapGFF2parser.c,v 1.92 2009-06-25 14:56:28 edgrif Exp $
+ * CVS info:   $Id: zmapGFF2parser.c,v 1.93 2009-07-02 22:21:59 rds Exp $
  *-------------------------------------------------------------------
  */
 
@@ -159,6 +159,9 @@ ZMapGFFParser zMapGFFCreateParser(GData *sources, gboolean parse_only)
     }
 
 
+  parser->parsed_sequence.raw_line_data = g_string_sized_new(2000);
+  parser->parsed_sequence.finished = TRUE; /* default we don't parse the dna/protein */
+
   /* Allocated dynamically as these fields in GFF can be big. */
   parser->attributes_str = g_string_sized_new(GFF_MAX_FREETEXT_CHARS) ;
   parser->comments_str = g_string_sized_new(GFF_MAX_FREETEXT_CHARS) ;
@@ -338,6 +341,38 @@ ZMapGFFHeader zMapGFFGetHeader(ZMapGFFParser parser)
   return header ;
 }
 
+gboolean zMapGFFParserSetSequenceFlag(ZMapGFFParser parser)
+{
+  gboolean set = TRUE;
+
+  parser->parsed_sequence.finished = FALSE;
+
+  return set;
+}
+
+ZMapSequence zMapGFFGetSequence(ZMapGFFParser parser)
+{
+  ZMapSequence sequence = NULL;
+
+  if(parser->done_header)
+    {
+      /* parsed_sequence.raw_line_data == NULL means we got to the end-XXXXXX  */
+
+      if(parser->parsed_sequence.seq_data.type != ZMAPSEQUENCE_NONE &&
+	 parser->parsed_sequence.seq_data.sequence != NULL &&
+	 parser->parsed_sequence.raw_line_data == NULL)
+	{
+	  sequence = g_new0(ZMapSequenceStruct, 1);
+	  *sequence = parser->parsed_sequence.seq_data;
+	  sequence->name = g_quark_from_string(parser->sequence_name);
+	  /* So we don't copy empty data */
+	  parser->parsed_sequence.seq_data.type     = ZMAPSEQUENCE_NONE;
+	  parser->parsed_sequence.seq_data.sequence = NULL; /* So it doesn't get free'd */
+	}
+    }
+
+  return sequence;
+}
 
 void zMapGFFFreeHeader(ZMapGFFHeader header)
 {
@@ -375,39 +410,6 @@ gboolean zMapGFFGetFeatures(ZMapGFFParser parser, ZMapFeatureBlock feature_block
 }
 
 
-#ifdef RDS_NO_TO_HACKED_CODE
-/* HACKED CODE TO FIX UP LINKS IN BLOCK->SET->FEATURE */
-static void setBlock(GQuark key_id, gpointer data, gpointer user_data)
-{
-  ZMapFeatureSet feature_set = (ZMapFeatureSet)data ;
-#ifdef RDS_DONT_INCLUDE
-  ZMapFeatureBlock feature_block = (ZMapFeatureBlock)user_data ;
-
-  feature_set->parent = (ZMapFeatureAny)feature_block ;
-  zMapFeatureBlockAddFeatureSet(feature_block, feature_set);
-#endif
-
-  g_datalist_foreach(&(feature_set->features), setSet, feature_set) ;
-
-  return ;
-}
-
-static void setSet(GQuark key_id, gpointer data, gpointer user_data)
-{
-  ZMapFeature feature = (ZMapFeature)data ;
-  ZMapFeatureSet feature_set = (ZMapFeatureSet)user_data ;
-
-#ifdef RDS_DONT_INCLUDE
-  feature->parent = (ZMapFeatureAny)feature_set ;
-#endif
-  zMapFeatureSetAddFeature(feature_set, feature);
-
-  return ;
-}
-/* END OF HACKED CODE TO FIX UP LINKS IN BLOCK->SET->FEATURE */
-#endif
-
-
 /* Optionally set mappings that are keys from the GFF source to feature set and style names. */
 void zMapGFFParseSetSourceHash(ZMapGFFParser parser,
 			       GHashTable *source_2_feature_set, GHashTable *source_2_sourcedata)
@@ -722,9 +724,87 @@ static gboolean parseHeaderLine(ZMapGFFParser parser, char *line)
 	    }
      
 	}
+      else if(!parser->parsed_sequence.finished)
+	{
+	  if(g_str_has_prefix(line, "##Type"))
+	    {
+	      char seq_type[11] = {'\0'};
+	      fields = 2;
+	      format_str = "%*6s%10s";
+	      if((fields = sscanf(line, format_str, &seq_type)) != 2)
+		{
+		  parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
+					      "Bad ##Type line %d: \"%s\"", 
+					      parser->line_count, line);
+		}
+	      else
+		{
+		  if(g_ascii_strcasecmp(seq_type, "DNA") == 0)
+		    {
+		      parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_DNA;
+		    }
+		  else if(g_ascii_strcasecmp(seq_type, "Protein"))
+		    {
+		      parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_PEPTIDE;
+		    }
+		}
+	    }
 
-      if (parser->done_version && parser->done_source && parser->done_sequence_region)
-	parser->done_header = TRUE ;
+	  if(!parser->parsed_sequence.in_sequence_block)
+	    {
+	      gboolean in_block = FALSE;
+
+	      if((parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_NONE))
+		{
+		  if(g_str_has_prefix(line, "##DNA"))
+		    parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_DNA;
+		  else if(g_str_has_prefix(line, "##Protein"))
+		    parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_PEPTIDE;
+		}
+
+	      if(g_str_has_prefix(line, "##DNA") && parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_DNA)
+		{
+		  in_block = TRUE;
+		}
+	      else if(g_str_has_prefix(line, "##Protein") && parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_PEPTIDE)
+		{
+		  in_block = TRUE;		  
+		}
+
+	      parser->parsed_sequence.in_sequence_block = in_block;
+	    }
+	  else if(g_str_has_prefix(line, "##end-")) /* I don't think we really need to care about matching type */
+	    {
+	      parser->parsed_sequence.seq_data.length   = parser->parsed_sequence.raw_line_data->len;
+	      parser->parsed_sequence.seq_data.sequence = g_string_free(parser->parsed_sequence.raw_line_data, FALSE);
+	      parser->parsed_sequence.raw_line_data     = NULL;
+	      parser->parsed_sequence.finished          = TRUE;	      
+	    }
+	  else
+	    {
+	      char *line_ptr = line;
+	      /* must be sequence */
+	      line_ptr+=2;	/* move past ## */
+	      /* save the string */
+	      g_string_append(parser->parsed_sequence.raw_line_data, line_ptr);
+	    }
+	}
+
+      if (parser->done_version && parser->done_source && parser->done_sequence_region &&
+	  parser->parsed_sequence.finished)
+	{
+	  parser->done_header = TRUE ;
+
+	  if((parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_DNA) &&
+	     (parser->features_end - parser->features_start + 1) != parser->parsed_sequence.seq_data.length)
+	    {
+	      parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER,
+					  "##sequence-region length [%d] does not match DNA base count [%d].", 
+					  (parser->features_end - parser->features_start + 1), 
+					  parser->parsed_sequence.seq_data.length);
+	    }
+
+	}
     }
 
 
@@ -1805,7 +1885,6 @@ static gboolean getAssemblyPathAttrs(char *attributes, char **assembly_name_unus
 {
   gboolean result = TRUE ;
   char *tag_pos ;
-  int start = 0, end = 0 ;
   ZMapStrand strand ;
   int length = 0 ;
   GArray *path = NULL ;
diff --git a/src/zmapGFF/zmapGFF_P.h b/src/zmapGFF/zmapGFF_P.h
index a099777d4..815e86074 100755
--- a/src/zmapGFF/zmapGFF_P.h
+++ b/src/zmapGFF/zmapGFF_P.h
@@ -25,9 +25,9 @@
  * Description: Internal types, functions etc. for the GFF parser,
  *              currently this parser only does GFF v2.
  * HISTORY:
- * Last edited: Mar 30 11:11 2009 (edgrif)
+ * Last edited: Jul  2 12:06 2009 (rds)
  * Created: Sat May 29 13:18:32 2004 (edgrif)
- * CVS info:   $Id: zmapGFF_P.h,v 1.18 2009-04-06 13:10:53 edgrif Exp $
+ * CVS info:   $Id: zmapGFF_P.h,v 1.19 2009-07-02 22:22:08 rds Exp $
  *-------------------------------------------------------------------
  */
 #ifndef ZMAP_GFF_P_H
@@ -141,6 +141,14 @@ typedef struct ZMapGFFParserStruct_
   GString *attributes_str ;
   GString *comments_str ;
 
+  struct 
+  {
+    GString *raw_line_data;
+    ZMapSequenceStruct seq_data;
+    unsigned int in_sequence_block : 1;
+    unsigned int finished :1;
+  }parsed_sequence;
+
 
 } ZMapGFFParserStruct ;
 
diff --git a/src/zmapServer/file/fileServer.c b/src/zmapServer/file/fileServer.c
index 14eb9b6cf..048e43736 100755
--- a/src/zmapServer/file/fileServer.c
+++ b/src/zmapServer/file/fileServer.c
@@ -30,9 +30,9 @@
  *              
  * Exported functions: See ZMap/zmapServerPrototype.h
  * HISTORY:
- * Last edited: Jun 12 10:52 2009 (edgrif)
+ * Last edited: Jul  2 21:56 2009 (rds)
  * Created: Fri Sep 10 18:29:18 2004 (edgrif)
- * CVS info:   $Id: fileServer.c,v 1.38 2009-06-12 13:57:07 edgrif Exp $
+ * CVS info:   $Id: fileServer.c,v 1.39 2009-07-02 22:21:32 rds Exp $
  *-------------------------------------------------------------------
  */
 
@@ -452,17 +452,180 @@ static ZMapServerResponseType getFeatures(void *server_in, GData *styles, ZMapFe
 }
 
 
+static void eachBlockSequence(gpointer key, gpointer data, gpointer user_data)
+{
+  ZMapFeatureBlock feature_block = (ZMapFeatureBlock)data ;
+  GetFeatures get_features = (GetFeatures)user_data ;
+
+  if (get_features->result == ZMAP_SERVERRESPONSE_OK)
+    {
+      ZMapSequence sequence;
+      if(!(sequence = zMapGFFGetSequence(get_features->parser)))
+	{
+	  GError *error;
+	  error = zMapGFFGetError(get_features->parser);
+	  setErrMsg(get_features->server,
+		    g_strdup_printf("zMapGFFGetSequence() failed, error=%s",
+				    error->message));
+	  ZMAPSERVER_LOG(Warning, FILE_PROTOCOL_STR, 
+			 get_features->server->file_path,
+			 "%s", get_features->server->last_err_msg);
+	}
+      else
+	{
+	  ZMapFeatureContext context;
+	  ZMapFeatureSet feature_set;
+
+	  if(zMapFeatureDNACreateFeatureSet(feature_block, &feature_set))
+	    {
+	      ZMapFeatureTypeStyle dna_style = NULL;
+	      ZMapFeature feature;
+
+	      /* This temp style creation feels wrong, and probably is,
+	       * but we don't have the merged in default styles in here,
+	       * or so it seems... */
+	      dna_style = zMapStyleCreate(ZMAP_FIXED_STYLE_DNA_NAME, 
+					  ZMAP_FIXED_STYLE_DNA_NAME_TEXT);
+	      
+	      feature = zMapFeatureDNACreateFeature(feature_block, dna_style,
+						    sequence->sequence, sequence->length);
+	      
+	      zMapStyleDestroy(dna_style);
+	    }
+
+	  context = (ZMapFeatureContext)zMapFeatureGetParentGroup((ZMapFeatureAny)feature_block, 
+								  ZMAPFEATURE_STRUCT_CONTEXT) ;
+
+	  /* I'm going to create the three frame translation up front! */
+	  if (zMap_g_list_find_quark(context->feature_set_names, zMapStyleCreateID(ZMAP_FIXED_STYLE_3FT_NAME)))
+	    {
+	      if ((zMapFeature3FrameTranslationCreateSet(feature_block, &feature_set)))
+		{
+		  ZMapFeatureTypeStyle frame_style = NULL;
+		  
+		  frame_style = zMapStyleCreate(ZMAP_FIXED_STYLE_DNA_NAME, 
+						ZMAP_FIXED_STYLE_DNA_NAME_TEXT);
+		  
+		  zMapFeature3FrameTranslationSetCreateFeatures(feature_set, frame_style);
+		  
+		  zMapStyleDestroy(frame_style);
+		}
+	    }
+
+	  g_free(sequence);
+	}
+    }
+
+  return ;
+}
+
+/* Process all the alignments in a context. */
+static void eachAlignmentSequence(gpointer key, gpointer data, gpointer user_data)
+{
+  ZMapFeatureAlignment alignment = (ZMapFeatureAlignment)data ;
+  GetFeatures get_features = (GetFeatures)user_data ;
+
+  if (get_features->result == ZMAP_SERVERRESPONSE_OK)
+    g_hash_table_foreach(alignment->blocks, eachBlockSequence, (gpointer)get_features) ;
+
+  return ;
+}
 
 /* We don't support this for now... */
 static ZMapServerResponseType getContextSequence(void *server_in, GData *styles, ZMapFeatureContext feature_context_out)
 {
-  ZMapServerResponseType result = ZMAP_SERVERRESPONSE_OK ;
+  FileServer server = (FileServer)server_in ;
+  GetFeaturesStruct get_features ;
+  GIOStatus status ;
+  gsize terminator_pos = 0 ;
+  GError *gff_file_err = NULL ;
+  GError *error = NULL ;
 
+  get_features.result = ZMAP_SERVERRESPONSE_OK ;
+  get_features.server = (FileServer)server_in ;
 
-  result = ZMAP_SERVERRESPONSE_UNSUPPORTED ;
+  get_features.parser = zMapGFFCreateParser(styles, FALSE) ;
+							    /* FALSE => do the real parse. */
 
+  zMapGFFParserSetSequenceFlag(get_features.parser);
+  get_features.gff_line = g_string_sized_new(2000) ;	    /* Probably not many lines will be >
+							       2k chars. */
+  g_io_channel_seek_position(server->gff_file, 0, G_SEEK_SET, &gff_file_err);
 
-  return result ;
+  /* Read the header, needed for feature coord range. */
+  while ((status = g_io_channel_read_line_string(server->gff_file, get_features.gff_line,
+						 &terminator_pos,
+						 &gff_file_err)) == G_IO_STATUS_NORMAL)
+    {
+      gboolean done_header = FALSE ;
+
+      *(get_features.gff_line->str + terminator_pos) = '\0' ; /* Remove terminating newline. */
+
+      if (zMapGFFParseHeader(get_features.parser, get_features.gff_line->str, &done_header))
+	{
+	  if (done_header)
+	    break ;
+	  else
+	    get_features.gff_line = g_string_truncate(get_features.gff_line, 0) ;
+							    /* Reset line to empty. */
+	}
+      else
+	{
+	  if (!done_header)
+	    {
+	      error = zMapGFFGetError(get_features.parser) ;
+
+	      if (!error)
+		{
+		  /* SHOULD ABORT HERE.... */
+		  setErrMsg(server, 
+			    g_strdup_printf("zMapGFFParseLine() failed with no GError for line %d: %s",
+					    zMapGFFGetLineNumber(get_features.parser), get_features.gff_line->str)) ;
+		  ZMAPSERVER_LOG(Critical, FILE_PROTOCOL_STR, server->file_path,
+				 "%s", server->last_err_msg) ;
+		}
+	      else
+		{
+		  /* If the error was serious we stop processing and return the error,
+		   * otherwise we just log the error. */
+		  if (zMapGFFTerminated(get_features.parser))
+		    {
+		      get_features.result = ZMAP_SERVERRESPONSE_REQFAIL ;
+		      setErrMsg(server, g_strdup_printf("GFFTerminated! %s", error->message)) ;
+		      ZMAPSERVER_LOG(Critical, FILE_PROTOCOL_STR, server->file_path,
+				     "%s", server->last_err_msg) ;		    }
+		  else
+		    {
+		      setErrMsg(server,
+				g_strdup_printf("zMapGFFParseHeader() failed for line %d: %s",
+						zMapGFFGetLineNumber(get_features.parser),
+						get_features.gff_line->str)) ;
+		      ZMAPSERVER_LOG(Critical, FILE_PROTOCOL_STR, server->file_path,
+				     "%s", server->last_err_msg) ;
+		    }
+		}
+	      get_features.result = ZMAP_SERVERRESPONSE_REQFAIL ;
+	    }
+
+	  break ;
+	}
+    }
+
+  if (get_features.result == ZMAP_SERVERRESPONSE_OK)
+    {
+      /* Fetch all the alignment blocks for all the sequences, this all hacky right now as really.
+       * we would have to parse and reparse the file....can be done but not needed this second. */
+      g_hash_table_foreach(feature_context_out->alignments, eachAlignmentSequence, (gpointer)&get_features) ;
+
+    }
+
+
+  /* Clear up. */
+  zMapGFFDestroyParser(get_features.parser) ;
+  g_string_free(get_features.gff_line, TRUE) ;
+
+
+  return get_features.result ;
 }
 
 
@@ -556,6 +719,8 @@ static void addMapping(ZMapFeatureContext feature_context, ZMapGFFHeader header)
 
   feature_context->sequence_to_parent.p2 = feature_context->sequence_to_parent.c2
     = feature_block->block_to_sequence.q2 = feature_block->block_to_sequence.t2 ;
+
+  feature_context->length = feature_context->sequence_to_parent.c2 - feature_context->sequence_to_parent.c1 + 1;
   
   return ;
 }
@@ -618,7 +783,6 @@ static void eachBlock(gpointer key, gpointer data, gpointer user_data)
 }
 
 
-
 static gboolean sequenceRequest(FileServer server, ZMapGFFParser parser, GString* gff_line,
 				ZMapFeatureBlock feature_block)
 {
-- 
GitLab