From 84bfea53ec8f36e4af0a82d0f6863e7f8c490dab Mon Sep 17 00:00:00 2001 From: rds <rds> Date: Thu, 2 Jul 2009 22:21:32 +0000 Subject: [PATCH] implement sequence fetching from gff file --- src/include/ZMap/zmapGFF.h | 6 +- src/zmapGFF/zmapGFF2parser.c | 155 ++++++++++++++++++++------- src/zmapGFF/zmapGFF_P.h | 12 ++- src/zmapServer/file/fileServer.c | 176 +++++++++++++++++++++++++++++-- 4 files changed, 301 insertions(+), 48 deletions(-) diff --git a/src/include/ZMap/zmapGFF.h b/src/include/ZMap/zmapGFF.h index 67b1be32a..d2dc7a0e2 100755 --- a/src/include/ZMap/zmapGFF.h +++ b/src/include/ZMap/zmapGFF.h @@ -28,9 +28,9 @@ * of ZMapFeatureStruct's, one for each GFF source. * * HISTORY: - * Last edited: Apr 24 11:01 2009 (edgrif) + * Last edited: Jul 2 21:26 2009 (rds) * Created: Sat May 29 13:18:32 2004 (edgrif) - * CVS info: $Id: zmapGFF.h,v 1.17 2009-04-24 10:36:18 edgrif Exp $ + * CVS info: $Id: zmapGFF.h,v 1.18 2009-07-02 22:22:53 rds Exp $ *------------------------------------------------------------------- */ #ifndef ZMAP_GFF_H @@ -101,6 +101,8 @@ typedef struct ZMapGFFParser zMapGFFCreateParser(GData *sources, gboolean parse_only) ; gboolean zMapGFFParseHeader(ZMapGFFParser parser, char *line, gboolean *header_finished) ; gboolean zMapGFFParseLine(ZMapGFFParser parser, char *line) ; +gboolean zMapGFFParserSetSequenceFlag(ZMapGFFParser parser); +ZMapSequence zMapGFFGetSequence(ZMapGFFParser parser); void zMapGFFParseSetSourceHash(ZMapGFFParser parser, GHashTable *source_2_feature_set, GHashTable *source_2_sourcedata) ; void zMapGFFSetStopOnError(ZMapGFFParser parser, gboolean stop_on_error) ; diff --git a/src/zmapGFF/zmapGFF2parser.c b/src/zmapGFF/zmapGFF2parser.c index a033cdf5a..914d4ecca 100755 --- a/src/zmapGFF/zmapGFF2parser.c +++ b/src/zmapGFF/zmapGFF2parser.c @@ -26,9 +26,9 @@ * * Exported functions: See ZMap/zmapGFF.h * HISTORY: - * Last edited: Jun 25 15:51 2009 (edgrif) + * Last edited: Jul 2 21:27 2009 (rds) * Created: Fri May 28 14:25:12 2004 (edgrif) - * CVS info: $Id: zmapGFF2parser.c,v 1.92 2009-06-25 14:56:28 edgrif Exp $ + * CVS info: $Id: zmapGFF2parser.c,v 1.93 2009-07-02 22:21:59 rds Exp $ *------------------------------------------------------------------- */ @@ -159,6 +159,9 @@ ZMapGFFParser zMapGFFCreateParser(GData *sources, gboolean parse_only) } + parser->parsed_sequence.raw_line_data = g_string_sized_new(2000); + parser->parsed_sequence.finished = TRUE; /* default we don't parse the dna/protein */ + /* Allocated dynamically as these fields in GFF can be big. */ parser->attributes_str = g_string_sized_new(GFF_MAX_FREETEXT_CHARS) ; parser->comments_str = g_string_sized_new(GFF_MAX_FREETEXT_CHARS) ; @@ -338,6 +341,38 @@ ZMapGFFHeader zMapGFFGetHeader(ZMapGFFParser parser) return header ; } +gboolean zMapGFFParserSetSequenceFlag(ZMapGFFParser parser) +{ + gboolean set = TRUE; + + parser->parsed_sequence.finished = FALSE; + + return set; +} + +ZMapSequence zMapGFFGetSequence(ZMapGFFParser parser) +{ + ZMapSequence sequence = NULL; + + if(parser->done_header) + { + /* parsed_sequence.raw_line_data == NULL means we got to the end-XXXXXX */ + + if(parser->parsed_sequence.seq_data.type != ZMAPSEQUENCE_NONE && + parser->parsed_sequence.seq_data.sequence != NULL && + parser->parsed_sequence.raw_line_data == NULL) + { + sequence = g_new0(ZMapSequenceStruct, 1); + *sequence = parser->parsed_sequence.seq_data; + sequence->name = g_quark_from_string(parser->sequence_name); + /* So we don't copy empty data */ + parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_NONE; + parser->parsed_sequence.seq_data.sequence = NULL; /* So it doesn't get free'd */ + } + } + + return sequence; +} void zMapGFFFreeHeader(ZMapGFFHeader header) { @@ -375,39 +410,6 @@ gboolean zMapGFFGetFeatures(ZMapGFFParser parser, ZMapFeatureBlock feature_block } -#ifdef RDS_NO_TO_HACKED_CODE -/* HACKED CODE TO FIX UP LINKS IN BLOCK->SET->FEATURE */ -static void setBlock(GQuark key_id, gpointer data, gpointer user_data) -{ - ZMapFeatureSet feature_set = (ZMapFeatureSet)data ; -#ifdef RDS_DONT_INCLUDE - ZMapFeatureBlock feature_block = (ZMapFeatureBlock)user_data ; - - feature_set->parent = (ZMapFeatureAny)feature_block ; - zMapFeatureBlockAddFeatureSet(feature_block, feature_set); -#endif - - g_datalist_foreach(&(feature_set->features), setSet, feature_set) ; - - return ; -} - -static void setSet(GQuark key_id, gpointer data, gpointer user_data) -{ - ZMapFeature feature = (ZMapFeature)data ; - ZMapFeatureSet feature_set = (ZMapFeatureSet)user_data ; - -#ifdef RDS_DONT_INCLUDE - feature->parent = (ZMapFeatureAny)feature_set ; -#endif - zMapFeatureSetAddFeature(feature_set, feature); - - return ; -} -/* END OF HACKED CODE TO FIX UP LINKS IN BLOCK->SET->FEATURE */ -#endif - - /* Optionally set mappings that are keys from the GFF source to feature set and style names. */ void zMapGFFParseSetSourceHash(ZMapGFFParser parser, GHashTable *source_2_feature_set, GHashTable *source_2_sourcedata) @@ -722,9 +724,87 @@ static gboolean parseHeaderLine(ZMapGFFParser parser, char *line) } } + else if(!parser->parsed_sequence.finished) + { + if(g_str_has_prefix(line, "##Type")) + { + char seq_type[11] = {'\0'}; + fields = 2; + format_str = "%*6s%10s"; + if((fields = sscanf(line, format_str, &seq_type)) != 2) + { + parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER, + "Bad ##Type line %d: \"%s\"", + parser->line_count, line); + } + else + { + if(g_ascii_strcasecmp(seq_type, "DNA") == 0) + { + parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_DNA; + } + else if(g_ascii_strcasecmp(seq_type, "Protein")) + { + parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_PEPTIDE; + } + } + } - if (parser->done_version && parser->done_source && parser->done_sequence_region) - parser->done_header = TRUE ; + if(!parser->parsed_sequence.in_sequence_block) + { + gboolean in_block = FALSE; + + if((parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_NONE)) + { + if(g_str_has_prefix(line, "##DNA")) + parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_DNA; + else if(g_str_has_prefix(line, "##Protein")) + parser->parsed_sequence.seq_data.type = ZMAPSEQUENCE_PEPTIDE; + } + + if(g_str_has_prefix(line, "##DNA") && parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_DNA) + { + in_block = TRUE; + } + else if(g_str_has_prefix(line, "##Protein") && parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_PEPTIDE) + { + in_block = TRUE; + } + + parser->parsed_sequence.in_sequence_block = in_block; + } + else if(g_str_has_prefix(line, "##end-")) /* I don't think we really need to care about matching type */ + { + parser->parsed_sequence.seq_data.length = parser->parsed_sequence.raw_line_data->len; + parser->parsed_sequence.seq_data.sequence = g_string_free(parser->parsed_sequence.raw_line_data, FALSE); + parser->parsed_sequence.raw_line_data = NULL; + parser->parsed_sequence.finished = TRUE; + } + else + { + char *line_ptr = line; + /* must be sequence */ + line_ptr+=2; /* move past ## */ + /* save the string */ + g_string_append(parser->parsed_sequence.raw_line_data, line_ptr); + } + } + + if (parser->done_version && parser->done_source && parser->done_sequence_region && + parser->parsed_sequence.finished) + { + parser->done_header = TRUE ; + + if((parser->parsed_sequence.seq_data.type == ZMAPSEQUENCE_DNA) && + (parser->features_end - parser->features_start + 1) != parser->parsed_sequence.seq_data.length) + { + parser->error = g_error_new(parser->error_domain, ZMAP_GFF_ERROR_HEADER, + "##sequence-region length [%d] does not match DNA base count [%d].", + (parser->features_end - parser->features_start + 1), + parser->parsed_sequence.seq_data.length); + } + + } } @@ -1805,7 +1885,6 @@ static gboolean getAssemblyPathAttrs(char *attributes, char **assembly_name_unus { gboolean result = TRUE ; char *tag_pos ; - int start = 0, end = 0 ; ZMapStrand strand ; int length = 0 ; GArray *path = NULL ; diff --git a/src/zmapGFF/zmapGFF_P.h b/src/zmapGFF/zmapGFF_P.h index a099777d4..815e86074 100755 --- a/src/zmapGFF/zmapGFF_P.h +++ b/src/zmapGFF/zmapGFF_P.h @@ -25,9 +25,9 @@ * Description: Internal types, functions etc. for the GFF parser, * currently this parser only does GFF v2. * HISTORY: - * Last edited: Mar 30 11:11 2009 (edgrif) + * Last edited: Jul 2 12:06 2009 (rds) * Created: Sat May 29 13:18:32 2004 (edgrif) - * CVS info: $Id: zmapGFF_P.h,v 1.18 2009-04-06 13:10:53 edgrif Exp $ + * CVS info: $Id: zmapGFF_P.h,v 1.19 2009-07-02 22:22:08 rds Exp $ *------------------------------------------------------------------- */ #ifndef ZMAP_GFF_P_H @@ -141,6 +141,14 @@ typedef struct ZMapGFFParserStruct_ GString *attributes_str ; GString *comments_str ; + struct + { + GString *raw_line_data; + ZMapSequenceStruct seq_data; + unsigned int in_sequence_block : 1; + unsigned int finished :1; + }parsed_sequence; + } ZMapGFFParserStruct ; diff --git a/src/zmapServer/file/fileServer.c b/src/zmapServer/file/fileServer.c index 14eb9b6cf..048e43736 100755 --- a/src/zmapServer/file/fileServer.c +++ b/src/zmapServer/file/fileServer.c @@ -30,9 +30,9 @@ * * Exported functions: See ZMap/zmapServerPrototype.h * HISTORY: - * Last edited: Jun 12 10:52 2009 (edgrif) + * Last edited: Jul 2 21:56 2009 (rds) * Created: Fri Sep 10 18:29:18 2004 (edgrif) - * CVS info: $Id: fileServer.c,v 1.38 2009-06-12 13:57:07 edgrif Exp $ + * CVS info: $Id: fileServer.c,v 1.39 2009-07-02 22:21:32 rds Exp $ *------------------------------------------------------------------- */ @@ -452,17 +452,180 @@ static ZMapServerResponseType getFeatures(void *server_in, GData *styles, ZMapFe } +static void eachBlockSequence(gpointer key, gpointer data, gpointer user_data) +{ + ZMapFeatureBlock feature_block = (ZMapFeatureBlock)data ; + GetFeatures get_features = (GetFeatures)user_data ; + + if (get_features->result == ZMAP_SERVERRESPONSE_OK) + { + ZMapSequence sequence; + if(!(sequence = zMapGFFGetSequence(get_features->parser))) + { + GError *error; + error = zMapGFFGetError(get_features->parser); + setErrMsg(get_features->server, + g_strdup_printf("zMapGFFGetSequence() failed, error=%s", + error->message)); + ZMAPSERVER_LOG(Warning, FILE_PROTOCOL_STR, + get_features->server->file_path, + "%s", get_features->server->last_err_msg); + } + else + { + ZMapFeatureContext context; + ZMapFeatureSet feature_set; + + if(zMapFeatureDNACreateFeatureSet(feature_block, &feature_set)) + { + ZMapFeatureTypeStyle dna_style = NULL; + ZMapFeature feature; + + /* This temp style creation feels wrong, and probably is, + * but we don't have the merged in default styles in here, + * or so it seems... */ + dna_style = zMapStyleCreate(ZMAP_FIXED_STYLE_DNA_NAME, + ZMAP_FIXED_STYLE_DNA_NAME_TEXT); + + feature = zMapFeatureDNACreateFeature(feature_block, dna_style, + sequence->sequence, sequence->length); + + zMapStyleDestroy(dna_style); + } + + context = (ZMapFeatureContext)zMapFeatureGetParentGroup((ZMapFeatureAny)feature_block, + ZMAPFEATURE_STRUCT_CONTEXT) ; + + /* I'm going to create the three frame translation up front! */ + if (zMap_g_list_find_quark(context->feature_set_names, zMapStyleCreateID(ZMAP_FIXED_STYLE_3FT_NAME))) + { + if ((zMapFeature3FrameTranslationCreateSet(feature_block, &feature_set))) + { + ZMapFeatureTypeStyle frame_style = NULL; + + frame_style = zMapStyleCreate(ZMAP_FIXED_STYLE_DNA_NAME, + ZMAP_FIXED_STYLE_DNA_NAME_TEXT); + + zMapFeature3FrameTranslationSetCreateFeatures(feature_set, frame_style); + + zMapStyleDestroy(frame_style); + } + } + + g_free(sequence); + } + } + + return ; +} + +/* Process all the alignments in a context. */ +static void eachAlignmentSequence(gpointer key, gpointer data, gpointer user_data) +{ + ZMapFeatureAlignment alignment = (ZMapFeatureAlignment)data ; + GetFeatures get_features = (GetFeatures)user_data ; + + if (get_features->result == ZMAP_SERVERRESPONSE_OK) + g_hash_table_foreach(alignment->blocks, eachBlockSequence, (gpointer)get_features) ; + + return ; +} /* We don't support this for now... */ static ZMapServerResponseType getContextSequence(void *server_in, GData *styles, ZMapFeatureContext feature_context_out) { - ZMapServerResponseType result = ZMAP_SERVERRESPONSE_OK ; + FileServer server = (FileServer)server_in ; + GetFeaturesStruct get_features ; + GIOStatus status ; + gsize terminator_pos = 0 ; + GError *gff_file_err = NULL ; + GError *error = NULL ; + get_features.result = ZMAP_SERVERRESPONSE_OK ; + get_features.server = (FileServer)server_in ; - result = ZMAP_SERVERRESPONSE_UNSUPPORTED ; + get_features.parser = zMapGFFCreateParser(styles, FALSE) ; + /* FALSE => do the real parse. */ + zMapGFFParserSetSequenceFlag(get_features.parser); + get_features.gff_line = g_string_sized_new(2000) ; /* Probably not many lines will be > + 2k chars. */ + g_io_channel_seek_position(server->gff_file, 0, G_SEEK_SET, &gff_file_err); - return result ; + /* Read the header, needed for feature coord range. */ + while ((status = g_io_channel_read_line_string(server->gff_file, get_features.gff_line, + &terminator_pos, + &gff_file_err)) == G_IO_STATUS_NORMAL) + { + gboolean done_header = FALSE ; + + *(get_features.gff_line->str + terminator_pos) = '\0' ; /* Remove terminating newline. */ + + if (zMapGFFParseHeader(get_features.parser, get_features.gff_line->str, &done_header)) + { + if (done_header) + break ; + else + get_features.gff_line = g_string_truncate(get_features.gff_line, 0) ; + /* Reset line to empty. */ + } + else + { + if (!done_header) + { + error = zMapGFFGetError(get_features.parser) ; + + if (!error) + { + /* SHOULD ABORT HERE.... */ + setErrMsg(server, + g_strdup_printf("zMapGFFParseLine() failed with no GError for line %d: %s", + zMapGFFGetLineNumber(get_features.parser), get_features.gff_line->str)) ; + ZMAPSERVER_LOG(Critical, FILE_PROTOCOL_STR, server->file_path, + "%s", server->last_err_msg) ; + } + else + { + /* If the error was serious we stop processing and return the error, + * otherwise we just log the error. */ + if (zMapGFFTerminated(get_features.parser)) + { + get_features.result = ZMAP_SERVERRESPONSE_REQFAIL ; + setErrMsg(server, g_strdup_printf("GFFTerminated! %s", error->message)) ; + ZMAPSERVER_LOG(Critical, FILE_PROTOCOL_STR, server->file_path, + "%s", server->last_err_msg) ; } + else + { + setErrMsg(server, + g_strdup_printf("zMapGFFParseHeader() failed for line %d: %s", + zMapGFFGetLineNumber(get_features.parser), + get_features.gff_line->str)) ; + ZMAPSERVER_LOG(Critical, FILE_PROTOCOL_STR, server->file_path, + "%s", server->last_err_msg) ; + } + } + get_features.result = ZMAP_SERVERRESPONSE_REQFAIL ; + } + + break ; + } + } + + if (get_features.result == ZMAP_SERVERRESPONSE_OK) + { + /* Fetch all the alignment blocks for all the sequences, this all hacky right now as really. + * we would have to parse and reparse the file....can be done but not needed this second. */ + g_hash_table_foreach(feature_context_out->alignments, eachAlignmentSequence, (gpointer)&get_features) ; + + } + + + /* Clear up. */ + zMapGFFDestroyParser(get_features.parser) ; + g_string_free(get_features.gff_line, TRUE) ; + + + return get_features.result ; } @@ -556,6 +719,8 @@ static void addMapping(ZMapFeatureContext feature_context, ZMapGFFHeader header) feature_context->sequence_to_parent.p2 = feature_context->sequence_to_parent.c2 = feature_block->block_to_sequence.q2 = feature_block->block_to_sequence.t2 ; + + feature_context->length = feature_context->sequence_to_parent.c2 - feature_context->sequence_to_parent.c1 + 1; return ; } @@ -618,7 +783,6 @@ static void eachBlock(gpointer key, gpointer data, gpointer user_data) } - static gboolean sequenceRequest(FileServer server, ZMapGFFParser parser, GString* gff_line, ZMapFeatureBlock feature_block) { -- GitLab