diff --git a/app/__init__.py b/app/__init__.py index 828f756c7a82616a0bd8c78891116ac83edcd6f5..60825ae9437b98b0d52ccc2ae3a2a3f260069d0b 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -11,7 +11,6 @@ from app.blueprints.swagger_description.swagger_description_blueprint import SWA from app.blueprints.es_proxy.controllers.es_proxy_controller import ES_PROXY_BLUEPRINT from app.blueprints.properties_config.controllers.properties_config_controller import PROPERTIES_CONFIG_BLUEPRINT from app.blueprints.contexts.controllers.contexts_controller import CONTEXTS_BLUEPRINT -from app.blueprints.search_parser.controllers.search_parser_controller import SEARCH_PARSER_BLUEPRINT from app.blueprints.search.controllers.search_controller import SEARCH_BLUEPRINT from app.blueprints.url_shortening.controllers.url_shortening_controller import URL_SHORTENING_BLUEPRINT from app.blueprints.element_usage_blueprint.controllers.element_usage_controller import ELEMENT_USAGE_BLUEPRINT @@ -21,7 +20,6 @@ from app.blueprints.entities_join.controllers.entities_join_controller import EN from app.blueprints.eubopen.search.controller import EUBOPEN_SEARCH_BLUEPRINT from app.blueprints.eubopen.visualisations.controller import EUBOPEN_VISUALISATIONS_BLUEPRINT from app.blueprints.eubopen.miscellaneous.controller import EUBOPEN_MISC_BLUEPRINT -from app.blueprints.usage_statistics.controllers.usage_statistics_controller import USAGE_STATISTICS_BLUEPRINT from app.blueprints.authorisation.controller import ACCESS_CONTROL_AUTH_BLUEPRINT @@ -67,10 +65,6 @@ def create_app(): 'blueprint': CONTEXTS_BLUEPRINT, 'url': '/contexts' }, - { - 'blueprint': SEARCH_PARSER_BLUEPRINT, - 'url': '/search_parsing' - }, { 'blueprint': SEARCH_BLUEPRINT, 'url': '/search' @@ -107,10 +101,6 @@ def create_app(): 'blueprint': EUBOPEN_MISC_BLUEPRINT, 'url': '/eubopen/miscellaneous' }, - { - 'blueprint': USAGE_STATISTICS_BLUEPRINT, - 'url': '/usage_statistics' - }, { 'blueprint': ACCESS_CONTROL_AUTH_BLUEPRINT, 'url': '/authorisation' diff --git a/app/blueprints/search_parser/__init__.py b/app/blueprints/search_parser/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/blueprints/search_parser/controllers/__init__.py b/app/blueprints/search_parser/controllers/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/blueprints/search_parser/controllers/marshmallow_schemas.py b/app/blueprints/search_parser/controllers/marshmallow_schemas.py deleted file mode 100644 index 137f0edbea249fad391e48d7b8966cb4a503e808..0000000000000000000000000000000000000000 --- a/app/blueprints/search_parser/controllers/marshmallow_schemas.py +++ /dev/null @@ -1,15 +0,0 @@ -""" -Marshmallow schemas for validating the queries to the search parser -""" -from marshmallow import fields - -from app.common_marshmallow.common_schemas import CommonSchema - - -class ParseFreeTextSearchRequest(CommonSchema): - """ - Class with the schema for parsing a free text search - """ - search_term = fields.String(required=True) - es_indexes = fields.String(required=True) - selected_es_index = fields.String() diff --git a/app/blueprints/search_parser/controllers/search_parser_controller.py b/app/blueprints/search_parser/controllers/search_parser_controller.py deleted file mode 100644 index 3a2313d5251ecbf7b0a5eb71c45ed754ce6ac9e5..0000000000000000000000000000000000000000 --- a/app/blueprints/search_parser/controllers/search_parser_controller.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -Search parser controller -""" -from flask import Blueprint, jsonify, request - -from app.request_validation.decorators import validate_form_with -from app.blueprints.search_parser.controllers import marshmallow_schemas -from app.blueprints.search_parser.services import search_parsing_service -from app.http_cache import http_cache_utils -from app.common_marshmallow import utils as schema_utils -from app.usage_statistics.decorators import record_response_final_status -from app.request_error_logging_decorators import log_and_return_internal_server_error - -SEARCH_PARSER_BLUEPRINT = Blueprint('search_parsing', __name__) - - -# move to /search/free_text when creating new search -@SEARCH_PARSER_BLUEPRINT.route('/parse_free_text_search', methods=['POST']) -@validate_form_with(marshmallow_schemas.ParseFreeTextSearchRequest) -@record_response_final_status() -@log_and_return_internal_server_error() -def parse_free_text_search(): - """ - :return: the json response with the query to use in elasticsearch - """ - - form_data = request.form - - search_term = form_data.get('search_term') - es_indexes = form_data.get('es_indexes') - selected_es_index = form_data.get('selected_es_index') - is_test = schema_utils.parse_boolean_param(form_data, 'is_test') - - raw_response = search_parsing_service.parse_search(search_term, es_indexes, selected_es_index, is_test) - http_response = jsonify(raw_response) - http_cache_utils.add_cache_headers_to_response(http_response) - return http_response diff --git a/app/blueprints/search_parser/services/__init__.py b/app/blueprints/search_parser/services/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/blueprints/search_parser/services/search_parsing_service.py b/app/blueprints/search_parser/services/search_parsing_service.py deleted file mode 100644 index 2f067a554c123c34a1fa7febfc49488d5e85d382..0000000000000000000000000000000000000000 --- a/app/blueprints/search_parser/services/search_parsing_service.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Search Parsing Service -""" -import time - -from app.free_text_parsing import free_text_parser -from app.usage_statistics import statistics_saver - - -class SearchParsingServiceError(Exception): - """ - Class for errors in this module - """ - - -def parse_search(search_term, es_indexes, selected_es_index, is_test=False): - """ - :param search_term: Term to parse - :param es_indexes: indexes in which the search will be done, separated by a comma - :param selected_es_index: es index to focus on - :param is_test: tells if it is a test request - :return: the query to send to elasticsearch based on the search term provided - """ - start_time = time.time() - response_dict = free_text_parser.parse_search(search_term, es_indexes, selected_es_index) - end_time = time.time() - time_taken = end_time - start_time - - if not is_test: - statistics_saver.save_free_text_search_record(time_taken) - - return response_dict diff --git a/app/blueprints/usage_statistics/__init__.py b/app/blueprints/usage_statistics/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/blueprints/usage_statistics/controllers/__init__.py b/app/blueprints/usage_statistics/controllers/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/blueprints/usage_statistics/controllers/marshmallow_schemas.py b/app/blueprints/usage_statistics/controllers/marshmallow_schemas.py deleted file mode 100644 index 7ca233409b6319cabf09d740b7004aaa851c98d1..0000000000000000000000000000000000000000 --- a/app/blueprints/usage_statistics/controllers/marshmallow_schemas.py +++ /dev/null @@ -1,14 +0,0 @@ -""" -Schemas to validate the input of custom statistics endpoint -""" -from marshmallow import fields, validate - -from app.common_marshmallow.common_schemas import CommonSchema - - -class StructureSearchJobStatistics(CommonSchema): - """ - Class that the schema for saving statistics for the structure search jobs - """ - search_type = fields.String(required=True, validate=validate.OneOf(['SIMILARITY', 'SUBSTRUCTURE', 'CONNECTIVITY'])) - time_taken = fields.Number(required=True, validate=validate.Range(min=0)) diff --git a/app/blueprints/usage_statistics/controllers/usage_statistics_controller.py b/app/blueprints/usage_statistics/controllers/usage_statistics_controller.py deleted file mode 100644 index f49478d60a05bec993be685fe8d2683d1951dd59..0000000000000000000000000000000000000000 --- a/app/blueprints/usage_statistics/controllers/usage_statistics_controller.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Controller for storing custom statistics in eubopen -""" -from flask import Blueprint, request, jsonify - -from utils import request_parameters -from app.blueprints.usage_statistics.controllers import marshmallow_schemas -from app.request_validation.decorators import validate_form_with -from app.usage_statistics.decorators import record_response_final_status -from app.request_error_logging_decorators import log_and_return_internal_server_error -from app.blueprints.usage_statistics.service import usage_statistics_service -from app.authorisation.decorators import special_access_token_required - -USAGE_STATISTICS_BLUEPRINT = Blueprint('usage_statistics', __name__) - - -@USAGE_STATISTICS_BLUEPRINT.route('/register_structure_search', methods=['POST']) -@special_access_token_required -@validate_form_with(marshmallow_schemas.StructureSearchJobStatistics) -@record_response_final_status() -@log_and_return_internal_server_error() -def register_structure_search(): - """ - Registers that a structure search was made - """ - form_data = request.form - - search_type = request_parameters.sanitise_parameter(form_data.get('search_type')) - time_taken = request_parameters.sanitise_parameter(form_data.get('time_taken')) - json_result = usage_statistics_service.register_structure_search(search_type, time_taken) - - return jsonify(json_result) diff --git a/app/blueprints/usage_statistics/service/__init__.py b/app/blueprints/usage_statistics/service/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/blueprints/usage_statistics/service/usage_statistics_service.py b/app/blueprints/usage_statistics/service/usage_statistics_service.py deleted file mode 100644 index 62ceedec3fbbc7417f3b4651ac2f183dd8ab3497..0000000000000000000000000000000000000000 --- a/app/blueprints/usage_statistics/service/usage_statistics_service.py +++ /dev/null @@ -1,17 +0,0 @@ -""" -Services for the usage statistics endpoints -""" -from app.usage_statistics import statistics_saver - - -def register_structure_search(search_type, time_taken): - """ - Registers the structure search event - :param search_type: search type (SIMILARITY, SUBSTRUCTURE, CONNECTIVITY) - :param time_taken: time taken to run the search - :return : a simple dict stating that the event was saved - """ - statistics_saver.save_similarity_search_record(search_type, time_taken) - return { - "msg": "Event saved successfully" - } diff --git a/app/free_text_parsing/__init__.py b/app/free_text_parsing/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/free_text_parsing/free_text_parser.py b/app/free_text_parsing/free_text_parser.py deleted file mode 100644 index 690850307af37502ffc644c22f6870bfa3936c92..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/free_text_parser.py +++ /dev/null @@ -1,59 +0,0 @@ -""" -Entry module for the free text parsing package -""" -import re - -import arpeggio - -from app.free_text_parsing.parser import PARSER -from app.free_text_parsing.query_builder.query_builder import QueryBuilder -from app.free_text_parsing.terms_visitor import TERMS_VISITOR -from app.cache import cache -from app import app_logging -from app.config import RUN_CONFIG - - -def parse_query_str(query_string: str): - """ - :param query_string: the text term to parse - :return: the es query to apply to es - """ - if len(query_string.strip()) == 0: - return {} - query_string = re.sub(r'[\s&&[^\n]]+', ' ', query_string) - parse_tree = PARSER.parse(query_string) - result = arpeggio.visit_parse_tree(parse_tree, TERMS_VISITOR) - return result - - -def parse_search(search_term, es_indexes, selected_es_index): - """ - :param search_term: Term to parse - :param es_indexes: indexes in which the search will be done, separated by a comma - :param selected_es_index: es index to focus on - :return: the query to send to elasticsearch based on the search term provided - """ - cache_key = f'{search_term}-{es_indexes}-{selected_es_index}-{RUN_CONFIG.get("cache_key_suffix")}' - app_logging.debug(f'cache_key: {cache_key}') - - cache_response = cache.fail_proof_get(key=cache_key) - if cache_response is not None: - app_logging.debug(f'results were cached') - return cache_response - - app_logging.debug(f'results were not cached') - - parsed_query = parse_query_str(search_term) - indexes_list = es_indexes.split(',') - best_queries, sorted_indexes_by_score = QueryBuilder.get_best_es_query(parsed_query, indexes_list, selected_es_index) - - response_dict = { - 'parsed_query': parsed_query, - 'best_es_base_queries': best_queries, - 'sorted_indexes_by_score': sorted_indexes_by_score - } - - seconds_valid = RUN_CONFIG.get('es_proxy_cache_seconds') - cache.fail_proof_set(key=cache_key, value=response_dict, timeout=seconds_valid) - - return response_dict diff --git a/app/free_text_parsing/grammar/__init__.py b/app/free_text_parsing/grammar/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/free_text_parsing/grammar/common.py b/app/free_text_parsing/grammar/common.py deleted file mode 100644 index c205adfc8f0c54c42aba08ed3d1f55879b068668..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/grammar/common.py +++ /dev/null @@ -1,67 +0,0 @@ -from arpeggio import RegExMatch as _ -from arpeggio import And, OneOrMore, ZeroOrMore, EOF - - -def reverse_regex_or_clause(regex_or_clause: str) -> str: - """ - Reverses a regex or clause - :param regex_or_clause: an or regex clause eg. H|He|Hf|Zn|As - :return: reversed regex clause eg. Zn|Hf|He|H|As - """ - reversed_clause = '|'.join(reversed(sorted(regex_or_clause.split('|')))) - return reversed_clause - - -def term_end_lookahead(): - return And([space, ')', EOF]) - - -def space(): - return _(r'\s') - - -def space_sequence(): - return _(r'\s+') - - -def not_space(): - return _(r'[^\s]') - - -def not_space_sequence(): - return _(r'[^\s]+') - - -def ascii_letter(): - return _(r'[A-Za-z]') - - -def ascii_letter_sequence(): - return _(r'[A-Za-z]+') - - -def integer_number(): - return _(r'\d+') - - -def digit(): - return _(r'\d') - - -def float_number(): - return _(r'\d+\.\d+') - - -def non_space_or_parenthesis_sequence(): - return _(r'[^\s()\[\]]') - - -def correctly_parenthesised_non_space_char_sequence(): - return ( - OneOrMore([ - non_space_or_parenthesis_sequence, - ('(', correctly_parenthesised_non_space_char_sequence, ')'), - ('[', correctly_parenthesised_non_space_char_sequence, ']') - ] - ) - ) \ No newline at end of file diff --git a/app/free_text_parsing/grammar/fasta.py b/app/free_text_parsing/grammar/fasta.py deleted file mode 100644 index 64883b6b3cf99ea6ec6b6df9c4210c4232705352..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/grammar/fasta.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Grammar for fasta format -""" -from arpeggio import Optional, OneOrMore, ZeroOrMore, Not, Repetition -from arpeggio import RegExMatch as _ - -from app.free_text_parsing.grammar import common - - -def sequence_part(): - return _(r'[A-Z-*]') - - -def protein_sequence(): - min_ten = (sequence_part,)*10 - continuation = (OneOrMore(sequence_part), ) - complete_tuple = min_ten + continuation - return complete_tuple - - -def sequence_id(): - return '>', _(r'[^\n]*'), '\n' - - -def fasta(): - return Optional(sequence_id), protein_sequence(), common.term_end_lookahead diff --git a/app/free_text_parsing/grammar/inchi.py b/app/free_text_parsing/grammar/inchi.py deleted file mode 100644 index 827bf412e8c6ee61fd3822dcc87ece4187a1056c..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/grammar/inchi.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -Grammar for inchi strings -""" -import re - -from arpeggio import OneOrMore, Optional, And, EOF -from arpeggio import RegExMatch as _ - -from app.free_text_parsing.grammar import common - - -def version(): - return common.integer_number, Optional(common.ascii_letter_sequence) - - -def layer(): - return '/', common.correctly_parenthesised_non_space_char_sequence - - -def inchi(): - return 'InChI=', version, OneOrMore(layer), common.term_end_lookahead - - -__INCHI_KEY_REGEX_STR = r'[A-Z]{14}-[A-Z]{10}-[A-Z]' -__INCHI_KEY_REGEX = re.compile(__INCHI_KEY_REGEX_STR) - - -def inchi_key(): - return _(__INCHI_KEY_REGEX_STR), common.term_end_lookahead - - -def is_inchi_key(inchi_key_str: str): - return __INCHI_KEY_REGEX.match(inchi_key_str) is not None diff --git a/app/free_text_parsing/grammar/search_parser.py b/app/free_text_parsing/grammar/search_parser.py deleted file mode 100644 index 46ab7b5562e47ddd90d32a7a7ce0294424ff1351..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/grammar/search_parser.py +++ /dev/null @@ -1,585 +0,0 @@ -from arpeggio import Optional, PTNodeVisitor, OneOrMore, ZeroOrMore, EOF -from arpeggio import RegExMatch as _ -from arpeggio import ParserPython -import arpeggio - -import glados.grammar.common as common -import glados.grammar.smiles as smiles -import glados.grammar.inchi as inchi -import glados.grammar.fasta as fasta -import re -import json -import operator -import requests -import urllib.parse -import traceback -import time -import threading -import sys -from glados.usage_statistics import glados_server_statistics -from glados.models import ESSearchRecord - -from django.http import HttpResponse -from glados.es.query_builder import QueryBuilder -from urllib.parse import urlparse -from django.conf import settings - -BASE_EBI_URL = 'https://www.ebi.ac.uk' - -UNICHEM_DS = None -UNICHEM_DS_LAST_LOAD = 0 -UNICHEM_LOADING_THREAD = None -UNICHEM_LOADING_THREAD_LOCK = threading.Lock() - -CHEMBL_ENTITIES = { - 'target': 'targets', - 'compound': 'compounds', - 'molecule': 'compounds', - 'document': 'documents', - 'assay': 'assays', - 'cell': 'cells', - 'tissue': 'tissues' -} - -WS_PARSED_URL = urlparse(settings.WS_URL) -WS_DOMAIN = WS_PARSED_URL.scheme + '://' + WS_PARSED_URL.netloc -WS_BASE_PATH = WS_PARSED_URL.path - - -# noinspection PyBroadException -def __load_unichem_data(): - global UNICHEM_DS, UNICHEM_LOADING_THREAD, UNICHEM_DS_LAST_LOAD - try: - print('Loading UNICHEM data . . .') - UNICHEM_DS = {} - req = requests.get( - url=BASE_EBI_URL + '/unichem/rest/src_ids/', - headers={'Accept': 'application/json'}, - timeout=5 - ) - json_resp = req.json() - for ds_i in json_resp: - ds_id_i = ds_i['src_id'] - req_i = requests.get(url=BASE_EBI_URL + '/unichem/rest/sources/{0}'.format(ds_id_i), - headers={'Accept': 'application/json'}) - UNICHEM_DS[ds_id_i] = req_i.json()[0] - UNICHEM_DS_LAST_LOAD = time.time() - print(' . . . UNICHEM data loaded!') - except: - print('Error, UNICHEM data is not available!', file=sys.stderr) - UNICHEM_LOADING_THREAD = None - UNICHEM_LOADING_THREAD_LOCK.release() - - -# noinspection PyBroadException -def load_unichem_data(wait=True): - global UNICHEM_LOADING_THREAD, UNICHEM_LOADING_THREAD_LOCK, UNICHEM_DS_LAST_LOAD - if time.time() - UNICHEM_DS_LAST_LOAD > 2*pow(60, 2) and UNICHEM_LOADING_THREAD is None: - try: - UNICHEM_LOADING_THREAD_LOCK.acquire() - if UNICHEM_LOADING_THREAD is None: - UNICHEM_LOADING_THREAD = threading.Thread(target=__load_unichem_data, - name='unichem-data-loader', daemon=True) - UNICHEM_LOADING_THREAD.start() - except: - UNICHEM_LOADING_THREAD_LOCK.release() - if wait: - if UNICHEM_LOADING_THREAD is not None: - UNICHEM_LOADING_THREAD.join() - return UNICHEM_DS is not None - return UNICHEM_DS is not None - - -# load_unichem_data(False) - - -def get_unichem_cross_reference_link_data(src_id: str, cross_reference_id: str) -> dict: - global UNICHEM_DS - if load_unichem_data(): - link_data = { - 'cross_reference_id': cross_reference_id, - 'cross_reference_link': None, - 'cross_reference_label': 'Unknown in UniChem' - } - if src_id in UNICHEM_DS: - ds = UNICHEM_DS[src_id] - if ds['base_id_url_available'] == '1': - link_data['cross_reference_link'] = ds['base_id_url'] + cross_reference_id - link_data['cross_reference_label'] = ds['name_label'] - return link_data - - -def property_term(): - return( - Optional(['+', '-']), - json_property_path_segment, ZeroOrMore('.', json_property_path_segment), ':', - [ - ('"', _('[^"]+'), '"'), - ("'", _("[^']+"), "'"), - ("(", _("[^\(\)]+"), ")"), - common.correctly_parenthesised_non_space_char_sequence - ], - common.term_end_lookahead - ) - - -def json_property_path_segment(): - return OneOrMore(_(r'[a-z0-9_\-]')) - - -def single_term(): - return common.correctly_parenthesised_non_space_char_sequence, common.term_end_lookahead - - -def exact_match_term(): - return ( - [ - ( - Optional(['+', '-']), - [ - ('"', _(r'((\\")|[^"])+'), '"'), - ("'", _(r"((\\')|[^'])+"), "'") - ] - ), - ( - ['+', '-'], common.correctly_parenthesised_non_space_char_sequence - ) - ], - common.term_end_lookahead - ) - - -def expression_term(): - return [parenthesised_expression, - smiles.smiles, - inchi.inchi_key, inchi.inchi, - fasta.fasta, - property_term, - exact_match_term, - single_term] - - -def parenthesised_expression(): - return '(', expression, ')', common.term_end_lookahead - - -def expression(): - return \ - ( - Optional(common.space_sequence), - expression_term, - ZeroOrMore( - # Optional( - # (common.space_sequence, _(r'and|or', ignore_case=True)) - # ), - common.space_sequence, - expression_term, - common.term_end_lookahead - ), - Optional(common.space_sequence) - ) - - -parser = ParserPython(expression, skipws=False) - - -__CHEMBL_REGEX_STR = r'^chembl[^\d\s]{0,2}([\d]+)[^\d\s]{0,2}$' -CHEMBL_REGEX = re.compile(__CHEMBL_REGEX_STR, flags=re.IGNORECASE) -__DOI_REGEX_STR = r'^(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>|])\S)+)$' -DOI_REGEX = re.compile(__DOI_REGEX_STR) -INTEGER_REGEX = re.compile(r'^\d+$') - - -def adjust_exact_term(exact_term: str) -> str: - if exact_term[-1] == '"': - return exact_term - elif exact_term[-1] == "'": - first_char = 1 - prefix = "" - if exact_term[0] == '+' or exact_term[0] == '-': - first_char = 2 - prefix = exact_term[0] - return prefix+'"'+exact_term[first_char:-1].replace(r"\'", r'\"')+'"' - else: - return exact_term[0]+'"'+exact_term[1:]+'"' - - -def get_chembl_id_dict(chembl_id, cross_references=[], include_in_query=True, score=None): - return { - 'chembl_id': chembl_id, - 'cross_references': cross_references, - 'include_in_query': include_in_query, - 'score': score - } - - -def get_chembl_id_list_dict(chembl_ids, cross_references=[], include_in_query=True): - return [ - get_chembl_id_dict( - chembl_id_i, - cross_references[i] if i < len(cross_references) else [], - include_in_query - ) - for i, chembl_id_i in enumerate(chembl_ids) - ] - - -def check_chembl(term_dict: dict): - re_match = CHEMBL_REGEX.match(term_dict['term']) - if re_match is not None: - chembl_id_num = re_match.group(1) - term_dict['references'].append( - { - 'type': 'chembl_id', - 'label': 'ChEMBL ID', - 'chembl_ids': [ - get_chembl_id_dict('CHEMBL{0}'.format(chembl_id_num)) - ], - 'include_in_query': True - } - ) - - -def check_integer(term_dict: dict): - re_match = INTEGER_REGEX.match(term_dict['term']) - if re_match is not None: - term_dict['references'].append( - { - 'type': 'integer_chembl_id', - 'label': 'Integer as ChEMBL ID', - 'chembl_ids': [ - get_chembl_id_dict('CHEMBL{0}'.format(term_dict['term'])) - ], - 'include_in_query': True - } - ) - - -def check_chembl_entities(term_dict: dict): - term = term_dict['term'].lower() - if len(term) > 0 and term[-1] == 's': - term = term[0:-1] - if term in CHEMBL_ENTITIES: - term_dict['chembl_entity'] = CHEMBL_ENTITIES[term] - - -def check_doi(term_dict: dict): - re_match = DOI_REGEX.match(term_dict['term']) - if re_match is not None: - try: - chembl_ids = [] - response = requests.get( - '{es_url}/{index_prefix}document/_search'.format( - es_url=settings.ELASTICSEARCH_EXTERNAL_URL, - index_prefix=settings.CHEMBL_ES_INDEX_PREFIX - ), - json= - { - 'size': 10, - '_source': 'document_chembl_id', - 'query': { - 'term': { - 'doi': { - 'value': term_dict['term'] - } - } - } - }, - timeout=5 - ) - json_response = response.json() - for hit_i in json_response['hits']['hits']: - chembl_ids.append(hit_i['_source']['document_chembl_id']) - if chembl_ids: - term_dict['references'].append( - { - 'type': 'doi', - 'label': 'DOI (Digital Object Identifier)', - 'chembl_ids': get_chembl_id_list_dict(chembl_ids), - 'include_in_query': True, - 'chembl_entity': 'document' - } - ) - except: - traceback.print_exc() - - -def check_inchi(term_dict: dict, term_is_inchi_key=False): - try: - chembl_ids = [] - response = requests.get( - '{es_url}/{index_prefix}molecule/_search'.format( - es_url=settings.ELASTICSEARCH_EXTERNAL_URL, - index_prefix=settings.CHEMBL_ES_INDEX_PREFIX - ), - json= - { - 'size': 10, - '_source': 'molecule_chembl_id', - 'query': { - 'term': { - 'molecule_structures.standard_inchi'+('_key' if term_is_inchi_key else ''): { - 'value': term_dict['term'] - } - } - } - }, - timeout=5 - ) - json_response = response.json() - for hit_i in json_response['hits']['hits']: - chembl_ids.append(hit_i['_source']['molecule_chembl_id']) - if chembl_ids: - term_dict['references'].append( - { - 'type': 'inchi'+('_key' if term_is_inchi_key else ''), - 'label': 'InChI'+(' Key' if term_is_inchi_key else ''), - 'chembl_ids': get_chembl_id_list_dict(chembl_ids), - 'include_in_query': True, - 'chembl_entity': 'compound' - } - ) - except: - traceback.print_exc() - - -def check_smiles(term_dict: dict): - global WS_BASE_PATH - try: - chembl_ids = [] - next_url_path = '{ws_path}/molecule.json?molecule_structures__canonical_smiles__flexmatch={smiles}'\ - .format(ws_path=WS_BASE_PATH, smiles=urllib.parse.quote(term_dict['term'])) - while next_url_path: - response = requests.get( - WS_DOMAIN + next_url_path, - headers={'Accept': 'application/json'}, - timeout=5 - ) - json_response = response.json() - if 'error_message' in json_response: - return None - for molecule_i in json_response['molecules']: - chembl_ids.append(molecule_i['molecule_chembl_id']) - next_url_path = json_response['page_meta']['next'] - if chembl_ids: - term_dict['references'].append( - { - 'type': 'smiles', - 'label': 'SMILES', - 'chembl_ids': get_chembl_id_list_dict(chembl_ids), - 'include_in_query': True, - 'chembl_entity': 'compound' - } - ) - except: - traceback.print_exc() - - -# noinspection PyBroadException -def check_unichem(term_dict: dict): - pass - # try: - # response = requests.get( - # BASE_EBI_URL+'/unichem/rest/orphanIdMap/{0}' - # .format(urllib.parse.quote(term_dict['term'])), - # headers={'Accept': 'application/json'}, - # timeout=5 - # ) - # json_response = response.json() - # if 'error' in json_response: - # return None - # chembl_ids = [] - # unichem_not_in_chembl_cross_refs = [] - # for unichem_src_i in json_response: - # cross_references = [] - # chembl_id_i = None - # for link_i in json_response[unichem_src_i]: - # if link_i['src_id'] == '1': - # chembl_id_i = link_i['src_compound_id'] - # elif unichem_src_i == '1': - # chembl_id_i = term_dict['term'] - # cross_references.append( - # get_unichem_cross_reference_link_data(link_i['src_id'], link_i['src_compound_id']) - # ) - # cross_references_dict = { - # 'from': get_unichem_cross_reference_link_data(unichem_src_i, term_dict['term']), - # 'also_in': cross_references - # } - # if chembl_id_i is not None: - # chembl_ids.append(get_chembl_id_dict(chembl_id_i, [cross_references_dict])) - # else: - # unichem_not_in_chembl_cross_refs.append(cross_references_dict) - # - # if len(chembl_ids) > 0 or len(unichem_not_in_chembl_cross_refs) > 0: - # term_dict['references'].append( - # { - # 'type': 'unichem', - # 'label': 'UniChem', - # 'chembl_ids': chembl_ids, - # 'not_in_chembl': unichem_not_in_chembl_cross_refs, - # 'include_in_query': False, - # 'chembl_entity': 'compound' - # } - # ) - # except: - # traceback.print_exc() - - -class TermsVisitor(PTNodeVisitor): - - def __init__(self): - super().__init__() - - def visit__default__(self, node, children): - """ - Called if no visit method is defined for the node. - - Args: - node(ParseTreeNode): - children(processed children ParseTreeNode-s): - """ - if isinstance(node, arpeggio.Terminal): - return arpeggio.text(node) - else: - # term = ''.join([str(child_i) for child_i in children]) - # check_unichem(term) - return ''.join([str(child_i) for child_i in children]) - - def visit_expression_term(self, node, children): - return children[0] - - def visit_parenthesised_expression(self, node, children): - return children[1] - - def visit_expression(self, node, children): - exp = {'or': []} - previous_single_term_lc = None - for child_i in children: - str_child_i_lc = str(child_i).strip().lower() - term_dict = None - if len(str_child_i_lc) > 0: - - if str_child_i_lc == 'and' or str_child_i_lc == 'or': - term_dict = self.get_term_dict(str(child_i).strip(), include_in_query=False) - check_unichem(term_dict) - last_term_is_and_group = len(exp['or']) > 0 and type(exp['or'][-1]) == dict and 'and' in exp['or'][-1] - if str_child_i_lc == 'and' and not last_term_is_and_group: - if len(exp['or']) > 0: - exp['or'][-1] = {'and': [exp['or'][-1], term_dict]} - else: - exp['or'].append({'and': [term_dict]}) - elif last_term_is_and_group and (str_child_i_lc == 'and' or previous_single_term_lc == 'and'): - if term_dict: - exp['or'][-1]['and'].append(term_dict) - else: - exp['or'][-1]['and'].append(child_i) - else: - if term_dict: - exp['or'].append(term_dict) - else: - exp['or'].append(child_i) - previous_single_term_lc = str_child_i_lc - if len(exp['or']) == 1: - return exp['or'][0] - return exp - - @staticmethod - def get_term_dict(term: str, include_in_query=True) -> dict: - return { - 'term': term, - 'include_in_query': include_in_query, - 'references': [], - 'exact_match_term': False, - 'filter_term': False, - 'chembl_entity': None - } - - def visit_smiles(self, node, children): - term = ''.join(children) - include_in_query = len(term) <= 4 - term_dict = self.get_term_dict(term, include_in_query=include_in_query) - check_smiles(term_dict) - if include_in_query: - check_unichem(term_dict) - if inchi.is_inchi_key(term): - check_inchi(term_dict) - return term_dict - - def visit_inchi(self, node, children): - term = ''.join(children) - term_dict = self.get_term_dict(term, include_in_query=False) - check_inchi(term_dict) - return term_dict - - def visit_inchi_key(self, node, children): - term = ''.join(children) - term_dict = self.get_term_dict(term, include_in_query=False) - check_inchi(term_dict, term_is_inchi_key=True) - return term_dict - - def visit_fasta(self, node, children): - term = ''.join(children) - term_dict = self.get_term_dict(term, include_in_query=False) - # check_fasta(term_dict) - return term_dict - - def visit_property_term(self, node, children): - term = ''.join(children) - term_dict = self.get_term_dict(term) - term_dict['filter_term'] = True - return term_dict - - def visit_exact_match_term(self, node, children): - term = ''.join(children) - term = adjust_exact_term(term) - term_dict = self.get_term_dict(term) - term_dict['exact_match_term'] = True - return term_dict - - def visit_single_term(self, node, children): - term = ''.join(children) - term_lc = term.lower() - if term_lc == 'or' or term_lc == 'and': - return term - term_dict = self.get_term_dict(term) - check_unichem(term_dict) - check_chembl(term_dict) - check_integer(term_dict) - check_doi(term_dict) - check_chembl_entities(term_dict) - return term_dict - - -def parse_query_str(query_string: str): - if len(query_string.strip()) == 0: - return {} - query_string = re.sub(r'[\s&&[^\n]]+', ' ', query_string) - pt = parser.parse(query_string) - result = arpeggio.visit_parse_tree(pt, TermsVisitor()) - return result - - -def parse_url_search(request): - if request.method == 'GET': - return HttpResponse('INVALID USAGE! PLEASE USE POST!', status=400) - elif request.method == 'POST': - - start_time = time.time() - query_string = request.POST.get('query_string', '') - indexes_str = request.POST.get('es_indexes', '') - indexes = indexes_str.split(',') - selected_es_index = request.POST.get('selected_es_index', None) - - parsed_query = parse_query_str(query_string) - best_queries, sorted_indexes_by_score = QueryBuilder.get_best_es_query(parsed_query, indexes, selected_es_index) - - response_dict = { - 'parsed_query': parsed_query, - 'best_es_base_queries': best_queries, - 'sorted_indexes_by_score': sorted_indexes_by_score - } - end_time = time.time() - time_taken = end_time - start_time - glados_server_statistics.record_search(ESSearchRecord.FREE_TEXT, time_taken) - - return HttpResponse(json.dumps(response_dict)) diff --git a/app/free_text_parsing/grammar/smiles.py b/app/free_text_parsing/grammar/smiles.py deleted file mode 100644 index f0db5b831728909077c170dec2e4a59e94cd2d01..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/grammar/smiles.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -Gramar for smiles srings -""" -from arpeggio import Optional, ZeroOrMore, And, EOF -from arpeggio import RegExMatch as _ - -from app.free_text_parsing.grammar import common - - -def aliphatic_organic(): - return _(common.reverse_regex_or_clause(r'B|C|N|O|S|P|F|Cl|Br|I')) - - -def aromatic_organic(): - return _(common.reverse_regex_or_clause(r'b|c|n|o|s|p')) - - -def element_symbol(): - return _( - common.reverse_regex_or_clause( - r'H|He|' - r'Li|Be|B|C|N|O|F|Ne|' - r'Na|Mg|Al|Si|P|S|Cl|Ar|' - r'K|Ca|Sc|Ti|V|Cr|Mn|Fe|Co|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|' - r'Rb|Sr|Y|Zr|Nb|Mo|Tc|Ru|Rh|Pd|Ag|Cd|In|Sn|Sb|Te|I|Xe|' - r'Cs|Ba|Hf|Ta|W|Re|Os|Ir|Pt|Au|Hg|Tl|Pb|Bi|Po|At|Rn|' - r'Fr|Ra|Rf|Db|Sg|Bh|Hs|Mt|Ds|Rg|Cn|Fl|Lv|' - r'La|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Ho|Er|Tm|Yb|Lu|' - r'Ac|Th|Pa|U|Np|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|No|Lr' - ) - ) - - -def aromatic_symbol(): - return _(common.reverse_regex_or_clause(r'b|c|n|o|p|s|se|as')) - - -def bracket_atom_symbol(): - return [element_symbol, aromatic_symbol, r'*'] - - -def chiral(): - return [ - '@@', '@TH1', '@TH2', '@AL1', '@AL2', '@SP1', '@SP2', '@SP3', - ('@TB', common.digit, Optional(common.digit)), - ('@OH', common.digit, Optional(common.digit)), '@' - ] - - -def h_count(): - return 'H', Optional(common.digit) - - -def charge(): - return [ - ( - '-', - Optional([ - '-', - (common.digit, Optional(common.digit)) - ]) - ), - ( - '+', - Optional([ - '+', - (common.digit, Optional(common.digit)) - ]) - ) - ] - - -def atom_class(): - return ':', common.integer_number - - -def bracket_atom(): - """ - http://opensmiles.org/opensmiles.html - bracket_atom ::= '[' isotope? symbol chiral? hcount? charge? class? ']' - """ - return r'[', Optional(common.integer_number), bracket_atom_symbol, \ - Optional(chiral), Optional(h_count), Optional(charge), Optional(atom_class), r']' - - -def atom(): - return [bracket_atom, aliphatic_organic, aromatic_organic, '*'] - - -def dot(): - return '.' - - -def bond(): - return ['-', '=', '#', '$', ':', '/', '\\'] - - -def branched_atom(): - return atom, Optional(first_ring_bond, ZeroOrMore(next_ring_bond)), ZeroOrMore(branch) - - -def first_ring_bond(): - return Optional(bond), \ - [ - (common.digit, Optional(common.digit)), - ('%', common.digit, common.digit) - ] - - -def next_ring_bond(): - return [ - (bond, (common.digit, Optional(common.digit))), - (Optional(bond), '%', common.digit, common.digit) - ] - - -def branch(): - return '(', Optional([bond, dot]), chain, ')' - - -def chain(): - return \ - ( - branched_atom, - ZeroOrMore( - ( - Optional([bond, dot]), branched_atom - ) - ) - ) - - -def smiles(): - return chain, common.term_end_lookahead - diff --git a/app/free_text_parsing/grammar/test/__init__.py b/app/free_text_parsing/grammar/test/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/free_text_parsing/grammar/test/test_grammar.py b/app/free_text_parsing/grammar/test/test_grammar.py deleted file mode 100644 index 323bcd721d7e550f787b603d27b54790ea644eb4..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/grammar/test/test_grammar.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -Module that tests the grammar -""" -import unittest - -from arpeggio import ParserPython - -from app.free_text_parsing.grammar.test import utils -from app.free_text_parsing.grammar import smiles -from app.free_text_parsing.grammar import inchi -from app.free_text_parsing.grammar import fasta - - -class GrammarTester(unittest.TestCase): - - def setUp(self): - print('Running Test: {0}'.format(self._testMethodName)) - - def tearDown(self): - print('Test {0}'.format('Passed!' if self._outcome.success else 'Failed!')) - - def try_parse_failure(self, parser: ParserPython, text_to_parse: str): - # noinspection PyBroadException - try: - parser.parse(text_to_parse) - self.fail("Should not parse {0} using the {1} parser!".format(text_to_parse, parser.__class__)) - except: - pass - - def try_parsing(self, parser: ParserPython, text_to_parse: str): - # noinspection PyBroadException - try: - parser.parse(text_to_parse) - except: - self.fail("Could not parse {0} using the {1} parser!".format(text_to_parse, parser.__class__)) - - # ------------------------------------------------------------------------------------------------------------------ - # Simple Grammars Tests - # ------------------------------------------------------------------------------------------------------------------ - - def test_smiles_parsing(self): - # noinspection PyBroadException - try: - parser = ParserPython(smiles.smiles, skipws=False) - except: - self.fail("Could not instantiate the SMILES parser!") - if parser: - for smiles_i in utils.SMILES_EXAMPLES: - self.try_parsing(parser, smiles_i) - for non_smiles_i in utils.NON_SMILES_EXAMPLES: - self.try_parse_failure(parser, non_smiles_i) - - def test_inchi_parsing(self): - # noinspection PyBroadException - try: - parser = ParserPython(inchi.inchi, skipws=False) - except: - self.fail("Could not instantiate the InChI parser!") - if parser: - for inchi_i in utils.INCHI_EXAMPLES: - self.try_parsing(parser, inchi_i) - - def test_inchi_key_parsing(self): - # noinspection PyBroadException - try: - parser = ParserPython(inchi.inchi_key, skipws=False) - except: - self.fail("Could not instantiate the InChI KEY parser!") - if parser: - for inchi_key_i in utils.INCHI_KEY_EXAMPLES: - self.try_parsing(parser, inchi_key_i) - - def test_fasta_parsing(self): - # noinspection PyBroadException - try: - parser = ParserPython(fasta.fasta, skipws=False) - except: - self.fail("Could not instantiate the FASTA parser!") - if parser: - for fasta_sequence_i in utils.FASTA_SAMPLES: - self.try_parsing(parser, fasta_sequence_i) - - # ------------------------------------------------------------------------------------------------------------------ - # Simple Grammars Tests - # ------------------------------------------------------------------------------------------------------------------ - - def test_parenthesization(self): - pass diff --git a/app/free_text_parsing/grammar/test/utils.py b/app/free_text_parsing/grammar/test/utils.py deleted file mode 100644 index af71141b7638b0756f341dc7569f1c8204c56e5b..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/grammar/test/utils.py +++ /dev/null @@ -1,186 +0,0 @@ - -# Longest molecule SMILES found in ChEMBL -LONGEST_CHEMBL_SMILES = r'CCCCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC1OC(CO)C(O)C(O)C1O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+' \ - r']OC(CO)C(O)C(OC2OC(CO)C(O)C(O)C2O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC3OC(CO' \ - r')C(O)C(O)C3O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC4OC(CO)C(O)C(O)C4O)C(O)CO.C' \ - r'CCCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC5OC(CO)C(O)C(O)C5O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+]' \ - r'OC(CO)C(O)C(OC6OC(CO)C(O)C(O)C6O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC7OC(CO)' \ - r'C(O)C(O)C7O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC8OC(CO)C(O)C(O)C8O)C(O)CO.CC' \ - r'CCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC9OC(CO)C(O)C(O)C9O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+]O' \ - r'C(CO)C(O)C(OC%10OC(CO)C(O)C(O)C%10O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC%11O' \ - r'C(CO)C(O)C(O)C%11O)C(O)CO.CCCCCCCCCCCCCCCC[NH2+]OC(CO)C(O)C(OC%12OC(CO)C(O)C(O)C%12' \ - r'O)C(O)CO.CCCCCCCCCC(C(=O)NCCc%13ccc(OP(=S)(Oc%14ccc(CCNC(=O)C(CCCCCCCCC)P(=O)(O)[O-' \ - r'])cc%14)N(C)\N=C\c%15ccc(Op%16(Oc%17ccc(\C=N\N(C)P(=S)(Oc%18ccc(CCNC(=O)C(CCCCCCCCC' \ - r')P(=O)(O)[O-])cc%18)Oc%19ccc(CCNC(=O)C(CCCCCCCCC)P(=O)(O)[O-])cc%19)cc%17)np(Oc%20c' \ - r'cc(\C=N\N(C)P(=S)(Oc%21ccc(CCNC(=O)C(CCCCCCCCC)P(=O)(O)[O-])cc%21)Oc%22ccc(CCNC(=O)' \ - r'C(CCCCCCCCC)P(=O)(O)[O-])cc%22)cc%20)(Oc%23ccc(\C=N\N(C)P(=S)(Oc%24ccc(CCNC(=O)C(CC' \ - r'CCCCCCC)P(=O)(O)[O-])cc%24)Oc%25ccc(CCNC(=O)C(CCCCCCCCC)P(=O)(O)[O-])cc%25)cc%23)np' \ - r'(Oc%26ccc(\C=N\N(C)P(=S)(Oc%27ccc(CCNC(=O)C(CCCCCCCCC)P(=O)(O)[O-])cc%27)Oc%28ccc(C' \ - r'CNC(=O)C(CCCCCCCCC)P(=O)(O)[O-])cc%28)cc%26)(Oc%29ccc(\C=N\N(C)P(=S)(Oc%30ccc(CCNC(' \ - r'=O)C(CCCCCCCCC)P(=O)(O)[O-])cc%30)Oc%31ccc(CCNC(=O)C(CCCCCCCCC)P(=O)(O)[O-])cc%31)c' \ - r'c%29)n%16)cc%15)cc%13)P(=O)(O)[O-]' - - -SMILES_EXAMPLES = \ - [ - r'COc1ccc2[C@@H]3[C@H](COc2c1)C(C)(C)OC4=C3C(=O)C(=O)C5=C4OC(C)(C)[C@@H]6COc7cc(OC)ccc7[C@H]56', - r'C\C(=C\C(=O)O)\C=C\C=C(/C)\C=C\C1=C(C)CCCC1(C)C', - r'COC1(CN2CCC1CC2)C#CC(C#N)(c3ccccc3)c4ccccc4', - r'CN1C\C(=C/c2ccc(C)cc2)\C3=C(C1)C(C(=C(N)O3)C#N)c4ccc(C)cc4', - r'COc1ccc2[C@@H]3[C@H](COc2c1)C(C)(C)OC4=C3C(=O)C(=O)C5=C4OC(C)(C)[C@@H]6COc7cc(OC)ccc7[C@H]56', - r'CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H]3CCCN3C(=O)C(CCCCN)CCCCN)C(C)(C)C)C(=O)O', - r'Cc1nnc2CN=C(c3ccccc3)c4cc(Cl)ccc4-n12', - r'CN1C(=O)CN=C(c2ccccc2)c3cc(Cl)ccc13', - r'c', - r'ccc', - r'cs', - r'ccs', - r'CC(C)(N)Cc1ccccc1', - r'CCCc1nn(C)c2C(=O)NC(=Nc12)c3cc(ccc3OCC)S(=O)(=O)N4CCN(C)CC4.OC(=O)CC(O)(CC(=O)O)C(=O)O', - r'CCOc1ccc(cc1)c2nn3nc(SC)ccc3c2c4ccc(cc4)S(=O)(=O)N', - r'COc1cc2ncnc(Nc3cc(NC(=O)c4ccccc4)ccc3C)c2cc1OC', - r'CC(=O)Nc1cccc(c1)c2cc(ccc2COCc3cncn3Cc4ccc(cc4)C#N)C#N', - r'COc1cccc(c1)c2cc(ccc2COCc3cncn3Cc4ccc(nc4)C#N)C#N', - r'Cc1cc(C)nc(NC(=O)CCc2c[nH]c3ccccc23)c1', - r'COc1cc2ncnc(Nc3ccc(C)c(NC(=O)c4ccccc4)c3)c2cc1OC', - r'COc1ccc2c(c3ccc(cc3)S(=O)(=O)C)c(nn2n1)c4ccc(F)cc4', - r'Cc1ccccc1COc2ccc(cc2)S(=O)(=O)N3CCC[C@@](C)(O)[C@@H]3C(=O)NO', - r'Cc1csc(c1)C(=O)NNC(=O)CN2CCCN(Cc3ccc(Cl)cc3)CC2', - r'Clc1ccc(OCc2ccccc2)c(\C=C\3/SC(=O)NC3=O)c1', - LONGEST_CHEMBL_SMILES - ] - -NON_SMILES_EXAMPLES = \ - [ - r'P55072123412345345', - r'P55072', - ] - -# Longest molecule InChI found in ChEMBL -LONGEST_CHEMBL_INCHIS=[ - r'InChI=1S/C271H337N109O172P28/c1-96-35-355(263(406)346-227(96)385)156-13-106(132(502-156)47-476-563(434,435)539-11' - r'8-26-170(370-84-311-187-217(370)324-252(281)336-237(187)395)513-142(118)56-484-554(416,417)526-105-12-155(354-10-' - r'9-154(272)321-262(354)405)499-129(105)44-473-561(430,431)538-117-25-169(369-83-310-186-216(369)323-251(280)335-23' - r'6(186)394)512-141(117)55-472-553(414,415)525-104-11-168(498-128(104)43-381)368-82-309-185-215(368)322-250(279)334' - r'-235(185)393)527-555(418,419)474-45-130-107(14-157(500-130)356-36-97(2)228(386)347-264(356)407)528-556(420,421)48' - r'5-57-143-121(29-173(514-143)373-87-314-190-220(373)327-255(284)339-240(190)398)543-571(450,451)481-52-138-114(22-' - r'165(509-138)365-79-305-181-205(275)293-73-299-211(181)365)535-567(442,443)490-61-147-119(27-171(518-147)371-85-31' - r'2-188-218(371)325-253(282)337-238(188)396)541-565(438,439)478-49-134-110(17-160(504-134)359-39-100(5)231(389)350-' - r'267(359)410)530-558(424,425)487-59-145-123(31-175(516-145)375-89-316-192-222(375)329-257(286)341-242(192)400)544-' - r'572(452,453)482-53-139-115(23-166(510-139)366-80-306-182-206(276)294-74-300-212(182)366)536-568(444,445)491-62-14' - r'8-120(28-172(519-148)372-86-313-189-219(372)326-254(283)338-239(189)397)540-564(436,437)477-48-133-109(16-159(503' - r'-133)358-38-99(4)230(388)349-266(358)409)529-557(422,423)486-58-144-122(30-174(515-144)374-88-315-191-221(374)328' - r'-256(285)340-241(191)399)542-570(448,449)480-51-137-113(21-164(508-137)364-78-304-180-204(274)292-72-298-210(180)' - r'364)534-566(440,441)479-50-136-112(20-163(507-136)363-77-303-179-203(273)291-71-297-209(179)363)533-562(432,433)4' - r'75-46-131-108(15-158(501-131)357-37-98(3)229(387)348-265(357)408)531-559(426,427)489-64-150-126(34-178(521-150)37' - r'8-92-319-195-225(378)332-260(289)344-245(195)403)547-575(458,459)495-67-152-199(197(382)247(522-152)379-93-308-18' - r'4-208(278)296-76-302-214(184)379)550-577(462,463)496-68-153-200(198(383)248(523-153)380-94-320-196-226(380)333-26' - r'1(290)345-246(196)404)549-576(460,461)494-66-135-111(18-161(505-135)360-40-101(6)232(390)351-268(360)411)532-560(' - r'428,429)488-60-146-124(32-176(517-146)376-90-317-193-223(376)330-258(287)342-243(193)401)545-573(454,455)483-54-1' - r'40-116(24-167(511-140)367-81-307-183-207(277)295-75-301-213(183)367)537-569(446,447)492-63-149-125(33-177(520-149' - r')377-91-318-194-224(377)331-259(288)343-244(194)402)546-574(456,457)493-65-151-127(19-162(506-151)361-41-102(7)23' - r'3(391)352-269(361)412)548-579(466,467)552-580(468,469)551-578(464,465)497-70-271-69-470-95-471-201(202(271)384)24' - r'9(524-271)362-42-103(8)234(392)353-270(362)413/h9-10,35-42,71-94,104-153,155-178,197-202,247-249,381-384H,11-34,4' - r'3-70,95H2,1-8H3,(H,414,415)(H,416,417)(H,418,419)(H,420,421)(H,422,423)(H,424,425)(H,426,427)(H,428,429)(H,430,43' - r'1)(H,432,433)(H,434,435)(H,436,437)(H,438,439)(H,440,441)(H,442,443)(H,444,445)(H,446,447)(H,448,449)(H,450,451)(' - r'H,452,453)(H,454,455)(H,456,457)(H,458,459)(H,460,461)(H,462,463)(H,464,465)(H,466,467)(H,468,469)(H2,272,321,405' - r')(H2,273,291,297)(H2,274,292,298)(H2,275,293,299)(H2,276,294,300)(H2,277,295,301)(H2,278,296,302)(H,346,385,406)(' - r'H,347,386,407)(H,348,387,408)(H,349,388,409)(H,350,389,410)(H,351,390,411)(H,352,391,412)(H,353,392,413)(H3,279,3' - r'22,334,393)(H3,280,323,335,394)(H3,281,324,336,395)(H3,282,325,337,396)(H3,283,326,338,397)(H3,284,327,339,398)(H' - r'3,285,328,340,399)(H3,286,329,341,400)(H3,287,330,342,401)(H3,288,331,343,402)(H3,289,332,344,403)(H3,290,333,345' - r',404)/t104-,105-,106-,107-,108-,109-,110-,111-,112-,113-,114-,115-,116-,117-,118-,119-,120-,121-,122-,123-,124-,1' - r'25-,126-,127-,128+,129+,130+,131+,132+,133+,134+,135+,136+,137+,138+,139+,140+,141+,142+,143+,144+,145+,146+,147+' - r',148+,149+,150+,151+,152+,153+,155+,156+,157+,158+,159+,160+,161+,162+,163+,164+,165+,166+,167+,168+,169+,170+,17' - r'1+,172+,173+,174+,175+,176+,177+,178+,197+,198+,199+,200+,201?,202-,247+,248+,249+,271+/m0/s1', - r'InChI=1S/C270H336N110O171P28/c1-95-35-355(262(406)346-226(95)385)155-13-105(131(501-155)47-475-562(434,435)538-11' - r'7-26-169(370-83-310-186-216(370)324-251(280)336-236(186)395)512-141(117)56-483-553(416,417)525-104-12-154(354-10-' - r'9-153(271)321-261(354)405)498-128(104)44-472-560(430,431)537-116-25-168(369-82-309-185-215(369)323-250(279)335-23' - r'5(185)394)511-140(116)55-471-552(414,415)524-103-11-167(497-127(103)43-381)368-81-308-184-214(368)322-249(278)334' - r'-234(184)393)526-554(418,419)473-45-129-106(14-156(499-129)356-36-96(2)227(386)347-263(356)407)527-555(420,421)48' - r'4-57-142-120(29-172(513-142)373-86-313-189-219(373)327-254(283)339-239(189)398)542-570(450,451)480-52-137-113(22-' - r'164(508-137)365-78-304-180-204(274)292-72-298-210(180)365)534-566(442,443)489-61-146-118(27-170(517-146)371-84-31' - r'1-187-217(371)325-252(281)337-237(187)396)540-564(438,439)477-49-133-109(17-159(503-133)359-39-99(5)230(389)350-2' - r'66(359)410)529-557(424,425)486-59-144-122(31-174(515-144)375-88-315-191-221(375)329-256(285)341-241(191)400)543-5' - r'71(452,453)481-53-138-114(23-165(509-138)366-79-305-181-205(275)293-73-299-211(181)366)535-567(444,445)490-62-147' - r'-119(28-171(518-147)372-85-312-188-218(372)326-253(282)338-238(188)397)539-563(436,437)476-48-132-108(16-158(502-' - r'132)358-38-98(4)229(388)349-265(358)409)528-556(422,423)485-58-143-121(30-173(514-143)374-87-314-190-220(374)328-' - r'255(284)340-240(190)399)541-569(448,449)479-51-136-112(21-163(507-136)364-77-303-179-203(273)291-71-297-209(179)3' - r'64)533-565(440,441)478-50-135-111(20-162(506-135)363-76-302-178-202(272)290-70-296-208(178)363)532-561(432,433)47' - r'4-46-130-107(15-157(500-130)357-37-97(3)228(387)348-264(357)408)530-558(426,427)488-64-149-125(34-177(520-149)378' - r'-91-318-194-224(378)332-259(288)344-244(194)403)546-574(458,459)494-67-151-198(196(382)246(521-151)379-92-307-183' - r'-207(277)295-75-301-213(183)379)549-576(462,463)495-68-152-199(197(383)247(522-152)380-93-319-195-225(380)333-260' - r'(289)345-245(195)404)548-575(460,461)493-66-134-110(18-160(504-134)360-40-100(6)231(390)351-267(360)411)531-559(4' - r'28,429)487-60-145-123(32-175(516-145)376-89-316-192-222(376)330-257(286)342-242(192)401)544-572(454,455)482-54-13' - r'9-115(24-166(510-139)367-80-306-182-206(276)294-74-300-212(182)367)536-568(446,447)491-63-148-124(33-176(519-148)' - r'377-90-317-193-223(377)331-258(287)343-243(193)402)545-573(456,457)492-65-150-126(19-161(505-150)361-41-101(7)232' - r'(391)352-268(361)412)547-578(466,467)551-579(468,469)550-577(464,465)496-69-270-201(384)200(470-94-320-270)248(52' - r'3-270)362-42-102(8)233(392)353-269(362)413/h9-10,35-42,70-93,103-152,154-177,196-201,246-248,320,381-384H,11-34,4' - r'3-69,94H2,1-8H3,(H,414,415)(H,416,417)(H,418,419)(H,420,421)(H,422,423)(H,424,425)(H,426,427)(H,428,429)(H,430,43' - r'1)(H,432,433)(H,434,435)(H,436,437)(H,438,439)(H,440,441)(H,442,443)(H,444,445)(H,446,447)(H,448,449)(H,450,451)(' - r'H,452,453)(H,454,455)(H,456,457)(H,458,459)(H,460,461)(H,462,463)(H,464,465)(H,466,467)(H,468,469)(H2,271,321,405' - r')(H2,272,290,296)(H2,273,291,297)(H2,274,292,298)(H2,275,293,299)(H2,276,294,300)(H2,277,295,301)(H,346,385,406)(' - r'H,347,386,407)(H,348,387,408)(H,349,388,409)(H,350,389,410)(H,351,390,411)(H,352,391,412)(H,353,392,413)(H3,278,3' - r'22,334,393)(H3,279,323,335,394)(H3,280,324,336,395)(H3,281,325,337,396)(H3,282,326,338,397)(H3,283,327,339,398)(H' - r'3,284,328,340,399)(H3,285,329,341,400)(H3,286,330,342,401)(H3,287,331,343,402)(H3,288,332,344,403)(H3,289,333,345' - r',404)/t103-,104-,105-,106-,107-,108-,109-,110-,111-,112-,113-,114-,115-,116-,117-,118-,119-,120-,121-,122-,123-,1' - r'24-,125-,126-,127+,128+,129+,130+,131+,132+,133+,134+,135+,136+,137+,138+,139+,140+,141+,142+,143+,144+,145+,146+' - r',147+,148+,149+,150+,151+,152+,154+,155+,156+,157+,158+,159+,160+,161+,162+,163+,164+,165+,166+,167+,168+,169+,17' - r'0+,171+,172+,173+,174+,175+,176+,177+,196+,197+,198+,199+,200?,201-,246+,247+,248+,270+/m0/s1' -] - -INCHI_EXAMPLES = \ - [ - 'InChI=1S/C21H20N4O3S2/c1-3-28-16-8-4-15(5-9-16)21-20(14-6-10-17(11-7-14)30(22,26)27)18-12-13-19(29-2)23-25(18)' - '24-21/h4-13H,3H2,1-2H3,(H2,22,26,27)', - 'InChI=1S/C24H22N4O3/c1-15-9-10-17(27-24(29)16-7-5-4-6-8-16)11-19(15)28-23-18-12-21(30-2)22(31-3)13-20(18)25-14' - '-26-23/h4-14H,1-3H3,(H,27,29)(H,25,26,28)', - 'InChI=1S/C28H23N5O2/c1-20(34)32-26-4-2-3-24(12-26)28-11-23(14-30)9-10-25(28)17-35-18-27-15-31-19-33(27)16-22-7' - '-5-21(13-29)6-8-22/h2-12,15,19H,16-18H2,1H3,(H,32,34)', - 'InChI=1S/C26H21N5O2/c1-32-25-4-2-3-21(10-25)26-9-19(11-27)5-7-22(26)16-33-17-24-14-29-18-31(24)15-20-6-8-23(12' - '-28)30-13-20/h2-10,13-14,18H,15-17H2,1H3', - 'InChI=1S/C18H19N3O/c1-12-9-13(2)20-17(10-12)21-18(22)8-7-14-11-19-16-6-4-3-5-15(14)16/h3-6,9-11,19H,7-8H2,1-2H' - '3,(H,20,21,22)', - 'InChI=1S/C24H22N4O3/c1-15-9-10-17(11-19(15)28-24(29)16-7-5-4-6-8-16)27-23-18-12-21(30-2)22(31-3)13-20(18)25-14' - '-26-23/h4-14H,1-3H3,(H,28,29)(H,25,26,27)', - 'InChI=1S/C20H16FN3O3S/c1-27-18-12-11-17-19(13-5-9-16(10-6-13)28(2,25)26)20(23-24(17)22-18)14-3-7-15(21)8-4-14/' - 'h3-12H,1-2H3', - 'InChI=1S/C21H26N2O6S/c1-15-6-3-4-7-16(15)14-29-17-8-10-18(11-9-17)30(27,28)23-13-5-12-21(2,25)19(23)20(24)22-2' - '6/h3-4,6-11,19,25-26H,5,12-14H2,1-2H3,(H,22,24)/t19-,21+/m0/s1', - 'InChI=1S/C20H25ClN4O2S/c1-15-11-18(28-14-15)20(27)23-22-19(26)13-25-8-2-7-24(9-10-25)12-16-3-5-17(21)6-4-16/h3' - '-6,11,14H,2,7-10,12-13H2,1H3,(H,22,26)(H,23,27)', - 'InChI=1S/C17H12ClNO3S/c18-13 -6-7-14(22-10-11-4-2-1-3-5-11)12(8-13)9-15-16(20)19-17(21)23-15/h1-9H,10H2,(H,19,' - '20,21)/b15-9-' - ] + LONGEST_CHEMBL_INCHIS - -INCHI_KEY_EXAMPLES = \ - [ - 'WQGOWYVBXVBECY-UHFFFAOYSA-N', - 'UVUISNWETBUMTN-UHFFFAOYSA-N', - 'NUPPXHWHRPUCIJ-UHFFFAOYSA-N', - 'JEFGVHRSOIKILD-UHFFFAOYSA-N', - 'UYAJKEYVAXYMNW-UHFFFAOYSA-N', - 'IYBAQZSLFMVGQZ-UHFFFAOYSA-N', - 'TZIFVBRWFVXOKP-UHFFFAOYSA-N', - 'FYNYGUYHKZWCAU-PZJWPPBQSA-N', - 'WVLXVIZBRPTUKG-UHFFFAOYSA-N', - 'OVELZDQGMGPZBH-DHDCSXOGSA-N' - ] - -FASTA_SEQUENCE_A = 'ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDK' \ - 'DGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK*' - -FASTA_SEQUENCE_B = 'LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLVEWIWGGFSVDKATLNRFFAFHFIL' \ - 'PFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLGLLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYA' \ - 'ILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY' - -FASTA_SEQUENCE_A_ID = '>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken\n' - -FASTA_SEQUENCE_B_ID = '>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]\n' - -FASTA_SAMPLES = \ - [ - FASTA_SEQUENCE_A, - FASTA_SEQUENCE_B, - FASTA_SEQUENCE_A_ID + FASTA_SEQUENCE_A, - FASTA_SEQUENCE_B_ID + FASTA_SEQUENCE_B - ] diff --git a/app/free_text_parsing/parser.py b/app/free_text_parsing/parser.py deleted file mode 100644 index 0d7cceb04a530f2a0902e0650533e0455dd18507..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/parser.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Moudle that sets up the Appreggio parser -""" -from arpeggio import ParserPython, Optional, PTNodeVisitor, OneOrMore, ZeroOrMore, EOF -from arpeggio import RegExMatch as _ - -from app.free_text_parsing.grammar import common -from app.free_text_parsing.grammar import smiles -from app.free_text_parsing.grammar import inchi - - -def single_term(): - return common.correctly_parenthesised_non_space_char_sequence, common.term_end_lookahead - - -def exact_match_term(): - return ( - [ - ( - Optional(['+', '-']), - [ - ('"', _(r'((\\")|[^"])+'), '"'), - ("'", _(r"((\\')|[^'])+"), "'") - ] - ), - ( - ['+', '-'], common.correctly_parenthesised_non_space_char_sequence - ) - ], - common.term_end_lookahead - ) - - -def json_property_path_segment(): - return OneOrMore(_(r'[a-z0-9_\-]')) - - -def property_term(): - return ( - Optional(['+', '-']), - json_property_path_segment, ZeroOrMore('.', json_property_path_segment), ':', - [ - ('"', _('[^"]+'), '"'), - ("'", _("[^']+"), "'"), - ("(", _("[^\(\)]+"), ")"), - common.correctly_parenthesised_non_space_char_sequence - ], - common.term_end_lookahead - ) - - -def parenthesised_expression(): - return '(', expression, ')', common.term_end_lookahead - - -def expression_term(): - return [parenthesised_expression, - smiles.smiles, - inchi.inchi_key, inchi.inchi, - property_term, - exact_match_term, - single_term] - - -def expression(): - """ - :return: - """ - - return \ - ( - Optional(common.space_sequence), - expression_term, - ZeroOrMore( - # Optional( - # (common.space_sequence, _(r'and|or', ignore_case=True)) - # ), - common.space_sequence, - expression_term, - common.term_end_lookahead - ), - Optional(common.space_sequence) - ) - - -PARSER = ParserPython(expression, skipws=False) diff --git a/app/free_text_parsing/query_builder/__init__.py b/app/free_text_parsing/query_builder/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/app/free_text_parsing/query_builder/es_models.py b/app/free_text_parsing/query_builder/es_models.py deleted file mode 100644 index da352c83f33c12cc3cf73e17aea4743f9ca937b9..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/query_builder/es_models.py +++ /dev/null @@ -1,28 +0,0 @@ -from typing import List -from app.es_data import es_data - -class ElasticSearchMultiSearchQuery: - - def __init__(self, index, body): - self.index = index - self.body = body - - -DATA_CONNECTION = 'data' - -def do_multi_search(queries: List[ElasticSearchMultiSearchQuery]): - try: - - multi_search_body = [] - for query_i in queries: - multi_search_body.append({'index': query_i.index}) - if query_i.body is None: - query_i.body = {} - query_i.body['track_total_hits'] = True - multi_search_body.append(query_i.body) - - result = es_data.do_multisearch(body=multi_search_body) - - return result - except Exception as e: - raise Exception('ERROR: can\'t retrieve elastic search data!') \ No newline at end of file diff --git a/app/free_text_parsing/query_builder/query_builder.py b/app/free_text_parsing/query_builder/query_builder.py deleted file mode 100644 index 58a9e5e865367042d806f432bc2350e9aaccd1a0..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/query_builder/query_builder.py +++ /dev/null @@ -1,280 +0,0 @@ -from app.free_text_parsing.query_builder.es_models import ElasticSearchMultiSearchQuery, do_multi_search - - -# ---------------------------------------------------------------------------------------------------------------------- -# This class implements the functionalities to build an Elastic Search query -# ---------------------------------------------------------------------------------------------------------------------- -class QueryBuilder: - - TEXT_FIELDS_BOOSTS = [ - "*.std_analyzed^1.6", - "*.eng_analyzed^0.8", - "*.ws_analyzed^1.4", - "*.keyword^2", - "*.lower_case_keyword^1.5", - "*.alphanumeric_lowercase_keyword^1.3" - ] - - TEXT_PHRASE_PREFIX_BOOSTS = [ - "*.std_analyzed^1.6", - "*.eng_analyzed^0.8", - "*.ws_analyzed^1.4", - "*.lower_case_keyword^1.5", - "*.alphanumeric_lowercase_keyword^1.3" - ] - - ID_FIELDS_BOOSTS = [ - "*.entity_id^2", - "*.id_reference^1.5", - "*.chembl_id^2", - "*.chembl_id_reference^1.5" - ] - - @staticmethod - def get_es_text_term_queries(query_string, fuzzy, minimum_should_match): - queries = [ - { - 'multi_match': { - 'type': 'most_fields', - 'fields': QueryBuilder.TEXT_FIELDS_BOOSTS, - 'query': query_string, - 'minimum_should_match': '{0}%'.format(minimum_should_match), - 'boost': 10, - 'fuzziness': 'AUTO' if fuzzy else 0 - } - }, - { - 'multi_match': { - 'type': 'best_fields', - 'fields': QueryBuilder.TEXT_FIELDS_BOOSTS, - 'query': query_string, - 'minimum_should_match': '{0}%'.format(minimum_should_match), - 'boost': 2, - 'fuzziness': 'AUTO' if fuzzy else 0 - } - } - ] - if not'fuzzy': - queries.append( - { - 'multi_match': { - 'type': 'phrase', - 'fields': QueryBuilder.TEXT_FIELDS_BOOSTS, - 'query': query_string, - 'minimum_should_match': '{0}%'.format(minimum_should_match), - 'boost': 1.5 - } - } - ) - queries.append( - { - 'multi_match': { - 'type': 'phrase_prefix', - 'fields': QueryBuilder.TEXT_PHRASE_PREFIX_BOOSTS, - 'query': query_string, - 'minimum_should_match': '{0}%'.format(minimum_should_match) - } - } - ) - return queries - - @staticmethod - def get_es_id_term_queries(terms, fuzzy, minimum_should_match): - queries = [] - for term_i in terms: - if len(term_i) >= 3: - queries.append( - { - 'multi_match': { - 'type': 'most_fields', - 'fields': QueryBuilder.ID_FIELDS_BOOSTS, - 'query': term_i, - 'fuzziness': 'AUTO' if fuzzy else 0, - 'boost': 10 - } - } - ) - return queries - - @staticmethod - def get_es_query_for(chembl_ids, terms, filter_terms, sub_queries, fuzzy, minimum_should_match, - boosted_es_keys, cur_es_key, is_or=True): - - query_string = ' '.join(terms) - filter_terms_joined = ' AND '.join(filter_terms) - query = { - 'bool': { - 'boost': 10**9 if cur_es_key in boosted_es_keys else 1, - 'must': { - 'bool': { - 'should': [], - 'must': [] - }, - } - } - } - bool_query = 'should' if is_or else 'must' - if query_string: - query['bool']['must']['bool'][bool_query] += QueryBuilder.get_es_text_term_queries( - query_string, fuzzy, minimum_should_match - ) - query['bool']['must']['bool'][bool_query] += QueryBuilder.get_es_id_term_queries( - terms, fuzzy, minimum_should_match - ) - - if chembl_ids: - delta = 0.3/len(chembl_ids) - chembl_ids_et = [] - for i, c_id_i in enumerate(chembl_ids): - chembl_ids_et.append('"{0}"^{1}'.format(c_id_i, (1.3-i*delta))) - if len(chembl_ids_et) > 0: - query['bool']['must']['bool'][bool_query].append( - { - 'query_string': { - 'fields': QueryBuilder.TEXT_FIELDS_BOOSTS + QueryBuilder.ID_FIELDS_BOOSTS, - 'query': ' '.join(chembl_ids_et), - 'allow_leading_wildcard': False, - 'fuzziness': 0, - # 'use_dis_max': False, - } - } - ) - - if filter_terms_joined: - query['bool']['filter'] = [] - query['bool']['filter'].append( - { - 'query_string': { - 'fields': QueryBuilder.TEXT_FIELDS_BOOSTS + QueryBuilder.ID_FIELDS_BOOSTS, - 'query': filter_terms_joined - } - } - ) - if sub_queries: - query['bool']['must']['bool'][bool_query] = query['bool']['must']['bool'][bool_query] + sub_queries - return query - - @staticmethod - def build_parsed_query_recursive(cur_parsed_query, chembl_ids, terms, filter_terms, fuzzy, minimum_should_match, - boosted_es_keys, cur_es_key): - - # Query tree leafs - if 'term' in cur_parsed_query: - # TODO - # if cur_parsed_query['chembl_entity'] and cur_parsed_query['chembl_entity'] in 'glados.Settings.SEARCH_PATH_2_ES_KEY': - # boosted_es_keys.add('glados.Settings.SEARCH_PATH_2_ES_KEY[cur_parsed_query['chembl_entity']]') - # return None - if cur_parsed_query['include_in_query']: - if cur_parsed_query['exact_match_term']: - terms.append(cur_parsed_query['term']) - filter_terms.append(cur_parsed_query['term']) - elif cur_parsed_query['filter_term']: - filter_terms.append(cur_parsed_query['term']) - else: - terms.append(cur_parsed_query['term']) - for ref_i in cur_parsed_query['references']: - if ref_i['include_in_query']: - for chembl_id_i in ref_i['chembl_ids']: - if chembl_id_i['include_in_query']: - chembl_ids.append(chembl_id_i['chembl_id']) - return None - - chembl_ids = [] - terms = [] - filter_terms = [] - boosted_es_keys = set() - - next_terms = [] - cur_type = None - if 'or' in cur_parsed_query: - next_terms = cur_parsed_query['or'] - cur_type = 'or' - if 'and' in cur_parsed_query: - next_terms = cur_parsed_query['and'] - cur_type = 'and' - - inner_queries = [] - for term_i in next_terms: - term_query = QueryBuilder.build_parsed_query_recursive( - term_i, chembl_ids, terms, filter_terms, fuzzy, minimum_should_match, boosted_es_keys, cur_es_key - ) - if term_query: - inner_queries.append(term_query) - return QueryBuilder.get_es_query_for( - chembl_ids, terms, filter_terms, inner_queries, fuzzy, minimum_should_match, boosted_es_keys, cur_es_key, - cur_type == 'or' - ) - - @staticmethod - def get_es_query_for_json_query(json_query, cur_es_key='', fuzzy=False, minimum_should_match=100): - chembl_ids = [] - terms = [] - filter_terms = [] - boosted_es_keys = set() - es_query = QueryBuilder.build_parsed_query_recursive( - json_query, chembl_ids, terms, filter_terms, fuzzy, minimum_should_match, boosted_es_keys, cur_es_key - ) - if not es_query: - es_query = QueryBuilder.get_es_query_for( - chembl_ids, terms, filter_terms, [], fuzzy, minimum_should_match, boosted_es_keys, cur_es_key - ) - return es_query - - @staticmethod - def get_best_es_query(json_query, indexes: list, cur_es_key=None): - es_base_queries = [] - cur_min_should_match = 100 - - while cur_min_should_match > 0: - es_query_i = QueryBuilder.get_es_query_for_json_query(json_query, cur_es_key, False, - cur_min_should_match) - es_base_queries.append(es_query_i) - cur_min_should_match -= 20 - - es_base_queries.append( - QueryBuilder.get_es_query_for_json_query( - json_query, cur_es_key, True, 40 - ) - ) - queries = [] - for index in indexes: - for es_query_i in es_base_queries: - # it is necessary to request at least 1 document to get the max_score value - queries.append(ElasticSearchMultiSearchQuery(index, { - 'size': 1, - '_source': ['_id'], - 'query': es_query_i - })) - - results = do_multi_search(queries)['responses'] - - best_queries = {} - for i, es_index_i in enumerate(indexes): - best_query_i = None - best_query_i_total = 0 - best_query_i_score = 0 + (len(indexes)-i)/(10**(len(es_base_queries)+1)) - j = 0 - while best_query_i is None and j < len(es_base_queries): - if results[i*len(es_base_queries) + j]['hits']['total']['value'] > 0: - best_query_i = es_base_queries[j] - best_query_i_total = results[i * len(es_base_queries) + j]['hits']['total']['value'] - best_query_i_score += results[i * len(es_base_queries) + j]['hits']['max_score']/(10**j) - if es_index_i == 'chembl_target': - best_query_i_score *= 100 - if es_index_i == 'chembl_molecule': - best_query_i_score *= 1000 - j += 1 - if best_query_i is None: - best_query_i = es_base_queries[0] - best_queries[es_index_i] = { - 'query': best_query_i, - 'total': best_query_i_total, - 'max_score': best_query_i_score - } - - sorted_indexes_by_score = sorted(best_queries.keys(), key=lambda key: best_queries[key]['max_score'], reverse=True) - for i, es_index_i in enumerate(indexes): - best_queries[es_index_i]['max_score'] -= (len(indexes)-i)/(10**(len(es_base_queries)+1)) - best_queries[es_index_i]['max_score'] = round(best_queries[es_index_i]['max_score'], len(es_base_queries)) - - return best_queries, sorted_indexes_by_score diff --git a/app/free_text_parsing/terms_visitor.py b/app/free_text_parsing/terms_visitor.py deleted file mode 100644 index 20e80354ad32d7e2d1cfa974dd915ed8170b97e2..0000000000000000000000000000000000000000 --- a/app/free_text_parsing/terms_visitor.py +++ /dev/null @@ -1,353 +0,0 @@ -""" -Terms Visitor -""" -import urllib -import traceback -import re - -from arpeggio import PTNodeVisitor -import arpeggio -import requests -from urllib.parse import urlparse - -from app.free_text_parsing.grammar import inchi -from app.config import RUN_CONFIG - -WS_URL = 'https://www.ebi.ac.uk/chembl/api/data' -WS_PARSED_URL = urlparse(WS_URL) -WS_DOMAIN = WS_PARSED_URL.scheme + '://' + WS_PARSED_URL.netloc - -ELASTICSEARCH_EXTERNAL_URL = 'https://www.ebi.ac.uk/chembl/glados-es' -CHEMBL_ES_INDEX_PREFIX = 'chembl_' - -__CHEMBL_REGEX_STR = r'^chembl[^\d\s]{0,2}([\d]+)[^\d\s]{0,2}$' -CHEMBL_REGEX = re.compile(__CHEMBL_REGEX_STR, flags=re.IGNORECASE) - -__DOI_REGEX_STR = r'^(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>|])\S)+)$' -DOI_REGEX = re.compile(__DOI_REGEX_STR) - -INTEGER_REGEX = re.compile(r'^\d+$') - -CHEMBL_ENTITIES = { - 'target': 'targets', - 'compound': 'compounds', - 'molecule': 'compounds', - 'document': 'documents', - 'assay': 'assays', - 'cell': 'cells', - 'tissue': 'tissues' -} - - -def check_chembl_entities(term_dict: dict): - term = term_dict['term'].lower() - if len(term) > 0 and term[-1] == 's': - term = term[0:-1] - if term in CHEMBL_ENTITIES: - term_dict['chembl_entity'] = CHEMBL_ENTITIES[term] - - -def check_doi(term_dict: dict): - re_match = DOI_REGEX.match(term_dict['term']) - if re_match is not None: - try: - chembl_ids = [] - response = requests.get( - '{es_url}/{index_prefix}document/_search'.format( - es_url=ELASTICSEARCH_EXTERNAL_URL, - index_prefix=CHEMBL_ES_INDEX_PREFIX - ), - json= - { - 'size': 10, - '_source': 'document_chembl_id', - 'query': { - 'term': { - 'doi': { - 'value': term_dict['term'] - } - } - } - }, - timeout=5 - ) - json_response = response.json() - for hit_i in json_response['hits']['hits']: - chembl_ids.append(hit_i['_source']['document_chembl_id']) - if chembl_ids: - term_dict['references'].append( - { - 'type': 'doi', - 'label': 'DOI (Digital Object Identifier)', - 'chembl_ids': get_chembl_id_list_dict(chembl_ids), - 'include_in_query': True, - 'chembl_entity': 'document' - } - ) - except: - traceback.print_exc() - - -def check_integer(term_dict: dict): - re_match = INTEGER_REGEX.match(term_dict['term']) - if re_match is not None: - term_dict['references'].append( - { - 'type': 'integer_chembl_id', - 'label': 'Integer as ChEMBL ID', - 'chembl_ids': [ - get_chembl_id_dict('CHEMBL{0}'.format(term_dict['term'])) - ], - 'include_in_query': True - } - ) - - -def check_chembl(term_dict: dict): - re_match = CHEMBL_REGEX.match(term_dict['term']) - if re_match is not None: - chembl_id_num = re_match.group(1) - term_dict['references'].append( - { - 'type': 'chembl_id', - 'label': 'ChEMBL ID', - 'chembl_ids': [ - get_chembl_id_dict('CHEMBL{0}'.format(chembl_id_num)) - ], - 'include_in_query': True - } - ) - - -def adjust_exact_term(exact_term: str) -> str: - if exact_term[-1] == '"': - return exact_term - elif exact_term[-1] == "'": - first_char = 1 - prefix = "" - if exact_term[0] == '+' or exact_term[0] == '-': - first_char = 2 - prefix = exact_term[0] - return prefix+'"'+exact_term[first_char:-1].replace(r"\'", r'\"')+'"' - else: - return exact_term[0]+'"'+exact_term[1:]+'"' - - -def check_inchi(term_dict: dict, term_is_inchi_key=False): - try: - chembl_ids = [] - response = requests.get( - '{es_url}/{index_prefix}molecule/_search'.format( - es_url=ELASTICSEARCH_EXTERNAL_URL, - index_prefix=CHEMBL_ES_INDEX_PREFIX - ), - json= - { - 'size': 10, - '_source': 'molecule_chembl_id', - 'query': { - 'term': { - 'molecule_structures.standard_inchi'+('_key' if term_is_inchi_key else ''): { - 'value': term_dict['term'] - } - } - } - }, - timeout=5 - ) - json_response = response.json() - for hit_i in json_response['hits']['hits']: - chembl_ids.append(hit_i['_source']['molecule_chembl_id']) - if chembl_ids: - term_dict['references'].append( - { - 'type': 'inchi'+('_key' if term_is_inchi_key else ''), - 'label': 'InChI'+(' Key' if term_is_inchi_key else ''), - 'chembl_ids': get_chembl_id_list_dict(chembl_ids), - 'include_in_query': True, - 'chembl_entity': 'compound' - } - ) - except: - traceback.print_exc() - - -def check_unichem(term_dict: dict): - pass - -def get_chembl_id_dict(chembl_id, cross_references=[], include_in_query=True, score=None): - return { - 'chembl_id': chembl_id, - 'cross_references': cross_references, - 'include_in_query': include_in_query, - 'score': score - } - -def get_chembl_id_list_dict(chembl_ids, cross_references=[], include_in_query=True): - return [ - get_chembl_id_dict( - chembl_id_i, - cross_references[i] if i < len(cross_references) else [], - include_in_query - ) - for i, chembl_id_i in enumerate(chembl_ids) - ] - -def check_smiles(term_dict: dict): - - ws_base_path = RUN_CONFIG.get('chembl_api').get('ws_url') - try: - chembl_ids = [] - next_url_path = '{ws_path}/molecule.json?molecule_structures__canonical_smiles__flexmatch={smiles}'\ - .format(ws_path=ws_base_path, smiles=urllib.parse.quote(term_dict['term'])) - while next_url_path: - response = requests.get( - WS_DOMAIN + next_url_path, - headers={'Accept': 'application/json'}, - timeout=5 - ) - json_response = response.json() - if 'error_message' in json_response: - return None - for molecule_i in json_response['molecules']: - chembl_ids.append(molecule_i['molecule_chembl_id']) - next_url_path = json_response['page_meta']['next'] - if chembl_ids: - term_dict['references'].append( - { - 'type': 'smiles', - 'label': 'SMILES', - 'chembl_ids': get_chembl_id_list_dict(chembl_ids), - 'include_in_query': True, - 'chembl_entity': 'compound' - } - ) - except: - traceback.print_exc() - - -class TermsVisitor(PTNodeVisitor): - - def __init__(self): - super().__init__() - - def visit__default__(self, node, children): - """ - Called if no visit method is defined for the node. - - Args: - node(ParseTreeNode): - children(processed children ParseTreeNode-s): - """ - if isinstance(node, arpeggio.Terminal): - return arpeggio.text(node) - else: - # term = ''.join([str(child_i) for child_i in children]) - # check_unichem(term) - return ''.join([str(child_i) for child_i in children]) - - def visit_expression_term(self, node, children): - return children[0] - - def visit_parenthesised_expression(self, node, children): - return children[1] - - def visit_expression(self, node, children): - exp = {'or': []} - previous_single_term_lc = None - for child_i in children: - str_child_i_lc = str(child_i).strip().lower() - term_dict = None - if len(str_child_i_lc) > 0: - - if str_child_i_lc == 'and' or str_child_i_lc == 'or': - term_dict = self.get_term_dict(str(child_i).strip(), include_in_query=False) - check_unichem(term_dict) - last_term_is_and_group = len(exp['or']) > 0 and type(exp['or'][-1]) == dict and 'and' in exp['or'][-1] - if str_child_i_lc == 'and' and not last_term_is_and_group: - if len(exp['or']) > 0: - exp['or'][-1] = {'and': [exp['or'][-1], term_dict]} - else: - exp['or'].append({'and': [term_dict]}) - elif last_term_is_and_group and (str_child_i_lc == 'and' or previous_single_term_lc == 'and'): - if term_dict: - exp['or'][-1]['and'].append(term_dict) - else: - exp['or'][-1]['and'].append(child_i) - else: - if term_dict: - exp['or'].append(term_dict) - else: - exp['or'].append(child_i) - previous_single_term_lc = str_child_i_lc - if len(exp['or']) == 1: - return exp['or'][0] - return exp - - @staticmethod - def get_term_dict(term: str, include_in_query=True) -> dict: - return { - 'term': term, - 'include_in_query': include_in_query, - 'references': [], - 'exact_match_term': False, - 'filter_term': False, - 'chembl_entity': None - } - - def visit_smiles(self, node, children): - term = ''.join(children) - include_in_query = len(term) <= 4 - term_dict = self.get_term_dict(term, include_in_query=include_in_query) - check_smiles(term_dict) - if include_in_query: - check_unichem(term_dict) - if inchi.is_inchi_key(term): - check_inchi(term_dict) - return term_dict - - def visit_inchi(self, node, children): - term = ''.join(children) - term_dict = self.get_term_dict(term, include_in_query=False) - check_inchi(term_dict) - return term_dict - - def visit_inchi_key(self, node, children): - term = ''.join(children) - term_dict = self.get_term_dict(term, include_in_query=False) - check_inchi(term_dict, term_is_inchi_key=True) - return term_dict - - def visit_fasta(self, node, children): - term = ''.join(children) - term_dict = self.get_term_dict(term, include_in_query=False) - # check_fasta(term_dict) - return term_dict - - def visit_property_term(self, node, children): - term = ''.join(children) - term_dict = self.get_term_dict(term) - term_dict['filter_term'] = True - return term_dict - - def visit_exact_match_term(self, node, children): - term = ''.join(children) - term = adjust_exact_term(term) - term_dict = self.get_term_dict(term) - term_dict['exact_match_term'] = True - return term_dict - - def visit_single_term(self, node, children): - term = ''.join(children) - term_lc = term.lower() - if term_lc == 'or' or term_lc == 'and': - return term - term_dict = self.get_term_dict(term) - check_unichem(term_dict) - check_chembl(term_dict) - check_integer(term_dict) - check_doi(term_dict) - check_chembl_entities(term_dict) - return term_dict - -TERMS_VISITOR = TermsVisitor() \ No newline at end of file diff --git a/functional_tests/run_functional_tests.py b/functional_tests/run_functional_tests.py index e348d93af6ffb1f7b72f773b8a1d5842b198e047..4e69882c0f794f368e2bdb75422fb83b1885a1ba 100755 --- a/functional_tests/run_functional_tests.py +++ b/functional_tests/run_functional_tests.py @@ -8,7 +8,7 @@ import os from specific_tests import fun_test_simple_query, fun_test_property_config, \ fun_test_group_config, fun_test_facets_group_config, fun_test_id_properties, \ - fun_test_search_parsing, fun_test_url_shortening, fun_test_element_usage, \ + fun_test_url_shortening, fun_test_element_usage, \ fun_test_go_slim_target_classification, fun_test_in_vivo_assay_classification, \ fun_test_drug_indications_by_phase, \ fun_test_organism_taxonomy_target_classification, fun_test_protein_target_classification, \ @@ -50,7 +50,7 @@ def run(): fun_test_get_es_data, fun_test_get_document_by_custom_id_prop, fun_test_id_properties, - fun_test_get_context_data, fun_test_search_parsing, fun_test_url_shortening, + fun_test_get_context_data, fun_test_url_shortening, fun_test_element_usage, fun_test_go_slim_target_classification, fun_test_in_vivo_assay_classification, fun_test_drug_indications_by_phase, diff --git a/functional_tests/specific_tests/fun_test_search_parsing.py b/functional_tests/specific_tests/fun_test_search_parsing.py deleted file mode 100644 index cf61e0111760095537fce0342308ee27f16f6212..0000000000000000000000000000000000000000 --- a/functional_tests/specific_tests/fun_test_search_parsing.py +++ /dev/null @@ -1,35 +0,0 @@ -# pylint: disable=import-error,unused-argument -""" -Module that tests parsing a search term -""" -import requests - -from specific_tests import utils - - -def run_test(server_base_url, delayed_jobs_server_base_path): - """ - Tests that parsing a search term works - :param server_base_url: base url of the running server. E.g. http://127.0.0.1:5000 - :param delayed_jobs_server_base_path: base path for the delayed_jobs - """ - - print('-------------------------------------------') - print('Testing parsing s search') - print('-------------------------------------------') - - term_params = { - 'search_term': 'MDCK', - 'es_indexes': 'chembl_molecule,chembl_target,chembl_assay,chembl_document,chembl_cell_line,chembl_tissue', - 'is_test': True, - } - - url = f'{server_base_url}/search_parsing/parse_free_text_search' - - request = requests.post(url, data=term_params) - - status_code = request.status_code - print(f'status_code: {status_code}') - response_text = request.text - utils.print_es_response(response_text) - assert status_code == 200, 'The request failed!'