Commit 46012ec1 authored by David Mendez's avatar David Mendez
Browse files

Merge branch 'staging' into 'master'

Do Entities Join in Backend

See merge request !70
parents cfcb8c53 323f60c1
......@@ -137,4 +137,7 @@ app/static/
/config.yml
/config_playground.yml
output.csv
\ No newline at end of file
output.csv
deleteme
deleteme.json
\ No newline at end of file
......@@ -16,6 +16,7 @@ from app.blueprints.url_shortening.controllers.url_shortening_controller import
from app.blueprints.element_usage_blueprint.controllers.element_usage_controller import ELEMENT_USAGE_BLUEPRINT
from app.blueprints.visualisation_data.controllers.visualisation_data_controller import VISUALISATION_DATA_BLUEPRINT
from app.blueprints.utils.controllers.utils_controller import UTILS_BLUEPRINT
from app.blueprints.entities_join.controllers.entities_join_controller import ENTITIES_JOIN_BLUEPRINT
def create_app():
......@@ -43,15 +44,56 @@ def create_app():
CACHE.cache._client.behaviors['retry_timeout'] = 10
CACHE.cache._client.behaviors['retry_timeout'] = 600
flask_app.register_blueprint(SWAGGER_BLUEPRINT, url_prefix=f'{base_path}/swagger')
flask_app.register_blueprint(ES_PROXY_BLUEPRINT, url_prefix=f'{base_path}/es_data')
flask_app.register_blueprint(PROPERTIES_CONFIG_BLUEPRINT, url_prefix=f'{base_path}/properties_configuration')
flask_app.register_blueprint(CONTEXTS_BLUEPRINT, url_prefix=f'{base_path}/contexts')
flask_app.register_blueprint(SEARCH_PARSER_BLUEPRINT, url_prefix=f'{base_path}/search_parsing')
flask_app.register_blueprint(URL_SHORTENING_BLUEPRINT, url_prefix=f'{base_path}/url_shortening')
flask_app.register_blueprint(ELEMENT_USAGE_BLUEPRINT, url_prefix=f'{base_path}/frontend_element_usage')
flask_app.register_blueprint(VISUALISATION_DATA_BLUEPRINT, url_prefix=f'{base_path}/visualisations')
flask_app.register_blueprint(UTILS_BLUEPRINT, url_prefix=f'{base_path}/utils')
blueprints_urls = [
{
'blueprint': SWAGGER_BLUEPRINT,
'url': '/swagger'
},
{
'blueprint': ES_PROXY_BLUEPRINT,
'url': '/es_data'
},
{
'blueprint': PROPERTIES_CONFIG_BLUEPRINT,
'url': '/properties_configuration'
},
{
'blueprint': CONTEXTS_BLUEPRINT,
'url': '/contexts'
},
{
'blueprint': SEARCH_PARSER_BLUEPRINT,
'url': '/search_parsing'
},
{
'blueprint': URL_SHORTENING_BLUEPRINT,
'url': '/url_shortening'
},
{
'blueprint': ELEMENT_USAGE_BLUEPRINT,
'url': '/frontend_element_usage'
},
{
'blueprint': VISUALISATION_DATA_BLUEPRINT,
'url': '/visualisations'
},
{
'blueprint': UTILS_BLUEPRINT,
'url': '/utils'
},
{
'blueprint': ENTITIES_JOIN_BLUEPRINT,
'url': '/entities_join'
}
]
for blueprint_desc in blueprints_urls:
blueprint = blueprint_desc['blueprint']
url = blueprint_desc['url']
flask_app.register_blueprint(blueprint, url_prefix=f'{base_path}/{url}')
print(f'{url} LOADED')
print(f'LOADED {len(blueprints_urls)} BLUEPRINTS')
return flask_app
......
"""
The blueprint used for handling requests to do joins of entities
"""
from flask import Blueprint, jsonify, abort, request
from app.request_validation.decorators import validate_form_with
from app.blueprints.entities_join.controllers import marshmallow_schemas
from app.blueprints.entities_join.services import entities_join_service
from app import app_logging
from utils import request_parameters
ENTITIES_JOIN_BLUEPRINT = Blueprint('entities_join', __name__)
@ENTITIES_JOIN_BLUEPRINT.route('/get_link_to_related_items', methods=['POST'])
@validate_form_with(marshmallow_schemas.EntitiesJoinQuery)
def get_link_to_related_items():
"""
returns the hash of the tiny url to the related items with the parameters given
"""
form_data = request.form
destination_entity_browser_state_template = request_parameters.sanitise_parameter(
form_data.get('destination_entity_browser_state_template'))
entity_from = request_parameters.sanitise_parameter(form_data.get('entity_from'))
entity_to = request_parameters.sanitise_parameter(form_data.get('entity_to'))
es_query = request_parameters.sanitise_parameter(form_data.get('es_query'))
selection_description = request_parameters.sanitise_parameter(form_data.get('selection_description'))
previous_hash = request_parameters.sanitise_parameter(form_data.get('previous_hash'))
app_logging.debug(f'destination_entity_browser_state_template: {destination_entity_browser_state_template}')
app_logging.debug(f'entity_from: {entity_from}')
app_logging.debug(f'entity_to: {entity_to}')
app_logging.debug(f'es_query: {es_query}')
app_logging.debug(f'selection_description: {selection_description}')
try:
json_response = entities_join_service.get_tiny_hash_to_related_items(destination_entity_browser_state_template,
entity_from, entity_to, es_query,
selection_description, previous_hash)
return jsonify(json_response)
except entities_join_service.EntitiesJoinServiceError as error:
app_logging.debug(str(error))
abort(500, f'Internal server error: {str(error)}')
"""
Schemas to validate the input of entities join Endpoint
"""
from marshmallow import Schema, fields
class EntitiesJoinQuery(Schema):
"""
Class that the schema doing an es join
"""
destination_entity_browser_state_template = fields.String(required=True)
entity_from = fields.String(required=True)
entity_to = fields.String(required=True)
es_query = fields.String(required=True)
selection_description = fields.String(required=True)
previous_hash = fields.String()
"""
Service that handles the requests to the entities join
"""
from app.entities_joiner import entities_joiner
class EntitiesJoinServiceError(Exception):
"""Base class for exceptions in this file."""
def get_tiny_hash_to_related_items(destination_entity_browser_state_template,
entity_from, entity_to, es_query,
selection_description, previous_hash):
"""
:param destination_entity_browser_state_template: template for building the resulting browser url
:param entity_from: source entity of the items
:param entity_to: destination entity of the join
:param es_query: query in elasticsearch for the dataset
:param selection_description: stringifyed javascript object describing de selection of items in the dataset
:param previous_hash: hash of the state that originated this join of entities
:return: a dict with the tiny url to the link with the generated state
"""
try:
tiny_hash = entities_joiner.get_tiny_hash_to_related_items(destination_entity_browser_state_template,
entity_from, entity_to, es_query,
selection_description, previous_hash)
return {
'tiny_hash': tiny_hash
}
except entities_joiner.EntitiesJoinerError as error:
raise EntitiesJoinServiceError(error)
......@@ -8,7 +8,7 @@ from app.blueprints.es_proxy.controllers import marshmallow_schemas
from app.blueprints.es_proxy.services import es_proxy_service
from app import app_logging
from app.http_cache import http_cache_utils
from utils import request_parameters
ES_PROXY_BLUEPRINT = Blueprint('es_proxy', __name__)
......@@ -21,10 +21,10 @@ def get_es_data():
"""
form_data = request.form
index_name = sanitise_parameter(form_data.get('index_name'))
raw_es_query = sanitise_parameter(form_data.get('es_query'))
raw_context = sanitise_parameter(form_data.get('context_obj'))
raw_contextual_sort_data = sanitise_parameter(form_data.get('contextual_sort_data'))
index_name = request_parameters.sanitise_parameter(form_data.get('index_name'))
raw_es_query = request_parameters.sanitise_parameter(form_data.get('es_query'))
raw_context = request_parameters.sanitise_parameter(form_data.get('context_obj'))
raw_contextual_sort_data = request_parameters.sanitise_parameter(form_data.get('contextual_sort_data'))
app_logging.debug(f'index_name: {index_name}')
app_logging.debug(f'raw_es_query: {raw_es_query}')
......@@ -65,14 +65,3 @@ def get_es_doc(index_name, doc_id):
abort(500, msg=f'Internal server error: {str(error)}')
except es_proxy_service.ESDataNotFoundError as error:
abort(404)
def sanitise_parameter(param_value):
"""
Makes the parameter null if it is 'null' or 'undefined', in some cases javascript produces those values
:param param_value: value of the parameter
:return: null if param_value in ('null', 'undefined'), the actual value otherwise
"""
if param_value in ('null', 'undefined'):
return None
return param_value
"""
Module with functions to make joins of entities
"""
import json
import base64
from datetime import datetime
import hashlib
from app.entities_joiner import standardisation
from app.entities_joiner import ids_loader
from app.url_shortening import url_shortener
from app.config import RUN_CONFIG
from app.usage_statistics import statistics_saver
from app import app_logging
from app import cache
class EntitiesJoinerError(Exception):
"""Base class for exceptions in this file."""
def get_tiny_hash_to_related_items(destination_entity_browser_state_template,
raw_origin_entity, raw_destination_entity, raw_es_query,
raw_selection_description, previous_hash):
"""
:param destination_entity_browser_state_template: template for building the resulting browser url
:param raw_origin_entity: text with the origin entity of the items
:param raw_destination_entity: text with the destination entity of the join
:param raw_es_query: stringifyed query in elasticsearch for the dataset
:param raw_selection_description: stringifyed javascript object describing de selection of items in the dataset
:param previous_hash: hash of the state that originated this join of entities
:return: the hash to the link with the generated state
"""
cache_key = get_cache_key(destination_entity_browser_state_template, raw_origin_entity, raw_destination_entity,
raw_es_query, raw_selection_description, previous_hash)
app_logging.debug(f'entities join cache_key: {cache_key}')
cache_response = cache.fail_proof_get(key=cache_key)
if cache_response is not None:
save_statistics(raw_origin_entity, raw_destination_entity, None, True)
return cache_response
origin_destination = parse_origin_destination_parameters(raw_origin_entity, raw_destination_entity)
join_parameters = parse_join_parameters(origin_destination['parsed_origin_entity'],
origin_destination['parsed_destination_entity'],
raw_selection_description)
es_query = json.loads(raw_es_query)
ids = ids_loader.load_ids_for_query(es_query, join_parameters['selection_description'],
join_parameters['origin_property'], join_parameters['index_name'])
query_parameters = parse_query_parameters(origin_destination['parsed_origin_entity'],
origin_destination['parsed_destination_entity'])
join_state_hash = get_join_state_hash(ids, query_parameters['destination_query_generator'],
query_parameters['settings_path'], query_parameters['browser_name'],
destination_entity_browser_state_template,
previous_hash)
cache.fail_proof_set(key=cache_key, value=join_state_hash, timeout=RUN_CONFIG.get('entities_join_cache_seconds'))
save_statistics(raw_origin_entity, raw_destination_entity, len(ids), False)
return join_state_hash
def parse_origin_destination_parameters(raw_origin_entity, raw_destination_entity):
"""
:param raw_origin_entity: text with the origin entity of the items
:param raw_destination_entity: text with the destination entity of the join
:return: A dict with the parsed values of the origin and destination entities of the join
"""
if raw_destination_entity == raw_origin_entity:
raise EntitiesJoinerError(
f'entity_to ({raw_destination_entity}) and entity_from ({raw_origin_entity}) cannot be the same!')
origin_destination = {}
try:
parsed_origin_entity = standardisation.PossibleOriginEntities(raw_origin_entity)
origin_destination['parsed_origin_entity'] = parsed_origin_entity
except ValueError as error:
raise EntitiesJoinerError(
f'entity_from: {str(error)}. Possible values are '
f'{[item.value for item in standardisation.PossibleOriginEntities]}')
try:
parsed_destination_entity = standardisation.PossibleDestinationEntities(raw_destination_entity)
origin_destination['parsed_destination_entity'] = parsed_destination_entity
except ValueError as error:
raise EntitiesJoinerError(
f'entity_to: {str(error)}. Possible values are '
f'{[item.value for item in standardisation.PossibleDestinationEntities]}')
return origin_destination
def parse_selection_description_dict(raw_selection_description):
"""
parses the selection description from the text entered, fails if invalid
:param raw_selection_description: stringifyed javascript object describing de selection of items in the dataset
:return: a dict with the selection description entered
"""
selection_description_dict = json.loads(raw_selection_description)
try:
parsed_selection_mode = standardisation.SelectionModes(selection_description_dict['selectionMode'])
if parsed_selection_mode == standardisation.SelectionModes.NO_ITEMS_EXCEPT:
if len(selection_description_dict.get('exceptions', [])) == 0:
raise EntitiesJoinerError(
f'When selection mode is {parsed_selection_mode} there must be at least one exception')
except ValueError as error:
raise EntitiesJoinerError(
f'selectionMode: {str(error)}. '
f'Possible values are {[item.value for item in standardisation.SelectionModes]}'
)
return selection_description_dict
def parse_join_parameters(parsed_origin_entity, parsed_destination_entity, raw_selection_description):
"""
Parses the parameters for the join, fails if they are invalid
:param parsed_origin_entity: origin entity of the join
:param parsed_destination_entity: destination entity of the join
:param raw_selection_description: stringifyied javascript object describing the selection
:return: a dict with the parsed parameters
"""
join_parameters = {}
index_name = standardisation.get_index_name_for_origin_entity(parsed_origin_entity)
fail_if_null(index_name, 'index name', parsed_origin_entity,
parsed_destination_entity)
join_parameters['index_name'] = index_name
origin_property = standardisation.get_origin_property(parsed_origin_entity,
parsed_destination_entity)
fail_if_null(origin_property, 'origin property', parsed_origin_entity,
'parsed_destination_entity')
join_parameters['origin_property'] = origin_property
selection_description = parse_selection_description_dict(raw_selection_description)
join_parameters['selection_description'] = selection_description
return join_parameters
def parse_query_parameters(parsed_origin_entity, parsed_destination_entity):
"""
parses que parameters to build the join query
:param parsed_origin_entity: origin entity of the join
:param parsed_destination_entity: destination entity of the join
:return: a dict with the parsed parameters
"""
query_parameters = {}
destination_query_generator = standardisation.get_destination_query_generator(
parsed_origin_entity,
parsed_destination_entity)
fail_if_null(destination_query_generator, 'destination query generator', parsed_origin_entity,
parsed_destination_entity)
query_parameters['destination_query_generator'] = destination_query_generator
settings_path = standardisation.get_settings_path_for_destination_entity(
parsed_destination_entity)
fail_if_null(settings_path, 'settings path', parsed_origin_entity,
parsed_destination_entity)
query_parameters['settings_path'] = settings_path
browser_name = standardisation.get_browser_names_for_destination_entity(
parsed_destination_entity)
fail_if_null(browser_name, 'browser name', parsed_origin_entity,
parsed_destination_entity)
query_parameters['browser_name'] = browser_name
return query_parameters
def get_cache_key(destination_entity_browser_state_template,
entity_from, entity_to, raw_es_query,
raw_selection_description, previous_hash):
"""
:param destination_entity_browser_state_template: template for building the resulting browser url
:param entity_from: source entity of the items
:param entity_to: destination entity of the join
:param raw_es_query: stringifyed query in elasticsearch for the dataset
:param raw_selection_description: stringifyed javascript object describing de selection of items in the dataset
:param previous_hash: hash of the state that originated this join of entities
:return: a cache key corresponding to the parameters given
"""
query_identifier = f'{destination_entity_browser_state_template}-{entity_from}-{entity_to}-{raw_es_query}-' \
f'{raw_selection_description}-{previous_hash}-{RUN_CONFIG.get("cache_key_suffix")}'
query_identifier_digest = hashlib.sha256(query_identifier.encode('utf-8')).digest()
base64_identifier_hash = base64.b64encode(query_identifier_digest).decode('utf-8')
return base64_identifier_hash
def fail_if_null(value, value_name, parsed_origin_entity, parsed_destination_entity):
"""
Fails if the value is null and uses the rest of the parameters to build an error message
:param value: value to check
:param value_name: name of the value for the error message
:param parsed_origin_entity: parsed from entity
:param parsed_destination_entity: parsed to entity
"""
if value is None:
raise EntitiesJoinerError(f'There is no {value_name} configured for queries from {parsed_origin_entity.value} '
f'to {parsed_destination_entity}')
def get_join_state_hash(ids, destination_query_generator, settings_path, browser_name,
destination_entity_browser_state_template,
previous_hash):
"""
:param ids: list of its for the join
:param destination_query_generator: function that generates the join query from a list of ids
:param settings_path: settings path to be used by the user interface
:param browser_name: name of the destination browser used to build the desired url
:param destination_entity_browser_state_template: template to build the url for the destination entity browser
:param previous_hash: hash of the state that originated this join of entities
:return: the hash for the query state used for the join
"""
desired_state = {
'origin': {
'type': 'ENTITIES_JOIN',
'previous_hash': previous_hash,
},
'list': {
'settings_path': settings_path,
'custom_query': destination_query_generator(ids),
'use_custom_query': True,
'search_term': "",
'at_least_one_facet_is_selected': False,
}
}
b64_desired_state = base64.b64encode(json.dumps(desired_state).encode())
destination_entity_browser_state_url = destination_entity_browser_state_template.replace(
'<BROWSER_NAME>',
browser_name
)
destination_entity_browser_state_url = destination_entity_browser_state_url.replace(
'<GENERATED_STATE>',
b64_desired_state.decode()
)
hashable_part = f'#{destination_entity_browser_state_url.split("#")[1]}'
return get_destination_url_hash(hashable_part)
def get_destination_url_hash(hashable_part):
"""
:param hashable_part: of the destination url generated
:return: a hash generated using the hasable url part of the destination browser
"""
shortening_response = url_shortener.shorten_url(hashable_part)
return shortening_response['hash']
def save_statistics(raw_entity_from, raw_entity_to, num_ids, is_cached):
"""
:param raw_entity_from: source entity of the items
:param raw_entity_to: destination entity of the join
:param num_ids: number of ids joined
:param is_cached: boolean indicating if the join is cached or not
"""
statistics_document = {
'entity_from': raw_entity_from,
'entity_to': raw_entity_to,
'num_ids': num_ids,
'request_date': datetime.utcnow().timestamp() * 1000,
'is_cached': is_cached
}
index_name = RUN_CONFIG.get('usage_statistics').get('entities_join_statistics_index')
statistics_saver.save_record_to_elasticsearch(statistics_document, index_name)
"""
Module with functions that help to load the ids for the entities joiner
"""
from app.entities_joiner import standardisation
from app.es_data import es_data
from utils import dict_property_access
def load_ids_for_query(es_query, selection_description, from_property, index_name):
"""
:param es_query: query for the dataset
:param selection_description: dict describing the selection
:param from_property: property to get to do the join
:param index_name: name of the index to query
:return: a list of ids for the query and the selection description indicated.
"""
ids = []
ids_query = get_ids_query(es_query, selection_description, from_property)
ids_scanner = es_data.get_es_scanner(index_name, ids_query)
for doc_i in ids_scanner:
doc_source = doc_i['_source']
from_property_value = dict_property_access.get_property_value(doc_source, from_property)
ids.append(from_property_value)
return ids
def get_ids_query(es_query, selection_description, from_property):
"""
:param es_query: query for the dataset
:param selection_description: dict describing the selection
:param from_property: property to get to do the join
:return: the query to use to get the ids depending on the selection description
"""
selection_mode = selection_description['selectionMode']
parsed_selection_mode = standardisation.SelectionModes(selection_mode)
exceptions = selection_description.get('exceptions', [])
if parsed_selection_mode == standardisation.SelectionModes.ALL_ITEMS_EXCEPT:
if len(exceptions) == 0:
return get_ids_query_for_all_items(es_query, from_property)
return get_ids_query_for_all_items_except_some(es_query, from_property, exceptions)
# Selecting none except some
return get_ids_query_for_no_items_except_some(es_query, from_property, exceptions)
def get_ids_query_for_all_items(es_query, from_property):
"""
:param es_query: uery for the dataset
:param from_property: property to get to to the join
:return: the ids query for all items
"""
return {
'query': es_query.get('query'),
"_source": [from_property],
}
def get_ids_query_for_all_items_except_some(es_query, from_property, exceptions):
"""
:param es_query: query for the dataset
:param from_property: property to get to to the join
:param exceptions: selection exceptions
:return: the ids query for all items except some
"""
dataset_query = es_query.get('query')
if dataset_query.get('bool') is None:
dataset_query['bool'] = {}
if dataset_query.get('bool').get('must_not') is None:
dataset_query['bool']['must_not'] = []
dataset_query['bool']['must_not'].append({
'terms': {
from_property: exceptions
}
})
return {
'query': dataset_query,
"_source": [from_property],
}
def get_ids_query_for_no_items_except_some(es_query, from_property, exceptions):
"""
:param es_query: query for the dataset
:param from_property: property to get to to the join
:param exceptions: selection exceptions