Commit 5f89da6c authored by David Mendez's avatar David Mendez
Browse files

Add module that identifies the separator

parent 9c67dcd3
"""
Module that identifies the separator of ids automatically
"""
import re
SAMPLE_MAX_SIZE = 200
ID_PLACEHOLDER = 'I_AM_STILL_ALIVE'
NEW_LINE_NAME = '__NEW_LINE__'
def get_separator(raw_ids, parsed_from):
"""
:param raw_ids: a string with the ids as entered by the user
:param parsed_from: parsed from corresponding to the entered entity
:return: the separator used in the string with the ids
"""
print(f'Getting separator automatically for {parsed_from}')
# the regex is the same for all froms for now
id_pattern = r'CHEMBL\d+'
# take a sample to avoid processing huge inputs
sample = raw_ids
if len(sample) > SAMPLE_MAX_SIZE:
sample = sample[:SAMPLE_MAX_SIZE]
sample = re.sub(id_pattern, ID_PLACEHOLDER, sample, count=2)
separator = get_separator_from_sandwich_match(sample)
if separator is not None:
return separator
# if there was no match, the separator may be new lines
separator = get_separator_from_sandwich_newlines_match(sample)
if separator is not None:
return separator
# if there was no match, there may be only one id
separator = get_separator_from_positive_lookbehind(sample)
if separator is not None:
return separator
# ifnothing matches, just default to an empty string
return ''
def get_separator_from_sandwich_match(sample):
"""
:param sample: sample text
:return: return the separator from sandwiching the separator, None if not match.
"""
pattern = fr'{ID_PLACEHOLDER}(.+){ID_PLACEHOLDER}'
match = re.match(pattern, sample)
if match is not None:
return match.group(1)
return None
def get_separator_from_sandwich_newlines_match(sample):
"""
:param sample: sample text
:return: return the separator from sandwiching the separator using new lines, None if not match.
"""
pattern = fr'{ID_PLACEHOLDER}(\n|\r)+{ID_PLACEHOLDER}'
match = re.match(pattern, sample)
if match is not None:
return NEW_LINE_NAME
return None
def get_separator_from_positive_lookbehind(sample):
"""
:param sample: sample text
:return: return the separator from doing a positive lookbehind, None if not match.
"""
pattern = fr'(?<={ID_PLACEHOLDER})'
match = re.search(pattern, sample)
if match is None:
return None
start = match.start()
separator = sample[start:]
return separator
"""
Module to test the automatic separator identifier
"""
import unittest
from app.chembl_utils.auto_separator import separator_identifier
class AutomaticSeparatorTest(unittest.TestCase):
"""
Class to test the automatic separator identifier
"""
def test_identifies_separator_1(self):
"""
Test that it identifies the separator correctly for the most common case
"""
test_raw_ids = 'CHEMBL3548189,CHEMBL3041241,CHEMBL1198499,CHEMBL1616766,CHEMBL3552333,CHEMBL3040769,' \
'CHEMBL1179177,CHEMBL1208242,CHEMBL2105295,CHEMBL1623649,CHEMBL3990326,CHEMBL3551968,' \
'CHEMBL3815178,CHEMBL1624921,CHEMBL1621119,CHEMBL1183111,CHEMBL354497,CHEMBL546869,' \
'CHEMBL1789284,CHEMBL1739771,CHEMBL3553885,CHEMBL3140231,CHEMBL3991129'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = ','
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_2(self):
"""
Test that it identifies the separator correctly for the most common case
"""
test_raw_ids = 'CHEMBL3548189GLADOSCHEMBL3041241GLADOSCHEMBL1198499GLADOSCHEMBL1616766GLADOSCHEMBL3552333' \
'GLADOSCHEMBL3040769GLADOSCHEMBL1179177GLADOSCHEMBL1208242GLADOSCHEMBL2105295GLADOS' \
'CHEMBL1623649GLADOSCHEMBL3990326GLADOSCHEMBL3551968GLADOSCHEMBL3815178GLADOSCHEMBL1624921' \
'GLADOSCHEMBL1621119GLADOSCHEMBL1183111GLADOSCHEMBL354497GLADOSCHEMBL546869GLADOSCHEMBL1789284' \
'GLADOSCHEMBL1739771GLADOSCHEMBL3553885GLADOSCHEMBL3140231GLADOSCHEMBL3991129'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = 'GLADOS'
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_3(self):
"""
Test that it identifies the separator correctly for the most common case
"""
test_raw_ids = 'CHEMBL3548189;;;;CHEMBL3041241;;;;CHEMBL1198499;;;;CHEMBL1616766;;;;CHEMBL3552333;;;;' \
'CHEMBL3040769;;;;CHEMBL1179177;;;;CHEMBL1208242;;;;CHEMBL2105295;;;;CHEMBL1623649;;;;' \
'CHEMBL3990326;;;;CHEMBL3551968;;;;CHEMBL3815178;;;;CHEMBL1624921;;;;CHEMBL1621119;;;;' \
'CHEMBL1183111;;;;CHEMBL354497;;;;CHEMBL546869;;;;CHEMBL1789284;;;;CHEMBL1739771;;;;' \
'CHEMBL3553885;;;;CHEMBL3140231;;;;CHEMBL3991129'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = ';;;;'
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_4(self):
"""
Test that it identifies the separator correctly for the most common case
"""
test_raw_ids = 'CHEMBL3548189\r\nCHEMBL3041241\r\nCHEMBL1198499\r\nCHEMBL1616766\r\nCHEMBL3552333\r\n' \
'CHEMBL3040769\r\nCHEMBL1179177\r\nCHEMBL1208242\r\nCHEMBL2105295\r\nCHEMBL1623649\r\n' \
'CHEMBL3990326\r\nCHEMBL3551968\r\nCHEMBL3815178\r\nCHEMBL1624921\r\nCHEMBL1621119\r\n' \
'CHEMBL1183111\r\nCHEMBL354497\r\nCHEMBL546869\r\nCHEMBL1789284\r\nCHEMBL1739771\r\n' \
'CHEMBL3553885\r\nCHEMBL3140231\r\nCHEMBL3991129'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = '__NEW_LINE__'
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_5(self):
"""
Test that it identifies the separator correctly for the most common case
"""
test_raw_ids = 'CHEMBL3548189,'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = ','
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_6(self):
"""
Test that it identifies the separator correctly for the most common case
"""
test_raw_ids = 'CHEMBL3548189,My descent is the story of everyman'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = ',My descent is the story of everyman'
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_7(self):
"""
Test that it identifies the separator correctly for the most common case
"""
test_raw_ids = 'CHEMBL3548189'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = ''
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_8(self):
"""
Test that it identifies the separator correctly when there are no ids at all
"""
test_raw_ids = 'The warming sun returns again\n'\
'And melts away the snow\n'\
'The sea is freed from icy chains\n'\
'Winter is letting go\n'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = ''
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment