Commit a0f20fe8 authored by David Mendez's avatar David Mendez
Browse files

Merge branch 'staging' into 'master'

Ids identifier: remove annoying blank characters from sample

See merge request !66
parents 3f5ed135 8b79b1b2
......@@ -23,6 +23,8 @@ def get_separator(raw_ids, parsed_from):
if len(sample) > SAMPLE_MAX_SIZE:
sample = sample[:SAMPLE_MAX_SIZE]
sample = sample.replace('\ufeff', '').replace('\u200c', '')
sample = re.sub(id_pattern, ID_PLACEHOLDER, sample, count=2)
separator = get_separator_from_sandwich_match(sample)
......
......@@ -138,3 +138,26 @@ class AutomaticSeparatorTest(unittest.TestCase):
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_11(self):
"""
Test that it identifies the separator correctly when there are no ids at all
"""
test_raw_ids = '\ufeffCHEMBL4250919\nCHEMBL347836\nCHEMBL4247846\nCHEMBL25\nCHEMBL190\nCHEMBL59'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = '__NEW_LINE__'
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
def test_identifies_separator_12(self):
"""
Test that it identifies the separator correctly when there are no ids at all
"""
test_raw_ids = '\ufeffCHEMBL4250919\ufeff\n\u200cCHEMBL347836\u200c\ufeff\n\u200cCHEMBL4247846\ufeff\u200c\n' \
'CHEMBL25\u200c\ufeff\nCHEMBL190\u200c\ufeff\nCHEMBL59'
parsed_from = 'MOLECULE_CHEMBL_IDS'
separator_must_be = '__NEW_LINE__'
separator_got = separator_identifier.get_separator(test_raw_ids, parsed_from)
self.assertEqual(separator_must_be, separator_got, msg='The separator was not identified correctly!')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment