Commit 7c20cada authored by Xiao Yang's avatar Xiao Yang

add hypothesis script to fetch raw annotations

parent f6670cdf
......@@ -37,6 +37,7 @@ Each article contains 3 core entity types, manually annotated by curators: Gene/
- ```annotations.py```: Python script used to extract annotations from raw [Hypothes.is](https://web.hypothes.is) annotations.
- ```generate_IOB_dataset.py```: Python script used to convert JSON format annotations to IOB tagging format.
- ```generate_json_dataset.py```: Python script used to extract annotations to JSON format.
- ```hypothesis.py```: Python script used to fetch raw [Hypothes.is](https://web.hypothes.is) annotations.
## License
To be discussed
......
import requests
import json
from collections import defaultdict
import csv
import re
import yaml
from typing import Iterator, Dict, Any, List
import os
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
class APIconfig():
def __init__(self, config_path: str) -> None:
try:
logger.info(f'opening config file {config_path}...')
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
self.api_token = config['TOKEN']
self.groups_ids = config['GROUPS_IDS']
self.white_list = config['WHITE_LIST']
except FileNotFoundError as err:
logger.exception(str(err))
raise
def retrieve_annotations(group_id: str, api_token: str) -> Iterator[Dict[str, Any]]:
"""
retrieve annotations of a group by its group ID
:param group_id: hypothes.is group ID
:type group_id: str
:param api_token: Hypothes.is user API token
:type api_token: str
:return: hypothes.is annotations
:rtype: Iterator
"""
# use sort and a search_after for pagination
SORT_BY = 'id'
# set num of annotations to retrieve per request, maximum 200
LIMIT = 50
# output response in JSON format
headers = {
'Authorization': 'Bearer {}'.format(api_token),
'Content-Type': 'application/json;charset=utf-8'}
payload = {'group': group_id, 'sort': SORT_BY, 'limit': 50}
response = requests.get('https://hypothes.is/api/search', params=payload, headers=headers)
total = response.json()['total']
counter = 0
while response.json()['rows']:
annotations = response.json()['rows']
counter += len(annotations)
for anno in annotations:
# counterc += 1
# print(anno)
# raise
yield extract_annotations(anno)
search_after = extract_annotations(annotations[-1])['id']
payload.update({'search_after': search_after})
response = requests.get('https://hypothes.is/api/search', params=payload, headers=headers)
# assert total == counter, f"Expected {total} annotations, but extracted {counter} "
print(f"{total} annotations extrated...")
def extract_annotations(annotation: Dict[str, Any]) -> Dict[str, Any]:
"""
extract details from each hypothesis annotation
:param annotation: Hypothes.is annotation object
:type annotation: dict
:return: extracted annotation details
:rtype: dict
"""
anno = {}
try:
anno['id'] = annotation['id']
anno['group'] = annotation['group']
anno['source'] = annotation['target'][0]['source']
anno['annotation'] = None
anno['text_position'] = None
anno['created'] = annotation['created']
for selector in annotation['target'][0]['selector']:
if selector['type'] == 'TextQuoteSelector':
anno['annotation'] = selector
if selector['type'] == 'TextPositionSelector':
anno['text_position'] = selector
anno['comment'] = annotation['text']
anno['user'] = annotation['user']
anno['document'] = annotation['document']
anno['tags'] = annotation['tags']
except KeyError as err:
logger.exception(str(err))
raise
return anno
def group_by_pmcid(annotations: List[Dict[str, Any]]) -> Dict[str, List[Dict]]:
"""
group annotations by pmcid
:param annotations: extracted annotations
:type annotations: dict
:return: extracted annotations and grouped by pmcid
:rtype: dict
"""
pmcid_annotations = defaultdict(list)
for anno in annotations:
pmcid = anno['source'].strip().split('/')[-2]
pmcid_annotations[pmcid].append(anno)
return pmcid_annotations
def retrieve_extracted_annotations(group_id: str, api_token: str) -> Dict[str, List[Dict]]:
"""Given group ID and api token, return the annotations that are grouped by pmcids"""
logger.info(f'retrieve annotations for group id {group_id}...')
annotations = [anno for anno in retrieve_annotations(group_id, api_token)]
pmcid_annotations = group_by_pmcid(annotations)
return pmcid_annotations
def flatten_annotation(annotation: Dict[str, Any]) -> Dict[str, Any]:
"""
convert annotation object to flat key-value pairs without nested dictionary
:param annotation:
:type annotation:
:return:
:rtype:
"""
# convert hypothes.is JSON response into flat csv rows
if len(annotation['tags']) > 1:
logger.warning(f"multiple tags found: {annotation['tags']} \n")
tags = ''.join(annotation['tags']) if annotation['tags'] else 'N/A'
all_tag = 'yes' if 'ALL' in tags else 'no'
# tags = tags.replace('[', '')
# tags = tags.replace(']', '')
tags = tags.replace('{', '')
tags = tags.replace('}', '')
tags = tags.replace('[ALL]', '')
tags = ','.join([t for t in re.split(pattern=r'[\[\]]+', string=tags) if t])
return {'id': annotation['id'],
'group_id': annotation['group'],
'source': annotation['source'],
'exact': annotation['annotation']['exact'],
'prefix': annotation['annotation']['prefix'],
'suffix': annotation['annotation']['suffix'],
'anno_type': annotation['annotation']['type'],
'position_type': annotation['text_position']['type'],
'start': annotation['text_position']['start'],
'end': annotation['text_position']['end'],
'tags': tags,
'all': all_tag,
'origin_tags': annotation['tags'][0] if annotation['tags'] else '',
'comment': annotation['comment'],
'user': annotation['user'],
'title': annotation['document']['title'][0],
'created': annotation['created']
}
def to_csv(annotations: List[Dict[str, Any]], fname: str) -> None:
"""
write annotations to csv files
:param annotations:
:type annotations:
:param fname:
:type fname:
:return:
:rtype:
"""
headers = ['id', 'group_id', 'source', 'exact', 'prefix', 'suffix', 'anno_type',
'position_type', 'start', 'end', 'tags', 'all', 'origin_tags', 'comment', 'user', 'title', 'created']
with open(fname, 'w') as f:
dictWriter = csv.DictWriter(f, fieldnames=headers)
dictWriter.writeheader()
print(fname, f'{len(annotations)} annotations')
for anno in annotations:
dictWriter.writerow(flatten_annotation(anno))
def retrieve_groups_annotations(config: APIconfig, write: Dict[str, str], pmcid_blacklist=None) -> Dict[str, Any]:
"""
retireve all group annotations
:param config:
:type config:
:param write:
:type write:
:param pmcid_blacklist:
:type pmcid_blacklist:
:return:
:rtype:
"""
group_gannotations = {}
for group_id in config.groups_ids:
pmcid_annotations = retrieve_extracted_annotations(group_id, config.api_token)
group_gannotations[group_id] = pmcid_annotations
if write:
try:
write_dir = write['dir']
format = write['format']
except KeyError as err:
logger.exception(str(err))
raise
if format == 'csv':
for group_name in config.groups_ids.values():
group_dir = os.path.join(write_dir, f'csv/{group_name}')
if not os.path.exists(group_dir):
os.mkdir(group_dir)
logger.info('save annotations as csv files...')
for group_id in group_gannotations:
group_dir = os.path.join(write_dir, f'csv/{config.groups_ids[group_id]}')
for pmcid, pmcid_annotations in group_gannotations[group_id].items():
if pmcid_blacklist and pmcid in pmcid_blacklist:
continue
if config.white_list and (pmcid not in config.white_list):
continue
fname = os.path.join(group_dir,f'{pmcid}-{config.groups_ids[group_id]}-{group_id}.csv')
to_csv(pmcid_annotations, fname)
elif format == 'json':
write_dir = os.path.join(write_dir, 'json')
if not os.path.exists(write_dir):
os.mkdir(write_dir)
with open(os.path.join(write_dir, 'extracted_annotations.json'), 'w') as f:
json.dump(group_gannotations, f)
else:
raise ValueError(f'Wrong format: {format}, use csv or json')
return group_gannotations
def retrieve_groups_annotations_stats(config: APIconfig, pmcid_blacklist=None) -> Dict[str, Any]:
"""
retireve all group annotations
:param config:
:type config:
:param write:
:type write:
:param pmcid_blacklist:
:type pmcid_blacklist:
:return:
:rtype:
"""
group_gannotations = {}
for group_id in config.groups_ids:
group_gannotations[group_id] = {'total count': 0, 'rel count': 0, 'pmcid': set()}
for annotation in retrieve_annotations(group_id, api_token=config.api_token):
pmcid = annotation['source'].strip().split('/')[-2]
if pmcid not in config.white_list:
continue
group_gannotations[group_id]['pmcid'].add(pmcid)
group_gannotations[group_id]['total count'] += 1
if 'AMB' in annotation['tags'] or 'ABM' in annotation['tags'] or 'ABG' in annotation['tags'] \
or 'YGD' in annotation['tags'] or 'NGD' in annotation['tags']:
group_gannotations[group_id]['rel count'] += 1
print(group_gannotations[group_id])
# raise
print(group_gannotations)
if __name__ == '__main__':
api_cfg = APIconfig('setting.config')
WRITE_DIR = "annotation/full"
print(api_cfg.api_token, api_cfg.groups_ids, len(api_cfg.white_list))
# pilot
# retrieve_groups_annotations(api_cfg, write={'format': 'csv', 'dir': 'annotation'}, pmcid_blacklist=['PMC5573731'])
# batch
# retrieve_groups_annotations(api_cfg, write={'format': 'csv', 'dir': WRITE_DIR})
retrieve_groups_annotations_stats(api_cfg)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment