Commit 7c20cada authored by Xiao Yang's avatar Xiao Yang

add hypothesis script to fetch raw annotations

parent f6670cdf
......@@ -37,6 +37,7 @@ Each article contains 3 core entity types, manually annotated by curators: Gene/
- ``````: Python script used to extract annotations from raw []( annotations.
- ``````: Python script used to convert JSON format annotations to IOB tagging format.
- ``````: Python script used to extract annotations to JSON format.
- ``````: Python script used to fetch raw []( annotations.
## License
To be discussed
import requests
import json
from collections import defaultdict
import csv
import re
import yaml
from typing import Iterator, Dict, Any, List
import os
import logging
logger = logging.getLogger(__name__)
class APIconfig():
def __init__(self, config_path: str) -> None:
try:'opening config file {config_path}...')
with open(config_path, 'r') as f:
config = yaml.safe_load(f)
self.api_token = config['TOKEN']
self.groups_ids = config['GROUPS_IDS']
self.white_list = config['WHITE_LIST']
except FileNotFoundError as err:
def retrieve_annotations(group_id: str, api_token: str) -> Iterator[Dict[str, Any]]:
retrieve annotations of a group by its group ID
:param group_id: group ID
:type group_id: str
:param api_token: user API token
:type api_token: str
:return: annotations
:rtype: Iterator
# use sort and a search_after for pagination
SORT_BY = 'id'
# set num of annotations to retrieve per request, maximum 200
LIMIT = 50
# output response in JSON format
headers = {
'Authorization': 'Bearer {}'.format(api_token),
'Content-Type': 'application/json;charset=utf-8'}
payload = {'group': group_id, 'sort': SORT_BY, 'limit': 50}
response = requests.get('', params=payload, headers=headers)
total = response.json()['total']
counter = 0
while response.json()['rows']:
annotations = response.json()['rows']
counter += len(annotations)
for anno in annotations:
# counterc += 1
# print(anno)
# raise
yield extract_annotations(anno)
search_after = extract_annotations(annotations[-1])['id']
payload.update({'search_after': search_after})
response = requests.get('', params=payload, headers=headers)
# assert total == counter, f"Expected {total} annotations, but extracted {counter} "
print(f"{total} annotations extrated...")
def extract_annotations(annotation: Dict[str, Any]) -> Dict[str, Any]:
extract details from each hypothesis annotation
:param annotation: annotation object
:type annotation: dict
:return: extracted annotation details
:rtype: dict
anno = {}
anno['id'] = annotation['id']
anno['group'] = annotation['group']
anno['source'] = annotation['target'][0]['source']
anno['annotation'] = None
anno['text_position'] = None
anno['created'] = annotation['created']
for selector in annotation['target'][0]['selector']:
if selector['type'] == 'TextQuoteSelector':
anno['annotation'] = selector
if selector['type'] == 'TextPositionSelector':
anno['text_position'] = selector
anno['comment'] = annotation['text']
anno['user'] = annotation['user']
anno['document'] = annotation['document']
anno['tags'] = annotation['tags']
except KeyError as err:
return anno
def group_by_pmcid(annotations: List[Dict[str, Any]]) -> Dict[str, List[Dict]]:
group annotations by pmcid
:param annotations: extracted annotations
:type annotations: dict
:return: extracted annotations and grouped by pmcid
:rtype: dict
pmcid_annotations = defaultdict(list)
for anno in annotations:
pmcid = anno['source'].strip().split('/')[-2]
return pmcid_annotations
def retrieve_extracted_annotations(group_id: str, api_token: str) -> Dict[str, List[Dict]]:
"""Given group ID and api token, return the annotations that are grouped by pmcids"""'retrieve annotations for group id {group_id}...')
annotations = [anno for anno in retrieve_annotations(group_id, api_token)]
pmcid_annotations = group_by_pmcid(annotations)
return pmcid_annotations
def flatten_annotation(annotation: Dict[str, Any]) -> Dict[str, Any]:
convert annotation object to flat key-value pairs without nested dictionary
:param annotation:
:type annotation:
# convert JSON response into flat csv rows
if len(annotation['tags']) > 1:
logger.warning(f"multiple tags found: {annotation['tags']} \n")
tags = ''.join(annotation['tags']) if annotation['tags'] else 'N/A'
all_tag = 'yes' if 'ALL' in tags else 'no'
# tags = tags.replace('[', '')
# tags = tags.replace(']', '')
tags = tags.replace('{', '')
tags = tags.replace('}', '')
tags = tags.replace('[ALL]', '')
tags = ','.join([t for t in re.split(pattern=r'[\[\]]+', string=tags) if t])
return {'id': annotation['id'],
'group_id': annotation['group'],
'source': annotation['source'],
'exact': annotation['annotation']['exact'],
'prefix': annotation['annotation']['prefix'],
'suffix': annotation['annotation']['suffix'],
'anno_type': annotation['annotation']['type'],
'position_type': annotation['text_position']['type'],
'start': annotation['text_position']['start'],
'end': annotation['text_position']['end'],
'tags': tags,
'all': all_tag,
'origin_tags': annotation['tags'][0] if annotation['tags'] else '',
'comment': annotation['comment'],
'user': annotation['user'],
'title': annotation['document']['title'][0],
'created': annotation['created']
def to_csv(annotations: List[Dict[str, Any]], fname: str) -> None:
write annotations to csv files
:param annotations:
:type annotations:
:param fname:
:type fname:
headers = ['id', 'group_id', 'source', 'exact', 'prefix', 'suffix', 'anno_type',
'position_type', 'start', 'end', 'tags', 'all', 'origin_tags', 'comment', 'user', 'title', 'created']
with open(fname, 'w') as f:
dictWriter = csv.DictWriter(f, fieldnames=headers)
print(fname, f'{len(annotations)} annotations')
for anno in annotations:
def retrieve_groups_annotations(config: APIconfig, write: Dict[str, str], pmcid_blacklist=None) -> Dict[str, Any]:
retireve all group annotations
:param config:
:type config:
:param write:
:type write:
:param pmcid_blacklist:
:type pmcid_blacklist:
group_gannotations = {}
for group_id in config.groups_ids:
pmcid_annotations = retrieve_extracted_annotations(group_id, config.api_token)
group_gannotations[group_id] = pmcid_annotations
if write:
write_dir = write['dir']
format = write['format']
except KeyError as err:
if format == 'csv':
for group_name in config.groups_ids.values():
group_dir = os.path.join(write_dir, f'csv/{group_name}')
if not os.path.exists(group_dir):
os.mkdir(group_dir)'save annotations as csv files...')
for group_id in group_gannotations:
group_dir = os.path.join(write_dir, f'csv/{config.groups_ids[group_id]}')
for pmcid, pmcid_annotations in group_gannotations[group_id].items():
if pmcid_blacklist and pmcid in pmcid_blacklist:
if config.white_list and (pmcid not in config.white_list):
fname = os.path.join(group_dir,f'{pmcid}-{config.groups_ids[group_id]}-{group_id}.csv')
to_csv(pmcid_annotations, fname)
elif format == 'json':
write_dir = os.path.join(write_dir, 'json')
if not os.path.exists(write_dir):
with open(os.path.join(write_dir, 'extracted_annotations.json'), 'w') as f:
json.dump(group_gannotations, f)
raise ValueError(f'Wrong format: {format}, use csv or json')
return group_gannotations
def retrieve_groups_annotations_stats(config: APIconfig, pmcid_blacklist=None) -> Dict[str, Any]:
retireve all group annotations
:param config:
:type config:
:param write:
:type write:
:param pmcid_blacklist:
:type pmcid_blacklist:
group_gannotations = {}
for group_id in config.groups_ids:
group_gannotations[group_id] = {'total count': 0, 'rel count': 0, 'pmcid': set()}
for annotation in retrieve_annotations(group_id, api_token=config.api_token):
pmcid = annotation['source'].strip().split('/')[-2]
if pmcid not in config.white_list:
group_gannotations[group_id]['total count'] += 1
if 'AMB' in annotation['tags'] or 'ABM' in annotation['tags'] or 'ABG' in annotation['tags'] \
or 'YGD' in annotation['tags'] or 'NGD' in annotation['tags']:
group_gannotations[group_id]['rel count'] += 1
# raise
if __name__ == '__main__':
api_cfg = APIconfig('setting.config')
WRITE_DIR = "annotation/full"
print(api_cfg.api_token, api_cfg.groups_ids, len(api_cfg.white_list))
# pilot
# retrieve_groups_annotations(api_cfg, write={'format': 'csv', 'dir': 'annotation'}, pmcid_blacklist=['PMC5573731'])
# batch
# retrieve_groups_annotations(api_cfg, write={'format': 'csv', 'dir': WRITE_DIR})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment