Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
E
EuropePMC-Corpus
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Locked Files
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Iterations
Merge Requests
0
Merge Requests
0
Requirements
Requirements
List
Security & Compliance
Security & Compliance
Dependency List
License Compliance
Operations
Operations
Metrics
Incidents
Packages & Registries
Packages & Registries
Package Registry
Container Registry
Analytics
Analytics
Code Review
Insights
Issue
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Literature-services
public-projects
EuropePMC-Corpus
Commits
7c20cada
Commit
7c20cada
authored
Aug 14, 2020
by
Xiao Yang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add hypothesis script to fetch raw annotations
parent
f6670cdf
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
276 additions
and
0 deletions
+276
-0
README.md
README.md
+1
-0
src/hypothesis.py
src/hypothesis.py
+275
-0
No files found.
README.md
View file @
7c20cada
...
...
@@ -37,6 +37,7 @@ Each article contains 3 core entity types, manually annotated by curators: Gene/
-
```annotations.py```
: Python script used to extract annotations from raw
[
Hypothes.is
](
https://web.hypothes.is
)
annotations.
-
```generate_IOB_dataset.py```
: Python script used to convert JSON format annotations to IOB tagging format.
-
```generate_json_dataset.py```
: Python script used to extract annotations to JSON format.
-
```hypothesis.py```
: Python script used to fetch raw
[
Hypothes.is
](
https://web.hypothes.is
)
annotations.
## License
To be discussed
...
...
src/hypothesis.py
0 → 100644
View file @
7c20cada
import
requests
import
json
from
collections
import
defaultdict
import
csv
import
re
import
yaml
from
typing
import
Iterator
,
Dict
,
Any
,
List
import
os
import
logging
logging
.
basicConfig
(
level
=
logging
.
DEBUG
)
logger
=
logging
.
getLogger
(
__name__
)
class
APIconfig
():
def
__init__
(
self
,
config_path
:
str
)
->
None
:
try
:
logger
.
info
(
f'opening config file
{
config_path
}
...'
)
with
open
(
config_path
,
'r'
)
as
f
:
config
=
yaml
.
safe_load
(
f
)
self
.
api_token
=
config
[
'TOKEN'
]
self
.
groups_ids
=
config
[
'GROUPS_IDS'
]
self
.
white_list
=
config
[
'WHITE_LIST'
]
except
FileNotFoundError
as
err
:
logger
.
exception
(
str
(
err
))
raise
def
retrieve_annotations
(
group_id
:
str
,
api_token
:
str
)
->
Iterator
[
Dict
[
str
,
Any
]]:
"""
retrieve annotations of a group by its group ID
:param group_id: hypothes.is group ID
:type group_id: str
:param api_token: Hypothes.is user API token
:type api_token: str
:return: hypothes.is annotations
:rtype: Iterator
"""
# use sort and a search_after for pagination
SORT_BY
=
'id'
# set num of annotations to retrieve per request, maximum 200
LIMIT
=
50
# output response in JSON format
headers
=
{
'Authorization'
:
'Bearer {}'
.
format
(
api_token
),
'Content-Type'
:
'application/json;charset=utf-8'
}
payload
=
{
'group'
:
group_id
,
'sort'
:
SORT_BY
,
'limit'
:
50
}
response
=
requests
.
get
(
'https://hypothes.is/api/search'
,
params
=
payload
,
headers
=
headers
)
total
=
response
.
json
()[
'total'
]
counter
=
0
while
response
.
json
()[
'rows'
]:
annotations
=
response
.
json
()[
'rows'
]
counter
+=
len
(
annotations
)
for
anno
in
annotations
:
# counterc += 1
# print(anno)
# raise
yield
extract_annotations
(
anno
)
search_after
=
extract_annotations
(
annotations
[
-
1
])[
'id'
]
payload
.
update
({
'search_after'
:
search_after
})
response
=
requests
.
get
(
'https://hypothes.is/api/search'
,
params
=
payload
,
headers
=
headers
)
# assert total == counter, f"Expected {total} annotations, but extracted {counter} "
print
(
f"
{
total
}
annotations extrated..."
)
def
extract_annotations
(
annotation
:
Dict
[
str
,
Any
])
->
Dict
[
str
,
Any
]:
"""
extract details from each hypothesis annotation
:param annotation: Hypothes.is annotation object
:type annotation: dict
:return: extracted annotation details
:rtype: dict
"""
anno
=
{}
try
:
anno
[
'id'
]
=
annotation
[
'id'
]
anno
[
'group'
]
=
annotation
[
'group'
]
anno
[
'source'
]
=
annotation
[
'target'
][
0
][
'source'
]
anno
[
'annotation'
]
=
None
anno
[
'text_position'
]
=
None
anno
[
'created'
]
=
annotation
[
'created'
]
for
selector
in
annotation
[
'target'
][
0
][
'selector'
]:
if
selector
[
'type'
]
==
'TextQuoteSelector'
:
anno
[
'annotation'
]
=
selector
if
selector
[
'type'
]
==
'TextPositionSelector'
:
anno
[
'text_position'
]
=
selector
anno
[
'comment'
]
=
annotation
[
'text'
]
anno
[
'user'
]
=
annotation
[
'user'
]
anno
[
'document'
]
=
annotation
[
'document'
]
anno
[
'tags'
]
=
annotation
[
'tags'
]
except
KeyError
as
err
:
logger
.
exception
(
str
(
err
))
raise
return
anno
def
group_by_pmcid
(
annotations
:
List
[
Dict
[
str
,
Any
]])
->
Dict
[
str
,
List
[
Dict
]]:
"""
group annotations by pmcid
:param annotations: extracted annotations
:type annotations: dict
:return: extracted annotations and grouped by pmcid
:rtype: dict
"""
pmcid_annotations
=
defaultdict
(
list
)
for
anno
in
annotations
:
pmcid
=
anno
[
'source'
].
strip
().
split
(
'/'
)[
-
2
]
pmcid_annotations
[
pmcid
].
append
(
anno
)
return
pmcid_annotations
def
retrieve_extracted_annotations
(
group_id
:
str
,
api_token
:
str
)
->
Dict
[
str
,
List
[
Dict
]]:
"""Given group ID and api token, return the annotations that are grouped by pmcids"""
logger
.
info
(
f'retrieve annotations for group id
{
group_id
}
...'
)
annotations
=
[
anno
for
anno
in
retrieve_annotations
(
group_id
,
api_token
)]
pmcid_annotations
=
group_by_pmcid
(
annotations
)
return
pmcid_annotations
def
flatten_annotation
(
annotation
:
Dict
[
str
,
Any
])
->
Dict
[
str
,
Any
]:
"""
convert annotation object to flat key-value pairs without nested dictionary
:param annotation:
:type annotation:
:return:
:rtype:
"""
# convert hypothes.is JSON response into flat csv rows
if
len
(
annotation
[
'tags'
])
>
1
:
logger
.
warning
(
f"multiple tags found:
{
annotation
[
'tags'
]
}
\n
"
)
tags
=
''
.
join
(
annotation
[
'tags'
])
if
annotation
[
'tags'
]
else
'N/A'
all_tag
=
'yes'
if
'ALL'
in
tags
else
'no'
# tags = tags.replace('[', '')
# tags = tags.replace(']', '')
tags
=
tags
.
replace
(
'{'
,
''
)
tags
=
tags
.
replace
(
'}'
,
''
)
tags
=
tags
.
replace
(
'[ALL]'
,
''
)
tags
=
','
.
join
([
t
for
t
in
re
.
split
(
pattern
=
r'[\[\]]+'
,
string
=
tags
)
if
t
])
return
{
'id'
:
annotation
[
'id'
],
'group_id'
:
annotation
[
'group'
],
'source'
:
annotation
[
'source'
],
'exact'
:
annotation
[
'annotation'
][
'exact'
],
'prefix'
:
annotation
[
'annotation'
][
'prefix'
],
'suffix'
:
annotation
[
'annotation'
][
'suffix'
],
'anno_type'
:
annotation
[
'annotation'
][
'type'
],
'position_type'
:
annotation
[
'text_position'
][
'type'
],
'start'
:
annotation
[
'text_position'
][
'start'
],
'end'
:
annotation
[
'text_position'
][
'end'
],
'tags'
:
tags
,
'all'
:
all_tag
,
'origin_tags'
:
annotation
[
'tags'
][
0
]
if
annotation
[
'tags'
]
else
''
,
'comment'
:
annotation
[
'comment'
],
'user'
:
annotation
[
'user'
],
'title'
:
annotation
[
'document'
][
'title'
][
0
],
'created'
:
annotation
[
'created'
]
}
def
to_csv
(
annotations
:
List
[
Dict
[
str
,
Any
]],
fname
:
str
)
->
None
:
"""
write annotations to csv files
:param annotations:
:type annotations:
:param fname:
:type fname:
:return:
:rtype:
"""
headers
=
[
'id'
,
'group_id'
,
'source'
,
'exact'
,
'prefix'
,
'suffix'
,
'anno_type'
,
'position_type'
,
'start'
,
'end'
,
'tags'
,
'all'
,
'origin_tags'
,
'comment'
,
'user'
,
'title'
,
'created'
]
with
open
(
fname
,
'w'
)
as
f
:
dictWriter
=
csv
.
DictWriter
(
f
,
fieldnames
=
headers
)
dictWriter
.
writeheader
()
print
(
fname
,
f'
{
len
(
annotations
)
}
annotations'
)
for
anno
in
annotations
:
dictWriter
.
writerow
(
flatten_annotation
(
anno
))
def
retrieve_groups_annotations
(
config
:
APIconfig
,
write
:
Dict
[
str
,
str
],
pmcid_blacklist
=
None
)
->
Dict
[
str
,
Any
]:
"""
retireve all group annotations
:param config:
:type config:
:param write:
:type write:
:param pmcid_blacklist:
:type pmcid_blacklist:
:return:
:rtype:
"""
group_gannotations
=
{}
for
group_id
in
config
.
groups_ids
:
pmcid_annotations
=
retrieve_extracted_annotations
(
group_id
,
config
.
api_token
)
group_gannotations
[
group_id
]
=
pmcid_annotations
if
write
:
try
:
write_dir
=
write
[
'dir'
]
format
=
write
[
'format'
]
except
KeyError
as
err
:
logger
.
exception
(
str
(
err
))
raise
if
format
==
'csv'
:
for
group_name
in
config
.
groups_ids
.
values
():
group_dir
=
os
.
path
.
join
(
write_dir
,
f'csv/
{
group_name
}
'
)
if
not
os
.
path
.
exists
(
group_dir
):
os
.
mkdir
(
group_dir
)
logger
.
info
(
'save annotations as csv files...'
)
for
group_id
in
group_gannotations
:
group_dir
=
os
.
path
.
join
(
write_dir
,
f'csv/
{
config
.
groups_ids
[
group_id
]
}
'
)
for
pmcid
,
pmcid_annotations
in
group_gannotations
[
group_id
].
items
():
if
pmcid_blacklist
and
pmcid
in
pmcid_blacklist
:
continue
if
config
.
white_list
and
(
pmcid
not
in
config
.
white_list
):
continue
fname
=
os
.
path
.
join
(
group_dir
,
f'
{
pmcid
}
-
{
config
.
groups_ids
[
group_id
]
}
-
{
group_id
}
.csv'
)
to_csv
(
pmcid_annotations
,
fname
)
elif
format
==
'json'
:
write_dir
=
os
.
path
.
join
(
write_dir
,
'json'
)
if
not
os
.
path
.
exists
(
write_dir
):
os
.
mkdir
(
write_dir
)
with
open
(
os
.
path
.
join
(
write_dir
,
'extracted_annotations.json'
),
'w'
)
as
f
:
json
.
dump
(
group_gannotations
,
f
)
else
:
raise
ValueError
(
f'Wrong format:
{
format
}
, use csv or json'
)
return
group_gannotations
def
retrieve_groups_annotations_stats
(
config
:
APIconfig
,
pmcid_blacklist
=
None
)
->
Dict
[
str
,
Any
]:
"""
retireve all group annotations
:param config:
:type config:
:param write:
:type write:
:param pmcid_blacklist:
:type pmcid_blacklist:
:return:
:rtype:
"""
group_gannotations
=
{}
for
group_id
in
config
.
groups_ids
:
group_gannotations
[
group_id
]
=
{
'total count'
:
0
,
'rel count'
:
0
,
'pmcid'
:
set
()}
for
annotation
in
retrieve_annotations
(
group_id
,
api_token
=
config
.
api_token
):
pmcid
=
annotation
[
'source'
].
strip
().
split
(
'/'
)[
-
2
]
if
pmcid
not
in
config
.
white_list
:
continue
group_gannotations
[
group_id
][
'pmcid'
].
add
(
pmcid
)
group_gannotations
[
group_id
][
'total count'
]
+=
1
if
'AMB'
in
annotation
[
'tags'
]
or
'ABM'
in
annotation
[
'tags'
]
or
'ABG'
in
annotation
[
'tags'
]
\
or
'YGD'
in
annotation
[
'tags'
]
or
'NGD'
in
annotation
[
'tags'
]:
group_gannotations
[
group_id
][
'rel count'
]
+=
1
print
(
group_gannotations
[
group_id
])
# raise
print
(
group_gannotations
)
if
__name__
==
'__main__'
:
api_cfg
=
APIconfig
(
'setting.config'
)
WRITE_DIR
=
"annotation/full"
print
(
api_cfg
.
api_token
,
api_cfg
.
groups_ids
,
len
(
api_cfg
.
white_list
))
# pilot
# retrieve_groups_annotations(api_cfg, write={'format': 'csv', 'dir': 'annotation'}, pmcid_blacklist=['PMC5573731'])
# batch
# retrieve_groups_annotations(api_cfg, write={'format': 'csv', 'dir': WRITE_DIR})
retrieve_groups_annotations_stats
(
api_cfg
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment