Commit 850ceb6f authored by Lukas Pravda's avatar Lukas Pravda
Browse files

Refactoring

* Factor out sanitization process
* add logging
parent c1caee7b
......@@ -4,44 +4,42 @@ name: ccdutils documentation
on:
push:
branches:
branches:
- master
pull_request:
branches:
branches:
- master
jobs:
documentation:
name: Generate documentation
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
persist-credentials: false
- name: Set up python
uses: actions/setup-python@v1
with:
python-version: 3.7
- name: Set up environment and generate docs
uses: s-weigand/setup-conda@v1
with:
activate-conda: true
conda-channels: conda-forge
- run: |
- uses: actions/checkout@v2
with:
persist-credentials: false
- name: Set up python
uses: actions/setup-python@v1
with:
python-version: 3.7
- name: Set up environment and generate docs
uses: s-weigand/setup-conda@v1
with:
activate-conda: true
conda-channels: conda-forge
- run: |
conda install rdkit=2020.03.6
pip install -e ".[docs]"
- run: |
pip install -e ".[docs]"
- run: |
cd doc
make html
- name: Deploy pages
uses: peaceiris/actions-gh-pages@v3
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./doc/_build/html
- name: Deploy pages
uses: peaceiris/actions-gh-pages@v3
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./doc/_build/html
......@@ -18,21 +18,21 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
persist-credentials: false
- name: Set up python
uses: actions/setup-python@v1
with:
python-version: 3.7
- name: Set up environment and run pytest
uses: s-weigand/setup-conda@v1
with:
activate-conda: true
conda-channels: conda-forge
- run: |
conda install rdkit=2020.03.6
pip install -e ".[tests]"
- run: pytest --cov=pdbeccdutils
- uses: actions/checkout@v2
with:
persist-credentials: false
- name: Set up python
uses: actions/setup-python@v1
with:
python-version: 3.7
- name: Set up environment and run pytest
uses: s-weigand/setup-conda@v1
with:
activate-conda: true
conda-channels: conda-forge
- run: |
conda install rdkit=2020.03.6
pip install -e ".[tests]"
- run: pytest --cov=pdbeccdutils
......@@ -9,20 +9,19 @@ before_script:
test:
script:
- pytest --cov=pdbeccdutils
- pytest --cov=pdbeccdutils
only:
- master
- master
pages:
script:
- cd doc
- make html
- mv _build/html/ ../public/
- cd doc
- make html
- mv _build/html/ ../public/
artifacts:
paths:
- public
- public
only:
- master
- master
![GitHub](https://img.shields.io/github/license/pdbeurope/ccdutils) ![PYPi](https://img.shields.io/pypi/v/pdbeccdutils?color=green&style=flat) ![ccdutils documentation](https://github.com/PDBeurope/ccdutils/workflows/ccdutils%20documentation/badge.svg) ![ccdutils tests](https://github.com/PDBeurope/ccdutils/workflows/ccdutils%20tests/badge.svg)
[![CodeFactor](https://www.codefactor.io/repository/github/pdbeurope/ccdutils/badge/master)](https://www.codefactor.io/repository/github/pdbeurope/ccdutils/overview/master) ![PYPi](https://img.shields.io/pypi/v/pdbeccdutils?color=green&style=flat) ![GitHub](https://img.shields.io/github/license/pdbeurope/ccdutils) ![ccdutils documentation](https://github.com/PDBeurope/ccdutils/workflows/ccdutils%20documentation/badge.svg) ![ccdutils tests](https://github.com/PDBeurope/ccdutils/workflows/ccdutils%20tests/badge.svg)
# pdbeccdutils
......@@ -45,8 +43,7 @@
## TODO list
* Port rest of the important functionality implemented by Oliver
* Add more unit/regression tests to get at least higher code coverage.
* Add more unit/regression tests to get higher code coverage.
* Further improvements of the documentation.
## Notes
......@@ -67,7 +64,7 @@ Otherwise it cannot pick `rdkit` module. `sphinx_rtd_theme` is a theme providing
* Generate *.rst* files to be included as a part of the documentation. Inside the directory `pdbeccdutils/doc` run the following commands to generate documentation.
* Alternatively, use the `recommonmark` package along with the proper configuration to get the Markdown working.
Use the following to generate initial markup files to be used by sphinx. This needs to be used when adding another sub-packages.
```console
......
......@@ -99,7 +99,7 @@ html_logo = "logo.png"
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
#html_css_files = ["css/styles.css"]
# html_css_files = ["css/styles.css"]
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
......@@ -191,7 +191,8 @@ def setup(app):
)
app.add_transform(AutoStructify)
app.connect(
"autodoc-process-docstring", no_namedtuple_attrib_docstring,
"autodoc-process-docstring",
no_namedtuple_attrib_docstring,
)
......@@ -205,4 +206,3 @@ def no_namedtuple_attrib_docstring(app, what, name, obj, options, lines):
# endregion
......@@ -26,17 +26,20 @@ of molecules. The basic use can be as easy as this:
"""
import os
import sys
from datetime import date
from typing import Dict, List, NamedTuple
import rdkit
from pdbecif.mmcif_io import MMCIF2Dict
from pdbeccdutils.core.component import Component
from pdbeccdutils.core.exceptions import CCDUtilsError
from pdbeccdutils.core.models import CCDProperties, Descriptor, ReleaseStatus
from pdbeccdutils.helpers import collection_ext, conversions
from pdbeccdutils.core.models import (
CCDProperties,
ConformerType,
Descriptor,
ReleaseStatus,
)
from pdbeccdutils.helpers import collection_ext, conversions, mol_tools, logging
from pdbecif.mmcif_io import MMCIF2Dict
class CCDReaderResult(NamedTuple):
......@@ -52,11 +55,13 @@ class CCDReaderResult(NamedTuple):
warnings (list[str]): A list of any warnings
found while reading the CCD. If no warnings found `warnings`
will be empty.
sanitized (bool): Whether or not the molecule was sanitized
"""
warnings: List[str]
errors: List[str]
component: Component
sanitized: bool
def read_pdb_cif_file(path_to_cif: str, sanitize: bool = True) -> CCDReaderResult:
......@@ -110,11 +115,10 @@ def read_pdb_components_file(
for k, v in MMCIF2Dict().parse(path_to_cif).items():
try:
result_bag[k] = _parse_pdb_mmcif(v)
result_bag[k] = _parse_pdb_mmcif(v, sanitize)
except CCDUtilsError as e:
print(
f"ERROR: Data block {k} not processed. Reason: ({str(e)}).",
file=sys.stderr,
logging.logger.error(
f"ERROR: Data block {k} not processed. Reason: ({str(e)})."
)
return result_bag
......@@ -134,8 +138,9 @@ def _parse_pdb_mmcif(cif_dict, sanitize=True):
CCDReaderResult: internal representation with the results
of parsing and Mol object.
"""
warnings = list()
errors = list()
warnings = []
errors = []
sanitized = False
mol = rdkit.Chem.RWMol()
atoms_dict = _preprocess_pdb_parser_output(cif_dict, "_chem_comp_atom", warnings)
......@@ -153,12 +158,17 @@ def _parse_pdb_mmcif(cif_dict, sanitize=True):
_parse_pdb_bonds(mol, bonds_dict, atoms_dict, errors)
_handle_implicit_hydrogens(mol)
if sanitize:
sanitized = mol_tools.sanitize(mol)
descriptors = _parse_pdb_descriptors(descriptors_dict, "descriptor")
descriptors += _parse_pdb_descriptors(identifiers_dict, "identifier")
properties = _parse_pdb_properties(properties_dict)
comp = Component(mol.GetMol(), cif_dict, properties, descriptors, sanitize=sanitize)
reader_result = CCDReaderResult(warnings=warnings, errors=errors, component=comp)
comp = Component(mol.GetMol(), cif_dict, properties, descriptors)
reader_result = CCDReaderResult(
warnings=warnings, errors=errors, component=comp, sanitized=sanitized
)
return reader_result
......@@ -212,20 +222,23 @@ def _parse_pdb_conformers(mol, atoms):
if not atoms:
return
ideal = _setup_pdb_conformer(atoms, "pdbx_model_Cartn_{}_ideal")
model = _setup_pdb_conformer(atoms, "model_Cartn_{}")
ideal = _setup_pdb_conformer(
atoms, "pdbx_model_Cartn_{}_ideal", ConformerType.Ideal.name
)
model = _setup_pdb_conformer(atoms, "model_Cartn_{}", ConformerType.Model.name)
mol.AddConformer(ideal, assignId=True)
mol.AddConformer(model, assignId=True)
def _setup_pdb_conformer(atoms, label):
def _setup_pdb_conformer(atoms, label, name):
"""
Setup a conformer
Args:
atoms (dict): mmcif category with the atom info.
label (str): Namespace with the [x,y,z] coordinates.
name (str): Conformer name.
Returns:
rdkit.Chem.rdchem.Conformer: Conformer of the component.
......@@ -243,6 +256,8 @@ def _setup_pdb_conformer(atoms, label):
atom_position = rdkit.Chem.rdGeometry.Point3D(x, y, z)
conformer.SetAtomPosition(i, atom_position)
conformer.SetProp("name", name)
return conformer
......@@ -352,6 +367,11 @@ def _parse_pdb_properties(chem_comp):
rel_status = chem_comp["pdbx_release_status"][0]
rel_status = ReleaseStatus.from_str(chem_comp["pdbx_release_status"][0])
weight = (
0.0
if chem_comp["formula_weight"][0] == "?"
else float(chem_comp["formula_weight"][0])
)
properties = CCDProperties(
id=chem_comp["id"][0],
......@@ -359,9 +379,7 @@ def _parse_pdb_properties(chem_comp):
formula=chem_comp["formula"][0],
modified_date=d,
pdbx_release_status=rel_status,
weight=0.0
if chem_comp["formula_weight"][0] == "?"
else float(chem_comp["formula_weight"][0]),
weight=weight,
)
return properties
......
......@@ -28,15 +28,15 @@ import math
import xml.etree.ElementTree as ET
from collections import OrderedDict
from typing import List
from xml.dom import minidom
import pdbeccdutils
import pdbecif.mmcif_io as mmcif
import rdkit
import pdbeccdutils
from defusedxml import minidom
from pdbeccdutils.core.component import Component
from pdbeccdutils.core.exceptions import CCDUtilsError
from pdbeccdutils.core.models import ConformerType
from pdbeccdutils.helpers.logging import logger
def write_molecule(
......@@ -165,24 +165,24 @@ def to_sdf_str(
Returns:
str: String representation of the component in the SDF format
"""
(mol_to_save, conf_id, conf_type) = _prepate_structure(
component, remove_hs, conf_type
)
(mol_to_save, _, conf_type) = _prepate_structure(component, remove_hs, conf_type)
mol_block = []
mappings = {}
if conf_type == ConformerType.AllConformers:
conformers = [ConformerType.Model, ConformerType.Ideal, ConformerType.Computed]
else:
conformers = [conf_type]
try:
for conf in conformers:
for c in conformers:
try:
conf_id = -1
if c != ConformerType.AllConformers:
conf_id = component.get_conformer(c).GetId()
block = [
f"{component.id} - {conf.name} conformer",
rdkit.Chem.MolToMolBlock(
mol_to_save, confId=component.conformers_mapping[conf]
).strip(),
f"{component.id} - {c.name} conformer",
rdkit.Chem.MolToMolBlock(mol_to_save, confId=conf_id).strip(),
"$$$$\n",
]
mol_block += block
......@@ -192,8 +192,7 @@ def to_sdf_str(
else:
raise CCDUtilsError(f"Error writing SDF file - {e}")
except Exception:
mappings = {m.name: component.conformers_mapping[m] for m in conformers}
mol_block = _to_sdf_str_fallback(mol_to_save, component.id, mappings)
mol_block = _to_sdf_str_fallback(mol_to_save, component.id, conformers)
return "\n".join(mol_block)
......@@ -544,11 +543,16 @@ def _prepate_structure(component, remove_hs, conf_type):
tuple(rdkit.Mol,int,ConformerType): mol along with properties
to be exported.
"""
conf_id = (
0
if conf_type == ConformerType.Depiction
else component.conformers_mapping[conf_type]
)
conf_id = -1 # this is AllConformers options
if conf_type == ConformerType.Depiction:
conf_id = 0
else:
for c in component.mol.GetConformers():
if c.GetProp("name") == conf_type.name:
conf_id = c.GetId()
break
mol_to_save = (
component.mol2D if conf_type == ConformerType.Depiction else component.mol
)
......@@ -794,11 +798,14 @@ def _get_cml_bond_type(bond_order):
"""
if bond_order == rdkit.Chem.rdchem.BondType.SINGLE:
return "1"
elif bond_order == rdkit.Chem.rdchem.BondType.DOUBLE:
if bond_order == rdkit.Chem.rdchem.BondType.DOUBLE:
return "2"
elif bond_order == rdkit.Chem.rdchem.BondType.TRIPLE:
if bond_order == rdkit.Chem.rdchem.BondType.TRIPLE:
return "3"
elif bond_order == rdkit.Chem.rdchem.BondType.AROMATIC:
if bond_order == rdkit.Chem.rdchem.BondType.AROMATIC:
return "A"
else:
return str(bond_order)
......@@ -823,13 +830,14 @@ def _get_ccd_cif_bond_stereo(bond):
rdkit.Chem.rdchem.BondStereo.STEREOCIS,
):
return "E"
elif stereo in (
if stereo in (
rdkit.Chem.rdchem.BondStereo.STEREOZ,
rdkit.Chem.rdchem.BondStereo.STEREOTRANS,
):
return "Z"
else:
return "N"
return "N"
def _get_ccd_cif_bond_type(bond):
......@@ -848,16 +856,20 @@ def _get_ccd_cif_bond_type(bond):
if bond_order == rdkit.Chem.rdchem.BondType.SINGLE:
return "SING"
elif bond_order == rdkit.Chem.rdchem.BondType.DOUBLE:
if bond_order == rdkit.Chem.rdchem.BondType.DOUBLE:
return "DOUB"
elif bond_order == rdkit.Chem.rdchem.BondType.TRIPLE:
if bond_order == rdkit.Chem.rdchem.BondType.TRIPLE:
return "TRIP"
elif bond_order == rdkit.Chem.rdchem.BondType.AROMATIC:
if bond_order == rdkit.Chem.rdchem.BondType.AROMATIC:
return "AROM"
elif bond_order == rdkit.Chem.rdchem.BondType.QUADRUPLE:
if bond_order == rdkit.Chem.rdchem.BondType.QUADRUPLE:
return "QUAD"
else:
return "SING"
return "SING"
def _get_ccd_cif_chiral_type(atom):
......@@ -877,10 +889,11 @@ def _get_ccd_cif_chiral_type(atom):
if chiral_type == rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW:
return "R"
elif chiral_type == rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW:
if chiral_type == rdkit.Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW:
return "S"
else:
return "N"
return "N"
def _get_atom_coord(component, at_id, conformer_type):
......@@ -895,39 +908,49 @@ def _get_atom_coord(component, at_id, conformer_type):
Returns:
rdkit.Geometry.rdGeometry.Point3D: 3D coordinates of the atom.
"""
conf_id = component.conformers_mapping[conformer_type]
conformer = None
for c in component.mol.GetConformers():
if c.GetProp("name") == conformer_type.name:
conformer = c
break
return component.mol.GetConformer(conf_id).GetAtomPosition(at_id)
return conformer.GetAtomPosition(at_id)
# region fallbacks
def _to_sdf_str_fallback(mol, ccd_id, mappings):
def _to_sdf_str_fallback(mol, ccd_id, conformers):
"""Fallback method to generate SDF file in case the default one in
RDKit fails.
Args:
mol (rdkit.Chem.rdchem.Mol): rdkit mol to be exported
id (str): component id.
mappings (dict of str): Deemed mappings to be exported
conformers (list of of ConformerType): List of conformer types
to be exported.
Returns:
list of str: SDF representation of the component
"""
content = []
for k, v in mappings.items():
try:
rdkit_conformer = mol.GetConformer(v)
except ValueError:
continue
for c in conformers:
rdkit_conformer = None
if c == ConformerType.AllConformers:
rdkit_conformer = mol.GetConformer()
else:
for conf in mol.GetConformers():
if conf.GetProp("name") == c.name:
rdkit_conformer = conf
break
atom_count = mol.GetNumAtoms()
bond_count = mol.GetNumBonds()
content += [
f"{ccd_id} - {k} conformer",
f"{ccd_id} - {c.name} conformer",
" RDKit 3D",
"\n" f"{atom_count:>3}{bond_count:3} 0 0 0 0 0 0 0 0999 V2000",
]
......@@ -1041,20 +1064,20 @@ def __charge_to_sdf(charge):
"""
if charge == -3:
return "7"
elif charge == -2:
if charge == -2:
return "6"
elif charge == -1:
if charge == -1:
return "5"
elif charge == 0:
if charge == 0:
return "0"
elif charge == 1:
if charge == 1:
return "+1"
elif charge == 2:
if charge == 2:
return "+2"
elif charge == 3:
if charge == 3:
return "+4"
else:
return "0"
return "0"
def __bond_stereo_to_sdf(bond):
......@@ -1095,14 +1118,14 @@ def __bond_type_to_sdf(bond):
if bond_order == rdkit.Chem.rdchem.BondType.SINGLE:
return "1"
elif bond_order == rdkit.Chem.rdchem.BondType.DOUBLE:
if bond_order == rdkit.Chem.rdchem.BondType.DOUBLE:
return "2"
elif bond_order == rdkit.Chem.rdchem.BondType.TRIPLE:
if bond_order == rdkit.Chem.rdchem.BondType.TRIPLE:
return "3"
elif bond_order == rdkit.Chem.rdchem.BondType.AROMATIC:
if bond_order == rdkit.Chem.rdchem.BondType.AROMATIC:
return "A"
else:
return "0"
return "0"
def __post_process_cif_category(cif_copy, category_name):
......@@ -1382,11 +1405,11 @@ def _add_rdkit_conformer_cif(component, cif_copy, remove_hs):
the CIF file.
remove_hs (boolean): Whether or not hydrogen atoms should be written
"""
try:
conformer = component.mol.GetConformer(
component.conformers_mapping[ConformerType.Computed]
)
conformer = component.get_conformer(ConformerType.Computed)
except ValueError:
logger.warning("Computed conformer does not exist.")
return # no conformer nothing to write, we quit
category = "_pdbe_chem_comp_rdkit_conformer"
......
......@@ -17,9 +17,7 @@
import json
import re
import sys