Commit 0387f0d4 authored by Lukas Pravda's avatar Lukas Pravda
Browse files

Merge branch 'dev'

parents 98ee0a75 ee3fe336
......@@ -33,9 +33,8 @@ jobs:
activate-conda: true
conda-channels: conda-forge
- run: |
conda install rdkit
pip install -e .
pip install pytest-cov sphinx sphinx_rtd_theme recommonmark sphinx-autodoc-typehints sphinx-markdown-tables
conda install rdkit=2020.03.6
pip install -e ".[docs]"
- run: |
cd doc
......
......@@ -2,7 +2,15 @@
name: ccdutils tests
on: [push, pull_request]
on:
push:
branches:
- master
- dev
pull_request:
branches:
- master
- dev
jobs:
pytest:
......@@ -25,7 +33,6 @@ jobs:
activate-conda: true
conda-channels: conda-forge
- run: |
conda install rdkit
pip install -e .
pip install pytest-cov
conda install rdkit=2020.03.6
pip install -e ".[tests]"
- run: pytest --cov=pdbeccdutils
clean:
rm -rf build/* && rm -rf dist/* && rm -rf pdbeccdutils.egg-info && rm -rf htmlcov && rm .coverage
test-report:
pytest --cov=pdbeccdutils --cov-report=html
package:
pytest;\
python setup.py bdist_wheel;\
pip uninstall pdbeccdutils;\
pip install -e .;\
find dist -name "pdbeccdutils-*-py3*" -exec pip install {} \;
make upload:
twine upload dist/* --verbose
make pypi: package upload clean
......@@ -7,7 +7,7 @@
* The tools use:
* [RDKit](http://www.rdkit.org/) for chemistry
* [PDBeCIF](https://gitlab.com/glenveegee/PDBeCIF.git) cif parser.
* [PDBeCIF](https://github.com/PDBeurope/pdbecif) cif parser.
* [scipy](https://www.scipy.org/) for depiction quality check.
* [numpy](https://www.numpy.org/) for molecular scaling.
......@@ -37,15 +37,15 @@
* Generation of 3D conformations.
* Fragment library search.
* Chemical scaffolds (Murcko scaffold, Murcko general, BRICS).
* Lightweight implementation of parity method by Jon Tyczak.
* Lightweight implementation of [parity method](https://doi.org/10.1016/j.str.2018.02.009) by Jon Tyczak.
* RDKit molecular properties per component.
* UniChem mapping.
## TODO list
* Port rest of the important functionality implemented by Oliver
* Add more unit/regression tests to get at least 100% code coverage.
* Further improvement of the documentation.
* Add more unit/regression tests to get at least higher code coverage.
* Further improvements of the documentation.
## Notes
......@@ -76,4 +76,4 @@ Use this to re-generate the documentation from the doc/ directory:
```console
make html
```
\ No newline at end of file
```
__version__ = '0.5.7'
__version__ = "0.5.8"
......@@ -92,24 +92,32 @@ def compare_molecules(template, query, thresh=0.01, exact_match=False):
query_atoms = query.GetNumAtoms()
min_num_atoms = min(template_atoms, query_atoms)
max_sim_score = float(min_num_atoms) / float(template_atoms + query_atoms - min_num_atoms)
max_sim_score = float(min_num_atoms) / float(
template_atoms + query_atoms - min_num_atoms
)
if max_sim_score < thresh:
return ParityResult({}, 0.0)
if not exact_match:
mcs_graph = rdFMCS.FindMCS([template, query],
bondCompare=rdFMCS.BondCompare.CompareAny,
atomCompare=rdFMCS.AtomCompare.CompareAny,
timeout=40,
completeRingsOnly=True)
mcs_graph = rdFMCS.FindMCS(
[template, query],
bondCompare=rdFMCS.BondCompare.CompareAny,
atomCompare=rdFMCS.AtomCompare.CompareAny,
timeout=40,
completeRingsOnly=True,
)
else:
mcs_graph = rdFMCS.FindMCS([template, query],
bondCompare=rdFMCS.BondCompare.CompareOrderExact,
atomCompare=rdFMCS.AtomCompare.CompareElements,
timeout=40,
completeRingsOnly=True)
substructure, sim_score = _generate_sim_score(template, query, mcs_graph.smartsString)
mcs_graph = rdFMCS.FindMCS(
[template, query],
bondCompare=rdFMCS.BondCompare.CompareOrderExact,
atomCompare=rdFMCS.AtomCompare.CompareElements,
timeout=40,
completeRingsOnly=True,
)
substructure, sim_score = _generate_sim_score(
template, query, mcs_graph.smartsString
)
return ParityResult(substructure, sim_score)
......@@ -76,14 +76,16 @@ def read_pdb_cif_file(path_to_cif: str, sanitize: bool = True) -> CCDReaderResul
the internal representation of the component.
"""
if not os.path.isfile(path_to_cif):
raise ValueError('File \'{}\' does not exists'.format(path_to_cif))
raise ValueError("File '{}' does not exists".format(path_to_cif))
cif_dict = list(MMCIF2Dict().parse(path_to_cif).values())[0]
return _parse_pdb_mmcif(cif_dict, sanitize)
def read_pdb_components_file(path_to_cif: str, sanitize: bool = True) -> Dict[str, CCDReaderResult]:
def read_pdb_components_file(
path_to_cif: str, sanitize: bool = True
) -> Dict[str, CCDReaderResult]:
"""
Process multiple compounds stored in the wwPDB CCD
`components.cif` file.
......@@ -102,7 +104,7 @@ def read_pdb_components_file(path_to_cif: str, sanitize: bool = True) -> Dict[st
the components in the `components.cif` file.
"""
if not os.path.isfile(path_to_cif):
raise ValueError('File \'{}\' does not exists'.format(path_to_cif))
raise ValueError("File '{}' does not exists".format(path_to_cif))
result_bag = {}
......@@ -110,7 +112,10 @@ def read_pdb_components_file(path_to_cif: str, sanitize: bool = True) -> Dict[st
try:
result_bag[k] = _parse_pdb_mmcif(v)
except CCDUtilsError as e:
print(f'ERROR: Data block {k} not processed. Reason: ({str(e)}).', file=sys.stderr)
print(
f"ERROR: Data block {k} not processed. Reason: ({str(e)}).",
file=sys.stderr,
)
return result_bag
......@@ -133,19 +138,23 @@ def _parse_pdb_mmcif(cif_dict, sanitize=True):
errors = list()
mol = rdkit.Chem.RWMol()
atoms_dict = _preprocess_pdb_parser_output(cif_dict, '_chem_comp_atom', warnings)
bonds_dict = _preprocess_pdb_parser_output(cif_dict, '_chem_comp_bond', warnings)
identifiers_dict = _preprocess_pdb_parser_output(cif_dict, '_pdbx_chem_comp_identifier', warnings)
descriptors_dict = _preprocess_pdb_parser_output(cif_dict, '_pdbx_chem_comp_descriptor', warnings)
properties_dict = _preprocess_pdb_parser_output(cif_dict, '_chem_comp', warnings)
atoms_dict = _preprocess_pdb_parser_output(cif_dict, "_chem_comp_atom", warnings)
bonds_dict = _preprocess_pdb_parser_output(cif_dict, "_chem_comp_bond", warnings)
identifiers_dict = _preprocess_pdb_parser_output(
cif_dict, "_pdbx_chem_comp_identifier", warnings
)
descriptors_dict = _preprocess_pdb_parser_output(
cif_dict, "_pdbx_chem_comp_descriptor", warnings
)
properties_dict = _preprocess_pdb_parser_output(cif_dict, "_chem_comp", warnings)
_parse_pdb_atoms(mol, atoms_dict)
_parse_pdb_conformers(mol, atoms_dict)
_parse_pdb_bonds(mol, bonds_dict, atoms_dict, errors)
_handle_implicit_hydrogens(mol)
descriptors = _parse_pdb_descriptors(descriptors_dict, 'descriptor')
descriptors += _parse_pdb_descriptors(identifiers_dict, 'identifier')
descriptors = _parse_pdb_descriptors(descriptors_dict, "descriptor")
descriptors += _parse_pdb_descriptors(identifiers_dict, "identifier")
properties = _parse_pdb_properties(properties_dict)
comp = Component(mol.GetMol(), cif_dict, properties, descriptors, sanitize=sanitize)
......@@ -167,23 +176,23 @@ def _parse_pdb_atoms(mol, atoms):
if not atoms:
return
for i in range(len(atoms['atom_id'])):
element = atoms['type_symbol'][i]
for i in range(len(atoms["atom_id"])):
element = atoms["type_symbol"][i]
element = element if len(element) == 1 else element[0] + element[1].lower()
isotope = None
if element == 'D':
element = 'H'
if element == "D":
element = "H"
isotope = 2
elif element == 'X':
element = '*'
elif element == "X":
element = "*"
atom = rdkit.Chem.Atom(element)
#atom.SetChiralTag(_atom_chiral_tag(atoms['pdbx_stereo_config'][i]))
atom.SetProp('name', atoms['atom_id'][i])
atom.SetProp('alt_name', atoms['alt_atom_id'][i])
atom.SetBoolProp('leaving_atom', atoms['pdbx_leaving_atom_flag'][i] == "Y")
atom.SetFormalCharge(conversions.str_to_int(atoms['charge'][i]))
# atom.SetChiralTag(_atom_chiral_tag(atoms['pdbx_stereo_config'][i]))
atom.SetProp("name", atoms["atom_id"][i])
atom.SetProp("alt_name", atoms["alt_atom_id"][i])
atom.SetBoolProp("leaving_atom", atoms["pdbx_leaving_atom_flag"][i] == "Y")
atom.SetFormalCharge(conversions.str_to_int(atoms["charge"][i]))
if isotope is not None:
atom.SetIsotope(isotope)
......@@ -203,8 +212,8 @@ def _parse_pdb_conformers(mol, atoms):
if not atoms:
return
ideal = _setup_pdb_conformer(atoms, 'pdbx_model_Cartn_{}_ideal')
model = _setup_pdb_conformer(atoms, 'model_Cartn_{}')
ideal = _setup_pdb_conformer(atoms, "pdbx_model_Cartn_{}_ideal")
model = _setup_pdb_conformer(atoms, "model_Cartn_{}")
mol.AddConformer(ideal, assignId=True)
mol.AddConformer(model, assignId=True)
......@@ -224,12 +233,12 @@ def _setup_pdb_conformer(atoms, label):
if not atoms:
return
conformer = rdkit.Chem.Conformer(len(atoms['atom_id']))
conformer = rdkit.Chem.Conformer(len(atoms["atom_id"]))
for i in range(len(atoms['atom_id'])):
x = conversions.str_to_float(atoms[label.format(('x'))][i])
y = conversions.str_to_float(atoms[label.format(('y'))][i])
z = conversions.str_to_float(atoms[label.format(('z'))][i])
for i in range(len(atoms["atom_id"])):
x = conversions.str_to_float(atoms[label.format(("x"))][i])
y = conversions.str_to_float(atoms[label.format(("y"))][i])
z = conversions.str_to_float(atoms[label.format(("z"))][i])
atom_position = rdkit.Chem.rdGeometry.Point3D(x, y, z)
conformer.SetAtomPosition(i, atom_position)
......@@ -250,18 +259,26 @@ def _parse_pdb_bonds(mol, bonds, atoms, errors):
if not bonds:
return
for i in range(len(bonds['atom_id_1'])):
atom_1 = collection_ext.find_element_in_list(atoms['atom_id'], bonds['atom_id_1'][i])
atom_2 = collection_ext.find_element_in_list(atoms['atom_id'], bonds['atom_id_2'][i])
bond_order = _bond_pdb_order(bonds['value_order'][i])
for i in range(len(bonds["atom_id_1"])):
atom_1 = collection_ext.find_element_in_list(
atoms["atom_id"], bonds["atom_id_1"][i]
)
atom_2 = collection_ext.find_element_in_list(
atoms["atom_id"], bonds["atom_id_2"][i]
)
bond_order = _bond_pdb_order(bonds["value_order"][i])
if any(a is None for a in [atom_1, atom_2, bond_order]):
errors.append('Problem with the {}-th bond in the _chem_comp_bond group'.format(i + 1))
errors.append(
"Problem with the {}-th bond in the _chem_comp_bond group".format(i + 1)
)
try:
mol.AddBond(atom_1, atom_2, bond_order)
except RuntimeError:
errors.append(f'Duplicit bond {bonds["atom_id_1"][i]}-{bonds["atom_id_2"][i]}')
errors.append(
f'Duplicit bond {bonds["atom_id_1"][i]}-{bonds["atom_id_2"][i]}'
)
def _handle_implicit_hydrogens(mol):
......@@ -285,7 +302,7 @@ def _handle_implicit_hydrogens(mol):
atom.SetNoImplicit(no_Hs)
def _parse_pdb_descriptors(pdbx_chem_comp_descriptors, label='descriptor'):
def _parse_pdb_descriptors(pdbx_chem_comp_descriptors, label="descriptor"):
"""
Parse useful information from _pdbx_chem_comp_* category
......@@ -304,9 +321,11 @@ def _parse_pdb_descriptors(pdbx_chem_comp_descriptors, label='descriptor'):
return descriptors
for i in range(len(pdbx_chem_comp_descriptors[label])):
d = Descriptor(type=pdbx_chem_comp_descriptors['type'][i],
program=pdbx_chem_comp_descriptors['program'][i],
value=pdbx_chem_comp_descriptors[label][i])
d = Descriptor(
type=pdbx_chem_comp_descriptors["type"][i],
program=pdbx_chem_comp_descriptors["program"][i],
value=pdbx_chem_comp_descriptors[label][i],
)
descriptors.append(d)
return descriptors
......@@ -324,18 +343,26 @@ def _parse_pdb_properties(chem_comp):
"""
properties = None
if chem_comp:
mod_date = chem_comp['pdbx_modified_date'][0].split('-')
d = date(1970, 1, 1) if mod_date[0] == '?' else date(int(mod_date[0]), int(mod_date[1]), int(mod_date[2]))
rel_status = chem_comp['pdbx_release_status'][0]
rel_status = ReleaseStatus.from_str(chem_comp['pdbx_release_status'][0])
properties = CCDProperties(id=chem_comp['id'][0],
name=chem_comp['name'][0],
formula=chem_comp['formula'][0],
modified_date=d,
pdbx_release_status=rel_status,
weight=0.0 if chem_comp['formula_weight'][0] == '?' else float(chem_comp['formula_weight'][0]))
mod_date = chem_comp["pdbx_modified_date"][0].split("-")
d = (
date(1970, 1, 1)
if mod_date[0] == "?"
else date(int(mod_date[0]), int(mod_date[1]), int(mod_date[2]))
)
rel_status = chem_comp["pdbx_release_status"][0]
rel_status = ReleaseStatus.from_str(chem_comp["pdbx_release_status"][0])
properties = CCDProperties(
id=chem_comp["id"][0],
name=chem_comp["name"][0],
formula=chem_comp["formula"][0],
modified_date=d,
pdbx_release_status=rel_status,
weight=0.0
if chem_comp["formula_weight"][0] == "?"
else float(chem_comp["formula_weight"][0]),
)
return properties
......@@ -354,13 +381,15 @@ def _preprocess_pdb_parser_output(dictionary, label, warnings):
dict: unified dictionary for structure parsing.
"""
if label not in dictionary:
warnings.append('Namespace {} does not exist.'.format(label))
warnings.append("Namespace {} does not exist.".format(label))
return []
check_element = list(dictionary[label].keys())[0]
values = (dictionary[label]
if isinstance(dictionary[label][check_element], list)
else {k: [v] for k, v in dictionary[label].items()})
values = (
dictionary[label]
if isinstance(dictionary[label][check_element], list)
else {k: [v] for k, v in dictionary[label].items()}
)
return values
......@@ -374,11 +403,11 @@ def _bond_pdb_order(value_order):
Returns:
rdkit.Chem.rdchem.BondType: -- bond type
"""
if value_order == 'SING':
if value_order == "SING":
return rdkit.Chem.rdchem.BondType(1)
if value_order == 'DOUB':
if value_order == "DOUB":
return rdkit.Chem.rdchem.BondType(2)
if value_order == 'TRIP':
if value_order == "TRIP":
return rdkit.Chem.rdchem.BondType(3)
return None
......@@ -393,12 +422,14 @@ def _atom_chiral_tag(tag):
Returns:
rdkit.Chem.ChiralType: Chiral center in RDKit language.
"""
if tag == 'N':
if tag == "N":
return rdkit.Chem.ChiralType.CHI_UNSPECIFIED
if tag == 'S':
if tag == "S":
return rdkit.Chem.ChiralType.CHI_TETRAHEDRAL_CCW
if tag == 'R':
if tag == "R":
return rdkit.Chem.ChiralType.CHI_TETRAHEDRAL_CW
return rdkit.Chem.ChiralType.CHI_UNSPECIFIED
# endregion parse mmcif
......@@ -116,7 +116,7 @@ def to_pdb_str(
)
info = rdkit.Chem.rdchem.AtomPDBResidueInfo()
info.SetResidueName(f'{component.id:>3}')
info.SetResidueName(f"{component.id:>3}")
info.SetTempFactor(20.0)
info.SetOccupancy(1.0)
info.SetChainId("A")
......@@ -929,8 +929,7 @@ def _to_sdf_str_fallback(mol, ccd_id, mappings):
content += [
f"{ccd_id} - {k} conformer",
" RDKit 3D",
"\n"
f"{atom_count:>3}{bond_count:3} 0 0 0 0 0 0 0 0999 V2000",
"\n" f"{atom_count:>3}{bond_count:3} 0 0 0 0 0 0 0 0999 V2000",
]
for i in range(0, atom_count):
......
......@@ -88,7 +88,6 @@ class Component:
if sanitize:
self._sanitization_issues = self._sanitize()
if descriptors is not None:
self._descriptors = descriptors
......
......@@ -43,7 +43,11 @@ class DepictionManager:
PubChem templates can be downloaded using PubChemDownloader class.
"""
def __init__(self, pubchem_templates_path: str = '', general_templates_path: str = config.general_templates) -> None:
def __init__(
self,
pubchem_templates_path: str = "",
general_templates_path: str = config.general_templates,
) -> None:
"""
Initialize component which does the ligand depiction.
If Nones is provided as parameters just the defalt RDKit
......@@ -63,16 +67,20 @@ class DepictionManager:
self.coordgen_params.coordgenScaling = 50 / 1.5
self.coordgen_params.templateFileDir = config.coordgen_templates
self.pubchem_templates = pubchem_templates_path if os.path.isdir(pubchem_templates_path) else ''
self.pubchem_templates = (
pubchem_templates_path if os.path.isdir(pubchem_templates_path) else ""
)
self.templates: Dict[str, rdkit.Chem.rdchem.Mol] = OrderedDict()
if os.path.isdir(general_templates_path):
for k in sorted(os.listdir(general_templates_path)):
template = self._load_template(os.path.join(general_templates_path, k))
template_name = k.split('.')[0]
template_name = k.split(".")[0]
self.templates[template_name] = template
def depict_molecule(self, het_id: str, mol: rdkit.Chem.rdchem.Mol) -> DepictionResult:
def depict_molecule(
self, het_id: str, mol: rdkit.Chem.rdchem.Mol
) -> DepictionResult:
"""
Given input molecule tries to generate its depictions.
......@@ -95,7 +103,11 @@ class DepictionManager:
rdkitMol = Chem.RWMol(temp_mol).GetMol()
results = []
pubchem_res = self._get_2D_by_pubchem(het_id, pubchemMol) if self.pubchem_templates else None
pubchem_res = (
self._get_2D_by_pubchem(het_id, pubchemMol)
if self.pubchem_templates
else None
)
template_res = self._get_2D_by_template(templateMol) if self.templates else []
rdkit_res = self._get_2D_by_rdkit(rdkitMol)
......@@ -111,11 +123,12 @@ class DepictionManager:
if results:
to_return = results[0]
fix_conformer(to_return.mol.GetConformer(0))
return to_return
return DepictionResult(source=DepictionSource.Failed, template_name='', mol=None, score=1000)
return DepictionResult(
source=DepictionSource.Failed, template_name="", mol=None, score=1000
)
def _get_pubchem_template_path(self, het_id):
"""Get path to the PubChem template if it exists.
......@@ -126,9 +139,9 @@ class DepictionManager:
Returns:
str: Path to the PubChem layout.
"""
path = os.path.join(self.pubchem_templates, f'{het_id}.sdf')
path = os.path.join(self.pubchem_templates, f"{het_id}.sdf")
return path if os.path.isfile(path) else ''
return path if os.path.isfile(path) else ""
def _load_template(self, path):
"""
......@@ -145,14 +158,14 @@ class DepictionManager:
rdkit.Chem.rdchem.Mol: RDKit representation of the template
"""
mol = Chem.RWMol()
extension = os.path.basename(path).split('.')[1]
extension = os.path.basename(path).split(".")[1]
if extension == 'sdf':
if extension == "sdf":
mol = Chem.MolFromMolFile(path, sanitize=True, removeHs=True)
elif extension == 'pdb':
elif extension == "pdb":
mol = Chem.MolFromPDBFile(path, sanitize=True, removeHs=True)
else:
raise ValueError('Unsupported molecule type \'{}\''.format(extension))
raise ValueError("Unsupported molecule type '{}'".format(extension))
p = Chem.AdjustQueryParameters()
p.makeAtomsGeneric = True
......@@ -175,9 +188,13 @@ class DepictionManager:
try:
rdCoordGen.AddCoords(mol, self.coordgen_params)
flaws = DepictionValidator(mol).depiction_score()
return DepictionResult(source=DepictionSource.RDKit, template_name=None, mol=mol, score=flaws)
return DepictionResult(
source=DepictionSource.RDKit, template_name=None, mol=mol, score=flaws
)
except Exception:
return DepictionResult(source=DepictionSource.Failed, template_name=None, mol=None, score=1000)
return DepictionResult(
source=DepictionSource.Failed, template_name=None, mol=None, score=1000
)
def _get_2D_by_pubchem(self, code, mol):
"""
......@@ -198,11 +215,18 @@ class DepictionManager:
if mol.HasSubstructMatch(template):
AllChem.GenerateDepictionMatching2DStructure(mol, template)
flaws = DepictionValidator(mol).depiction_score()
return DepictionResult(source=DepictionSource.PubChem, template_name=code, mol=mol, score=flaws)
return DepictionResult(
source=DepictionSource.PubChem,
template_name=code,
mol=mol,
score=flaws,
)
except Exception as e:
print(str(e), file=sys.stderr)
return DepictionResult(source=DepictionSource.Failed, template_name=None, mol=None, score=1000)
return DepictionResult(
source=DepictionSource.Failed, template_name=None, mol=None, score=1000
)
def _get_2D_by_template(self, mol):
"""
......@@ -222,8 +246,14 @@ class DepictionManager:
if temp_mol.HasSubstructMatch(template):
AllChem.GenerateDepictionMatching2DStructure(temp_mol, template)
flaws = DepictionValidator(temp_mol).depiction_score()
results.append(DepictionResult(source=DepictionSource.Template,
template_name=key, mol=temp_mol, score=flaws))
results.append(
DepictionResult(
source=DepictionSource.Template,
template_name=key,
mol=temp_mol,
score=flaws,
)
)
except Exception:
pass # if it fails it fails, but generally it wont
......@@ -240,7 +270,10 @@ class DepictionValidator:
self.conformer = mol.GetConformer()
self.bonds = self.mol.GetBonds()
atoms = [self.conformer.GetAtomPosition(i