Commit 92b2b3f7 authored by Lukas Pravda's avatar Lukas Pravda
Browse files

RELEASE 0.3

This is a minor release:

Highlights:
 * Refactored internals of the pdbeccdutils.core
 * Use of CoordGen library -> better images, less failures
 * Introduction of Scaffolding (by Abhik)
 * Extended documentation
 * Added unit tests
parent a14efae7
include pdbeccdutils/data/generic_templates/*
include pdbeccdutils/data/coordgen_templates/*
include pdbeccdutils/fragment_library.tsv
from pdbeccdutils.scripts.murcko_scaffold_cif_cli import main
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -22,9 +22,9 @@ copyright = '2018, Protein Data Bank in Europe'
author = 'Protein Data Bank in Europe'
# The short X.Y version
version = '0.2'
version = '0.3'
# The full version, including alpha/beta/rc tags
release = '0.2.1'
release = '0.3'
# endregion
......
......@@ -4,16 +4,17 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
```
# pdbeccdutils: Small molecules chemistry toolkit
# pdbeccdutils: Small molecules chemistry toolkit
## User guide
Getting started with pdbeccdutils
```eval_rst
.. toctree::
:maxdepth: 1
guide/intro
guide/install
guide/depictions
......
# pdbeccdutils.core
## pdbeccdutils.core.ccd_reader
```eval_rst
......@@ -18,7 +17,7 @@
## pdbeccdutils.core.structure_writer
```eval_rst
.. automodule:: pdbeccdutils.core.structure_writer
.. automodule:: pdbeccdutils.core.ccd_writer
:members:
```
......@@ -41,4 +40,4 @@
```eval_rst
.. automodule:: pdbeccdutils.core.exceptions
:members:
```
\ No newline at end of file
```
\ No newline at end of file
# pdbeccdutils.scripts
## Protein-ligand interaction pipeline
This script is used to generate protein-ligand interactions information
for bound molecules from mmcif files
......@@ -17,7 +18,7 @@ Journal of Molecular Biology, 429(3), 365–371.
The refactored version can be installed from source using:
```console
git clone -b development https://github.com/lpravda/arpeggio.git
git clone -b development https://github.com/lpravda/arpeggio.git
pip install -e arpeggio
```
......@@ -27,13 +28,22 @@ pip install -e arpeggio
```
## PDBeChem pipeline
```eval_rst
.. automodule:: pdbeccdutils.scripts.process_components_cif_cli
:members:
```
```
## Setup PubChem library
## Setup pubchem library
```eval_rst
.. automodule:: pdbeccdutils.scripts.setup_pubchem_library_cli
:members:
```
```
## Get MURCKO scaffolds
```eval_rst
.. automodule:: pdbeccdutils.scripts.murcko_scaffold_cif_cli
:members:
```
\ No newline at end of file
__version__ = '0.2'
__version__ = '0.3'
......@@ -2,21 +2,39 @@
Jon Tyczak to Abhik to be used by the cofactors pipeline.
"""
import rdkit
from rdkit import Chem
from rdkit.Chem import rdFMCS
from typing import NamedTuple
ParityResult = NamedTuple('ParityResult',
[('template_atoms', int),
('query_atoms', int),
('match_count', int),
('similarity_score', float)])
ParityResult.__doc__ = """
NamedTuple for the result of parity method along with the details
necessary for calculating the similarity score.
Args:
template_atoms (int): Number of template atoms.
query_atoms (int): Number of query molecule atoms.
match_count (int): Size o the common subgaph match.
similarity_score (float): Calculate similarity score.
"""
def _get_matches(mol, smarts):
"""Gets the subgraph match for the parity method.
Args:
mol (rdkit.chem.rdchem.Mol): Molecule to be queried
mol (rdkit.Chem.Mol): Molecule to be queried
smarts (str): Molecule representation in SMARTS.
Returns:
int: Molecular subgraph matches.
"""
patt = rdkit.Chem.MolFromSmarts(smarts)
patt = Chem.MolFromSmarts(smarts)
matches = mol.GetSubstructMatches(patt, uniquify=False)
return matches
......@@ -26,8 +44,8 @@ def _generate_sim_score(template, query, smarts):
smarts string returns similarity score.
Args:
template (rdkit.chem.rdchem.Mol): template molecule.
query (rdkit.Chem.rdchem.Mol): query molecule.
template (rdkit.Chem.Mol): template molecule.
query (rdkit.Chem.Mol): query molecule.
smarts (str): common subgraph in the SMARTS format.
Returns:
......@@ -36,9 +54,9 @@ def _generate_sim_score(template, query, smarts):
"""
if template.GetNumAtoms() == 1:
smarts = rdkit.Chem.MolToSmarts(template)
smarts = Chem.MolToSmarts(template)
elif query.GetNumAtoms() == 1:
smarts = rdkit.Chem.MolToSmarts(query)
smarts = Chem.MolToSmarts(query)
if smarts is None:
best_matches = 0
best_sim_score = 0.0
......@@ -66,43 +84,35 @@ def _generate_sim_score(template, query, smarts):
return (best_matches, best_sim_score)
def compare_molecules(template, query, thresh):
def compare_molecules(template, query, thresh=0.01):
"""Given the two molecules calculates their similarity score.
If expected similarity score is lower than the threshold nothing
is calculated.
Args:
template (rdkit.Chem.rdchem.Mol): Template molecule
query (rdkit.Chem.rdchem.Mol): Query molecule
thresh (float): Threshold score for the match to be considered.
template (rdkit.Chem.Mol): Template molecule
query (rdkit.Chem.Mol): Query molecule
thresh (float, optional): Defaults to 0.01: Threshold score for
the match to be considered.
Returns:
(int, int, int, float):
int - template atom count
int - query atom count
int - size of common subgraph
float - similarity score
ParityResult: Result of the PARITY comparison.
"""
sim_arr = ['', '', 0, 0]
sim_arr[0] = template.GetNumAtoms()
sim_arr[1] = query.GetNumAtoms()
templ_atoms = template.GetNumAtoms()
query_atoms = query.GetNumAtoms()
min_num_atoms = min(sim_arr[0], sim_arr[1])
max_sim_score = float(min_num_atoms) / float(sim_arr[0] + sim_arr[1] - min_num_atoms)
min_num_atoms = min(templ_atoms, query_atoms)
max_sim_score = float(min_num_atoms) / float(templ_atoms + query_atoms - min_num_atoms)
if max_sim_score < thresh:
return sim_arr
return ParityResult(templ_atoms, query_atoms, 0, 0)
mcs_graph = rdkit.Chem.rdFMCS.FindMCS([template, query],
bondCompare=rdkit.Chem.rdFMCS.BondCompare.CompareAny,
atomCompare=rdkit.Chem.rdFMCS.AtomCompare.CompareAny,
timeout=40,
completeRingsOnly=True)
mcs_graph = rdFMCS.FindMCS([template, query],
bondCompare=rdFMCS.BondCompare.CompareAny,
atomCompare=rdFMCS.AtomCompare.CompareAny,
timeout=40,
completeRingsOnly=True)
matches, sim_score = _generate_sim_score(template, query, mcs_graph.smartsString)
sim_arr[2] = sim_score
sim_arr[3] = matches
return sim_arr
return ParityResult(templ_atoms, query_atoms, matches, sim_score)
......@@ -42,7 +42,7 @@ CCDReaderResult = NamedTuple('CCDReaderResult',
('component', Component)])
CCDReaderResult.__doc__ = """
Namedtuple for the result of reading an individual PDB chemical
NamedTuple for the result of reading an individual PDB chemical
component definition (CCD).
Args:
......
......@@ -22,12 +22,15 @@ from typing import Any, Dict, List, Optional, Tuple
import rdkit
import rdkit.Chem.Draw as Draw
from rdkit.Chem import BRICS
from rdkit.Chem.Scaffolds import MurckoScaffold
import pdbeccdutils.helpers.drawing as drawing
from pdbeccdutils.core.depictions import DepictionManager
from pdbeccdutils.core.depictions import DepictionManager, DepictionResult
from pdbeccdutils.core.exceptions import CCDUtilsError
from pdbeccdutils.core.fragment_library import FragmentLibrary
from pdbeccdutils.core.models import (ConformerType, Descriptor, Properties, ReleaseStatus)
from pdbeccdutils.core.models import (ConformerType, Descriptor, Properties,
ReleaseStatus, ScaffoldingMethod)
from pdbeccdutils.helpers.io_grabber import IOGrabber
METALS_SMART = '[Li,Na,K,Rb,Cs,F,Be,Mg,Ca,Sr,Ba,Ra,Sc,Ti,V,Cr,Mn,Fe,Co,Ni,Cu,Zn,Al,Ga,Y,Zr,Nb,Mo,'\
......@@ -262,7 +265,7 @@ class Component:
return False
return True
def compute_2d(self, manager: DepictionManager, remove_hs: bool=True) -> rdkit.Chem.Mol:
def compute_2d(self, manager: DepictionManager, remove_hs: bool=True) -> DepictionResult:
"""Compute 2d depiction of the component using DepictionManager
instance.
......@@ -273,20 +276,17 @@ class Component:
hydrogens prior to depiction.
Returns:
rdkit.Chem.Mol: 2D depiction of the ligand.
DepictionResult: Object with the details about depiction process.
"""
mol_copy = rdkit.Chem.RWMol(self.mol)
if remove_hs:
mol_copy = rdkit.Chem.RemoveHs(mol_copy, updateExplicitCount=True, sanitize=False)
rdkit.Chem.SanitizeMol(mol_copy, catchErrors=True)
result_log = manager.depict_molecule(self._id, mol_copy)
self._2dmol = result_log.mol
if result_log is not None:
# add conformer
self._2dmol = result_log.mol
return result_log
else:
return None
return result_log
def export_2d_svg(self, file_name: str, width: int=500, names: bool=False,
atom_highlight: Dict[Any, Tuple]={}, bond_highlight: Dict[Tuple, Tuple]={}):
......@@ -465,6 +465,28 @@ class Component:
return matches_found
def get_scaffolds(self, scaffolding_method=ScaffoldingMethod.MurckoScaffold):
"""[summary]
scaffolding_method (ScaffoldingMethod, optional):
Defaults to MurckoScaffold. Scaffolding method to use
Returns:
list of rdkit.Mol: Scaffolds found in the component.
"""
scaffold = None
if scaffolding_method == ScaffoldingMethod.MurckoScaffold:
scaffold = [(MurckoScaffold.GetScaffoldForMol(self.mol))]
elif scaffolding_method == ScaffoldingMethod.MurckoGeneric:
scaffold = [(MurckoScaffold.MakeScaffoldGeneric(self.mol))]
elif scaffolding_method == ScaffoldingMethod.Brics:
scaffold = BRICS.BRICSDecompose(self.mol)
scaffold = list(map(lambda l: rdkit.Chem.MolFromSmiles(l), scaffold))
return scaffold
def _fix_molecule(self, rwmol):
"""
Single molecule sanitization process. Presently, only valence
......@@ -550,7 +572,7 @@ class Component:
try:
copy = rdkit.Chem.Draw.rdMolDraw2D.PrepareMolForDrawing(self._2dmol, wedgeBonds=True,
kekulize=True, addChiralHs=True)
except RuntimeError:
except (RuntimeError, ValueError):
copy = rdkit.Chem.Draw.rdMolDraw2D.PrepareMolForDrawing(self._2dmol, wedgeBonds=False,
kekulize=True, addChiralHs=True)
......
......@@ -17,12 +17,14 @@
import os
import numpy
import rdkit
from rdkit import Chem
from rdkit import Geometry
from rdkit.Chem import rdCoordGen
from rdkit.Chem import AllChem
from scipy.spatial import KDTree
import pdbeccdutils.utils.config as config
from pdbeccdutils.core.models import DepictionSource, DepictionResult, Mapping
from pdbeccdutils.core.models import DepictionSource, DepictionResult
class DepictionManager:
......@@ -48,8 +50,11 @@ class DepictionManager:
for depicting ligand e.g. porphyring rings.
"""
self.coordgen_params = rdCoordGen.CoordGenParams()
self.coordgen_params.coordgenScaling = 50 / 1.5
self.coordgen_params.templateFileDir = config.coordgen_templates
self.pubchem_templates = []
self.pubchem_templates = pubchem_templates_path if os.path.isdir(pubchem_templates_path) else ''
self.substructures = []
if os.path.isdir(general_templates_path):
......@@ -57,98 +62,7 @@ class DepictionManager:
self._load_template(os.path.join(general_templates_path, k))
for k in os.listdir(general_templates_path)}
if os.path.isdir(pubchem_templates_path):
self.pubchem_templates = {k.split('.')[0]:
os.path.join(pubchem_templates_path, k)
for k in os.listdir(pubchem_templates_path)}
def _load_template(self, path):
"""
Loads a template molecule with 2D coordinates
Args:
path (str): path to the model molecule in *.sdf,
or *.pdb format
Raises:
ValueError: if unsuported format is used: sdf|pdb
Returns:
rdkit.Chem.rdchem.Mol: RDKit representation of the template
"""
mol = rdkit.Chem.RWMol()
extension = os.path.basename(path).split('.')[1]
if extension == 'sdf':
mol = rdkit.Chem.MolFromMolFile(path, removeHs=True)
elif extension == 'pdb':
mol = rdkit.Chem.MolFromPDBFile(path, removeHs=True)
else:
raise ValueError('Unsupported molecule type \'{}\''.format(extension))
[x.SetAtomicNum(6) for x in mol.GetAtoms()]
[x.SetBondType(rdkit.Chem.BondType.UNSPECIFIED) for x in mol.GetBonds()]
return mol
def _anonymization(self, mol):
"""
Converts all molecule atoms into carbons and all bonds into
single ones. This is used for the substructure matching step.
Original mapping of both is returned.
Args:
mol (rdkit.Chem.rdchem.Mol): molecule to be anonymized
Returns:
Mappings: original mapping between atoms and bonds
of the molecule
"""
molAtomMapping = {atom.GetIdx(): atom.GetAtomicNum() for atom in mol.GetAtoms()}
molBondMapping = {bond.GetIdx(): bond.GetBondType() for bond in mol.GetBonds()}
[x.SetAtomicNum(6) for x in mol.GetAtoms()]
[x.SetBondType(rdkit.Chem.BondType.SINGLE) for x in mol.GetBonds()]
return Mapping(atom_mapping=molAtomMapping, bond_mapping=molBondMapping)
def _deanonymization(self, mol, mappings):
"""
Based on the original mapping restores atom elements and
bond types.
Args:
mol (rdkit.Chem.rdchem.Mol): molecule to be restored
mappings (Mapping): Original bond and atom info
"""
[x.SetAtomicNum(mappings.atom_mapping[x.GetIdx()]) for x in mol.GetAtoms()]
[x.SetBondType(mappings.bond_mapping[x.GetIdx()]) for x in mol.GetBonds()]
def _rescale_molecule(self, mol, factor):
"""
Rescale molecule coords to a given factor
Args:
mol (rdkit.Chem.rdchem.Mol) molecule to be rescaled
factor (float): rescaling factor
"""
matrix = numpy.zeros((4, 4), numpy.float)
for i in range(3):
matrix[i, i] = factor
matrix[3, 3] = 1
rdkit
import rdkit
# those two lines are here because of very weird underetministic
# import issue on Linux.
# essentially if they are removed, protein-interactions pipeline
# fails with segmentation fault.
# Will be removed in future when stable data structure is present
rdkit.Chem.AllChem.TransformMol(mol, matrix)
def depict_molecule(self, id, mol):
def depict_molecule(self, het_id, mol):
"""
Given input molecule tries to generate its depictions.
......@@ -159,20 +73,19 @@ class DepictionManager:
Arguments:
id (str): id of the ligand
mol (Chem.rdchem.Mol): molecule to be depicted
mol (rdkit.Chem.Mol): molecule to be depicted
Returns:
pdbeccdutils.utils.DepictionResult: Summary of the ligand
depiction process.
DepictionResult: Summary of the ligand depiction process.
"""
temp_mol = rdkit.Chem.RWMol(mol)
mappings = self._anonymization(temp_mol)
templateMol = rdkit.Chem.RWMol(temp_mol).GetMol()
pubchemMol = rdkit.Chem.RWMol(temp_mol).GetMol()
rdkitMol = rdkit.Chem.RWMol(temp_mol).GetMol()
temp_mol = Chem.RWMol(mol)
templateMol = Chem.RWMol(temp_mol).GetMol()
pubchemMol = Chem.RWMol(temp_mol).GetMol()
rdkitMol = Chem.RWMol(temp_mol).GetMol()
results = []
pubchem_res = self._get_2D_by_pubchem(id, pubchemMol) if self.pubchem_templates else None
pubchem_res = self._get_2D_by_pubchem(het_id, pubchemMol) if len(self.pubchem_templates) > 0 else None
template_res = self._get_2D_by_template(templateMol) if self.substructures else []
rdkit_res = self._get_2D_by_rdkit(rdkitMol)
......@@ -186,27 +99,71 @@ class DepictionManager:
results.sort(key=lambda l: (l.score, l.source))
if len(results) > 0:
self._deanonymization(results[0].mol, mappings)
return results[0]
else:
return DepictionResult(source=DepictionSource.Failed, template_name='', mol=None, score=1000)
def _get_pubchem_template_path(self, het_id):
"""Get path to the PubChem template if it exists.
Args:
het_id (str): Ligand in.
Returns:
str: Path to the PubChem layout.
"""
path = os.path.join(self.pubchem_templates, f'{het_id}.sdf')
return path if os.path.isfile(path) else ''
def _load_template(self, path):
"""
Loads a template molecule with 2D coordinates
Args:
path (str): path to the model molecule in *.sdf,
or *.pdb format
Raises:
ValueError: if unsuported format is used: sdf|pdb
Returns:
rdkit.Chem.Mol: RDKit representation of the template
"""
mol = Chem.RWMol()
extension = os.path.basename(path).split('.')[1]
if extension == 'sdf':
mol = Chem.MolFromMolFile(path, sanitize=True, removeHs=True)
elif extension == 'pdb':
mol = Chem.MolFromPDBFile(path, sanitize=True, removeHs=True)
else:
raise ValueError('Unsupported molecule type \'{}\''.format(extension))
p = Chem.AdjustQueryParameters()
p.makeAtomsGeneric = True
p.makeBondsGeneric = True
mol = Chem.AdjustQueryProperties(mol, p)
return mol
def _get_2D_by_rdkit(self, mol):
"""
Get depiction done using solely the default RDKit functionality.
Args:
mol (rdkit.Chem.rdchem.Mol): Mol to be depicted
mol (rdkit.Chem.Mol): Mol to be depicted
Returns:
DepictionResult: Depiction with some usefull metadata
"""
try:
rdkit.Chem.AllChem.GenerateDepictionMatching3DStructure(mol, mol)
rdCoordGen.AddCoords(mol, self.coordgen_params)
flaws = DepictionValidator(mol).depiction_score()
return DepictionResult(source=DepictionSource.RdKit, template_name=None, mol=mol, score=flaws)
return DepictionResult(source=DepictionSource.RDKit, template_name=None, mol=mol, score=flaws)
except Exception:
return None
return DepictionResult(source=DepictionSource.Failed, template_name=None, mol=None, score=1000)
def _get_2D_by_pubchem(self, id, mol):
"""
......@@ -220,36 +177,36 @@ class DepictionManager:
DepictionResult: Depiction with some usefull metadata
"""
try:
if id in self.pubchem_templates:
template = self._load_template(self.pubchem_templates[id])
self._rescale_molecule(template, 1.5)
template_path = self._get_pubchem_template_path(id)
if len(template_path) > 0:
template = self._load_template(template_path)
if mol.HasSubstructMatch(template):
rdkit.Chem.AllChem.GenerateDepictionMatching2DStructure(mol, template)
AllChem.GenerateDepictionMatching2DStructure(mol, template)
flaws = DepictionValidator(mol).depiction_score()
return DepictionResult(source=DepictionSource.Pubchem, template_name=id, mol=mol, score=flaws)
except Exception:
pass
return DepictionResult(source=DepictionSource.PubChem, template_name=id, mol=mol, score=flaws)
except Exception as e:
print(str(e))
return None
return DepictionResult(source=DepictionSource.Failed, template_name=None, mol=None, score=1000)
def _get_2D_by_template(self, mol):
"""
Depict ligand using user-provided templates
Args:
mol (rdkit.Chem.rdchem.Mol): Mol to be depicted
mol (rdkit.Chem.Mol): Mol to be depicted
Returns:
list of pdbecccdutils.DepictionResult: Depictions with their
list of DepictionResult: Depictions with their
quality and metadata.
"""
results = list()