"""
Generate canonical identifiers for chemical databases, specifically quantum
chemical data.
"""
from copy import deepcopy
import cmiles
import warnings
from .utils import has_openeye, has_rdkit
if has_openeye:
import openeye as oe
if has_rdkit:
import rdkit as rd
[docs]def get_molecule_ids(molecule_input, toolkit='openeye', strict=True, **kwargs):
"""
Generate a dictionary of canonical identifiers
The `molecule_input` can be either a JSON serialised molecule (see `QCSchema <https://molssi-qc-schema.readthedocs.io/en/latest/index.html#>`_)
or an isomeric SMILES with all steroechemistry defined.
Required fields for the QCSchema molecule:
1. symbols
2. geometry
3. connectivity
Parameters
----------
molecule_input: dict or str
A JSON serialized QC molecule or an isomeric SMILES
toolkit: str, optional, default 'openeye'
toolkit to use for canonicalization. Currently supports `openeye` and `rdkit`
strict: bool, optional. Default True
If true, will raise an exception if SMILES is missing explicit H.
**permute_xyz: bool, optional, default False
**Only use if input molecule is in QCSchema format**.
If True, the geometry will be permuted to reflect the canononical
atom order in the mapped SMILES. ``get_molecule_ids`` will return the permuted QCSchema. ``cmiles`` identifiers will be in
the `identifiers` field
If False, the map indices in the mapped SMILES will reflect the order of the atoms in the input QCSchema.
Returns
-------
dict
If ``permute_xyz=True``, will return permuted qcschema with cmiles identifiers in `identifiers` field.
"""
# check input and convert to oe or rdkit mol
molecule = cmiles.utils.load_molecule(molecule_input, toolkit=toolkit, **kwargs)
# check for map. If map exists, remove. We only want maps generated with cmiles
if cmiles.utils.has_atom_map(molecule):
cmiles.utils.remove_atom_map(molecule)
# check for explicit hydrogen
if strict and not cmiles.utils.has_explicit_hydrogen(molecule):
raise ValueError("Input molecule is missing explicit hydrogen")
# Check for fully defined stereochemsitry
if not cmiles.utils.has_stereo_defined(molecule):
raise ValueError("Input SMILES must have full stereochemistry defined")
molecule_ids = {}
backend_toolkit = cmiles.utils._set_toolkit(molecule)
molecule_ids['canonical_smiles'] = backend_toolkit.mol_to_smiles(molecule,
isomeric=False,
explicit_hydrogen=False,
mapped=False)
molecule_ids['canonical_isomeric_smiles'] = backend_toolkit.mol_to_smiles(molecule,
isomeric=True,
explicit_hydrogen=False,
mapped=False)
molecule_ids['canonical_explicit_hydrogen_smiles'] = backend_toolkit.mol_to_smiles(molecule,
isomeric=False,
explicit_hydrogen=True,
mapped=False)
molecule_ids['canonical_isomeric_explicit_hydrogen_smiles'] = backend_toolkit.mol_to_smiles(molecule,
isomeric=True,
explicit_hydrogen=True,
mapped=False)
molecule_ids['canonical_isomeric_explicit_hydrogen_mapped_smiles'] = backend_toolkit.mol_to_smiles(molecule, isomeric=True,
explicit_hydrogen=True,
mapped=True)
molecule_ids['molecular_formula'] = cmiles.utils.mol_to_hill_molecular_formula(molecule)
inchi = get_inchi_and_key(molecule)
if inchi:
molecule_ids['standard_inchi'] = inchi[0]
molecule_ids['inchi_key'] = inchi[-1]
if cmiles.utils.has_rdkit:
molecule_ids['unique_tautomer_representation'] = standardize_tautomer(molecule_ids['canonical_isomeric_smiles'])
if cmiles.utils.has_openeye:
molecule_ids['unique_protomer_representation'] = get_unique_protomer(molecule)
molecule_ids['provenance'] = 'cmiles_' + cmiles.__version__ + '_{}_'.format(backend_toolkit) + \
backend_toolkit.toolkit.__version__
try:
if kwargs['permute_xyz']:
permuted_json_mol = cmiles.utils.permute_qcschema(molecule_input, molecule_ids, toolkit=toolkit)
return permuted_json_mol
else:
return molecule_ids
except KeyError:
return molecule_ids
[docs]def get_inchi_and_key(molecule):
"""
Generate inchi and inchikey. Uses RDKit which uses the inchi API
Parameters
----------
molecule: rdkit.Chem.Mol
If an `oechem.OEMol` is provided, will convert it to an `rdkit.Chem.Mol`
Returns
-------
tuple (inchi, inchi_key)
"""
# Todo can use the InChI code directly here
# Make sure molecule is rdkit mol
if not has_rdkit:
warnings.warn("rdkit is not installed. cmiles ids will not include inchi and inchikey")
return
if not isinstance(molecule, rd.Chem.Mol):
molecule = rd.Chem.MolFromSmiles(oe.oechem.OEMolToSmiles(molecule))
inchi = rd.Chem.MolToInchi(molecule)
inchi_key = rd.Chem.MolToInchiKey(molecule)
return inchi, inchi_key
[docs]def get_iupac(molecule):
"""
Generate IUPAC name
Parameters
----------
molecule :
`oechem.OEMol`
Returns
-------
str:
iupac name
Notes
-----
Will only be generated if has openeye license
"""
if not has_openeye:
raise ImportError("OpenEye is not installed. You can use the canonicalization='rdkit' to use the RDKit backend"
"The Conda recipe for cmiles installs rdkit")
from openeye import oeiupac
if not oeiupac.OEIUPACIsLicensed():
raise ImportError("Must have OEIUPAC license!")
return oeiupac.OECreateIUPACName(molecule)
[docs]def get_unique_protomer(molecule):
"""
Generate unique protomer for all tuatomers and charge states of the moelcule.
**Requires openeye license**
Parameters
----------
molecule: oechem.OEMol
Will convert `rdkit.Chem.Mol` to `oechem.OEMol` if openeye is installed and license is valid
Returns
-------
str
unique protomer
"""
molecule = deepcopy(molecule)
# This only works for OpenEye
# Todo There might be a way to use different layers of InChI for this purpose.
# Not all tautomers are recognized as the same by InChI so it won't capture all tautomers.
# But if we use only the formula and connectivity level we might be able to capture a larger subset.
#
if has_openeye:
from openeye import oequacpac, oechem
else:
raise RuntimeError("Must have OpenEye for unique protomer feature")
if not oechem.OEChemIsLicensed():
raise ImportError("Must have OEChem license!")
if not oequacpac.OEQuacPacIsLicensed():
raise ImportError("Must have OEQuacPac license!")
if has_rdkit:
if isinstance(molecule, rd.Chem.rdchem.Mol):
# convert to openeye molecule
# Maybe we shouldn't do this.
smiles = rd.Chem.MolToSmiles(molecule)
molecule = oechem.OEMol()
oechem.OESmilesToMol(molecule, smiles)
molecule_copy = deepcopy(molecule)
oequacpac.OEGetUniqueProtomer(molecule_copy, molecule)
return oechem.OEMolToSmiles(molecule_copy)
[docs]def standardize_tautomer(iso_can_smi):
"""
Standardize tautomer to one universal tautomer.
Parameters
----------
iso_can_smi: str
isomeric SMILES
Returns
-------
str:
standardized tautomer
Notes
-----
Does not standardize for ionization states.
In some cases preforms better than `oequacpac.OEGetUniqueProtomer`.
See `notebook <https://github.com/openforcefield/cmiles/blob/master/notebooks/Tautomers.ipynb>`_
"""
if has_rdkit:
from rdkit.Chem import MolStandardize
else:
raise ImportError("Must have rdkit installed to use this function")
std_tautomer = MolStandardize.canonicalize_tautomer_smiles(iso_can_smi)
return std_tautomer