Source code for cmiles.generator

"""
Generate canonical identifiers for chemical databases, specifically quantum
chemical data.
"""
from copy import deepcopy
import cmiles
import warnings
from .utils import has_openeye, has_rdkit

if has_openeye:
    import openeye as oe
if has_rdkit:
    import rdkit as rd


[docs]def get_molecule_ids(molecule_input, toolkit='openeye', strict=True, **kwargs): """ Generate a dictionary of canonical identifiers The `molecule_input` can be either a JSON serialised molecule (see `QCSchema <https://molssi-qc-schema.readthedocs.io/en/latest/index.html#>`_) or an isomeric SMILES with all steroechemistry defined. Required fields for the QCSchema molecule: 1. symbols 2. geometry 3. connectivity Parameters ---------- molecule_input: dict or str A JSON serialized QC molecule or an isomeric SMILES toolkit: str, optional, default 'openeye' toolkit to use for canonicalization. Currently supports `openeye` and `rdkit` strict: bool, optional. Default True If true, will raise an exception if SMILES is missing explicit H. **permute_xyz: bool, optional, default False **Only use if input molecule is in QCSchema format**. If True, the geometry will be permuted to reflect the canononical atom order in the mapped SMILES. ``get_molecule_ids`` will return the permuted QCSchema. ``cmiles`` identifiers will be in the `identifiers` field If False, the map indices in the mapped SMILES will reflect the order of the atoms in the input QCSchema. Returns ------- dict If ``permute_xyz=True``, will return permuted qcschema with cmiles identifiers in `identifiers` field. """ # check input and convert to oe or rdkit mol molecule = cmiles.utils.load_molecule(molecule_input, toolkit=toolkit, **kwargs) # check for map. If map exists, remove. We only want maps generated with cmiles if cmiles.utils.has_atom_map(molecule): cmiles.utils.remove_atom_map(molecule) # check for explicit hydrogen if strict and not cmiles.utils.has_explicit_hydrogen(molecule): raise ValueError("Input molecule is missing explicit hydrogen") # Check for fully defined stereochemsitry if not cmiles.utils.has_stereo_defined(molecule): raise ValueError("Input SMILES must have full stereochemistry defined") molecule_ids = {} backend_toolkit = cmiles.utils._set_toolkit(molecule) molecule_ids['canonical_smiles'] = backend_toolkit.mol_to_smiles(molecule, isomeric=False, explicit_hydrogen=False, mapped=False) molecule_ids['canonical_isomeric_smiles'] = backend_toolkit.mol_to_smiles(molecule, isomeric=True, explicit_hydrogen=False, mapped=False) molecule_ids['canonical_explicit_hydrogen_smiles'] = backend_toolkit.mol_to_smiles(molecule, isomeric=False, explicit_hydrogen=True, mapped=False) molecule_ids['canonical_isomeric_explicit_hydrogen_smiles'] = backend_toolkit.mol_to_smiles(molecule, isomeric=True, explicit_hydrogen=True, mapped=False) molecule_ids['canonical_isomeric_explicit_hydrogen_mapped_smiles'] = backend_toolkit.mol_to_smiles(molecule, isomeric=True, explicit_hydrogen=True, mapped=True) molecule_ids['molecular_formula'] = cmiles.utils.mol_to_hill_molecular_formula(molecule) inchi = get_inchi_and_key(molecule) if inchi: molecule_ids['standard_inchi'] = inchi[0] molecule_ids['inchi_key'] = inchi[-1] if cmiles.utils.has_rdkit: molecule_ids['unique_tautomer_representation'] = standardize_tautomer(molecule_ids['canonical_isomeric_smiles']) if cmiles.utils.has_openeye: molecule_ids['unique_protomer_representation'] = get_unique_protomer(molecule) molecule_ids['provenance'] = 'cmiles_' + cmiles.__version__ + '_{}_'.format(backend_toolkit) + \ backend_toolkit.toolkit.__version__ try: if kwargs['permute_xyz']: permuted_json_mol = cmiles.utils.permute_qcschema(molecule_input, molecule_ids, toolkit=toolkit) return permuted_json_mol else: return molecule_ids except KeyError: return molecule_ids
[docs]def get_inchi_and_key(molecule): """ Generate inchi and inchikey. Uses RDKit which uses the inchi API Parameters ---------- molecule: rdkit.Chem.Mol If an `oechem.OEMol` is provided, will convert it to an `rdkit.Chem.Mol` Returns ------- tuple (inchi, inchi_key) """ # Todo can use the InChI code directly here # Make sure molecule is rdkit mol if not has_rdkit: warnings.warn("rdkit is not installed. cmiles ids will not include inchi and inchikey") return if not isinstance(molecule, rd.Chem.Mol): molecule = rd.Chem.MolFromSmiles(oe.oechem.OEMolToSmiles(molecule)) inchi = rd.Chem.MolToInchi(molecule) inchi_key = rd.Chem.MolToInchiKey(molecule) return inchi, inchi_key
[docs]def get_iupac(molecule): """ Generate IUPAC name Parameters ---------- molecule : `oechem.OEMol` Returns ------- str: iupac name Notes ----- Will only be generated if has openeye license """ if not has_openeye: raise ImportError("OpenEye is not installed. You can use the canonicalization='rdkit' to use the RDKit backend" "The Conda recipe for cmiles installs rdkit") from openeye import oeiupac if not oeiupac.OEIUPACIsLicensed(): raise ImportError("Must have OEIUPAC license!") return oeiupac.OECreateIUPACName(molecule)
[docs]def get_unique_protomer(molecule): """ Generate unique protomer for all tuatomers and charge states of the moelcule. **Requires openeye license** Parameters ---------- molecule: oechem.OEMol Will convert `rdkit.Chem.Mol` to `oechem.OEMol` if openeye is installed and license is valid Returns ------- str unique protomer """ molecule = deepcopy(molecule) # This only works for OpenEye # Todo There might be a way to use different layers of InChI for this purpose. # Not all tautomers are recognized as the same by InChI so it won't capture all tautomers. # But if we use only the formula and connectivity level we might be able to capture a larger subset. # if has_openeye: from openeye import oequacpac, oechem else: raise RuntimeError("Must have OpenEye for unique protomer feature") if not oechem.OEChemIsLicensed(): raise ImportError("Must have OEChem license!") if not oequacpac.OEQuacPacIsLicensed(): raise ImportError("Must have OEQuacPac license!") if has_rdkit: if isinstance(molecule, rd.Chem.rdchem.Mol): # convert to openeye molecule # Maybe we shouldn't do this. smiles = rd.Chem.MolToSmiles(molecule) molecule = oechem.OEMol() oechem.OESmilesToMol(molecule, smiles) molecule_copy = deepcopy(molecule) oequacpac.OEGetUniqueProtomer(molecule_copy, molecule) return oechem.OEMolToSmiles(molecule_copy)
[docs]def standardize_tautomer(iso_can_smi): """ Standardize tautomer to one universal tautomer. Parameters ---------- iso_can_smi: str isomeric SMILES Returns ------- str: standardized tautomer Notes ----- Does not standardize for ionization states. In some cases preforms better than `oequacpac.OEGetUniqueProtomer`. See `notebook <https://github.com/openforcefield/cmiles/blob/master/notebooks/Tautomers.ipynb>`_ """ if has_rdkit: from rdkit.Chem import MolStandardize else: raise ImportError("Must have rdkit installed to use this function") std_tautomer = MolStandardize.canonicalize_tautomer_smiles(iso_can_smi) return std_tautomer