RDKit
Open-source cheminformatics and machine learning.
SmilesWrite.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_SMILESWRITE_H_012020
12 #define RD_SMILESWRITE_H_012020
13 
14 #include <string>
15 #include <vector>
16 #include <memory>
17 #include <cstdint>
18 #include <limits>
19 
20 namespace RDKit {
21 class Atom;
22 class Bond;
23 class ROMol;
24 
26  bool doIsomericSmiles =
27  true; /**< include stereochemistry and isotope information */
28  bool doKekule = false; /**< kekulize the molecule before generating the SMILES
29  and output single/double bonds. NOTE that the output
30  is not canonical and that this will thrown an
31  exception if the molecule cannot be kekulized. */
32  bool canonical = true; /**< generate canonical SMILES */
33  bool allBondsExplicit = false; /**< include symbols for all bonds */
34  bool allHsExplicit = false; /**< provide hydrogen counts for every atom */
35  bool doRandom = false; /**< randomize the output order. The resulting SMILES
36  is not canonical */
37  int rootedAtAtom = -1; /**< make sure the SMILES starts at the specified
38  atom. The resulting SMILES is not canonical */
39 };
40 namespace SmilesWrite {
41 
42 enum CXSmilesFields : uint32_t {
43  CX_NONE = 0,
44  CX_ATOM_LABELS = 1 << 0,
46  CX_COORDS = 1 << 2,
47  CX_RADICALS = 1 << 3,
48  CX_ATOM_PROPS = 1 << 4,
49  CX_LINKNODES = 1 << 5,
51  CX_SGROUPS = 1 << 7,
52  CX_POLYMER = 1 << 8,
53  CX_BOND_CFG = 1 << 9,
54  CX_ALL = 0x7fffffff,
56 };
57 
58 //! \brief returns the cxsmiles data for a molecule
60  const ROMol &mol, std::uint32_t flags = CXSmilesFields::CX_ALL);
61 
62 //! \brief returns true if the atom number is in the SMILES organic subset
64 
65 //! \brief returns the SMILES for an atom
66 /*!
67  \param atom : the atom to work with
68  \param doKekule : we're doing kekulized smiles (e.g. don't use
69  lower case for the atom label)
70  \param bondIn : the bond we came into the atom on (unused)
71  \param allHsExplicit : if true, hydrogen counts will be provided for every
72  atom.
73  \param isomericSmiles : if true, isomeric SMILES will be generated
74 */
76  bool doKekule = false,
77  const Bond *bondIn = nullptr,
78  bool allHsExplicit = false,
79  bool isomericSmiles = true);
80 
81 //! \brief returns the SMILES for a bond
82 /*!
83  \param bond : the bond to work with
84  \param atomToLeftIdx : the index of the atom preceding \c bond
85  in the SMILES
86  \param doKekule : we're doing kekulized smiles (e.g. write out
87  bond orders for aromatic bonds)
88  \param allBondsExplicit : if true, symbols will be included for all bonds.
89 */
91  const Bond *bond, int atomToLeftIdx = -1, bool doKekule = false,
92  bool allBondsExplicit = false);
93 } // namespace SmilesWrite
94 
95 //! \brief returns canonical SMILES for a molecule
97  const ROMol &mol, const SmilesWriteParams &params);
98 
99 //! \brief returns canonical SMILES for a molecule
100 /*!
101  \param mol : the molecule in question.
102  \param doIsomericSmiles : include stereochemistry and isotope information
103  in the SMILES
104 
105  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) NOTE that
106  this will throw an exception if the molecule cannot be kekulized.
107 
108  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
109  The resulting SMILES is not, of course, canonical.
110  \param canonical : if false, no attempt will be made to canonicalize the
111  SMILES
112  \param allBondsExplicit : if true, symbols will be included for all bonds.
113  \param allHsExplicit : if true, hydrogen counts will be provided for every
114  atom.
115  */
116 inline std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles = true,
117  bool doKekule = false, int rootedAtAtom = -1,
118  bool canonical = true,
119  bool allBondsExplicit = false,
120  bool allHsExplicit = false,
121  bool doRandom = false) {
123  ps.doIsomericSmiles = doIsomericSmiles;
124  ps.doKekule = doKekule;
125  ps.rootedAtAtom = rootedAtAtom;
126  ps.canonical = canonical;
127  ps.allBondsExplicit = allBondsExplicit;
128  ps.allHsExplicit = allHsExplicit;
129  ps.doRandom = doRandom;
130  return MolToSmiles(mol, ps);
131 };
132 
133 //! \brief returns a vector of random SMILES for a molecule (may contain
134 //! duplicates)
135 /*!
136  \param mol : the molecule in question.
137  \param numSmiles : the number of SMILES to return
138  \param randomSeed : if >0, will be used to seed the random number generator
139  \param doIsomericSmiles : include stereochemistry and isotope information
140  in the SMILES
141  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
142  \param allBondsExplicit : if true, symbols will be included for all bonds.
143  \param allHsExplicit : if true, hydrogen counts will be provided for every
144  atom.
145  */
147  const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed = 0,
148  bool doIsomericSmiles = true, bool doKekule = false,
149  bool allBondsExplicit = false, bool allHsExplicit = false);
150 
151 //! \brief returns canonical SMILES for part of a molecule
153  const ROMol &mol, const SmilesWriteParams &params,
154  const std::vector<int> &atomsToUse,
155  const std::vector<int> *bondsToUse = nullptr,
156  const std::vector<std::string> *atomSymbols = nullptr,
157  const std::vector<std::string> *bondSymbols = nullptr);
158 
159 //! \brief returns canonical SMILES for part of a molecule
160 /*!
161  \param mol : the molecule in question.
162  \param atomsToUse : indices of the atoms in the fragment
163  \param bondsToUse : indices of the bonds in the fragment. If this is not
164  provided,
165  all bonds between the atoms in atomsToUse will be included
166  \param atomSymbols : symbols to use for the atoms in the output SMILES
167  \param bondSymbols : symbols to use for the bonds in the output SMILES
168  \param doIsomericSmiles : include stereochemistry and isotope information
169  in the SMILES
170  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
171  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
172  The resulting SMILES is not, of course, canonical.
173  \param canonical : if false, no attempt will be made to canonicalize the
174  SMILES
175  \param allBondsExplicit : if true, symbols will be included for all bonds.
176  \param allHsExplicit : if true, hydrogen counts will be provided for every
177  atom.
178  \param doRandom : generate a randomized smiles string by randomly choosing
179  the priority to follow in the DFS traversal. [default false]
180 
181  \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
182 
183  */
184 inline std::string MolFragmentToSmiles(
185  const ROMol &mol, const std::vector<int> &atomsToUse,
186  const std::vector<int> *bondsToUse = nullptr,
187  const std::vector<std::string> *atomSymbols = nullptr,
188  const std::vector<std::string> *bondSymbols = nullptr,
189  bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
190  bool canonical = true, bool allBondsExplicit = false,
191  bool allHsExplicit = false) {
193  ps.doIsomericSmiles = doIsomericSmiles;
194  ps.doKekule = doKekule;
195  ps.rootedAtAtom = rootedAtAtom;
196  ps.canonical = canonical;
197  ps.allBondsExplicit = allBondsExplicit;
198  ps.allHsExplicit = allHsExplicit;
199  return MolFragmentToSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
200  bondSymbols);
201 }
202 
203 //! \brief returns canonical CXSMILES for a molecule
205  const ROMol &mol, const SmilesWriteParams &ps,
206  std::uint32_t flags = SmilesWrite::CXSmilesFields::CX_ALL);
207 
208 //! \brief returns canonical CXSMILES for a molecule
209 /*!
210  \param mol : the molecule in question.
211  \param doIsomericSmiles : include stereochemistry and isotope information
212  in the SMILES
213  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
214  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
215  The resulting SMILES is not, of course, canonical.
216  \param canonical : if false, no attempt will be made to canonicalize the
217  SMILES
218  \param allBondsExplicit : if true, symbols will be included for all bonds.
219  \param allHsExplicit : if true, hydrogen counts will be provided for every
220  atom.
221  */
222 inline std::string MolToCXSmiles(const ROMol &mol, bool doIsomericSmiles = true,
223  bool doKekule = false, int rootedAtAtom = -1,
224  bool canonical = true,
225  bool allBondsExplicit = false,
226  bool allHsExplicit = false,
227  bool doRandom = false) {
229  ps.doIsomericSmiles = doIsomericSmiles;
230  ps.doKekule = doKekule;
231  ps.rootedAtAtom = rootedAtAtom;
232  ps.canonical = canonical;
233  ps.allBondsExplicit = allBondsExplicit;
234  ps.allHsExplicit = allHsExplicit;
235  ps.doRandom = doRandom;
236  return MolToCXSmiles(mol, ps);
237 };
238 
239 //! \brief returns canonical CXSMILES for part of a molecule
241  const ROMol &mol, const SmilesWriteParams &params,
242  const std::vector<int> &atomsToUse,
243  const std::vector<int> *bondsToUse = nullptr,
244  const std::vector<std::string> *atomSymbols = nullptr,
245  const std::vector<std::string> *bondSymbols = nullptr);
246 
247 //! \brief returns canonical CXSMILES for part of a molecule
248 /*!
249  \param mol : the molecule in question.
250  \param atomsToUse : indices of the atoms in the fragment
251  \param bondsToUse : indices of the bonds in the fragment. If this is not
252  provided,
253  all bonds between the atoms in atomsToUse will be included
254  \param atomSymbols : symbols to use for the atoms in the output SMILES
255  \param bondSymbols : symbols to use for the bonds in the output SMILES
256  \param doIsomericSmiles : include stereochemistry and isotope information
257  in the SMILES
258  \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
259  \param rootedAtAtom : make sure the SMILES starts at the specified atom.
260  The resulting SMILES is not, of course, canonical.
261  \param canonical : if false, no attempt will be made to canonicalize the
262  SMILES
263  \param allBondsExplicit : if true, symbols will be included for all bonds.
264  \param allHsExplicit : if true, hydrogen counts will be provided for every
265  atom.
266 
267  \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
268 
269  */
270 inline std::string MolFragmentToCXSmiles(
271  const ROMol &mol, const std::vector<int> &atomsToUse,
272  const std::vector<int> *bondsToUse = nullptr,
273  const std::vector<std::string> *atomSymbols = nullptr,
274  const std::vector<std::string> *bondSymbols = nullptr,
275  bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
276  bool canonical = true, bool allBondsExplicit = false,
277  bool allHsExplicit = false) {
279  ps.doIsomericSmiles = doIsomericSmiles;
280  ps.doKekule = doKekule;
281  ps.rootedAtAtom = rootedAtAtom;
282  ps.canonical = canonical;
283  ps.allBondsExplicit = allBondsExplicit;
284  ps.allHsExplicit = allHsExplicit;
285  return MolFragmentToCXSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
286  bondSymbols);
287 }
288 
289 } // namespace RDKit
290 #endif
The class for representing atoms.
Definition: Atom.h:68
class for representing a bond
Definition: Bond.h:47
#define RDKIT_SMILESPARSE_EXPORT
Definition: export.h:457
RDKIT_SMILESPARSE_EXPORT bool inOrganicSubset(int atomicNumber)
returns true if the atom number is in the SMILES organic subset
RDKIT_SMILESPARSE_EXPORT std::string GetBondSmiles(const Bond *bond, int atomToLeftIdx=-1, bool doKekule=false, bool allBondsExplicit=false)
returns the SMILES for a bond
RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions(const ROMol &mol, std::uint32_t flags=CXSmilesFields::CX_ALL)
returns the cxsmiles data for a molecule
RDKIT_SMILESPARSE_EXPORT std::string GetAtomSmiles(const Atom *atom, bool doKekule=false, const Bond *bondIn=nullptr, bool allHsExplicit=false, bool isomericSmiles=true)
returns the SMILES for an atom
Std stuff.
Definition: Abbreviations.h:19
RDKIT_SMILESPARSE_EXPORT std::vector< std::string > MolToRandomSmilesVect(const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed=0, bool doIsomericSmiles=true, bool doKekule=false, bool allBondsExplicit=false, bool allHsExplicit=false)
returns a vector of random SMILES for a molecule (may contain duplicates)
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical SMILES for part of a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToCXSmiles(const ROMol &mol, const SmilesWriteParams &ps, std::uint32_t flags=SmilesWrite::CXSmilesFields::CX_ALL)
returns canonical CXSMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToCXSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical CXSMILES for part of a molecule