RDKit
Open-source cheminformatics and machine learning.
FileParserUtils.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2010-2022 Greg Landrum and other RDKit contributors
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_FILEPARSERUTILS_H
12 #define RD_FILEPARSERUTILS_H
13 
14 #include <string>
15 #include <iostream>
17 #include <boost/lexical_cast.hpp>
18 #include <boost/algorithm/string.hpp>
19 #include <boost/format.hpp>
21 #include "FileParserUtils.h"
22 #include <string_view>
23 
24 namespace RDKit {
25 class RWMol;
26 class Conformer;
27 
28 namespace FileParserUtils {
29 RDKIT_FILEPARSERS_EXPORT inline std::string_view strip(
30  std::string_view orig, std::string stripChars = " \t\r\n") {
31  std::string_view res = orig;
32  auto start = res.find_first_not_of(stripChars);
33  if (start != std::string_view::npos) {
34  auto end = res.find_last_not_of(stripChars) + 1;
35  res = res.substr(start, end - start);
36  } else {
37  res = "";
38  }
39  return res;
40 }
41 
42 template <typename T>
43 T stripSpacesAndCast(std::string_view input, bool acceptSpaces = false) {
44  auto trimmed = strip(input, " ");
45  if (acceptSpaces && trimmed.empty()) {
46  return 0;
47  } else {
48  return boost::lexical_cast<T>(trimmed);
49  }
50 }
51 template <typename T>
52 T stripSpacesAndCast(const std::string &input, bool acceptSpaces = false) {
53  return stripSpacesAndCast<T>(std::string_view(input.c_str()), acceptSpaces);
54 }
55 RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input,
56  bool acceptSpaces = true);
57 RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(const std::string &input,
58  bool acceptSpaces = true);
59 RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input,
60  bool acceptSpaces = true);
61 RDKIT_FILEPARSERS_EXPORT int toInt(const std::string_view input,
62  bool acceptSpaces = true);
63 RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(std::string_view input,
64  bool acceptSpaces = true);
65 RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string_view input,
66  bool acceptSpaces = true);
67 
68 // parses info from a V3000 CTAB into a molecule
70  int confId = -1);
71 // reads a line from an MDL v3K CTAB
72 RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream,
73  unsigned int &line);
74 
75 // nAtoms and nBonds are ignored on input, set on output
77  std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
78  bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
79  bool strictParsing = true, bool expectMEND = true);
80 
81 // nAtoms and nBonds are used
83  std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
84  bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
85  bool strictParsing = true);
86 
87 //! finishes up the processing (sanitization, etc.) of a molecule read from
88 //! CTAB
90  bool chiralityPossible,
91  bool sanitize, bool removeHs);
92 
93 //! Deprecated, please use QueryOps::replaceAtomWithQueryAtom instead
95 
96 //! applies a particular property to the atoms as an atom property list
97 template <typename T>
98 void applyMolListPropToAtoms(ROMol &mol, const std::string &pn,
99  const std::string &prefix,
100  const std::string &missingValueMarker = "n/a") {
101  std::string atompn = pn.substr(prefix.size());
102  std::string strVect = mol.getProp<std::string>(pn);
103  std::vector<std::string> tokens;
104  boost::split(tokens, strVect, boost::is_any_of(" \t\n"),
105  boost::token_compress_on);
106  if (tokens.size() < mol.getNumAtoms()) {
108  << "Property list " << pn << " too short, only " << tokens.size()
109  << " elements found. Ignoring it." << std::endl;
110  return;
111  }
112  std::string mv = missingValueMarker;
113  size_t first_token = 0;
114  if (tokens.size() == mol.getNumAtoms() + 1 && tokens[0].front() == '[' &&
115  tokens[0].back() == ']') {
116  mv = std::string(tokens[0].begin() + 1, tokens[0].end() - 1);
117  first_token = 1;
118  }
119  if (mv.empty()) {
120  BOOST_LOG(rdWarningLog) << "Missing value marker for property " << pn
121  << " is empty." << std::endl;
122  }
123  for (size_t i = first_token; i < tokens.size(); ++i) {
124  if (tokens[i] != mv) {
125  unsigned int atomid = i - first_token;
126  try {
127  T apv = boost::lexical_cast<T>(tokens[i]);
128  mol.getAtomWithIdx(atomid)->setProp(atompn, apv);
129  } catch (const boost::bad_lexical_cast &) {
131  << "Value " << tokens[i] << " for property " << pn << " of atom "
132  << atomid << " can not be parsed. Ignoring it." << std::endl;
133  }
134  }
135  }
136 }
137 
138 //! applies all properties matching a particular prefix as an atom property
139 //! list
140 template <typename T>
141 void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix,
142  const std::string missingValueMarker = "n/a") {
143  for (auto pn : mol.getPropList()) {
144  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
145  applyMolListPropToAtoms<T>(mol, pn, prefix, missingValueMarker);
146  }
147  }
148 }
149 static const std::string atomPropPrefix = "atom.";
150 //! if the property name matches our rules for atom property lists, we'll
151 //! apply it to the atoms
153  ROMol &mol, const std::string pn,
154  const std::string &missingValueMarker = "n/a") {
155  if (pn.find(atomPropPrefix) == 0 && pn.length() > atomPropPrefix.length()) {
156  std::string prefix = atomPropPrefix + "prop.";
157  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
158  applyMolListPropToAtoms<std::string>(mol, pn, prefix, missingValueMarker);
159  } else {
160  prefix = atomPropPrefix + "iprop.";
161  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
162  applyMolListPropToAtoms<std::int64_t>(mol, pn, prefix,
163  missingValueMarker);
164  } else {
165  prefix = atomPropPrefix + "dprop.";
166  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
167  applyMolListPropToAtoms<double>(mol, pn, prefix, missingValueMarker);
168  } else {
169  prefix = atomPropPrefix + "bprop.";
170  if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
171  applyMolListPropToAtoms<bool>(mol, pn, prefix, missingValueMarker);
172  }
173  }
174  }
175  }
176  }
177 }
178 //! loops over all properties and applies the ones that match the rules for
179 //! atom property lists to the atoms
181  ROMol &mol, const std::string &missingValueMarker = "n/a") {
182  for (auto pn : mol.getPropList()) {
183  processMolPropertyList(mol, pn, missingValueMarker);
184  }
185 }
186 template <typename T>
187 std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName,
188  std::string missingValueMarker = "",
189  unsigned int lineSize = 190) {
190  std::string res;
191  std::string propVal;
192  if (!missingValueMarker.empty()) {
193  propVal += boost::str(boost::format("[%s] ") % missingValueMarker);
194  } else {
195  missingValueMarker = "n/a";
196  }
197  for (const auto &atom : mol.atoms()) {
198  std::string apVal = missingValueMarker;
199  if (atom->hasProp(atomPropName)) {
200  T tVal = atom->getProp<T>(atomPropName);
201  apVal = boost::lexical_cast<std::string>(tVal);
202  // seems like this should work, but it doesn't:
203  // atom->getProp(atomPropName,apVal);
204  }
205  if (propVal.length() + apVal.length() + 1 >= lineSize) {
206  // remove trailing space:
207  propVal.pop_back();
208  res += propVal + "\n";
209  propVal = "";
210  }
211  propVal += apVal + " ";
212  }
213  if (!propVal.empty()) {
214  // remove the trailing space:
215  propVal.pop_back();
216  res += propVal;
217  }
218  return res;
219 }
221  ROMol &mol, const std::string &atomPropName,
222  const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
223  std::string molPropName = "atom.iprop." + atomPropName;
224  mol.setProp(molPropName,
225  getAtomPropertyList<boost::int64_t>(
226  mol, atomPropName, missingValueMarker, lineSize));
227 }
229  ROMol &mol, const std::string &atomPropName,
230  const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
231  std::string molPropName = "atom.dprop." + atomPropName;
232  mol.setProp(molPropName,
233  getAtomPropertyList<double>(mol, atomPropName, missingValueMarker,
234  lineSize));
235 }
237  ROMol &mol, const std::string &atomPropName,
238  const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
239  std::string molPropName = "atom.bprop." + atomPropName;
240  mol.setProp(molPropName,
241  getAtomPropertyList<bool>(mol, atomPropName, missingValueMarker,
242  lineSize));
243 }
245  ROMol &mol, const std::string &atomPropName,
246  const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
247  std::string molPropName = "atom.prop." + atomPropName;
248  mol.setProp(molPropName,
249  getAtomPropertyList<std::string>(mol, atomPropName,
250  missingValueMarker, lineSize));
251 }
252 
253 } // namespace FileParserUtils
254 } // namespace RDKit
255 
256 #endif
#define BOOST_LOG(__arg__)
Definition: RDLog.h:110
RDKIT_RDGENERAL_EXPORT RDLogger rdWarningLog
The class for representing atoms.
Definition: Atom.h:68
The class for representing 2D or 3D conformation of a molecule.
Definition: Conformer.h:45
void getProp(const std::string &key, T &res) const
allows retrieval of a particular property value
Definition: RDProps.h:107
void setProp(const std::string &key, T val, bool computed=false) const
sets a property value
Definition: RDProps.h:77
STR_VECT getPropList(bool includePrivate=true, bool includeComputed=true) const
returns a list with the names of our properties
Definition: RDProps.h:45
unsigned int getNumAtoms() const
returns our number of atoms
Definition: ROMol.h:415
CXXAtomIterator< MolGraph, Atom * > atoms()
C++11 Range iterator.
Definition: ROMol.h:277
Atom * getAtomWithIdx(unsigned int idx)
returns a pointer to a particular Atom
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:32
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:161
void processMolPropertyList(ROMol &mol, const std::string pn, const std::string &missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT std::string getV3000CTAB(const ROMol &tmol, int confId=-1)
RDKIT_FILEPARSERS_EXPORT void finishMolProcessing(RWMol *res, bool chiralityPossible, bool sanitize, bool removeHs)
void createAtomDoublePropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input, bool acceptSpaces=true)
void createAtomIntPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input, bool acceptSpaces=true)
RDKIT_FILEPARSERS_EXPORT Atom * replaceAtomWithQueryAtom(RWMol *mol, Atom *atom)
Deprecated, please use QueryOps::replaceAtomWithQueryAtom instead.
T stripSpacesAndCast(std::string_view input, bool acceptSpaces=false)
void createAtomStringPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
void applyMolListPropToAtoms(ROMol &mol, const std::string &pn, const std::string &prefix, const std::string &missingValueMarker="n/a")
applies a particular property to the atoms as an atom property list
std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName, std::string missingValueMarker="", unsigned int lineSize=190)
RDKIT_FILEPARSERS_EXPORT std::string_view strip(std::string_view orig, std::string stripChars=" \t\r\n")
void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix, const std::string missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true, bool expectMEND=true)
void processMolPropertyLists(ROMol &mol, const std::string &missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true)
static const std::string atomPropPrefix
RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream, unsigned int &line)
RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(const std::string &input, bool acceptSpaces=true)
void createAtomBoolPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
Std stuff.
Definition: Abbreviations.h:19