RDKit
Open-source cheminformatics and machine learning.
MHFP.h
Go to the documentation of this file.
1 //
2 // 2019, Daniel Probst, Reymond Group @ University of Bern
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 /*! \file MHFP.h
12 
13 */
14 #include <RDGeneral/export.h>
15 #ifndef RD_MHFPFPS_H
16 #define RD_MHFPFPS_H
17 #include <string>
18 #include <vector>
19 #include <GraphMol/ROMol.h>
21 
22 class SparseBitVect;
23 namespace RDKit {
24 namespace MHFPFingerprints {
25 const std::string mhfpFingerprintVersion = "1.0.0";
26 
27 namespace FNV {
28 const uint32_t prime = 0x01000193;
29 const uint32_t seed = 0x811C9DC5;
30 
31 //! A simple implementation of the Fowler–Noll–Vo hash function.
32 inline uint32_t hash(const std::string& str, uint32_t hash = seed) {
33  const unsigned char* ptr = (const unsigned char*)str.c_str();
34  size_t len = str.length();
35 
36  while (len--) {
37  hash = (*ptr++ ^ hash) * prime;
38  }
39 
40  return hash;
41 };
42 } // namespace FNV
43 
45  public:
46  //! Constructor
47  /*!
48  \brief Construct a MHFPEncoder
49 
50  The MHFPEncoder class is instantieted with a given number of permutations
51  and a seed. Fingerprints / minhashes created with a different number of
52  permutations or a different seed are not compatible.
53 
54  \param n_permutations the number of permutations used to create hash
55  functions. This will be the dimensionality of the resulting vector.
56  Default: <tt>2048</tt>.
57  \param seed a random seed. Default: <tt>42</tt>.
58  */
59  MHFPEncoder(unsigned int n_permutations = 2048, unsigned int seed = 42);
60 
61  /*!
62  \brief Creates a MinHash from a vector of strings.
63 
64  This method is exposed in order to enable advanced usage of this MHFP
65  implementation such as customizing the properties that are hashed in order
66  to create an MHFP instance. In theory, any number of values that can be
67  represented as strings can be minhashed. This method is called
68  by MHFPEncoder::Encode.
69 
70  \param vec a vector containg strings (e.g. the smiles shingling of a
71  molecule).
72 
73  \returns the MinHash of the input.
74  */
75  std::vector<uint32_t> FromStringArray(const std::vector<std::string>& vec);
76 
77  /*!
78  \brief Creates a MinHash from a list of unsigned integers.
79 
80  This method is exposed in order to enable advanced usage of this MHFP
81  implementation such as MinHashing a sparse array generated by another
82  fingerprint (e.g. Morgan / ECFP).
83 
84  \param vec a vector containg unsigned integers.
85 
86  \returns the MinHash of the input.
87  */
88  std::vector<uint32_t> FromArray(const std::vector<uint32_t>& vec);
89 
90  /*!
91  \brief Creates a molecular shingling based on circular substructures.
92 
93  A molecular shingling is a vector of SMILES that were extracted from and
94  represent a molecule. This method extracts substructures centered at each
95  atom of the molecule with different radii. A molecule with 10 atoms will
96  generate <tt>10 * 3</tt> shingles when a radius of <tt>3</tt> is chosen.
97 
98  \param radius the maximum radius of the substructure that is generated at
99  each atom. Default: <tt>3</tt>.
100  \param rings whether the rings (SSSR) are extrected from the molecule and
101  added to the shingling. Given the molecule
102  <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
103  to the shingling. Default: <tt>true</tt>.
104  \param isomeric whether the SMILES added to the shingling are isomeric.
105  Default: <tt>false</tt>.
106  \param kekulize whether the SMILES added to the shingling are kekulized.
107  Default: <tt>true</tt>. NOTE that this will throw an exception if
108  the molecule cannot be kekulized.
109  \param min_radius the minimum radius that is used to extract n-grams.
110  Default: <tt>1</tt>.
111 
112  \returns the shingling of a molecule.
113  */
114  std::vector<std::string> CreateShingling(const ROMol& mol,
115  unsigned char radius = 3,
116  bool rings = true,
117  bool isomeric = false,
118  bool kekulize = false,
119  unsigned char min_radius = 1);
120 
121  //! \overload
122  std::vector<std::string> CreateShingling(const std::string& smiles,
123  unsigned char radius = 3,
124  bool rings = true,
125  bool isomeric = false,
126  bool kekulize = false,
127  unsigned char min_radius = 1);
128 
129  /*!
130  \brief Creates a MinHash vector from a molecule.
131 
132  This methods is a wrapper around MHFPEncoder::CreateShingling and
133  MHFPEncoder::FromStringArray. When a vector of molecules or SMILES is passed
134  and RDKit was compiled with OpenMP, it is parallelized and will speed up by
135  a factor of the number of cores.
136 
137  \param radius the maximum radius of the substructure that is generated at
138  each atom. Default: <tt>3</tt>.
139  \param rings whether the rings (SSSR) are extrected from the molecule and
140  added to the shingling. Given the molecule
141  <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
142  to the shingling. Default: <tt>true</tt>.
143  \param isomeric whether the SMILES added to the shingling are isomeric.
144  Default: <tt>false</tt>.
145  \param kekulize whether the SMILES added to the shingling are kekulized.
146  Default: <tt>true</tt>. NOTE that this will throw an exception if
147  the molecule cannot be kekulized.
148  \param min_radius the minimum radius that is used to extract n-grams.
149  Default: <tt>1</tt>.
150 
151  \returns the MHFP fingerprint.
152  */
153  std::vector<uint32_t> Encode(ROMol& mol, unsigned char radius = 3,
154  bool rings = true, bool isomeric = false,
155  bool kekulize = false,
156  unsigned char min_radius = 1);
157 
158  //! \overload
159  std::vector<std::vector<uint32_t>> Encode(std::vector<ROMol>& mols,
160  unsigned char radius = 3,
161  bool rings = true,
162  bool isomeric = false,
163  bool kekulize = false,
164  unsigned char min_radius = 1);
165 
166  //! \overload
167  std::vector<uint32_t> Encode(std::string& smiles, unsigned char radius = 3,
168  bool rings = true, bool isomeric = false,
169  bool kekulize = false,
170  unsigned char min_radius = 1);
171 
172  //! \overload
173  std::vector<std::vector<uint32_t>> Encode(std::vector<std::string>& smiles,
174  unsigned char radius = 3,
175  bool rings = true,
176  bool isomeric = false,
177  bool kekulize = false,
178  unsigned char min_radius = 1);
179 
180  /*!
181  \brief Creates a binary fingerprint based on circular sub-SMILES.
182 
183  Creates a binary fingerprint similar to ECFP. However, instead of using
184  a Morgan-style hashing, circular n-grams (sub-SMILES) are created, hashed
185  directly and folded.
186 
187  \param radius the maximum radius of the substructure that is generated at
188  each atom. Default: <tt>3</tt>.
189  \param rings whether the rings (SSSR) are extrected from the molecule and
190  added to the shingling. Given the molecule
191  <tt>"C1CCCCCC1C(=O)C"</tt>, "<tt>C1CCCCCC1"</tt> would be added
192  to the shingling. Default: <tt>true</tt>.
193  \param isomeric whether the SMILES added to the shingling are isomeric.
194  Default: <tt>false</tt>.
195  \param kekulize whether the SMILES added to the shingling are kekulized.
196  Default: <tt>true</tt>. NOTE that this will throw an exception if
197  the molecule cannot be kekulized.
198  \param min_radius the minimum radius that is used to extract n-grams.
199  Default: <tt>1</tt>.
200  \param length the length into which the fingerprint is folded.
201  Default: <tt>2048</tt>.
202 
203  \returns the SECFP fingerprint.
204  */
205  ExplicitBitVect EncodeSECFP(ROMol& mol, unsigned char radius = 3,
206  bool rings = true, bool isomeric = false,
207  bool kekulize = false,
208  unsigned char min_radius = 1,
209  size_t length = 2048);
210 
211  //! \overload
212  std::vector<ExplicitBitVect> EncodeSECFP(
213  std::vector<ROMol>& mols, unsigned char radius = 3, bool rings = true,
214  bool isomeric = false, bool kekulize = false,
215  unsigned char min_radius = 1, size_t length = 2048);
216 
217  //! \overload
218  ExplicitBitVect EncodeSECFP(std::string& smiles, unsigned char radius = 3,
219  bool rings = true, bool isomeric = false,
220  bool kekulize = false,
221  unsigned char min_radius = 1,
222  size_t length = 2048);
223 
224  //! \overload
225  std::vector<ExplicitBitVect> EncodeSECFP(
226  std::vector<std::string>& smiles, unsigned char radius = 3,
227  bool rings = true, bool isomeric = false, bool kekulize = false,
228  unsigned char min_radius = 1, size_t length = 2048);
229 
230  /*!
231  \brief Calculates the Hamming distance between two MHFP
232  fingerprints.
233 
234  \param a an MHFP fingerprint vector.
235  \param b an MHFP fingerprint vector.
236 
237  \returns the Hamming distance between the two fingerprints.
238  */
239  static double Distance(const std::vector<uint32_t>& a,
240  const std::vector<uint32_t>& b) {
241  size_t mismatches = 0;
242 
243  for (size_t i = 0; i < a.size(); i++) {
244  if (a[i] != b[i]) {
245  mismatches++;
246  }
247  }
248 
249  return mismatches / (double)a.size();
250  }
251 
252  private:
253  //! The fastest mod implementation.
254  uint64_t FastMod(const uint64_t input, const uint64_t ceil) {
255  return input >= ceil ? input % ceil : input;
256  }
257 
258  ExplicitBitVect Fold(const std::vector<uint32_t>& vec,
259  uint32_t length = 2048) {
260  ExplicitBitVect ebv(length);
261  for (size_t i = 0; i < vec.size(); i++) {
262  ebv.setBit(vec[i] % length);
263  }
264  return ebv;
265  }
266 
267  std::vector<uint32_t> HashShingling(std::vector<std::string> vec) {
268  std::vector<uint32_t> result(vec.size());
269  for (size_t i = 0; i < vec.size(); i++) {
270  result[i] = FNV::hash(vec[i]);
271  }
272  return result;
273  }
274 
275  unsigned int n_permutations_, seed_;
276  uint64_t prime_ = 2305843009213693951UL;
277  uint32_t max_hash_ = 4294967295;
278  std::vector<uint32_t> perms_a_;
279  std::vector<uint32_t> perms_b_;
280 };
281 
282 } // namespace MHFPFingerprints
283 } // namespace RDKit
284 
285 #endif
Defines the primary molecule class ROMol as well as associated typedefs.
a class for bit vectors that are densely occupied
static double Distance(const std::vector< uint32_t > &a, const std::vector< uint32_t > &b)
Calculates the Hamming distance between two MHFP fingerprints.
Definition: MHFP.h:239
std::vector< uint32_t > FromStringArray(const std::vector< std::string > &vec)
Creates a MinHash from a vector of strings.
std::vector< std::string > CreateShingling(const ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
Creates a molecular shingling based on circular substructures.
std::vector< std::vector< uint32_t > > Encode(std::vector< std::string > &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
ExplicitBitVect EncodeSECFP(std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< std::vector< uint32_t > > Encode(std::vector< ROMol > &mols, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< ExplicitBitVect > EncodeSECFP(std::vector< std::string > &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< std::string > CreateShingling(const std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< uint32_t > Encode(std::string &smiles, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< uint32_t > FromArray(const std::vector< uint32_t > &vec)
Creates a MinHash from a list of unsigned integers.
ExplicitBitVect EncodeSECFP(ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1, size_t length=2048)
Creates a binary fingerprint based on circular sub-SMILES.
MHFPEncoder(unsigned int n_permutations=2048, unsigned int seed=42)
Constructor.
std::vector< ExplicitBitVect > EncodeSECFP(std::vector< ROMol > &mols, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1, size_t length=2048)
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< uint32_t > Encode(ROMol &mol, unsigned char radius=3, bool rings=true, bool isomeric=false, bool kekulize=false, unsigned char min_radius=1)
Creates a MinHash vector from a molecule.
a class for bit vectors that are sparsely occupied.
Definition: SparseBitVect.h:34
#define RDKIT_FINGERPRINTS_EXPORT
Definition: export.h:177
const uint32_t seed
Definition: MHFP.h:29
const uint32_t prime
Definition: MHFP.h:28
uint32_t hash(const std::string &str, uint32_t hash=seed)
A simple implementation of the Fowler–Noll–Vo hash function.
Definition: MHFP.h:32
const std::string mhfpFingerprintVersion
Definition: MHFP.h:25
Std stuff.
Definition: Abbreviations.h:19