Skip to content

Commit 7ec685d

Browse files
committed
Split atoms by atomic mass in the canon algorithm, rather than do this by default which would lead to incorrect results in "unique smiles"
1 parent b6351be commit 7ec685d

File tree

3 files changed

+112
-6
lines changed

3 files changed

+112
-6
lines changed

base/standard/src/main/java/org/openscience/cdk/graph/invariant/Canon.java

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import java.io.IOException;
3434
import java.io.InputStreamReader;
3535
import java.util.Arrays;
36-
import java.util.Collections;
3736
import java.util.Comparator;
3837

3938
/**
@@ -102,6 +101,25 @@ private Canon(int[][] g, long[] partition, boolean[] hydrogens, boolean symOnly)
102101
symmetry = refine(labelling, hydrogens);
103102
}
104103

104+
/**
105+
* Compute the canonical labels for the provided structure. The labelling
106+
* does not consider isomer information or stereochemistry. The current
107+
* implementation does not fully distinguish all structure topologies
108+
* but in practise performs well in the majority of cases. A complete
109+
* canonical labelling can be obtained using the {@link InChINumbersTools}
110+
* but is computationally much more expensive.
111+
*
112+
* @param container structure
113+
* @param g adjacency list graph representation
114+
* @param opts canonical generation options see {@link CanonOpts}
115+
* @return the canonical labelling
116+
* @see EquivalentClassPartitioner
117+
* @see InChINumbersTools
118+
*/
119+
public static long[] label(IAtomContainer container, int[][] g, int opts) {
120+
return label(container, g, basicInvariants(container, g, opts));
121+
}
122+
105123
/**
106124
* Compute the canonical labels for the provided structure. The labelling
107125
* does not consider isomer information or stereochemistry. The current
@@ -117,7 +135,7 @@ private Canon(int[][] g, long[] partition, boolean[] hydrogens, boolean symOnly)
117135
* @see InChINumbersTools
118136
*/
119137
public static long[] label(IAtomContainer container, int[][] g) {
120-
return label(container, g, basicInvariants(container, g));
138+
return label(container, g, CanonOpts.Default);
121139
}
122140

123141
/**
@@ -182,13 +200,32 @@ public static long[] label(IAtomContainer container,
182200
*
183201
* @param container structure
184202
* @param g adjacency list graph representation
203+
* @param opts canonical generation options see {@link CanonOpts}
185204
* @return symmetry classes
186205
* @see EquivalentClassPartitioner
187206
*/
207+
public static long[] symmetry(IAtomContainer container, int[][] g, int opts) {
208+
return new Canon(g, basicInvariants(container, g, opts), terminalHydrogens(container, g), true).symmetry;
209+
}
210+
211+
/**
212+
* Compute the symmetry classes for the provided structure. There are known
213+
* examples where symmetry is incorrectly found. The {@link
214+
* EquivalentClassPartitioner} gives more accurate symmetry perception but
215+
* this method is very quick and in practise successfully portions the
216+
* majority of chemical structures.
217+
*
218+
* @param container structure
219+
* @param g adjacency list graph representation
220+
* @return symmetry classes
221+
* @see EquivalentClassPartitioner
222+
* @see #basicInvariants(IAtomContainer, int[][], int)
223+
*/
188224
public static long[] symmetry(IAtomContainer container, int[][] g) {
189-
return new Canon(g, basicInvariants(container, g), terminalHydrogens(container, g), true).symmetry;
225+
return symmetry(container, g, CanonOpts.Default);
190226
}
191227

228+
192229
/**
193230
* Internal - refine invariants to a canonical labelling and
194231
* symmetry classes.
@@ -304,6 +341,18 @@ private long primeProduct(int[] ws, long[] ranks, boolean[] hydrogens) {
304341
return prod;
305342
}
306343

344+
/**
345+
* See {@link #basicInvariants(IAtomContainer, int[][], int)}.
346+
* @param container an atom container to generate labels for
347+
* @param graph graph representation (adjacency list)
348+
349+
* @return the initial invariants
350+
* @see #basicInvariants(IAtomContainer, int[][], int)
351+
*/
352+
public static long[] basicInvariants(IAtomContainer container, int[][] graph) {
353+
return basicInvariants(container, graph, CanonOpts.Default);
354+
}
355+
307356
/**
308357
* Generate the initial invariants for each atom in the {@code container}.
309358
* The labels use the invariants described in {@cdk.cite WEI89}.
@@ -328,11 +377,12 @@ private long primeProduct(int[] ws, long[] ranks, boolean[] hydrogens) {
328377
*
329378
* @param container an atom container to generate labels for
330379
* @param graph graph representation (adjacency list)
380+
* @param flav bit mask canon flavor (see {@link CanonOpts})
331381
* @return initial invariants
332382
* @throws NullPointerException an atom had unset atomic number, hydrogen
333383
* count or formal charge
334384
*/
335-
public static long[] basicInvariants(IAtomContainer container, int[][] graph) {
385+
public static long[] basicInvariants(IAtomContainer container, int[][] graph, int flav) {
336386

337387
long[] labels = new long[graph.length];
338388

@@ -362,6 +412,16 @@ public static long[] basicInvariants(IAtomContainer container, int[][] graph) {
362412
label <<= 4; // hydrogen count <= 15 (4 bits)
363413
label |= impH + expH & 0xf;
364414

415+
// atomic mass to split ties (if flavour requested), we can't do this
416+
// by default because "unique" smiles doesn't include the isotopic mass
417+
// so splitting on something that doesn't appear in the output would not
418+
// function correctly
419+
// n.b. the comparator based invariants are much more flexible still
420+
if ((flav & CanonOpts.AtomicMass) != 0 && atom.getMassNumber() != null) {
421+
label <<= 10;
422+
label |= atom.getMassNumber();
423+
}
424+
365425
labels[v] = label;
366426
}
367427
return labels;
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
* Copyright (c) 2021 John Mayfield
3+
*
4+
* Contact: cdk-devel@lists.sourceforge.net
5+
*
6+
* This program is free software; you can redistribute it and/or modify it
7+
* under the terms of the GNU Lesser General Public License as published by
8+
* the Free Software Foundation; either version 2.1 of the License, or (at
9+
* your option) any later version. All we ask is that proper credit is given
10+
* for our work, which includes - but is not limited to - adding the above
11+
* copyright notice to the beginning of your source code files, and to any
12+
* copyright notice that you may distribute with programs based on this work.
13+
*
14+
* This program is distributed in the hope that it will be useful, but WITHOUT
15+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17+
* License for more details.
18+
*
19+
* You should have received a copy of the GNU Lesser General Public License
20+
* along with this program; if not, write to the Free Software
21+
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 U
22+
*/
23+
24+
package org.openscience.cdk.graph.invariant;
25+
26+
/**
27+
* Basic flavor options to tweak canonical invariants, note these deliberately mirror some fields
28+
* from the {@link SmiFlavor} settings.
29+
*/
30+
public class CanonOpts {
31+
/**
32+
* Default canon flavour options.
33+
*/
34+
public static final int Default = 0;
35+
/**
36+
* Distinguish atoms based on atomic mass.
37+
*/
38+
public static final int AtomicMass = 0x008;
39+
}

base/test-standard/src/test/java/org/openscience/cdk/graph/invariant/CanonTest.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424

2525
package org.openscience.cdk.graph.invariant;
2626

27-
import org.junit.Assert;
2827
import org.junit.Test;
2928
import org.openscience.cdk.CDKConstants;
3029
import org.openscience.cdk.graph.GraphUtil;
@@ -34,7 +33,6 @@
3433
import org.openscience.cdk.smiles.SmiFlavor;
3534
import org.openscience.cdk.smiles.SmilesGenerator;
3635
import org.openscience.cdk.smiles.SmilesParser;
37-
import org.xmlcml.euclid.Int;
3836

3937
import java.util.Comparator;
4038

@@ -134,6 +132,15 @@ public void bridgingExplicitHydrogensAreIncluded() throws Exception {
134132
assertThat(mask, is(new boolean[]{false, false, false, false}));
135133
}
136134

135+
@Test
136+
public void isotopeFlavor() throws Exception {
137+
IAtomContainer m = smi("CC[13CH3]");
138+
long[] symmetry = Canon.symmetry(m, GraphUtil.toAdjList(m), CanonOpts.Default);
139+
assertThat(symmetry, is(new long[]{1,3,1}));
140+
long[] symmetry2 = Canon.symmetry(m, GraphUtil.toAdjList(m), CanonOpts.AtomicMass);
141+
assertThat(symmetry2, is(new long[]{1,2,3}));
142+
}
143+
137144
@Test
138145
public void explicitHydrogensIonsAreIncluded() throws Exception {
139146
IAtomContainer m = smi("[H+]");

0 commit comments

Comments
 (0)