Skip to content

Commit 56777bc

Browse files
committed
Handle some extra cases of isotope and charge parsing.
1 parent c3e90ed commit 56777bc

File tree

2 files changed

+109
-60
lines changed

2 files changed

+109
-60
lines changed

tool/formula/src/main/java/org/openscience/cdk/tools/manipulator/MolecularFormulaManipulator.java

Lines changed: 88 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -630,11 +630,22 @@ private static boolean isDigit(char c) {
630630
return c >= '0' && c <= '9';
631631
}
632632

633+
private static boolean isSign(char c) {
634+
return c == '+' || c == '-' || c == MINUS;
635+
}
636+
633637
// helper class for parsing MFs
634638
private static final class CharIter {
635639
int pos;
636640
String str;
637641

642+
643+
public CharIter(int pos, String str) {
644+
this.pos = pos;
645+
this.str = str;
646+
}
647+
648+
638649
char next() {
639650
return pos == str.length() ? '\0' : str.charAt(pos++);
640651
}
@@ -654,6 +665,20 @@ int nextUInt() {
654665
return res;
655666
}
656667

668+
Elements nextElement() {
669+
char c1 = next();
670+
if (!isUpper(c1)) {
671+
if (c1 != '\0') pos--;
672+
return null;
673+
}
674+
char c2 = next();
675+
if (!isLower(c2)) {
676+
if (c2 != '\0') pos--;
677+
return Elements.ofString("" + c1);
678+
}
679+
return Elements.ofString("" + c1 + c2);
680+
}
681+
657682
boolean nextIf(char c) {
658683
if (str.charAt(pos) == c) {
659684
pos++;
@@ -675,18 +700,14 @@ private static boolean parseIsotope(CharIter iter,
675700
mass = iter.nextUInt();
676701
if (mass < 0)
677702
return false;
703+
elem = iter.nextElement(); // optional
678704
if (!iter.nextIf(']'))
679705
return false;
680706
}
681-
char c1 = iter.next();
682-
char c2 = iter.next();
683-
if (!isLower(c2)) {
684-
// could use a switch, see SMARTS parser
685-
elem = Elements.ofString("" + c1);
686-
if (c2 != '\0')
687-
iter.pos--;
688-
} else {
689-
elem = Elements.ofString("" + c1 + c2);
707+
if (elem == null) {
708+
elem = iter.nextElement();
709+
if (elem == null)
710+
return false;
690711
}
691712
count = iter.nextUInt();
692713
if (count < 0)
@@ -730,13 +751,19 @@ private static IMolecularFormula getMolecularFormula(String stringMF, IMolecular
730751
// Extract charge from String when contains []X- format
731752
Integer charge = null;
732753
if ((stringMF.contains("[") && stringMF.contains("]")) && (stringMF.contains("+") || stringMF.contains(HYPHEN_STR) || stringMF.contains(MINUS_STR))) {
733-
charge = extractCharge(stringMF);
734-
stringMF = cleanMFfromCharge(stringMF);
754+
int pos = findChargePosition(stringMF);
755+
if (pos >= 0) {
756+
charge = parseCharge(new CharIter(pos, stringMF));
757+
stringMF = stringMF.substring(0, pos);
758+
if (stringMF.charAt(0) == '[' &&
759+
stringMF.charAt(stringMF.length()-1) == ']')
760+
stringMF = stringMF.substring(1, stringMF.length()-1);
761+
}
735762
}
736763
if (stringMF.isEmpty())
737764
return null;
738765
int len = stringMF.length();
739-
CharIter iter = new CharIter();
766+
CharIter iter = new CharIter(0, stringMF);
740767
iter.str = stringMF;
741768
while (iter.pos < len) {
742769
if (!parseIsotope(iter, formula, assumeMajorIsotope)) {
@@ -750,57 +777,58 @@ private static IMolecularFormula getMolecularFormula(String stringMF, IMolecular
750777
return formula;
751778
}
752779

753-
/**
754-
* Extract the molecular formula when it is defined with charge. e.g. [O3S]2-.
755-
*
756-
* @param formula The formula to inspect
757-
* @return The corrected formula
758-
*/
759-
private static String cleanMFfromCharge(String formula) {
760-
if (!(formula.contains("[") && formula.contains("]"))) return formula;
761-
boolean startBreak = false;
762-
String finalFormula = "";
763-
for (int f = 0; f < formula.length(); f++) {
764-
char thisChar = formula.charAt(f);
765-
if (thisChar == '[') {
766-
// start
767-
startBreak = true;
768-
} else if (thisChar == ']') {
780+
781+
private static int parseCharge(CharIter iter) {
782+
int sign = 0;
783+
int number = iter.nextUInt();
784+
switch (iter.next()) {
785+
case '+':
786+
sign = +1;
787+
break;
788+
case HYPHEN:
789+
case MINUS:
790+
sign = -1;
769791
break;
770-
} else if (startBreak) finalFormula += thisChar;
771792
}
772-
return finalFormula;
793+
if (number < 0)
794+
number = iter.nextUInt();
795+
if (number < 0)
796+
number = 1;
797+
if (sign == 0) {
798+
switch (iter.next()) {
799+
case '+':
800+
sign = +1;
801+
break;
802+
case HYPHEN:
803+
case MINUS:
804+
sign = -1;
805+
break;
806+
}
807+
}
808+
return sign * number;
773809
}
774810

775811
/**
776-
* Extract the charge given a molecular formula format [O3S]2-.
812+
* Extract the charge position given a molecular formula format [O3S]2-.
777813
*
778814
* @param formula The formula to inspect
779-
* @return The charge
815+
* @return The charge position in the string
780816
*/
781-
private static int extractCharge(String formula) {
782-
783-
if (!(formula.contains("[") && formula.contains("]") && (formula.contains("+") || formula.contains(HYPHEN_STR) || formula.contains(MINUS_STR))))
784-
return 0;
785-
786-
boolean finishBreak = false;
787-
String multiple = "";
788-
for (int f = 0; f < formula.length(); f++) {
789-
char thisChar = formula.charAt(f);
790-
if (thisChar == ']') {
791-
// finish
792-
finishBreak = true;
793-
} else if (thisChar == HYPHEN || thisChar == MINUS) {
794-
multiple = HYPHEN + multiple;
795-
break;
796-
} else if (thisChar == '+') {
797-
break;
798-
} else if (finishBreak) {
799-
multiple += thisChar;
800-
}
801-
}
802-
if (multiple.isEmpty() || multiple.equals(HYPHEN_STR) || multiple.equals(MINUS_STR)) multiple += 1;
803-
return Integer.valueOf(multiple);
817+
private static int findChargePosition(String formula) {
818+
int end = formula.length() - 1;
819+
int pos = end;
820+
while (pos >= 0 && isSign(formula.charAt(pos)))
821+
pos--;
822+
int mark1 = pos;
823+
while (pos >= 0 && isDigit(formula.charAt(pos)))
824+
pos--;
825+
int mark2 = pos;
826+
while (pos >= 0 && isSign(formula.charAt(pos)))
827+
pos--;
828+
if (pos == mark2 && formula.charAt(pos) != ']')
829+
pos = mark1; // not a charge CH3- we sucked up a number
830+
if (pos == 0) return -1;
831+
return pos+1;
804832
}
805833

806834
/**
@@ -1359,18 +1387,18 @@ private static String breakExtractor(String formula) {
13591387
boolean finalBreak = false;
13601388

13611389
int innerMostBracket = formula.lastIndexOf("(");
1362-
1390+
13631391
if (innerMostBracket<0)
13641392
return formula;
1365-
1393+
13661394
String finalformula = formula.substring(0, innerMostBracket);
13671395
String multipliedformula = "";
13681396
String formulaEnd = "";
13691397
String multiple = "";
1370-
1398+
13711399
for (int f = innerMostBracket + 1; f < formula.length(); f++) {
13721400
char thisChar = formula.charAt(f);
1373-
1401+
13741402
if ( finalBreak ) {
13751403
if ( isDigit(thisChar) ){
13761404
multiple += thisChar;
@@ -1386,7 +1414,7 @@ private static String breakExtractor(String formula) {
13861414
}
13871415
}
13881416
finalformula += muliplier(multipliedformula, multiple.isEmpty() ? 1:Integer.valueOf(multiple)) + formulaEnd;
1389-
1417+
13901418
if (finalformula.contains("("))
13911419
return breakExtractor(finalformula);
13921420
else

tool/formula/src/test/java/org/openscience/cdk/tools/manipulator/MolecularFormulaManipulatorTest.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,4 +1461,25 @@ public void getMostAbundantFe100() {
14611461
Assert.assertThat(MolecularFormulaManipulator.getMass(mf, MostAbundant),
14621462
closeTo(4731.154, 0.001));
14631463
}
1464+
1465+
public void roundtrip(String mfStr, String expected) {
1466+
IChemObjectBuilder bldr = SilentChemObjectBuilder.getInstance();
1467+
IMolecularFormula mf =
1468+
MolecularFormulaManipulator.getMolecularFormula(mfStr, bldr);
1469+
String actual = getString(mf, false, true);
1470+
Assert.assertEquals(expected, actual);
1471+
}
1472+
1473+
@Test public void testIsotopeAndChargeParsing() {
1474+
// proffered input
1475+
roundtrip("[[2]H2]-", "[[2]H2]-");
1476+
// missing outer brackets, isotope+element in square brackets
1477+
roundtrip("[[2H]2]-", "[[2]H2]-");
1478+
roundtrip("[[2H]2]+", "[[2]H2]+"); // [2H]2+ is ambiguous
1479+
// missing outer brackets, isotope in square brackets
1480+
roundtrip("[2]H2-", "[[2]H2]-");
1481+
roundtrip("[2]H2+", "[[2]H2]+");
1482+
// +2 => 2+ with brackets
1483+
roundtrip("[2H]2+2", "[[2]H2]2+");
1484+
}
14641485
}

0 commit comments

Comments
 (0)