@@ -630,11 +630,22 @@ private static boolean isDigit(char c) {
630630 return c >= '0' && c <= '9' ;
631631 }
632632
633+ private static boolean isSign (char c ) {
634+ return c == '+' || c == '-' || c == MINUS ;
635+ }
636+
633637 // helper class for parsing MFs
634638 private static final class CharIter {
635639 int pos ;
636640 String str ;
637641
642+
643+ public CharIter (int pos , String str ) {
644+ this .pos = pos ;
645+ this .str = str ;
646+ }
647+
648+
638649 char next () {
639650 return pos == str .length () ? '\0' : str .charAt (pos ++);
640651 }
@@ -654,6 +665,20 @@ int nextUInt() {
654665 return res ;
655666 }
656667
668+ Elements nextElement () {
669+ char c1 = next ();
670+ if (!isUpper (c1 )) {
671+ if (c1 != '\0' ) pos --;
672+ return null ;
673+ }
674+ char c2 = next ();
675+ if (!isLower (c2 )) {
676+ if (c2 != '\0' ) pos --;
677+ return Elements .ofString ("" + c1 );
678+ }
679+ return Elements .ofString ("" + c1 + c2 );
680+ }
681+
657682 boolean nextIf (char c ) {
658683 if (str .charAt (pos ) == c ) {
659684 pos ++;
@@ -675,18 +700,14 @@ private static boolean parseIsotope(CharIter iter,
675700 mass = iter .nextUInt ();
676701 if (mass < 0 )
677702 return false ;
703+ elem = iter .nextElement (); // optional
678704 if (!iter .nextIf (']' ))
679705 return false ;
680706 }
681- char c1 = iter .next ();
682- char c2 = iter .next ();
683- if (!isLower (c2 )) {
684- // could use a switch, see SMARTS parser
685- elem = Elements .ofString ("" + c1 );
686- if (c2 != '\0' )
687- iter .pos --;
688- } else {
689- elem = Elements .ofString ("" + c1 + c2 );
707+ if (elem == null ) {
708+ elem = iter .nextElement ();
709+ if (elem == null )
710+ return false ;
690711 }
691712 count = iter .nextUInt ();
692713 if (count < 0 )
@@ -730,13 +751,19 @@ private static IMolecularFormula getMolecularFormula(String stringMF, IMolecular
730751 // Extract charge from String when contains []X- format
731752 Integer charge = null ;
732753 if ((stringMF .contains ("[" ) && stringMF .contains ("]" )) && (stringMF .contains ("+" ) || stringMF .contains (HYPHEN_STR ) || stringMF .contains (MINUS_STR ))) {
733- charge = extractCharge (stringMF );
734- stringMF = cleanMFfromCharge (stringMF );
754+ int pos = findChargePosition (stringMF );
755+ if (pos >= 0 ) {
756+ charge = parseCharge (new CharIter (pos , stringMF ));
757+ stringMF = stringMF .substring (0 , pos );
758+ if (stringMF .charAt (0 ) == '[' &&
759+ stringMF .charAt (stringMF .length ()-1 ) == ']' )
760+ stringMF = stringMF .substring (1 , stringMF .length ()-1 );
761+ }
735762 }
736763 if (stringMF .isEmpty ())
737764 return null ;
738765 int len = stringMF .length ();
739- CharIter iter = new CharIter ();
766+ CharIter iter = new CharIter (0 , stringMF );
740767 iter .str = stringMF ;
741768 while (iter .pos < len ) {
742769 if (!parseIsotope (iter , formula , assumeMajorIsotope )) {
@@ -750,57 +777,58 @@ private static IMolecularFormula getMolecularFormula(String stringMF, IMolecular
750777 return formula ;
751778 }
752779
753- /**
754- * Extract the molecular formula when it is defined with charge. e.g. [O3S]2-.
755- *
756- * @param formula The formula to inspect
757- * @return The corrected formula
758- */
759- private static String cleanMFfromCharge (String formula ) {
760- if (!(formula .contains ("[" ) && formula .contains ("]" ))) return formula ;
761- boolean startBreak = false ;
762- String finalFormula = "" ;
763- for (int f = 0 ; f < formula .length (); f ++) {
764- char thisChar = formula .charAt (f );
765- if (thisChar == '[' ) {
766- // start
767- startBreak = true ;
768- } else if (thisChar == ']' ) {
780+
781+ private static int parseCharge (CharIter iter ) {
782+ int sign = 0 ;
783+ int number = iter .nextUInt ();
784+ switch (iter .next ()) {
785+ case '+' :
786+ sign = +1 ;
787+ break ;
788+ case HYPHEN :
789+ case MINUS :
790+ sign = -1 ;
769791 break ;
770- } else if (startBreak ) finalFormula += thisChar ;
771792 }
772- return finalFormula ;
793+ if (number < 0 )
794+ number = iter .nextUInt ();
795+ if (number < 0 )
796+ number = 1 ;
797+ if (sign == 0 ) {
798+ switch (iter .next ()) {
799+ case '+' :
800+ sign = +1 ;
801+ break ;
802+ case HYPHEN :
803+ case MINUS :
804+ sign = -1 ;
805+ break ;
806+ }
807+ }
808+ return sign * number ;
773809 }
774810
775811 /**
776- * Extract the charge given a molecular formula format [O3S]2-.
812+ * Extract the charge position given a molecular formula format [O3S]2-.
777813 *
778814 * @param formula The formula to inspect
779- * @return The charge
815+ * @return The charge position in the string
780816 */
781- private static int extractCharge (String formula ) {
782-
783- if (!(formula .contains ("[" ) && formula .contains ("]" ) && (formula .contains ("+" ) || formula .contains (HYPHEN_STR ) || formula .contains (MINUS_STR ))))
784- return 0 ;
785-
786- boolean finishBreak = false ;
787- String multiple = "" ;
788- for (int f = 0 ; f < formula .length (); f ++) {
789- char thisChar = formula .charAt (f );
790- if (thisChar == ']' ) {
791- // finish
792- finishBreak = true ;
793- } else if (thisChar == HYPHEN || thisChar == MINUS ) {
794- multiple = HYPHEN + multiple ;
795- break ;
796- } else if (thisChar == '+' ) {
797- break ;
798- } else if (finishBreak ) {
799- multiple += thisChar ;
800- }
801- }
802- if (multiple .isEmpty () || multiple .equals (HYPHEN_STR ) || multiple .equals (MINUS_STR )) multiple += 1 ;
803- return Integer .valueOf (multiple );
817+ private static int findChargePosition (String formula ) {
818+ int end = formula .length () - 1 ;
819+ int pos = end ;
820+ while (pos >= 0 && isSign (formula .charAt (pos )))
821+ pos --;
822+ int mark1 = pos ;
823+ while (pos >= 0 && isDigit (formula .charAt (pos )))
824+ pos --;
825+ int mark2 = pos ;
826+ while (pos >= 0 && isSign (formula .charAt (pos )))
827+ pos --;
828+ if (pos == mark2 && formula .charAt (pos ) != ']' )
829+ pos = mark1 ; // not a charge CH3- we sucked up a number
830+ if (pos == 0 ) return -1 ;
831+ return pos +1 ;
804832 }
805833
806834 /**
@@ -1359,18 +1387,18 @@ private static String breakExtractor(String formula) {
13591387 boolean finalBreak = false ;
13601388
13611389 int innerMostBracket = formula .lastIndexOf ("(" );
1362-
1390+
13631391 if (innerMostBracket <0 )
13641392 return formula ;
1365-
1393+
13661394 String finalformula = formula .substring (0 , innerMostBracket );
13671395 String multipliedformula = "" ;
13681396 String formulaEnd = "" ;
13691397 String multiple = "" ;
1370-
1398+
13711399 for (int f = innerMostBracket + 1 ; f < formula .length (); f ++) {
13721400 char thisChar = formula .charAt (f );
1373-
1401+
13741402 if ( finalBreak ) {
13751403 if ( isDigit (thisChar ) ){
13761404 multiple += thisChar ;
@@ -1386,7 +1414,7 @@ private static String breakExtractor(String formula) {
13861414 }
13871415 }
13881416 finalformula += muliplier (multipliedformula , multiple .isEmpty () ? 1 :Integer .valueOf (multiple )) + formulaEnd ;
1389-
1417+
13901418 if (finalformula .contains ("(" ))
13911419 return breakExtractor (finalformula );
13921420 else
0 commit comments