Skip to content

Commit 80588e4

Browse files
wangjunboJunbo wang
authored andcommitted
[SPARK-32559][SQL] Fix the trim logic did't handle ASCII control characters correctly
### What changes were proposed in this pull request? The trim logic in Cast expression introduced in #29375 trim ASCII control characters unexpectly. Before this patch ![image](https://github.com/apache/spark/assets/25627922/ca6a2fb1-2143-4264-84d1-70b6bb755ec7) And hive ![image](https://github.com/apache/spark/assets/25627922/017aaa4a-133e-4396-9694-79f03f027bbe) ### Why are the changes needed? The behavior described above doesn't consistent with the behavior of Hive ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? add ut Closes #41535 from Kwafoor/trim_bugfix. Lead-authored-by: wangjunbo <wangjunbo@qiyi.com> Co-authored-by: Junbo wang <1042815068@qq.com> Signed-off-by: Kent Yao <yao@apache.org>
1 parent 37ab190 commit 80588e4

3 files changed

Lines changed: 22 additions & 6 deletions

File tree

common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,14 @@ private UTF8String copyUTF8String(int start, int end) {
499499
return UTF8String.fromBytes(newBytes);
500500
}
501501

502+
/**
503+
* Determines if the specified character (Unicode code point) is white space or an ISO control
504+
* character according to Java.
505+
*/
506+
private boolean isWhitespaceOrISOControl(int codePoint) {
507+
return Character.isWhitespace(codePoint) || Character.isISOControl(codePoint);
508+
}
509+
502510
/**
503511
* Trims space characters (ASCII 32) from both ends of this string.
504512
*
@@ -535,14 +543,14 @@ public UTF8String trim() {
535543
public UTF8String trimAll() {
536544
int s = 0;
537545
// skip all of the whitespaces in the left side
538-
while (s < this.numBytes && Character.isWhitespace(getByte(s))) s++;
546+
while (s < this.numBytes && isWhitespaceOrISOControl(getByte(s))) s++;
539547
if (s == this.numBytes) {
540548
// Everything trimmed
541549
return EMPTY_UTF8;
542550
}
543551
// skip all of the whitespaces in the right side
544552
int e = this.numBytes - 1;
545-
while (e > s && Character.isWhitespace(getByte(e))) e--;
553+
while (e > s && isWhitespaceOrISOControl(getByte(e))) e--;
546554
if (s == 0 && e == numBytes - 1) {
547555
// Nothing trimmed
548556
return this;
@@ -1131,11 +1139,11 @@ public boolean toLong(LongWrapper toLongResult) {
11311139

11321140
private boolean toLong(LongWrapper toLongResult, boolean allowDecimal) {
11331141
int offset = 0;
1134-
while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
1142+
while (offset < this.numBytes && isWhitespaceOrISOControl(getByte(offset))) offset++;
11351143
if (offset == this.numBytes) return false;
11361144

11371145
int end = this.numBytes - 1;
1138-
while (end > offset && Character.isWhitespace(getByte(end))) end--;
1146+
while (end > offset && isWhitespaceOrISOControl(getByte(end))) end--;
11391147

11401148
byte b = getByte(offset);
11411149
final boolean negative = b == '-';
@@ -1228,11 +1236,11 @@ public boolean toInt(IntWrapper intWrapper) {
12281236

12291237
private boolean toInt(IntWrapper intWrapper, boolean allowDecimal) {
12301238
int offset = 0;
1231-
while (offset < this.numBytes && Character.isWhitespace(getByte(offset))) offset++;
1239+
while (offset < this.numBytes && isWhitespaceOrISOControl(getByte(offset))) offset++;
12321240
if (offset == this.numBytes) return false;
12331241

12341242
int end = this.numBytes - 1;
1235-
while (end > offset && Character.isWhitespace(getByte(end))) end--;
1243+
while (end > offset && isWhitespaceOrISOControl(getByte(end))) end--;
12361244

12371245
byte b = getByte(offset);
12381246
final boolean negative = b == '-';

common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,10 @@ public void trims() {
229229
assertEquals(fromString("1"), fromString("1").trim());
230230
assertEquals(fromString("1"), fromString("1\t").trimAll());
231231

232+
assertEquals(fromString("1中文").toString(), fromString("1中文").trimAll().toString());
233+
assertEquals(fromString("1"), fromString("1\u0003").trimAll());
234+
assertEquals(fromString("1"), fromString("1\u007F").trimAll());
235+
232236
assertEquals(fromString("hello"), fromString(" hello ").trim());
233237
assertEquals(fromString("hello "), fromString(" hello ").trimLeft());
234238
assertEquals(fromString(" hello"), fromString(" hello ").trimRight());

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,10 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
127127
stringToDate(UTF8String.fromString(s))
128128
}
129129

130+
test("SPARK-32559: string to date trim Control Characters") {
131+
assert(toDate("MIDDLE\u0003") === None)
132+
}
133+
130134
test("string to date") {
131135
assert(toDate("2015-01-28").get === days(2015, 1, 28))
132136
assert(toDate("2015").get === days(2015, 1, 1))

0 commit comments

Comments
 (0)