Skip to content

Commit 2838cc9

Browse files
isnotinvainasingh
authored andcommitted
Split out version checks to separate files, add some tests
1 parent 5af9142 commit 2838cc9

7 files changed

Lines changed: 344 additions & 56 deletions

File tree

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet;
20+
21+
import org.apache.parquet.VersionParser.ParsedVersion;
22+
23+
/**
24+
* There was a bug (PARQUET-251) that caused the statistics metadata
25+
* for binary columns to be corrupted in the write path.
26+
*
27+
* This class is used to detect whether a file was written with this bug,
28+
* and thus it's statistics should be ignored / not trusted.
29+
*/
30+
public class CorruptStatistics {
31+
private static final Log LOG = Log.getLog(CorruptStatistics.class);
32+
33+
// the version in which the bug described by jira: PARQUET-251 was fixed
34+
// the bug involved writing invalid binary statistics, so stats written prior to this
35+
// fix must be ignored / assumed invalid
36+
private static final SemanticVersion PARQUET_251_FIXED_VERSION = new SemanticVersion(1, 8, 0);
37+
38+
/**
39+
* Decides if the statistics from a file created by createdBy (the created_by field from parquet format)
40+
* should be ignored because they are potentially corrupt.
41+
*/
42+
public static boolean shouldIgnoreStatistics(String createdBy) {
43+
44+
if (Strings.isNullOrEmpty(createdBy)) {
45+
// created_by is not populated, which could have been caused by
46+
// parquet-mr during the same time as PARQUET-251, see PARQUET-297
47+
LOG.info("Ignoring statistics because created_by is null or empty! See PARQUET-251 and PARQUET-297");
48+
return true;
49+
}
50+
51+
try {
52+
ParsedVersion version = VersionParser.parse(createdBy);
53+
54+
if (!"parquet-mr".equals(version.application)) {
55+
// assume other applications don't have this bug
56+
return false;
57+
}
58+
59+
SemanticVersion semver = SemanticVersion.parse(version.semver);
60+
if (semver.compareTo(PARQUET_251_FIXED_VERSION) < 0) {
61+
LOG.info("Ignoring statistics because this file was created prior to "
62+
+ PARQUET_251_FIXED_VERSION
63+
+ ", see PARQUET-251" );
64+
return true;
65+
}
66+
67+
// this file was created after the fix
68+
return false;
69+
} catch (RuntimeException e) {
70+
// couldn't parse the created_by field, log what went wrong, don't trust the stats,
71+
// but don't make this fatal.
72+
LOG.warn("Ignoring statistics because created_by could not be parsed (see PARQUET-251): " + createdBy, e);
73+
return true;
74+
}
75+
}
76+
}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet;
20+
21+
import java.util.regex.Matcher;
22+
import java.util.regex.Pattern;
23+
24+
/**
25+
* Very basic semver parser, only pays attention to major, minor, and patch numbers.
26+
* Attempts to do a little bit of validation that the version string is valid, but
27+
* is not a full implementation of the semver spec.
28+
*
29+
* NOTE: compareTo only respects major, minor, and patch (ignores rc numbers, SNAPSHOT, etc)
30+
*/
31+
public final class SemanticVersion implements Comparable<SemanticVersion> {
32+
// (major).(minor).(patch)[(rc)(rcnum)]?(-(SNAPSHOT))?
33+
private static final String FORMAT = "^(\\d+)\\.(\\d+)\\.(\\d+)((.*)(\\d+))?(\\-(.*))?$";
34+
private static final Pattern PATTERN = Pattern.compile(FORMAT);
35+
36+
public final int major;
37+
public final int minor;
38+
public final int patch;
39+
40+
public SemanticVersion(int major, int minor, int patch) {
41+
Preconditions.checkArgument(major >= 0, "major must be >= 0");
42+
Preconditions.checkArgument(minor >= 0, "minor must be >= 0");
43+
Preconditions.checkArgument(patch >= 0, "patch must be >= 0");
44+
45+
this.major = major;
46+
this.minor = minor;
47+
this.patch = patch;
48+
}
49+
50+
public static SemanticVersion parse(String version) {
51+
Matcher matcher = PATTERN.matcher(version);
52+
53+
Preconditions.checkArgument(matcher.matches(), "" + version + " does not match format " + FORMAT);
54+
55+
int major = Integer.valueOf(matcher.group(1));
56+
int minor = Integer.valueOf(matcher.group(2));
57+
int patch = Integer.valueOf(matcher.group(3));
58+
59+
return new SemanticVersion(major, minor, patch);
60+
}
61+
62+
@Override
63+
public int compareTo(SemanticVersion o) {
64+
int cmp;
65+
66+
cmp = Integer.compare(major, o.major);
67+
if (cmp != 0) {
68+
return cmp;
69+
}
70+
71+
cmp = Integer.compare(minor, o.minor);
72+
if (cmp != 0) {
73+
return cmp;
74+
}
75+
76+
return Integer.compare(patch, o.patch);
77+
}
78+
79+
@Override
80+
public boolean equals(Object o) {
81+
if (this == o) return true;
82+
if (o == null || getClass() != o.getClass()) return false;
83+
84+
SemanticVersion that = (SemanticVersion) o;
85+
return compareTo(that) == 0;
86+
}
87+
88+
@Override
89+
public int hashCode() {
90+
int result = major;
91+
result = 31 * result + minor;
92+
result = 31 * result + patch;
93+
return result;
94+
}
95+
96+
@Override
97+
public String toString() {
98+
return major + "." + minor + "." + patch;
99+
}
100+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet;
20+
21+
import java.util.regex.Matcher;
22+
import java.util.regex.Pattern;
23+
24+
import static org.apache.parquet.Preconditions.checkArgument;
25+
26+
/**
27+
* Parses a parquet Version string
28+
*/
29+
public class VersionParser {
30+
// example: parquet-mr version 1.8.0rc2-SNAPSHOT (build ddb469afac70404ea63b72ed2f07a911a8592ff7)
31+
public static final String FORMAT = "(.+) version (.+) \\(build (.+)\\)";
32+
public static final Pattern PATTERN = Pattern.compile(FORMAT);
33+
34+
public static class ParsedVersion {
35+
public final String application;
36+
public final String semver;
37+
public final String appBuildHash;
38+
39+
public ParsedVersion(String application, String semver, String appBuildHash) {
40+
checkArgument(!Strings.isNullOrEmpty(application), "application cannont be null or empty");
41+
checkArgument(!Strings.isNullOrEmpty(semver), "semver cannont be null or empty");
42+
checkArgument(!Strings.isNullOrEmpty(appBuildHash), "appBuildHash cannont be null or empty");
43+
this.application = application;
44+
this.semver = semver;
45+
this.appBuildHash = appBuildHash;
46+
}
47+
}
48+
49+
public static ParsedVersion parse(String createdBy) {
50+
Matcher matcher = PATTERN.matcher(createdBy);
51+
checkArgument(matcher.matches(), "Could not parse created_by: " + createdBy + " using format: " + FORMAT);
52+
return new ParsedVersion(matcher.group(1), matcher.group(2), matcher.group(3));
53+
}
54+
55+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package org.apache.parquet;
2+
3+
import org.junit.Test;
4+
5+
import static org.junit.Assert.assertFalse;
6+
import static org.junit.Assert.assertTrue;
7+
8+
/*
9+
* Licensed to the Apache Software Foundation (ASF) under one
10+
* or more contributor license agreements. See the NOTICE file
11+
* distributed with this work for additional information
12+
* regarding copyright ownership. The ASF licenses this file
13+
* to you under the Apache License, Version 2.0 (the
14+
* "License"); you may not use this file except in compliance
15+
* with the License. You may obtain a copy of the License at
16+
*
17+
* http://www.apache.org/licenses/LICENSE-2.0
18+
*
19+
* Unless required by applicable law or agreed to in writing,
20+
* software distributed under the License is distributed on an
21+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22+
* KIND, either express or implied. See the License for the
23+
* specific language governing permissions and limitations
24+
* under the License.
25+
*/
26+
public class CorruptStatisticsTest {
27+
@Test
28+
public void testCorruptStatistics() {
29+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.6.0 (build abcd)"));
30+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.4.2 (build abcd)"));
31+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.6.100 (build abcd)"));
32+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.7.999 (build abcd)"));
33+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.6.22rc99 (build abcd)"));
34+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.6.22rc99-SNAPSHOT (build abcd)"));
35+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.6.1-SNAPSHOT (build abcd)"));
36+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("unparseable string"));
37+
assertTrue(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.6.0t-01-abcdefg (build abcd)"));
38+
39+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("imapla version 1.6.0 (build abcd)"));
40+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("imapla version 1.10.0 (build abcd)"));
41+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.8.0 (build abcd)"));
42+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.8.1 (build abcd)"));
43+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.8.1rc3 (build abcd)"));
44+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.8.1rc3-SNAPSHOT (build abcd)"));
45+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.9.0 (build abcd)"));
46+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 2.0.0 (build abcd)"));
47+
assertFalse(CorruptStatistics.shouldIgnoreStatistics("parquet-mr version 1.9.0t-01-abcdefg (build abcd)"));
48+
}
49+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package org.apache.parquet;
2+
3+
import org.junit.Test;
4+
5+
import static org.junit.Assert.assertEquals;
6+
import static org.junit.Assert.assertTrue;
7+
8+
/*
9+
* Licensed to the Apache Software Foundation (ASF) under one
10+
* or more contributor license agreements. See the NOTICE file
11+
* distributed with this work for additional information
12+
* regarding copyright ownership. The ASF licenses this file
13+
* to you under the Apache License, Version 2.0 (the
14+
* "License"); you may not use this file except in compliance
15+
* with the License. You may obtain a copy of the License at
16+
*
17+
* http://www.apache.org/licenses/LICENSE-2.0
18+
*
19+
* Unless required by applicable law or agreed to in writing,
20+
* software distributed under the License is distributed on an
21+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
22+
* KIND, either express or implied. See the License for the
23+
* specific language governing permissions and limitations
24+
* under the License.
25+
*/
26+
public class SemanticVersionTest {
27+
@Test
28+
public void testCompare() {
29+
assertTrue(new SemanticVersion(1, 8, 1).compareTo(new SemanticVersion(1, 8, 1)) == 0);
30+
assertTrue(new SemanticVersion(1, 8, 0).compareTo(new SemanticVersion(1, 8, 1)) < 0);
31+
assertTrue(new SemanticVersion(1, 8, 2).compareTo(new SemanticVersion(1, 8, 1)) > 0);
32+
33+
assertTrue(new SemanticVersion(1, 8, 1).compareTo(new SemanticVersion(1, 8, 1)) == 0);
34+
assertTrue(new SemanticVersion(1, 8, 0).compareTo(new SemanticVersion(1, 8, 1)) < 0);
35+
assertTrue(new SemanticVersion(1, 8, 2).compareTo(new SemanticVersion(1, 8, 1)) > 0);
36+
37+
assertTrue(new SemanticVersion(1, 7, 0).compareTo(new SemanticVersion(1, 8, 0)) < 0);
38+
assertTrue(new SemanticVersion(1, 9, 0).compareTo(new SemanticVersion(1, 8, 0)) > 0);
39+
40+
assertTrue(new SemanticVersion(0, 0, 0).compareTo(new SemanticVersion(1, 0, 0)) < 0);
41+
assertTrue(new SemanticVersion(2, 0, 0).compareTo(new SemanticVersion(1, 0, 0)) > 0);
42+
43+
assertTrue(new SemanticVersion(1, 8, 100).compareTo(new SemanticVersion(1, 9, 0)) < 0);
44+
}
45+
46+
@Test
47+
public void testParse() {
48+
assertEquals(new SemanticVersion(1, 8, 0), SemanticVersion.parse("1.8.0"));
49+
assertEquals(new SemanticVersion(1, 8, 0), SemanticVersion.parse("1.8.0rc3"));
50+
assertEquals(new SemanticVersion(1, 8, 0), SemanticVersion.parse("1.8.0rc3-SNAPSHOT"));
51+
assertEquals(new SemanticVersion(1, 8, 0), SemanticVersion.parse("1.8.0-SNAPSHOT"));
52+
}
53+
}

parquet-common/src/test/java/org/apache/parquet/VersionTest.java

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import java.util.regex.Matcher;
2222
import java.util.regex.Pattern;
2323

24+
import org.apache.parquet.VersionParser.ParsedVersion;
2425
import org.junit.Test;
2526

2627
import static org.junit.Assert.assertEquals;
@@ -50,13 +51,10 @@ public void testVersion() {
5051

5152
@Test
5253
public void testFullVersion() {
53-
// example: parquet-mr version 1.8.0rc2-SNAPSHOT (build ddb469afac70404ea63b72ed2f07a911a8592ff7)
54-
String regex = "parquet-mr version (.*) \\(build (.*)\\)";
55-
Pattern pattern = Pattern.compile(regex);
56-
Matcher m = pattern.matcher(Version.FULL_VERSION);
57-
assertTrue(Version.FULL_VERSION + " did not match " + pattern, m.matches());
58-
assertVersionValid(m.group(1));
59-
assertEquals(Version.VERSION_NUMBER, m.group(1));
60-
assertFalse(m.group(2).isEmpty());
54+
ParsedVersion version = VersionParser.parse(Version.FULL_VERSION);
55+
56+
assertVersionValid(version.semver);
57+
assertEquals(Version.VERSION_NUMBER, version.semver);
58+
assertEquals("parquet-mr", version.application);
6159
}
6260
}

0 commit comments

Comments
 (0)