Skip to content

Commit 3de6ee2

Browse files
committed
Fix schema pruning when used in where clause.
1 parent 1b1711e commit 3de6ee2

2 files changed

Lines changed: 26 additions & 1 deletion

File tree

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruning.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.execution.datasources.parquet
1919

20-
import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, NamedExpression}
20+
import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, IsNotNull, IsNull, NamedExpression}
2121
import org.apache.spark.sql.catalyst.planning.PhysicalOperation
2222
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
2323
import org.apache.spark.sql.catalyst.rules.Rule
@@ -196,6 +196,7 @@ private[sql] object ParquetSchemaPruning extends Rule[LogicalPlan] {
196196
*/
197197
private def getRootFields(expr: Expression): Seq[RootField] = {
198198
expr match {
199+
case IsNotNull(_: Attribute) | IsNull(_: Attribute) => Seq.empty
199200
case att: Attribute =>
200201
RootField(StructField(att.name, att.dataType, att.nullable), derivedFromAtt = true) :: Nil
201202
case SelectedField(field) => RootField(field, derivedFromAtt = false) :: Nil

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,30 @@ class ParquetSchemaPruningSuite
155155
Row(null) :: Row(null) :: Nil)
156156
}
157157

158+
testSchemaPruning("select a single complex field and in where clause") {
159+
val query = sql("select name.first from contacts where name.first = 'Jane'")
160+
checkScan(query, "struct<name:struct<first:string>>")
161+
checkAnswer(query, Row("Jane") :: Nil)
162+
}
163+
164+
testSchemaPruning("select a single complex field array and in clause") {
165+
val query = sql("select friends.middle from contacts where friends.first[0] = 'Susan'")
166+
checkScan(query,
167+
"struct<friends:array<struct<first:string,middle:string>>>")
168+
checkAnswer(query.orderBy("id"),
169+
Row(Array("Z.")) :: Nil)
170+
}
171+
172+
testSchemaPruning("select a single complex field from a map entry and in clause") {
173+
val query =
174+
sql("select relatives[\"brother\"].middle from contacts " +
175+
"where relatives[\"brother\"].first = 'John'")
176+
checkScan(query,
177+
"struct<relatives:map<string,struct<first:string,middle:string>>>")
178+
checkAnswer(query.orderBy("id"),
179+
Row("Y.") :: Nil)
180+
}
181+
158182
private def testSchemaPruning(testName: String)(testThunk: => Unit) {
159183
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
160184
test(s"Spark vectorized reader - without partition data column - $testName") {

0 commit comments

Comments
 (0)