fix: Fall back to Spark for unsupported partition or sort expressions in window aggregates (#1253)

andygrove · web-flow · commit be488390dcd8 · 2025-01-09T16:55:52.000-07:00
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -3148,17 +3148,22 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim
       orderSpec: Seq[SortOrder],
       op: SparkPlan): Boolean = {
     if (partitionSpec.length != orderSpec.length) {
-      withInfo(op, "Partitioning and sorting specifications do not match")
       return false
     }
 
-    val partitionColumnNames = partitionSpec.collect { case a: AttributeReference =>
-      a.name
+    val partitionColumnNames = partitionSpec.collect {
+      case a: AttributeReference => a.name
+      case other =>
+        withInfo(op, s"Unsupported partition expression: ${other.getClass.getSimpleName}")
+        return false
     }
 
     val orderColumnNames = orderSpec.collect { case s: SortOrder =>
       s.child match {
         case a: AttributeReference => a.name
+        case other =>
+          withInfo(op, s"Unsupported sort expression: ${other.getClass.getSimpleName}")
+          return false
       }
     }
 
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometAggregateSuite.scala
@@ -25,6 +25,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.sql.{CometTestBase, DataFrame, Row}
 import org.apache.spark.sql.catalyst.optimizer.EliminateSorts
 import org.apache.spark.sql.comet.CometHashAggregateExec
+import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.functions.{count_distinct, sum}
 import org.apache.spark.sql.internal.SQLConf
@@ -89,6 +90,37 @@ class CometAggregateSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
+  // based on Spark's SQLWindowFunctionSuite test of the same name
+  test("window function: partition and order expressions") {
+    for (shuffleMode <- Seq("auto", "native", "jvm")) {
+      withSQLConf(CometConf.COMET_SHUFFLE_MODE.key -> shuffleMode) {
+        val df =
+          Seq((1, "a", 5), (2, "a", 6), (3, "b", 7), (4, "b", 8), (5, "c", 9), (6, "c", 10)).toDF(
+            "month",
+            "area",
+            "product")
+        df.createOrReplaceTempView("windowData")
+        val df2 = sql("""
+            |select month, area, product, sum(product + 1) over (partition by 1 order by 2)
+            |from windowData
+          """.stripMargin)
+        checkSparkAnswer(df2)
+        val cometShuffles = collect(df2.queryExecution.executedPlan) {
+          case _: CometShuffleExchangeExec => true
+        }
+        if (shuffleMode == "jvm") {
+          assert(cometShuffles.length == 1)
+        } else {
+          // we fall back to Spark for shuffle because we do not support
+          // native shuffle with a LocalTableScan input, and we do not fall
+          // back to Comet columnar shuffle due to
+          // https://github.com/apache/datafusion-comet/issues/1248
+          assert(cometShuffles.isEmpty)
+        }
+      }
+    }
+  }
+
   test("multiple column distinct count") {
     withSQLConf(
       CometConf.COMET_ENABLED.key -> "true",

Original file line number	Diff line number	Diff line change
`@@ -3148,17 +3148,22 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim`
`3148`	`3148`	`orderSpec: Seq[SortOrder],`
`3149`	`3149`	`op: SparkPlan): Boolean = {`
`3150`	`3150`	`if (partitionSpec.length != orderSpec.length) {`
`3151`		`- withInfo(op, "Partitioning and sorting specifications do not match")`
`3152`	`3151`	`return false`
`3153`	`3152`	`}`
`3154`	`3153`
`3155`		`- val partitionColumnNames = partitionSpec.collect { case a: AttributeReference =>`
`3156`		`- a.name`
	`3154`	`+ val partitionColumnNames = partitionSpec.collect {`
	`3155`	`+ case a: AttributeReference => a.name`
	`3156`	`+ case other =>`
	`3157`	`+ withInfo(op, s"Unsupported partition expression: ${other.getClass.getSimpleName}")`
	`3158`	`+ return false`
`3157`	`3159`	`}`
`3158`	`3160`
`3159`	`3161`	`val orderColumnNames = orderSpec.collect { case s: SortOrder =>`
`3160`	`3162`	`s.child match {`
`3161`	`3163`	`case a: AttributeReference => a.name`
	`3164`	`+ case other =>`
	`3165`	`+ withInfo(op, s"Unsupported sort expression: ${other.getClass.getSimpleName}")`
	`3166`	`+ return false`
`3162`	`3167`	`}`
`3163`	`3168`	`}`
`3164`	`3169`