apache · mihailotim-db · Aug 4, 2025
diff --git a/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala b/...catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DeduplicateRelations.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.catalyst.trees.TreePattern._
+import org.apache.spark.sql.internal.SQLConf
 
 object DeduplicateRelations extends Rule[LogicalPlan] {
   val PROJECT_FOR_EXPRESSION_ID_DEDUPLICATION =
@@ -244,8 +245,17 @@ object DeduplicateRelations extends Rule[LogicalPlan] {
       if (planChanged) {
         if (planWithNewSubquery.childrenResolved) {
           val planWithNewChildren = planWithNewSubquery.withNewChildren(newChildren.toSeq)
+          val childrenOutputLookup = AttributeSet.fromAttributeSets(newChildren.map(_.outputSet))
+          val childrenOutput = newChildren.flatMap(_.output)
           val attrMap = AttributeMap(plan.children.flatMap(_.output)
-            .zip(newChildren.flatMap(_.output)).filter { case (a1, a2) => a1.exprId != a2.exprId })
+            .zip(childrenOutput).filter { case (a1, a2) => a1.exprId != a2.exprId })
+          val preventDeduplicationIfOldExprIdStillExists =
+            conf.getConf(SQLConf.DONT_DEDUPLICATE_EXPRESSION_IF_EXPR_ID_IN_OUTPUT)
+          val missingAttributeMap = AttributeMap(attrMap.filter {
+            case (oldAttribute, _) =>
+              !preventDeduplicationIfOldExprIdStillExists ||
+              !childrenOutputLookup.contains(oldAttribute)
+          })
           if (attrMap.isEmpty) {
             planWithNewChildren
           } else {
@@ -289,7 +299,7 @@ object DeduplicateRelations extends Rule[LogicalPlan] {
                   rightDeserializer = newRightDes, leftGroup = newLeftGroup,
                   rightGroup = newRightGroup, leftAttr = newLeftAttr, rightAttr = newRightAttr,
                   leftOrder = newLeftOrder, rightOrder = newRightOrder)
-              case _ => planWithNewChildren.rewriteAttrs(attrMap)
+              case _ => planWithNewChildren.rewriteAttrs(missingAttributeMap)
             }
           }
         } else {

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -241,6 +241,15 @@ object SQLConf {
     }
   }
 
+  val DONT_DEDUPLICATE_EXPRESSION_IF_EXPR_ID_IN_OUTPUT =
+    buildConf("spark.sql.analyzer.dontDeduplicateExpressionIfExprIdInOutput")
+    .internal()
+    .doc(
+      "DeduplicateRelations shouldn't remap expressions to new ExprIds if old ExprId still " +
+      "exists in output.")
+    .booleanConf
+    .createWithDefault(true)
+
   val UNION_IS_RESOLVED_WHEN_DUPLICATES_PER_CHILD_RESOLVED =
     buildConf("spark.sql.analyzer.unionIsResolvedWhenDuplicatesPerChildResolved")
     .internal()

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.sql
 
 import org.apache.spark.api.python.PythonEvalType
-import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, AttributeReference, PythonUDF, SortOrder}
-import org.apache.spark.sql.catalyst.plans.logical.{Expand, Generate, ScriptInputOutputSchema, ScriptTransformation, Window => WindowPlan}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, AttributeReference, EqualTo, NamedExpression, PythonUDF, SortOrder}
+import org.apache.spark.sql.catalyst.plans.logical.{Expand, Generate, Join, ScriptInputOutputSchema, ScriptTransformation, Window => WindowPlan}
 import org.apache.spark.sql.classic.{Dataset => DatasetImpl}
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions.{col, count, explode, sum, year}
@@ -499,4 +499,29 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
       assert(df1.join(df2, $"t1.i" === $"t2.i").cache().count() == 1)
     }
   }
+
+  test("SPARK-53098: `DeduplicateRelations` shouldn't remap expressions if old `ExprId` still " +
+  "exists in output") {
+    val df1 = Seq[TestData](TestData(1, "sales")).toDS()
+    val df2 = Seq[TestData](TestData(1, "sales")).toDS()
+    val df3 = df1.join(df2, df1("key") === df2("key")).select(df1("*"))
+    for (conf <- Seq(true, false)) {
+      withSQLConf(
+        // Disable auto-resolution of ambiguity because we want to test behavior before
+        // `resolveSelfJoinCondition` fully kicks in (while we still have ambiguous join condition)
+        SQLConf.DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY.key -> "false",
+        SQLConf.DONT_DEDUPLICATE_EXPRESSION_IF_EXPR_ID_IN_OUTPUT.key -> conf.toString
+      ) {
+        val analyzedPlan =
+          df1.join(df3, df1.col("key") === df3.col("key"), "left_outer").queryExecution.analyzed
+          .asInstanceOf[Join]
+        val joinCondition = analyzedPlan.condition.get.asInstanceOf[EqualTo]
+        val leftBranchExprId = joinCondition.left.asInstanceOf[NamedExpression]
+        val rightBranchExprId = joinCondition.right.asInstanceOf[NamedExpression]
+        assert(leftBranchExprId === rightBranchExprId)
+        assert(analyzedPlan.left.outputSet.contains(leftBranchExprId) == conf)
+        assert(analyzedPlan.right.outputSet.contains(leftBranchExprId) != conf)
+      }
+    }
+  }
 }