Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions native-engine/auron-planner/proto/auron.proto
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ message PhysicalPlanNode {
OrcScanExecNode orc_scan = 25;
KafkaScanExecNode kafka_scan = 26;
OrcSinkExecNode orc_sink = 27;
CoalesceExecNode coalesce = 29;
}
}

Expand Down Expand Up @@ -779,6 +780,11 @@ message KafkaScanExecNode {
string mock_data_json_array = 9;
}

message CoalesceExecNode {
PhysicalPlanNode input = 1;
int32 numPartitions = 2;
}

enum KafkaFormat {
JSON = 0;
PROTOBUF = 1;
Expand Down
8 changes: 8 additions & 0 deletions native-engine/auron-planner/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ use datafusion_ext_plans::{
agg_exec::AggExec,
broadcast_join_build_hash_map_exec::BroadcastJoinBuildHashMapExec,
broadcast_join_exec::BroadcastJoinExec,
coalesce_exec::CoalesceExec,
debug_exec::DebugExec,
empty_partitions_exec::EmptyPartitionsExec,
expand_exec::ExpandExec,
Expand Down Expand Up @@ -576,6 +577,13 @@ impl PhysicalPlanner {
schema,
)))
}
PhysicalPlanType::Coalesce(coalesce) => {
let input: Arc<dyn ExecutionPlan> = convert_box_required!(self, coalesce.input)?;
Ok(Arc::new(CoalesceExec::new(
input,
coalesce.num_partitions as usize,
)))
}
PhysicalPlanType::CoalesceBatches(coalesce_batches) => {
let input: Arc<dyn ExecutionPlan> =
convert_box_required!(self, coalesce_batches.input)?;
Expand Down
122 changes: 122 additions & 0 deletions native-engine/datafusion-ext-plans/src/coalesce_exec.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::{any::Any, fmt::Formatter, sync::Arc};

use arrow::datatypes::SchemaRef;
use datafusion::{
common::Result,
execution::context::TaskContext,
physical_expr::EquivalenceProperties,
physical_plan::{
DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties,
SendableRecordBatchStream, Statistics,
execution_plan::{Boundedness, EmissionType},
metrics::{ExecutionPlanMetricsSet, MetricsSet},
},
};
use futures::StreamExt;
use once_cell::sync::OnceCell;

use crate::common::execution_context::ExecutionContext;

#[derive(Debug)]
pub struct CoalesceExec {
input: Arc<dyn ExecutionPlan>,
num_partitions: usize,
metrics: ExecutionPlanMetricsSet,
props: OnceCell<PlanProperties>,
}

impl CoalesceExec {
pub fn new(input: Arc<dyn ExecutionPlan>, num_partitions: usize) -> Self {
Self {
input,
num_partitions,
metrics: ExecutionPlanMetricsSet::new(),
props: OnceCell::new(),
}
}
}

impl DisplayAs for CoalesceExec {
fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result {
write!(f, "CoalesceExec[num_partitions={}]", self.num_partitions)
}
}

impl ExecutionPlan for CoalesceExec {
fn name(&self) -> &str {
"CoalesceExec"
}

fn as_any(&self) -> &dyn Any {
self
}

fn schema(&self) -> SchemaRef {
self.input.schema()
}

fn properties(&self) -> &PlanProperties {
self.props.get_or_init(|| {
PlanProperties::new(
EquivalenceProperties::new(self.schema()),
self.input.output_partitioning().clone(),
EmissionType::Both,
Boundedness::Bounded,
)
})
}

fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
vec![&self.input]
}

fn with_new_children(
self: Arc<Self>,
children: Vec<Arc<dyn ExecutionPlan>>,
) -> Result<Arc<dyn ExecutionPlan>> {
Ok(Arc::new(CoalesceExec::new(
children[0].clone(),
self.num_partitions,
)))
}

fn execute(
&self,
partition: usize,
context: Arc<TaskContext>,
) -> Result<SendableRecordBatchStream> {
let exec_ctx = ExecutionContext::new(context, partition, self.schema(), &self.metrics);
let mut input = exec_ctx.execute(&self.input)?;
Ok(
exec_ctx.output_with_sender("Coalesce", move |sender| async move {
while let Some(batch) = input.next().await.transpose()? {
sender.send(batch).await;
}
Ok(())
}),
)
}

fn metrics(&self) -> Option<MetricsSet> {
Some(self.metrics.clone_inner())
}

fn statistics(&self) -> Result<Statistics> {
todo!()
}
}
1 change: 1 addition & 0 deletions native-engine/datafusion-ext-plans/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ pub mod agg;
pub mod agg_exec;
pub mod broadcast_join_build_hash_map_exec;
pub mod broadcast_join_exec;
pub mod coalesce_exec;
pub mod debug_exec;
pub mod empty_partitions_exec;
pub mod expand_exec;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,9 @@ class ShimsImpl extends Shims with Logging {
override def createNativeFilterExec(condition: Expression, child: SparkPlan): NativeFilterBase =
NativeFilterExec(condition, child)

def createNativeCoalesceExec(numPartitions: Int, child: SparkPlan): NativeCoalesceBase =
NativeCoalesceExec(numPartitions, child)

override def createNativeGenerateExec(
generator: Generator,
requiredChildOutput: Seq[Attribute],
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.auron.plan

import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.SparkPlan

import org.apache.auron.sparkver

case class NativeCoalesceExec(numPartitions: Int, override val child: SparkPlan)
extends NativeCoalesceBase(numPartitions, child) {
@sparkver("3.2 / 3.3 / 3.4 / 3.5 / 4.0 / 4.1")
override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan =
copy(child = newChild)

@sparkver("3.0 / 3.1")
override def withNewChildren(newChildren: Seq[SparkPlan]): SparkPlan =
copy(child = newChildren.head)

override def output: Seq[Attribute] =
child.output
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.auron

import org.apache.spark.sql.{AuronQueryTest, Row}
import org.apache.spark.sql.execution.auron.plan.NativeCoalesceExec

class AuronNativeCoalesceExecSuite extends AuronQueryTest with BaseAuronSQLSuite {
import testImplicits._

test("test CoalesceExec to native") {
withSQLConf("spark.auron.enable.coalesce" -> "true") {
Seq((1, 2, "test test"))
.toDF("c1", "c2", "part")
.createOrReplaceTempView("coalesce_table1")
val df = {
spark.sql("select /*+ coalesce(2)*/ a.c1, a.c2 from coalesce_table1 a ")
}
df.show()

checkAnswer(df, Seq(Row(1, 2)))
val test = collectFirst(df.queryExecution.executedPlan) {
case coalesceExec: NativeCoalesceExec =>
coalesceExec
}
println(test.get)

assert(collectFirst(df.queryExecution.executedPlan) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This asserts that a NativeCoalesceExec node appears in the plan, but never checks the one thing coalesce actually does — the output partition count. With a single-row, single-partition input, coalesce(2) collapses to a trivial case, so the firstParent and index issues above (and any future coalescer bug) would still pass green here.

Could the test cover the cases that exercise the grouping? Two that would catch the bugs above:

// many input partitions coalesced down, full row set preserved
val df = spark.range(0, 100, 1, numPartitions = 8).coalesce(2)
assert(df.rdd.getNumPartitions == 2)
checkAnswer(df.toDF(), (0 until 100).map(Row(_)))

// empty input still yields numPartitions partitions (the contract the :51 comment describes)
assert(spark.emptyDataFrame.coalesce(1).rdd.getNumPartitions == 1)

case coalesceExec: NativeCoalesceExec =>
coalesceExec
}.isDefined)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,11 @@ public class SparkAuronConfiguration extends AuronConfiguration {
.withDescription("Enable AggregateExec operation conversion to native Auron implementations.")
.withDefaultValue(true);

public static final ConfigOption<Boolean> ENABLE_COALESEC = new SQLConfOption<>(Boolean.class)
.withKey("auron.enable.coalesce")
.withCategory("Operator Supports")
.withDescription("Enable CoalesceExec operation conversion to native Auron implementations.")
.withDefaultValue(true);
public static final ConfigOption<Boolean> ENABLE_EXPAND = new SQLConfOption<>(Boolean.class)
.withKey("auron.enable.expand")
.withCategory("Operator Supports")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ object AuronConvertStrategy extends Logging {
e.setTagValue(convertStrategyTag, AlwaysConvert)
case e: GenerateExec if isNative(e.child) =>
e.setTagValue(convertStrategyTag, AlwaysConvert)
case e: CoalesceExec if isNative(e.child) =>
e.setTagValue(convertStrategyTag, AlwaysConvert)
case e: ObjectHashAggregateExec if isNative(e.child) =>
e.setTagValue(convertStrategyTag, AlwaysConvert)
case e: LocalTableScanExec =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ object AuronConverters extends Logging {
SparkAuronConfiguration.ENABLE_TAKE_ORDERED_AND_PROJECT.get()
def enableCollectLimit: Boolean = SparkAuronConfiguration.ENABLE_COLLECT_LIMIT.get()
def enableAggr: Boolean = SparkAuronConfiguration.ENABLE_AGGR.get()
def enableCoalesec: Boolean = SparkAuronConfiguration.ENABLE_COALESEC.get()
def enableExpand: Boolean = SparkAuronConfiguration.ENABLE_EXPAND.get()
def enableWindow: Boolean = SparkAuronConfiguration.ENABLE_WINDOW.get()
def enableWindowGroupLimit: Boolean = SparkAuronConfiguration.ENABLE_WINDOW_GROUP_LIMIT.get()
Expand Down Expand Up @@ -252,6 +253,10 @@ object AuronConverters extends Logging {
}
convertedAgg

case e: CoalesceExec if enableCoalesec => // coalesec
val convertedCoalesce = tryConvert(e, convertCoalesceExec)
convertedCoalesce

case e: ObjectHashAggregateExec if enableAggr => // object hash aggregate
val convertedAgg = tryConvert(e, convertObjectHashAggregateExec)
if (!e.getTagValue(convertibleTag).contains(true)) {
Expand Down Expand Up @@ -370,6 +375,8 @@ object AuronConverters extends Logging {
"Conversion disabled: spark.auron.enable.local.table.scan=false."
case _: DataWritingCommandExec if !enableDataWriting =>
"Conversion disabled: spark.auron.enable.data.writing=false."
case _: CoalesceExec if !enableCoalesec =>
"Conversion disabled: spark.auron.enable.coalesce=false."
case _ =>
s"${exec.getClass.getSimpleName} is not supported yet."
}
Expand Down Expand Up @@ -807,6 +814,10 @@ object AuronConverters extends Logging {
Shims.get.createNativeCollectLimitExec(limit, offset, exec.child)
}

def convertCoalesceExec(exec: CoalesceExec): SparkPlan = {
Shims.get.createNativeCoalesceExec(exec.numPartitions, exec.child)
}

def convertHashAggregateExec(exec: HashAggregateExec): SparkPlan = {
// split non-trivial children exprs in partial-agg to a ProjectExec
// for enabling filter-project optimization in native side
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import org.apache.spark.Partitioner
import org.apache.spark.SparkContext
import org.apache.spark.TaskContext
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.{CoalescedRDDPartition, RDD}
import org.apache.spark.sql.catalyst.InternalRow

import org.apache.auron.metric.SparkMetricNode
Expand Down Expand Up @@ -80,6 +80,33 @@ class NativeRDD(
}
}

class CoalesceNativeRDD(
@transient private val rddSparkContext: SparkContext,
rddDependencies: Seq[Dependency[_]],
partitions: Array[Partition],
@transient private val nativePlan: (Partition, TaskContext) => PhysicalPlanNode,
friendlyName: String)
extends NativeRDD(
rddSparkContext,
metrics = SparkMetricNode(Map.empty, Seq(), None),
rddPartitions = partitions,
rddPartitioner = None,
rddDependencies,
rddShuffleReadFull = false,
nativePlan,
friendlyName)
with Logging
with Serializable {

override protected def getPartitions: Array[Partition] = partitions

override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the coalesce meant to run natively, or as JVM iterator concatenation? The PR ships two implementations, and as written the native one never runs.

This compute override concatenates the parent partition iterators directly in the JVM. The base NativeRDD.compute (NativeRDD.scala:67-80) is the only thing that actually executes a native plan — it calls NativeHelper.executeNativePlan(...) — and this override never does. So the .setCoalesce(...) plan built in NativeCoalesceBase.doExecuteNative (:60-64 and :73-76) is constructed, serialized into the RDD, and then never invoked. That leaves the whole native chain off the execution path: coalesce_exec.rs, the CoalesceExecNode proto message, and the PhysicalPlanType::Coalesce arm in planner.rs:580-586. The coalesce ends up performed purely by JVM iterator concatenation, the same way Spark's own CoalescedRDD would.

And even if the native node were reached, CoalesceExec::execute (coalesce_exec.rs:73-113) is a pass-through — it streams input batches out unchanged, properties() returns the input's partitioning untouched, and num_partitions is stored but only ever read in fmt_as. So it neither repartitions nor does anything the input stream wouldn't already do. For contrast, NativeUnionBase.doExecuteNative wraps a real UnionExecNode in a plain NativeRDD with no compute override, so union genuinely executes natively.

The two designs are mutually exclusive, and right now the PR carries both scaffolds and lands neither cleanly:

  • If the intent is native — coalesce N input partitions into one task, then run the rest of the native pipeline over the concatenated stream — then this override severs that, and CoalesceExec::execute needs to actually do the coalescing rather than pass through.
  • If the intent is JVM concatenation, then coalesce_exec.rs, the proto field, and the planner arm are dead code that could be dropped, and the ~245-line vendored DefaultPartitionCoalescer (NativeCoalesceBase.scala:90-335) is doing partition layout for a coalesce that Spark's own CoalescedRDD could also provide — worth weighing whether the copy earns its maintenance cost in that case.

Which direction were you aiming for? The inline questions below are the correctness issues that hold regardless of which way it goes.

split.asInstanceOf[CoalescedRDDPartition].parents.iterator.flatMap { parentPartition =>
firstParent[InternalRow].iterator(parentPartition, context)
}
}
}

class EmptyNativeRDD(@transient private val rddSparkContext: SparkContext)
extends NativeRDD(
rddSparkContext = rddSparkContext,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ abstract class Shims {
offset: Int,
child: SparkPlan): NativeCollectLimitBase

def createNativeCoalesceExec(numPartitions: Int, child: SparkPlan): NativeCoalesceBase

def createNativeParquetInsertIntoHiveTableExec(
cmd: InsertIntoHiveTable,
child: SparkPlan): NativeParquetInsertIntoHiveTableBase
Expand Down
Loading
Loading