Union 原理-源码（spark3.0）_spark union原理-CSDN博客

本文深入探讨Spark的Union操作，解释如何处理不同RDD的合并。当RDD具有相同的分区器时，创建PartitionerAwareUnionRDD，否则采用UnionRDD。PartitionerAwareUnionRDD基于oneToOneDependency，而UnionRDD若RDD数量小于10，则进行特定的分区合并，依赖关系为RangeDependency。

这也是一个action算子，并且不会发生shuffle。首先会将多个上游的RDD 放入到一个集合中，或获取所有RDD的分区，判断是否都定义了分区并且分区是一样的，如果是则返回PartitionerAwareUnionRDD，否则返回UnionRDD;

PartitionerAwareUnionRDD 是oneToOneDependency 是一个宰依赖，会使用父RDD的分区器

UnionRDD 中会判断RDD的个数是否超过了10个，没超过类似于合并了下分区，这个是自定义的依赖，传过来的依赖关系为Nil；计算依赖关系是使用的RangeDependency 也是一个宰依赖

RDD类

  /**
   * Return the union of this RDD and another one. Any identical elements will appear multiple
   * times (use `.distinct()` to eliminate them).
   */
  def union(other: RDD[T]): RDD[T] = withScope {
    sc.union(this, other)
  }

  /** Build the union of a list of RDDs passed as variable-length arguments. */
  def union[T: ClassTag](first: RDD[T], rest: RDD[T]*): RDD[T] = withScope {
    // TODO：Seq(first) ++ rest先将RDD放入到一个集合中
    union(Seq(first) ++ rest)
  }

  /** Build the union of a list of RDDs. */
  def union[T: ClassTag](rdds: Seq[RDD[T]]): RDD[T] = withScope {
    // TODO：过滤出 包含分区的rdd
    val nonEmptyRdds = rdds.filter(!_.partitions.isEmpty)
    // TODO：获取了所有RDD的分区器 并且去重
    val partitioners = nonEmptyRdds.flatMap(_.partitioner).toSet
    //TODO：如果每个RDD都设置了分区 并且是都是一样的 就使用此分区器整合所有的RDD
    if (nonEmptyRdds.forall(_.partitioner.isDefined) && partitioners.size == 1) {
      // TODO:返回PartitionerAwareUnionRDD  这个类继承了OneToOneDependency  所以这是一个宰依赖
      // 数据重新分区但是 是一个在依赖
      new PartitionerAwareUnionRDD(this, nonEmptyRdds)
    } else {
      // TODO：如果到这里是保留各自的分区了，返回UnionRDD
      new UnionRDD(this, nonEmptyRdds)
    }
  }

PartitionerAwareUnionRDD类

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils

/**
 * Class representing partitions of PartitionerAwareUnionRDD, which maintains the list of
 * corresponding partitions of parent RDDs.
 */
private[spark]
class PartitionerAwareUnionRDDPartition(
    @transient val rdds: Seq[RDD[_]],
    override val index: Int
  ) extends Partition {
  var parents = rdds.map(_.partitions(index)).toArray

  override def hashCode(): Int = index

  override def equals(other: Any): Boolean = super.equals(other)

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent partition at the time of task serialization
    parents = rdds.map(_.partitions(index)).toArray
    oos.defaultWriteObject()
  }
}

/**
 * Class representing an RDD that can take multiple RDDs partitioned by the same partitioner and
 * unify them into a single RDD while preserving the partitioner. So m RDDs with p partitions each
 * will be unified to a single RDD with p partitions and the same partitioner. The preferred
 * location for each partition of the unified RDD will be the most common preferred location
 * of the corresponding partitions of the parent RDDs. For example, location of partition 0
 * of the unified RDD will be where most of partition 0 of the parent RDDs are located.
 */
private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
                                           // TODO：这里是OneToOneDependency  是一个宰依赖
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  //TODO：因为所有RDD的分区器都是一样的，那么就使用第一个就可以了
  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    // TODO：整合RDD的分区
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      // TODO：分区，PartitionerAwareUnionRDDPartition
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies(): Unit = {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
}

UnionRDD类