From 16290c8770bb45404c7206450ca52a173245b26d Mon Sep 17 00:00:00 2001 From: Enrico Minack Date: Mon, 25 Nov 2024 18:45:55 +0100 Subject: [PATCH] Make the number of fallback storage sub-directories configurable --- .../spark/internal/config/package.scala | 14 ++++- .../spark/storage/FallbackStorage.scala | 35 +++++++---- .../spark/storage/FallbackStorageSuite.scala | 60 ++++++++++++++++++- 3 files changed, 96 insertions(+), 13 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index 0e19143411e96..29f89dc9f23ab 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -578,12 +578,24 @@ package object config { ConfigBuilder("spark.storage.decommission.fallbackStorage.path") .doc("The location for fallback storage during block manager decommissioning. " + "For example, `s3a://spark-storage/`. In case of empty, fallback storage is disabled. " + - "The storage should be managed by TTL because Spark will not clean it up.") + "The storage should be managed by TTL because Spark will not clean it up, " + + "unless spark.storage.decommission.fallbackStorage.cleanUp is true.") .version("3.1.0") .stringConf .checkValue(_.endsWith(java.io.File.separator), "Path should end with separator.") .createOptional + private[spark] val STORAGE_DECOMMISSION_FALLBACK_STORAGE_SUBPATHS = + ConfigBuilder("spark.storage.decommission.fallbackStorage.subPaths") + .doc("The fallback storage puts all files of one shuffle in one directory when this is 0. " + + "When this option is larger than 0, it will instead distribute the files across " + + "this number of subdirectories.") + .version("4.0.0") + .intConf + .checkValue(_ >= 0, "The number of subdirectories must be 0 or larger.") + .createWithDefault(Int.MaxValue) + + private[spark] val STORAGE_DECOMMISSION_FALLBACK_STORAGE_CLEANUP = ConfigBuilder("spark.storage.decommission.fallbackStorage.cleanUp") .doc("If true, Spark cleans up its fallback storage data during shutting down.") diff --git a/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala b/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala index 0f2bfaede4454..888f2a0330e33 100644 --- a/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala +++ b/core/src/main/scala/org/apache/spark/storage/FallbackStorage.scala @@ -30,12 +30,13 @@ import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.{Logging, MDC} import org.apache.spark.internal.LogKeys._ -import org.apache.spark.internal.config.{STORAGE_DECOMMISSION_FALLBACK_STORAGE_CLEANUP, STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH} +import org.apache.spark.internal.config.{STORAGE_DECOMMISSION_FALLBACK_STORAGE_CLEANUP, STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH, STORAGE_DECOMMISSION_FALLBACK_STORAGE_SUBPATHS} import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.util.JavaUtils import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcTimeout} import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleBlockInfo} import org.apache.spark.shuffle.IndexShuffleBlockResolver.NOOP_REDUCE_ID +import org.apache.spark.storage.FallbackStorage.getPath import org.apache.spark.util.Utils /** @@ -62,17 +63,15 @@ private[storage] class FallbackStorage(conf: SparkConf) extends Logging { val indexFile = r.getIndexFile(shuffleId, mapId) if (indexFile.exists()) { - val hash = JavaUtils.nonNegativeHash(indexFile.getName) fallbackFileSystem.copyFromLocalFile( new Path(Utils.resolveURI(indexFile.getAbsolutePath)), - new Path(fallbackPath, s"$appId/$shuffleId/$hash/${indexFile.getName}")) + getPath(conf, appId, shuffleId, indexFile.getName)) val dataFile = r.getDataFile(shuffleId, mapId) if (dataFile.exists()) { - val hash = JavaUtils.nonNegativeHash(dataFile.getName) fallbackFileSystem.copyFromLocalFile( new Path(Utils.resolveURI(dataFile.getAbsolutePath)), - new Path(fallbackPath, s"$appId/$shuffleId/$hash/${dataFile.getName}")) + getPath(conf, appId, shuffleId, dataFile.getName)) } // Report block statuses @@ -90,8 +89,7 @@ private[storage] class FallbackStorage(conf: SparkConf) extends Logging { } def exists(shuffleId: Int, filename: String): Boolean = { - val hash = JavaUtils.nonNegativeHash(filename) - fallbackFileSystem.exists(new Path(fallbackPath, s"$appId/$shuffleId/$hash/$filename")) + fallbackFileSystem.exists(getPath(conf, appId, shuffleId, filename)) } } @@ -155,6 +153,23 @@ private[spark] object FallbackStorage extends Logging { FALLBACK_BLOCK_MANAGER_ID, blockId, StorageLevel.DISK_ONLY, memSize = 0, dataLength) } + /** + * Provide the Path for a shuffle file. + */ + private[storage] def getPath(conf: SparkConf, + appId: String, + shuffleId: Int, + filename: String): Path = { + val fallbackPath = new Path(conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).get) + val subPaths = conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_SUBPATHS) + if (subPaths > 0) { + val hash = JavaUtils.nonNegativeHash(filename) % subPaths + new Path(fallbackPath, s"$appId/$shuffleId/$hash/$filename") + } else { + new Path(fallbackPath, s"$appId/$shuffleId/$filename") + } + } + /** * Read a ManagedBuffer. */ @@ -176,8 +191,7 @@ private[spark] object FallbackStorage extends Logging { } val name = ShuffleIndexBlockId(shuffleId, mapId, NOOP_REDUCE_ID).name - val hash = JavaUtils.nonNegativeHash(name) - val indexFile = new Path(fallbackPath, s"$appId/$shuffleId/$hash/$name") + val indexFile = getPath(conf, appId, shuffleId, name) val start = startReduceId * 8L val end = endReduceId * 8L Utils.tryWithResource(fallbackFileSystem.open(indexFile)) { inputStream => @@ -187,8 +201,7 @@ private[spark] object FallbackStorage extends Logging { index.skip(end - (start + 8L)) val nextOffset = index.readLong() val name = ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID).name - val hash = JavaUtils.nonNegativeHash(name) - val dataFile = new Path(fallbackPath, s"$appId/$shuffleId/$hash/$name") + val dataFile = getPath(conf, appId, shuffleId, name) val size = nextOffset - offset logDebug(s"To byte array $size") val array = new Array[Byte](size.toInt) diff --git a/core/src/test/scala/org/apache/spark/storage/FallbackStorageSuite.scala b/core/src/test/scala/org/apache/spark/storage/FallbackStorageSuite.scala index 6c51bd4ff2e2f..2f4b85ba9c3a3 100644 --- a/core/src/test/scala/org/apache/spark/storage/FallbackStorageSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/FallbackStorageSuite.scala @@ -23,7 +23,7 @@ import scala.concurrent.duration._ import scala.util.Random import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FSDataInputStream, LocalFileSystem, Path, PositionedReadable, Seekable} +import org.apache.hadoop.fs.{FSDataInputStream, LocalFileSystem, Path, PathFilter, PositionedReadable, Seekable} import org.mockito.{ArgumentMatchers => mc} import org.mockito.Mockito.{mock, never, verify, when} import org.scalatest.concurrent.Eventually.{eventually, interval, timeout} @@ -39,6 +39,7 @@ import org.apache.spark.scheduler.ExecutorDecommissionInfo import org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleBlockInfo} import org.apache.spark.shuffle.IndexShuffleBlockResolver.NOOP_REDUCE_ID +import org.apache.spark.util.HadoopFSUtils import org.apache.spark.util.Utils.tryWithResource class FallbackStorageSuite extends SparkFunSuite with LocalSparkContext { @@ -293,6 +294,62 @@ class FallbackStorageSuite extends SparkFunSuite with LocalSparkContext { } } + Seq(0, 1, 2, 4, 1024, Int.MaxValue).foreach { subPaths => + test(s"Get path for filename with $subPaths subdirectories") { + val conf = getSparkConf(2, 2).set(STORAGE_DECOMMISSION_FALLBACK_STORAGE_SUBPATHS, subPaths) + val path = conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).get + val appId = "app-id" + val shuffleId = 123 + val filename = "the-file" + val actual = FallbackStorage.getPath(conf, appId, shuffleId, filename) + val expected = if (subPaths == 0) { + new Path(s"${path}/$appId/$shuffleId/$filename") + } else { + new Path(s"${path}/$appId/$shuffleId/${1049883992 % subPaths}/$filename") + } + assert(actual == expected) + } + } + + Seq(0, 1, 2, 4, 1024, Int.MaxValue).foreach { subPaths => + test(s"Control number of sub-directories ($subPaths)") { + val conf = getSparkConf(2, 2).set(STORAGE_DECOMMISSION_FALLBACK_STORAGE_SUBPATHS, subPaths) + sc = new SparkContext(conf) + withSpark(sc) { sc => + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) + val rdd1 = sc.parallelize(1 to 10, 10) + val rdd2 = rdd1.map(x => (x % 2, 1)) + val rdd3 = rdd2.reduceByKey(_ + _) + assert(rdd3.count() === 2) + + // Decommission all + val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend] + sc.getExecutorIds().foreach { + sched.decommissionExecutor(_, ExecutorDecommissionInfo(""), false) + } + + // We expect two files per partition, with ten partitions + val files = 0 until 10 flatMap (idx => Seq( + s"shuffle_0_${idx}_0.index", s"shuffle_0_${idx}_0.data") + ) + val fallbackStorage = new FallbackStorage(sc.getConf) + // Uploading is completed on decommissioned executors + eventually(timeout(20.seconds), interval(1.seconds)) { + files.foreach { file => assert(fallbackStorage.exists(0, file)) } + } + + // Check number of subdirectories + val path = conf.get(STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).get + val noopFilter = new PathFilter { + override def accept(path: Path): Boolean = true + } + val subDirs = HadoopFSUtils.listFiles(new Path(path), sc.hadoopConfiguration, noopFilter) + .flatMap(_._2.map(_.getPath.getParent)).toSet.toList + assert(subDirs.length == Math.max(1, Math.min(subPaths, 20)), subDirs.mkString(", ")) + } + } + } + CompressionCodec.shortCompressionCodecNames.keys.foreach { codec => test(s"$codec - Newly added executors should access old data from remote storage") { sc = new SparkContext(getSparkConf(2, 0).set(IO_COMPRESSION_CODEC, codec)) @@ -335,6 +392,7 @@ class FallbackStorageSuite extends SparkFunSuite with LocalSparkContext { } } } + class ReadPartialInputStream(val in: FSDataInputStream) extends InputStream with Seekable with PositionedReadable { override def read: Int = in.read