G-Research · EnricoMi · Mar 7, 2024 · Mar 11, 2024 · Mar 11, 2024 · Mar 11, 2024
diff --git a/core/src/main/scala/org/apache/spark/internal/config/UI.scala b/core/src/main/scala/org/apache/spark/internal/config/UI.scala
@@ -221,7 +221,7 @@ private[spark] object UI {
     .createOptional
 
   val CUSTOM_EXECUTOR_LOG_URL = ConfigBuilder("spark.ui.custom.executor.log.url")
-    .doc("Specifies custom spark executor log url for supporting external log service instead of " +
+    .doc("Specifies custom Spark executor log url for supporting external log service instead of " +
       "using cluster managers' application log urls in the Spark UI. Spark will support " +
       "some path variables via patterns which can vary on cluster manager. Please check the " +
       "documentation for your cluster manager to see which patterns are supported, if any. " +

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -1658,15 +1658,13 @@ Apart from these, the following properties are also available, and may be useful
   <td><code>spark.ui.custom.executor.log.url</code></td>
   <td>(none)</td>
   <td>
-    Specifies custom spark executor log URL for supporting external log service instead of using cluster
+    Specifies custom Spark executor log URL for supporting external log service instead of using cluster
     managers' application log URLs in Spark UI. Spark will support some path variables via patterns
     which can vary on cluster manager. Please check the documentation for your cluster manager to
     see which patterns are supported, if any. <p/>
     Please note that this configuration also replaces original log urls in event log,
     which will be also effective when accessing the application on history server. The new log urls must be
     permanent, otherwise you might have dead link for executor log urls.
-    <p/>
-    For now, only YARN mode supports this configuration
   </td>
   <td>3.0.0</td>
 </tr>

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
@@ -1662,6 +1662,30 @@ See the [configuration page](configuration.html) for information on Spark config
   </td>
   <td>3.3.0</td>
 </tr>
+<tr>
+  <td><code>spark.ui.custom.executor.log.url</code></td>
+  <td>(none)</td>
+  <td>
+    Specifies custom Spark executor log URL for supporting external log service instead of using cluster
+    managers' application log URLs in Spark UI. Spark will support some path variables via patterns
+    which can vary on cluster manager. Spark Kubernetes cluster manager supports the following path variables:
+    <ul>
+      <li><code>APP_ID</code>: The unique application id</li>
+      <li><code>EXECUTOR_ID</code>: The executor id (a positive integer larger than zero)</li>
+      <li><code>HOSTNAME</code>: The name of the host where the executor runs</li>
+      <li><code>KUBERNETES_NAMESPACE</code>: The namespace where the executor pods run</li>
+      <li><code>KUBERNETES_POD_NAME</code>: The name of the pod that contains the executor</li>
+      <li><code>FILE_NAME</code>: The name of the log, which is always <code>"log"</code></li>
+    </ul>
+    Please note that this configuration also replaces original log urls in event log,
+    which will be also effective when accessing the application on history server. The new log urls must be
+    permanent, otherwise you might have dead link for executor log urls. <p/>
+    Example: Config value <code>"https://my.custom.url/logs?app={{APP_ID}}&amp;executor={{EXECUTOR_ID}}"</code>
+    adds for application <code>"app-example-123"</code> and executor 1 this link to the Spark UI:
+    <code>https://my.custom.url/logs?app=app-example-123&amp;executor=1</code>
+  </td>
+  <td>4.0.0</td>
+</tr>
 </table>
 
 #### Pod template properties

diff --git a/...ore/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBackend.scala b/...ore/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBackend.scala
@@ -18,6 +18,7 @@ package org.apache.spark.scheduler.cluster.k8s
 
 import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.k8s.Config.KUBERNETES_NAMESPACE
 import org.apache.spark.deploy.worker.WorkerWatcher
 import org.apache.spark.executor.CoarseGrainedExecutorBackend
 import org.apache.spark.internal.Logging
@@ -28,6 +29,47 @@ import org.apache.spark.rpc._
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
 import org.apache.spark.util.Utils
 
+/**
+ * Custom implementation of CoarseGrainedExecutorBackend for Kubernetes resource manager.
+ * This class provides kubernetes executor attributes.
+ */
+private[spark] class KubernetesExecutorBackend(
+    rpcEnv: RpcEnv,
+    appId: String,
+    driverUrl: String,
+    executorId: String,
+    bindAddress: String,
+    hostname: String,
+    podName: String,
+    cores: Int,
+    env: SparkEnv,
+    resourcesFile: Option[String],
+    resourceProfile: ResourceProfile)
+  extends CoarseGrainedExecutorBackend(
+    rpcEnv,
+    driverUrl,
+    executorId,
+    bindAddress,
+    hostname,
+    cores,
+    env,
+    resourcesFile,
+    resourceProfile) with Logging {
+
+  override def extractAttributes: Map[String, String] = {
+    // give precedence to attributes extracted by overridden method
+    Map(
+      "LOG_FILES" -> "log",
+      "APP_ID" -> appId,
+      "EXECUTOR_ID" -> executorId,
+      "HOSTNAME" -> hostname,
+      "KUBERNETES_NAMESPACE" -> env.conf.get(KUBERNETES_NAMESPACE),
+      "KUBERNETES_POD_NAME" -> podName
+    ) ++ super.extractAttributes
+  }
+
+}
+
 private[spark] object KubernetesExecutorBackend extends Logging {
 
   // Message used internally to start the executor when the driver successfully accepted the
@@ -49,8 +91,8 @@ private[spark] object KubernetesExecutorBackend extends Logging {
   def main(args: Array[String]): Unit = {
     val createFn: (RpcEnv, Arguments, SparkEnv, ResourceProfile, String) =>
       CoarseGrainedExecutorBackend = { case (rpcEnv, arguments, env, resourceProfile, execId) =>
-        new CoarseGrainedExecutorBackend(rpcEnv, arguments.driverUrl, execId,
-        arguments.bindAddress, arguments.hostname, arguments.cores,
+        new KubernetesExecutorBackend(rpcEnv, arguments.appId, arguments.driverUrl, execId,
+        arguments.bindAddress, arguments.hostname, arguments.podName, arguments.cores,
         env, arguments.resourcesFileOpt, resourceProfile)
     }
     run(parseArguments(args, this.getClass.getCanonicalName.stripSuffix("$")), createFn)

diff --git a/...rc/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBackendSuite.scala b/...rc/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesExecutorBackendSuite.scala
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.k8s
+
+import org.mockito.Mockito.mock
+import org.mockito.Mockito.when
+
+import org.apache.spark.{SparkConf, SparkEnv, SparkFunSuite}
+import org.apache.spark.resource.ResourceProfile
+import org.apache.spark.rpc.RpcEnv
+
+class KubernetesExecutorBackendSuite extends SparkFunSuite {
+  test("extract log urls and attributes") {
+    val mockRpcEnv = mock(classOf[RpcEnv])
+    val mockSparkEnv = mock(classOf[SparkEnv])
+    val conf = new SparkConf()
+    when(mockSparkEnv.conf).thenReturn(conf)
+    val mockResourceProfile = mock(classOf[ResourceProfile])
+
+    val backend = new KubernetesExecutorBackend(
+      mockRpcEnv, "app-id", "driver-url", "executor-id", "bind-address", "hostname", "pod-name",
+      1, mockSparkEnv, None, mockResourceProfile)
+
+    val expectedKubernetesAttributes = Map(
+      "LOG_FILES" -> "log",
+      "APP_ID" -> "app-id",
+      "EXECUTOR_ID" -> "executor-id",
+      "HOSTNAME" -> "hostname",
+      "KUBERNETES_NAMESPACE" -> "default",
+      "KUBERNETES_POD_NAME" -> "pod-name"
+    )
+
+    assert(backend.extractLogUrls === Map.empty)
+    assert(backend.extractAttributes === expectedKubernetesAttributes)
+
+    withEnvs(
+      "SPARK_LOG_URL_STDOUT" -> "https://my.custom.url/logs/stdout",
+      "SPARK_LOG_URL_STDERR" -> "https://my.custom.url/logs/stderr",
+      "SPARK_EXECUTOR_ATTRIBUTE_ENV_1" -> "val1",
+      "SPARK_EXECUTOR_ATTRIBUTE_ENV_2" -> "val2") {
+      assert(backend.extractLogUrls === Map(
+        "stdout" -> "https://my.custom.url/logs/stdout",
+        "stderr" -> "https://my.custom.url/logs/stderr"
+      ))
+      assert(backend.extractAttributes === Map(
+        "ENV_1" -> "val1", "ENV_2" -> "val2"
+      ) ++ expectedKubernetesAttributes)
+    }
+
+    // env vars have precedence
+    withEnvs(
+      "SPARK_EXECUTOR_ATTRIBUTE_LOG_FILES" -> "env-log",
+      "SPARK_EXECUTOR_ATTRIBUTE_APP_ID" -> "env-app-id",
+      "SPARK_EXECUTOR_ATTRIBUTE_EXECUTOR_ID" -> "env-exec-id",
+      "SPARK_EXECUTOR_ATTRIBUTE_HOSTNAME" -> "env-hostname",
+      "SPARK_EXECUTOR_ATTRIBUTE_KUBERNETES_NAMESPACE" -> "env-namespace",
+      "SPARK_EXECUTOR_ATTRIBUTE_KUBERNETES_POD_NAME" -> "env-pod-name") {
+      assert(backend.extractAttributes === Map(
+        "LOG_FILES" -> "env-log",
+        "APP_ID" -> "env-app-id",
+        "EXECUTOR_ID" -> "env-exec-id",
+        "HOSTNAME" -> "env-hostname",
+        "KUBERNETES_NAMESPACE" -> "env-namespace",
+        "KUBERNETES_POD_NAME" -> "env-pod-name"
+      ))
+    }
+
+    // namespace 'default' above is the config default, here we set a value
+    conf.set("spark.kubernetes.namespace", "my-namespace")
+    assert(backend.extractAttributes === expectedKubernetesAttributes ++
+      Map("KUBERNETES_NAMESPACE" -> "my-namespace"))
+  }
+
+  private def withEnvs(pairs: (String, String)*)(f: => Unit): Unit = {
+    val readonlyEnv = System.getenv()
+    val field = readonlyEnv.getClass.getDeclaredField("m")
+    field.setAccessible(true)
+    val modifiableEnv = field.get(readonlyEnv).asInstanceOf[java.util.Map[String, String]]
+    try {
+      for ((k, v) <- pairs) {
+        assert(!modifiableEnv.containsKey(k))
+        modifiableEnv.put(k, v)
+      }
+      f
+    } finally {
+      for ((k, _) <- pairs) {
+        modifiableEnv.remove(k)
+      }
+    }
+  }
+}