From 6d908dd76dd9de6dbfe089b2fb1410474978b345 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Tue, 20 Aug 2024 16:29:07 -0700 Subject: [PATCH 01/20] migrate --- .../resources/texera-compiler-web-config.yml | 42 +++++++++ .../web/TexeraCompilerWebApplication.scala | 85 +++++++++++++++++++ .../web/TexeraCompilerWebConfiguration.java | 6 ++ .../WorkflowCompilationResource.scala | 67 +++++++++++++++ .../common/workflow/PhysicalPlan.scala | 2 +- .../common/workflow/WorkflowCompiler.scala | 72 ++++++++++++++++ 6 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 core/amber/src/main/resources/texera-compiler-web-config.yml create mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebApplication.scala create mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebConfiguration.java create mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala diff --git a/core/amber/src/main/resources/texera-compiler-web-config.yml b/core/amber/src/main/resources/texera-compiler-web-config.yml new file mode 100644 index 00000000000..f76a6ac5052 --- /dev/null +++ b/core/amber/src/main/resources/texera-compiler-web-config.yml @@ -0,0 +1,42 @@ +server: + # modify applicationContextPath if you want the root path to be the name of the application + # for example, set it to /twitter, then the url will become texera.ics.uci.edu:port/twitter + applicationContextPath: / + applicationConnectors: + - type: http + port: 9090 + adminConnectors: + - type: http + port: 9091 + requestLog: + type: classic + timeZone: UTC + appenders: + - type: console + - type: file + currentLogFilename: ../log/access-texera-compiler.log + threshold: ALL + queueSize: 512 + discardingThreshold: 0 + archive: true + archivedLogFilenamePattern: ../log/access-texera-compiler-%d{yyyy-MM-dd}.log.gz + archivedFileCount: 7 + bufferSize: 8KiB + immediateFlush: true +logging: + level: INFO + loggers: + "io.dropwizard": INFO + appenders: + - type: console + logFormat: "[%date{ISO8601}] [%level] [%logger] [%thread] - %msg %n" + - type: file + currentLogFilename: ../log/texera-compiler-server.log + threshold: ALL + queueSize: 512 + discardingThreshold: 0 + archive: false + timeZone: UTC + logFormat: "[%date{ISO8601}] [%level] [%logger] [%thread] - %msg %n" + bufferSize: 8KiB + immediateFlush: true \ No newline at end of file diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebApplication.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebApplication.scala new file mode 100644 index 00000000000..a0ec0e08c10 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebApplication.scala @@ -0,0 +1,85 @@ +package edu.uci.ics.texera.web + +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import com.github.toastshaman.dropwizard.auth.jwt.JwtAuthFilter +import com.typesafe.scalalogging.LazyLogging +import edu.uci.ics.amber.engine.common.AmberConfig +import edu.uci.ics.texera.Utils +import edu.uci.ics.texera.web.TexeraWebApplication.parseArgs +import edu.uci.ics.texera.web.auth.JwtAuth.jwtConsumer +import edu.uci.ics.texera.web.auth.{ + GuestAuthFilter, + SessionUser, + UserAuthenticator, + UserRoleAuthorizer +} +import edu.uci.ics.texera.web.resource.WorkflowCompilationResource +import io.dropwizard.auth.{AuthDynamicFeature, AuthValueFactoryProvider} +import io.dropwizard.setup.{Bootstrap, Environment} +import org.eclipse.jetty.server.session.SessionHandler +import org.glassfish.jersey.server.filter.RolesAllowedDynamicFeature + +object TexeraCompilerWebApplication { + def main(args: Array[String]): Unit = { + val argMap = parseArgs(args) + + new TexeraCompilerWebApplication().run( + "server", + Utils.amberHomePath + .resolve("src") + .resolve("main") + .resolve("resources") + .resolve("texera-compiler-web-config.yml") + .toString + ) + } +} + +class TexeraCompilerWebApplication + extends io.dropwizard.Application[TexeraCompilerWebConfiguration] + with LazyLogging { + override def initialize(bootstrap: Bootstrap[TexeraCompilerWebConfiguration]): Unit = { + // register scala module to dropwizard default object mapper + bootstrap.getObjectMapper.registerModule(DefaultScalaModule) + } + + override def run( + configuration: TexeraCompilerWebConfiguration, + environment: Environment + ): Unit = { + // serve backend at /api/texera + environment.jersey.setUrlPattern("/api/texera/*") + + // register SessionHandler + environment.jersey.register(classOf[SessionHandler]) + environment.servlets.setSessionHandler(new SessionHandler) + environment.jersey.register(classOf[WorkflowCompilationResource]) + + if (AmberConfig.isUserSystemEnabled) { + // register JWT Auth layer + environment.jersey.register( + new AuthDynamicFeature( + new JwtAuthFilter.Builder[SessionUser]() + .setJwtConsumer(jwtConsumer) + .setRealm("realm") + .setPrefix("Bearer") + .setAuthenticator(UserAuthenticator) + .setAuthorizer(UserRoleAuthorizer) + .buildAuthFilter() + ) + ) + } else { + // register Guest Auth layer + environment.jersey.register( + new AuthDynamicFeature( + new GuestAuthFilter.Builder().setAuthorizer(UserRoleAuthorizer).buildAuthFilter() + ) + ) + } + + environment.jersey.register( + new AuthValueFactoryProvider.Binder[SessionUser](classOf[SessionUser]) + ) + environment.jersey.register(classOf[RolesAllowedDynamicFeature]) + } +} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebConfiguration.java b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebConfiguration.java new file mode 100644 index 00000000000..7aa01abcbd3 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebConfiguration.java @@ -0,0 +1,6 @@ +package edu.uci.ics.texera.web; + +import io.dropwizard.Configuration; + +public class TexeraCompilerWebConfiguration extends Configuration { +} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala new file mode 100644 index 00000000000..9755925eb14 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -0,0 +1,67 @@ +package edu.uci.ics.texera.web.resource + +import com.typesafe.scalalogging.LazyLogging +import edu.uci.ics.amber.engine.common.virtualidentity.WorkflowIdentity +import edu.uci.ics.texera.Utils +import edu.uci.ics.texera.web.auth.SessionUser +import edu.uci.ics.texera.web.model.http.response.SchemaPropagationResponse +import edu.uci.ics.texera.web.model.websocket.request.LogicalPlanPojo +import edu.uci.ics.texera.workflow.common.WorkflowContext +import edu.uci.ics.texera.workflow.common.tuple.schema.Attribute +import edu.uci.ics.texera.workflow.common.workflow.{PhysicalPlan, WorkflowCompiler} +import io.dropwizard.auth.Auth +import org.jooq.types.UInteger + +import javax.annotation.security.RolesAllowed +import javax.ws.rs.{Consumes, POST, Path, PathParam, Produces} +import javax.ws.rs.core.MediaType + +case class WorkflowCompilationResponse( + physicalPlan: PhysicalPlan, + operatorInputSchemas: Map[String, List[Option[List[Attribute]]]], + operatorErrors: Map[String, String] +) + +@Consumes(Array(MediaType.APPLICATION_JSON)) +@Produces(Array(MediaType.APPLICATION_JSON)) +@Path("/compilation") +class WorkflowCompilationResource extends LazyLogging { + @POST + @Path("/{wid}") + @RolesAllowed(Array("REGULAR", "ADMIN")) + def suggestAutocompleteSchema( + workflowStr: String, + @PathParam("wid") wid: UInteger, + @Auth sessionUser: SessionUser + ): WorkflowCompilationResponse = { + val logicalPlanPojo = Utils.objectMapper.readValue(workflowStr, classOf[LogicalPlanPojo]) + + val context = new WorkflowContext( + userId = Option(sessionUser.getUser.getUid), + workflowId = WorkflowIdentity(wid.toString.toLong) + ) + + // compile the pojo + val workflowCompilationResult = new WorkflowCompiler(context).cleanCompile(logicalPlanPojo) + + // return the result + WorkflowCompilationResponse( + physicalPlan = workflowCompilationResult.physicalPlan, + operatorInputSchemas = workflowCompilationResult.operatorIdToInputSchemas.map { + case (operatorIdentity, schemas) => + val opId = operatorIdentity.id + val attributes = schemas.map { schema => + if (schema.isEmpty) + None + else + Some(schema.get.attributes) + } + + (opId, attributes) + }, + operatorErrors = workflowCompilationResult.operatorIdToError.map { + case (operatorIdentity, error) => (operatorIdentity.id, error.toString) + } + ) + } +} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala index df16fe76be1..560d4cfcbfe 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala @@ -19,7 +19,7 @@ import org.jgrapht.util.SupplierUtil import scala.jdk.CollectionConverters.{IteratorHasAsScala, ListHasAsScala, SetHasAsScala} object PhysicalPlan { - + def empty: PhysicalPlan = PhysicalPlan(operators = Set.empty, links = Set.empty) def apply(context: WorkflowContext, logicalPlan: LogicalPlan): PhysicalPlan = { var physicalPlan = PhysicalPlan(operators = Set.empty, links = Set.empty) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala index 1e36c8b1a70..54923cebe55 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala @@ -15,16 +15,53 @@ import edu.uci.ics.texera.web.workflowruntimestate.WorkflowAggregatedState.FAILE import edu.uci.ics.texera.web.workflowruntimestate.WorkflowFatalError import edu.uci.ics.texera.workflow.common.WorkflowContext import edu.uci.ics.texera.workflow.common.storage.OpResultStorage +import edu.uci.ics.texera.workflow.common.tuple.schema.{Attribute, Schema} import edu.uci.ics.texera.workflow.operators.sink.managed.ProgressiveSinkOpDesc import edu.uci.ics.texera.workflow.operators.visualization.VisualizationConstants import java.time.Instant +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer +case class WorkflowCompilationResult( + physicalPlan: PhysicalPlan, + operatorIdToInputSchemas: Map[OperatorIdentity, List[Option[Schema]]], + operatorIdToError: Map[OperatorIdentity, WorkflowFatalError] +) + class WorkflowCompiler( context: WorkflowContext ) extends LazyLogging { + def compileToLogicalPlan( + logicalPlanPojo: LogicalPlanPojo + ): (LogicalPlan, Map[OperatorIdentity, WorkflowFatalError]) = { + val errorList = new ArrayBuffer[(OperatorIdentity, Throwable)]() + val opIdToError = mutable.Map[OperatorIdentity, WorkflowFatalError]() + + var logicalPlan: LogicalPlan = LogicalPlan(logicalPlanPojo) + logicalPlan = SinkInjectionTransformer.transform( + logicalPlanPojo.opsToViewResult, + logicalPlan + ) + + logicalPlan.propagateWorkflowSchema(context, Some(errorList)) + // report compilation errors + if (errorList.nonEmpty) { + errorList.foreach { + case (opId, err) => + logger.error("error occurred in logical plan compilation", err) + opIdToError += (opId -> WorkflowFatalError( + COMPILATION_ERROR, + Timestamp(Instant.now), + err.toString, + getStackTraceWithAllCauses(err), + opId.id + )) + } + } + (logicalPlan, opIdToError.toMap) + } def compileLogicalPlan( logicalPlanPojo: LogicalPlanPojo, executionStateStore: ExecutionStateStore @@ -92,6 +129,41 @@ class WorkflowCompiler( ) } + def cleanCompile( + logicalPlanPojo: LogicalPlanPojo + ): WorkflowCompilationResult = { + val (logicalPlan, opIdToError) = compileToLogicalPlan(logicalPlanPojo) + if (opIdToError.nonEmpty) { + // encounter error during compile the logical plan pojo to logical plan, + // so directly return empty physical plan, empty schema map and error + return WorkflowCompilationResult( + physicalPlan = PhysicalPlan.empty, + operatorIdToInputSchemas = Map.empty, + operatorIdToError = opIdToError + ) + } + // the PhysicalPlan with topology expanded. + val physicalPlan = PhysicalPlan(context, logicalPlan) + + // Extract physical input schemas, excluding internal ports + val physicalInputSchemas = physicalPlan.operators.map { physicalOp => + physicalOp.id -> physicalOp.inputPorts.values + .filterNot(_._1.id.internal) + .map { + case (port, _, schema) => port.id -> schema.toOption + } + } + + // Group the physical input schemas by their logical operator ID and consolidate the schemas + val opIdToInputSchemas = physicalInputSchemas + .groupBy(_._1.logicalOpId) + .view + .mapValues(_.flatMap(_._2).toList.sortBy(_._1.id).map(_._2)) + .toMap + + WorkflowCompilationResult(physicalPlan, opIdToInputSchemas, Map.empty) + } + private def assignSinkStorage( logicalPlan: LogicalPlan, context: WorkflowContext, From c39bf9cb9fa8f8ef43c361e0b407f26e523c12bc Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Wed, 21 Aug 2024 09:28:32 -0700 Subject: [PATCH 02/20] organize codes --- .../WorkflowCompilationResource.scala | 4 +- .../resource/WorkflowWebsocketResource.scala | 1 + .../common/workflow/WorkflowCompiler.scala | 89 +++++++++++-------- 3 files changed, 54 insertions(+), 40 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index 9755925eb14..32630f54267 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -29,7 +29,7 @@ class WorkflowCompilationResource extends LazyLogging { @POST @Path("/{wid}") @RolesAllowed(Array("REGULAR", "ADMIN")) - def suggestAutocompleteSchema( + def compileWorkflow( workflowStr: String, @PathParam("wid") wid: UInteger, @Auth sessionUser: SessionUser @@ -42,7 +42,7 @@ class WorkflowCompilationResource extends LazyLogging { ) // compile the pojo - val workflowCompilationResult = new WorkflowCompiler(context).cleanCompile(logicalPlanPojo) + val workflowCompilationResult = new WorkflowCompiler(context).compileToPhysicalPlan(logicalPlanPojo) // return the result WorkflowCompilationResponse( diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowWebsocketResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowWebsocketResource.scala index 071c8ba839a..8519906f5e7 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowWebsocketResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowWebsocketResource.scala @@ -91,6 +91,7 @@ class WorkflowWebsocketResource extends LazyLogging { sessionState.send(modifyLogicResponse) } case editingTimeCompilationRequest: EditingTimeCompilationRequest => + // TODO: remove this after separating the workflow compiler as a standalone service val stateStore = if (executionStateOpt.isDefined) { val currentState = executionStateOpt.get.executionStateStore.metadataStore.getState.state diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala index 54923cebe55..3db345e1db7 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala @@ -33,6 +33,11 @@ class WorkflowCompiler( context: WorkflowContext ) extends LazyLogging { + /** + * Compile the workflow to logical plan and errors(if any) + * @param logicalPlanPojo the pojo parsed from workflow str provided by user + * @return LogicalPlan, and a Map from OpId to Op's error(this map is empty if there is no error) + */ def compileToLogicalPlan( logicalPlanPojo: LogicalPlanPojo ): (LogicalPlan, Map[OperatorIdentity, WorkflowFatalError]) = { @@ -62,11 +67,53 @@ class WorkflowCompiler( } (logicalPlan, opIdToError.toMap) } + + /** + * Compile a workflow to physical plan, along with the schema propagation result and error(if any) + * + * @param logicalPlanPojo the pojo parsed from workflow str provided by user + * @return WorkflowCompilationResult, containing the physical plan, input schemas per op and error per op + */ + def compileToPhysicalPlan( + logicalPlanPojo: LogicalPlanPojo + ): WorkflowCompilationResult = { + val (logicalPlan, opIdToError) = compileToLogicalPlan(logicalPlanPojo) + if (opIdToError.nonEmpty) { + // encounter errors during compile pojo to logical plan, + // so directly return empty physical plan, schema map and non-empty error map + return WorkflowCompilationResult( + physicalPlan = PhysicalPlan.empty, + operatorIdToInputSchemas = Map.empty, + operatorIdToError = opIdToError + ) + } + // from logical plan to physical plan + val physicalPlan = PhysicalPlan(context, logicalPlan) + + // Extract physical input schemas, excluding internal ports + val physicalInputSchemas = physicalPlan.operators.map { physicalOp => + physicalOp.id -> physicalOp.inputPorts.values + .filterNot(_._1.id.internal) + .map { + case (port, _, schema) => port.id -> schema.toOption + } + } + + // Group the physical input schemas by their logical operator ID and consolidate the schemas + val opIdToInputSchemas = physicalInputSchemas + .groupBy(_._1.logicalOpId) + .view + .mapValues(_.flatMap(_._2).toList.sortBy(_._1.id).map(_._2)) + .toMap + + WorkflowCompilationResult(physicalPlan, opIdToInputSchemas, Map.empty) + } + def compileLogicalPlan( logicalPlanPojo: LogicalPlanPojo, executionStateStore: ExecutionStateStore ): LogicalPlan = { - + // TODO: remove this function after separating compiler as a standalone service val errorList = new ArrayBuffer[(OperatorIdentity, Throwable)]() // remove previous error state executionStateStore.metadataStore.updateState { metadataStore => @@ -109,10 +156,11 @@ class WorkflowCompiler( opResultStorage: OpResultStorage, executionStateStore: ExecutionStateStore ): Workflow = { + // TODO: remove this function after separating compiler as a standalone service // generate a LogicalPlan. The logical plan is the injected with all necessary sinks val logicalPlan = compileLogicalPlan(logicalPlanPojo, executionStateStore) - // assign the storage location to sink operators + // TODO: push the sink storage assignment directly on physical plan in workflow execution service assignSinkStorage( logicalPlan, context, @@ -129,41 +177,6 @@ class WorkflowCompiler( ) } - def cleanCompile( - logicalPlanPojo: LogicalPlanPojo - ): WorkflowCompilationResult = { - val (logicalPlan, opIdToError) = compileToLogicalPlan(logicalPlanPojo) - if (opIdToError.nonEmpty) { - // encounter error during compile the logical plan pojo to logical plan, - // so directly return empty physical plan, empty schema map and error - return WorkflowCompilationResult( - physicalPlan = PhysicalPlan.empty, - operatorIdToInputSchemas = Map.empty, - operatorIdToError = opIdToError - ) - } - // the PhysicalPlan with topology expanded. - val physicalPlan = PhysicalPlan(context, logicalPlan) - - // Extract physical input schemas, excluding internal ports - val physicalInputSchemas = physicalPlan.operators.map { physicalOp => - physicalOp.id -> physicalOp.inputPorts.values - .filterNot(_._1.id.internal) - .map { - case (port, _, schema) => port.id -> schema.toOption - } - } - - // Group the physical input schemas by their logical operator ID and consolidate the schemas - val opIdToInputSchemas = physicalInputSchemas - .groupBy(_._1.logicalOpId) - .view - .mapValues(_.flatMap(_._2).toList.sortBy(_._1.id).map(_._2)) - .toMap - - WorkflowCompilationResult(physicalPlan, opIdToInputSchemas, Map.empty) - } - private def assignSinkStorage( logicalPlan: LogicalPlan, context: WorkflowContext, @@ -171,7 +184,7 @@ class WorkflowCompiler( reuseStorageSet: Set[OperatorIdentity] = Set() ): Unit = { // create a JSON object that holds pointers to the workflow's results in Mongo - // TODO in the future, will extract this logic from here when we need pointers to the stats storage + // TODO: move it to the execution service, and change the 1st parameter from LogicalPlan to PhysicalPlan val resultsJSON = objectMapper.createObjectNode() val sinksPointers = objectMapper.createArrayNode() // assign storage to texera-managed sinks before generating exec config From 9c4200221e07bc944861d3569795df622ab87988 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Thu, 22 Aug 2024 08:57:42 -0700 Subject: [PATCH 03/20] remove sessions --- .../web/TexeraCompilerWebConfiguration.java | 6 --- ...la => TexeraWorkflowCompilerService.scala} | 45 ++++--------------- ...rkflowCompilerWebServiceConfiguration.java | 6 +++ .../WorkflowCompilationResource.scala | 3 -- 4 files changed, 14 insertions(+), 46 deletions(-) delete mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebConfiguration.java rename core/amber/src/main/scala/edu/uci/ics/texera/web/{TexeraCompilerWebApplication.scala => TexeraWorkflowCompilerService.scala} (51%) create mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerWebServiceConfiguration.java diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebConfiguration.java b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebConfiguration.java deleted file mode 100644 index 7aa01abcbd3..00000000000 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebConfiguration.java +++ /dev/null @@ -1,6 +0,0 @@ -package edu.uci.ics.texera.web; - -import io.dropwizard.Configuration; - -public class TexeraCompilerWebConfiguration extends Configuration { -} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebApplication.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala similarity index 51% rename from core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebApplication.scala rename to core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala index a0ec0e08c10..3c7c7266224 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraCompilerWebApplication.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala @@ -19,11 +19,11 @@ import io.dropwizard.setup.{Bootstrap, Environment} import org.eclipse.jetty.server.session.SessionHandler import org.glassfish.jersey.server.filter.RolesAllowedDynamicFeature -object TexeraCompilerWebApplication { +object TexeraWorkflowCompilerService { def main(args: Array[String]): Unit = { val argMap = parseArgs(args) - new TexeraCompilerWebApplication().run( + new TexeraWorkflowCompilerService().run( "server", Utils.amberHomePath .resolve("src") @@ -35,51 +35,22 @@ object TexeraCompilerWebApplication { } } -class TexeraCompilerWebApplication - extends io.dropwizard.Application[TexeraCompilerWebConfiguration] +class TexeraWorkflowCompilerService + extends io.dropwizard.Application[TexeraWorkflowCompilerWebServiceConfiguration] with LazyLogging { - override def initialize(bootstrap: Bootstrap[TexeraCompilerWebConfiguration]): Unit = { + override def initialize(bootstrap: Bootstrap[TexeraWorkflowCompilerWebServiceConfiguration]): Unit = { // register scala module to dropwizard default object mapper bootstrap.getObjectMapper.registerModule(DefaultScalaModule) } override def run( - configuration: TexeraCompilerWebConfiguration, - environment: Environment + configuration: TexeraWorkflowCompilerWebServiceConfiguration, + environment: Environment ): Unit = { // serve backend at /api/texera environment.jersey.setUrlPattern("/api/texera/*") - // register SessionHandler - environment.jersey.register(classOf[SessionHandler]) - environment.servlets.setSessionHandler(new SessionHandler) + // register the compilation endpoint environment.jersey.register(classOf[WorkflowCompilationResource]) - - if (AmberConfig.isUserSystemEnabled) { - // register JWT Auth layer - environment.jersey.register( - new AuthDynamicFeature( - new JwtAuthFilter.Builder[SessionUser]() - .setJwtConsumer(jwtConsumer) - .setRealm("realm") - .setPrefix("Bearer") - .setAuthenticator(UserAuthenticator) - .setAuthorizer(UserRoleAuthorizer) - .buildAuthFilter() - ) - ) - } else { - // register Guest Auth layer - environment.jersey.register( - new AuthDynamicFeature( - new GuestAuthFilter.Builder().setAuthorizer(UserRoleAuthorizer).buildAuthFilter() - ) - ) - } - - environment.jersey.register( - new AuthValueFactoryProvider.Binder[SessionUser](classOf[SessionUser]) - ) - environment.jersey.register(classOf[RolesAllowedDynamicFeature]) } } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerWebServiceConfiguration.java b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerWebServiceConfiguration.java new file mode 100644 index 00000000000..75d04bcbc9b --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerWebServiceConfiguration.java @@ -0,0 +1,6 @@ +package edu.uci.ics.texera.web; + +import io.dropwizard.Configuration; + +public class TexeraWorkflowCompilerWebServiceConfiguration extends Configuration { +} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index 32630f54267..c3accc2c0f9 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -28,16 +28,13 @@ case class WorkflowCompilationResponse( class WorkflowCompilationResource extends LazyLogging { @POST @Path("/{wid}") - @RolesAllowed(Array("REGULAR", "ADMIN")) def compileWorkflow( workflowStr: String, @PathParam("wid") wid: UInteger, - @Auth sessionUser: SessionUser ): WorkflowCompilationResponse = { val logicalPlanPojo = Utils.objectMapper.readValue(workflowStr, classOf[LogicalPlanPojo]) val context = new WorkflowContext( - userId = Option(sessionUser.getUser.getUid), workflowId = WorkflowIdentity(wid.toString.toLong) ) From 4889821fa415ecba1840142bc95e61ebc35637e8 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Thu, 22 Aug 2024 16:12:22 -0700 Subject: [PATCH 04/20] fix null issue --- .../operators/sink/managed/ProgressiveSinkOpDesc.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/sink/managed/ProgressiveSinkOpDesc.java b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/sink/managed/ProgressiveSinkOpDesc.java index b73c5b32eb1..552a6996617 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/sink/managed/ProgressiveSinkOpDesc.java +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/sink/managed/ProgressiveSinkOpDesc.java @@ -58,7 +58,9 @@ public class ProgressiveSinkOpDesc extends SinkOpDesc { @Override public PhysicalOp getPhysicalOp(WorkflowIdentity workflowId, ExecutionIdentity executionId) { - final SinkStorageWriter writer = storage.getStorageWriter(); + // Since during workflow compilation phase, the storage can be null, the writer should also be null + // the writer will be set property when workflow execution service receives the physical plan + final SinkStorageWriter writer = (storage != null) ? storage.getStorageWriter() : null; return PhysicalOp.localPhysicalOp( workflowId, executionId, From 11cf8847ed7a174bef3b42a74fcb4f6a36c67676 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Fri, 23 Aug 2024 19:25:41 -0700 Subject: [PATCH 05/20] resolving comments --- .../deploysemantics/PhysicalOpPojo.scala | 66 +++++++++++++++++++ .../web/TexeraWorkflowCompilerService.scala | 8 ++- .../request/WorkflowExecuteRequest.scala | 7 ++ .../WorkflowCompilationResource.scala | 21 ++++-- .../metadata/PropertyNameConstants.scala | 13 ++++ .../common/workflow/WorkflowCompiler.scala | 4 +- 6 files changed, 108 insertions(+), 11 deletions(-) create mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOpPojo.scala diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOpPojo.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOpPojo.scala new file mode 100644 index 00000000000..a6b29afbf84 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOpPojo.scala @@ -0,0 +1,66 @@ +package edu.uci.ics.amber.engine.architecture.deploysemantics + +import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.LocationPreference +import edu.uci.ics.amber.engine.common.virtualidentity.{ + ExecutionIdentity, + PhysicalOpIdentity, + WorkflowIdentity +} +import edu.uci.ics.amber.engine.common.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} +import edu.uci.ics.texera.workflow.common.metadata.PropertyNameConstants +import edu.uci.ics.texera.workflow.common.tuple.schema.Schema +import edu.uci.ics.texera.workflow.common.workflow.PartitionInfo +import org.codehaus.jackson.annotate.JsonProperty + +object PhysicalOpPojo { + def apply(physicalOp: PhysicalOp): PhysicalOpPojo = { + + val result = new PhysicalOpPojo() + result.id = physicalOp.id + result.workflowId = physicalOp.workflowId + result.executionId = physicalOp.executionId + result.parallelizable = physicalOp.parallelizable + result.locationPreference = physicalOp.locationPreference + result.partitionRequirement = physicalOp.partitionRequirement + result.inputPorts = physicalOp.inputPorts + result.outputPorts = physicalOp.outputPorts + result.isOneToManyOp = physicalOp.isOneToManyOp + result.suggestedWorkerNum = physicalOp.suggestedWorkerNum + + result + } +} + +class PhysicalOpPojo extends Serializable { + + @JsonProperty(PropertyNameConstants.OPERATOR_ID) + var id: PhysicalOpIdentity = _ + + @JsonProperty(PropertyNameConstants.WORKFLOW_ID) + var workflowId: WorkflowIdentity = _ + + @JsonProperty(PropertyNameConstants.EXECUTION_ID) + var executionId: ExecutionIdentity = _ + + @JsonProperty(PropertyNameConstants.PARALLELIZABLE) + var parallelizable: Boolean = _ + + @JsonProperty(PropertyNameConstants.LOCATION_PREFERENCE) + var locationPreference: Option[LocationPreference] = _ + + @JsonProperty(PropertyNameConstants.PARTITION_REQUIREMENT) + var partitionRequirement: List[Option[PartitionInfo]] = _ + + @JsonProperty(PropertyNameConstants.INPUT_PORTS) + var inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = _ + + @JsonProperty(PropertyNameConstants.OUTPUT_PORTS) + var outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = + _ + + @JsonProperty(PropertyNameConstants.IS_ONE_TO_MANY_OP) + var isOneToManyOp: Boolean = _ + + @JsonProperty(PropertyNameConstants.SUGGESTED_WORKER_NUM) + var suggestedWorkerNum: Option[Int] = _ +} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala index 3c7c7266224..87f7066ace4 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala @@ -38,14 +38,16 @@ object TexeraWorkflowCompilerService { class TexeraWorkflowCompilerService extends io.dropwizard.Application[TexeraWorkflowCompilerWebServiceConfiguration] with LazyLogging { - override def initialize(bootstrap: Bootstrap[TexeraWorkflowCompilerWebServiceConfiguration]): Unit = { + override def initialize( + bootstrap: Bootstrap[TexeraWorkflowCompilerWebServiceConfiguration] + ): Unit = { // register scala module to dropwizard default object mapper bootstrap.getObjectMapper.registerModule(DefaultScalaModule) } override def run( - configuration: TexeraWorkflowCompilerWebServiceConfiguration, - environment: Environment + configuration: TexeraWorkflowCompilerWebServiceConfiguration, + environment: Environment ): Unit = { // serve backend at /api/texera environment.jersey.setUrlPattern("/api/texera/*") diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala index 04e94187628..db5ec44c1fb 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala @@ -1,6 +1,8 @@ package edu.uci.ics.texera.web.model.websocket.request import com.fasterxml.jackson.databind.annotation.JsonDeserialize +import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOpPojo +import edu.uci.ics.amber.engine.common.workflow.PhysicalLink import edu.uci.ics.texera.workflow.common.operators.LogicalOp import edu.uci.ics.texera.workflow.common.workflow.{LogicalLink, WorkflowSettings} @@ -24,3 +26,8 @@ case class LogicalPlanPojo( opsToViewResult: List[String], opsToReuseResult: List[String] ) + +case class PhysicalPlanPojo( + operators: List[PhysicalOpPojo], + links: List[PhysicalLink] +) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index c3accc2c0f9..82a3462169b 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -1,11 +1,12 @@ package edu.uci.ics.texera.web.resource import com.typesafe.scalalogging.LazyLogging +import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOpPojo import edu.uci.ics.amber.engine.common.virtualidentity.WorkflowIdentity import edu.uci.ics.texera.Utils import edu.uci.ics.texera.web.auth.SessionUser import edu.uci.ics.texera.web.model.http.response.SchemaPropagationResponse -import edu.uci.ics.texera.web.model.websocket.request.LogicalPlanPojo +import edu.uci.ics.texera.web.model.websocket.request.{LogicalPlanPojo, PhysicalPlanPojo} import edu.uci.ics.texera.workflow.common.WorkflowContext import edu.uci.ics.texera.workflow.common.tuple.schema.Attribute import edu.uci.ics.texera.workflow.common.workflow.{PhysicalPlan, WorkflowCompiler} @@ -17,7 +18,7 @@ import javax.ws.rs.{Consumes, POST, Path, PathParam, Produces} import javax.ws.rs.core.MediaType case class WorkflowCompilationResponse( - physicalPlan: PhysicalPlan, + physicalPlan: PhysicalPlanPojo, operatorInputSchemas: Map[String, List[Option[List[Attribute]]]], operatorErrors: Map[String, String] ) @@ -30,7 +31,7 @@ class WorkflowCompilationResource extends LazyLogging { @Path("/{wid}") def compileWorkflow( workflowStr: String, - @PathParam("wid") wid: UInteger, + @PathParam("wid") wid: UInteger ): WorkflowCompilationResponse = { val logicalPlanPojo = Utils.objectMapper.readValue(workflowStr, classOf[LogicalPlanPojo]) @@ -39,11 +40,19 @@ class WorkflowCompilationResource extends LazyLogging { ) // compile the pojo - val workflowCompilationResult = new WorkflowCompiler(context).compileToPhysicalPlan(logicalPlanPojo) - + val workflowCompilationResult = + new WorkflowCompiler(context).compileToPhysicalPlan(logicalPlanPojo) + // get the physical plan from the compilation result + val physicalPlan = workflowCompilationResult.physicalPlan + // convert the physical plan to pojo, which is serializable + val physicalPlanPojo = PhysicalPlanPojo( + // the reason of using PhysicalOpPojo is because some fields in PhysicalOp is not serializable + physicalPlan.operators.map(op => PhysicalOpPojo(op)).toList, + physicalPlan.links.toList + ) // return the result WorkflowCompilationResponse( - physicalPlan = workflowCompilationResult.physicalPlan, + physicalPlan = physicalPlanPojo, operatorInputSchemas = workflowCompilationResult.operatorIdToInputSchemas.map { case (operatorIdentity, schemas) => val opId = operatorIdentity.id diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/metadata/PropertyNameConstants.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/metadata/PropertyNameConstants.scala index b21cd851788..5184afb433b 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/metadata/PropertyNameConstants.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/metadata/PropertyNameConstants.scala @@ -20,4 +20,17 @@ object PropertyNameConstants { // logical plan property names final val RESULT_ATTRIBUTE_NAME = "resultAttribute" final val SPAN_LIST_NAME = "spanListName" final val TABLE_NAME = "tableName" + + // physical plan property names + final val WORKFLOW_ID = "workflowID" + final val EXECUTION_ID = "executionID" + final val PARALLELIZABLE = "parallelizable" + final val LOCATION_PREFERENCE = "locationPreference" + final val PARTITION_REQUIREMENT = "partitionRequirement" + // derivePartition is a function type that cannot be serialized + final val INPUT_PORTS = "inputPorts" + final val OUTPUT_PORTS = "outputPorts" + // propagateSchema is a function type that cannot be serialized + final val IS_ONE_TO_MANY_OP = "isOneToManyOp" + final val SUGGESTED_WORKER_NUM = "suggestedWorkerNum" } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala index 3db345e1db7..c394ea93841 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala @@ -75,8 +75,8 @@ class WorkflowCompiler( * @return WorkflowCompilationResult, containing the physical plan, input schemas per op and error per op */ def compileToPhysicalPlan( - logicalPlanPojo: LogicalPlanPojo - ): WorkflowCompilationResult = { + logicalPlanPojo: LogicalPlanPojo + ): WorkflowCompilationResult = { val (logicalPlan, opIdToError) = compileToLogicalPlan(logicalPlanPojo) if (opIdToError.nonEmpty) { // encounter errors during compile pojo to logical plan, From 3cb86f654ebfa56a9ccbf2af94a6c672d0d881ae Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Sun, 25 Aug 2024 17:03:07 +0800 Subject: [PATCH 06/20] resolving comments --- ...yml => texera-compiling-service-web-config.yml} | 6 +++--- ...eraWorkflowCompilerWebServiceConfiguration.java | 6 ------ ....scala => TexeraWorkflowCompilingService.scala} | 14 +++++++------- ...exeraWorkflowCompilingServiceConfiguration.java | 6 ++++++ 4 files changed, 16 insertions(+), 16 deletions(-) rename core/amber/src/main/resources/{texera-compiler-web-config.yml => texera-compiling-service-web-config.yml} (83%) delete mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerWebServiceConfiguration.java rename core/amber/src/main/scala/edu/uci/ics/texera/web/{TexeraWorkflowCompilerService.scala => TexeraWorkflowCompilingService.scala} (79%) create mode 100644 core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingServiceConfiguration.java diff --git a/core/amber/src/main/resources/texera-compiler-web-config.yml b/core/amber/src/main/resources/texera-compiling-service-web-config.yml similarity index 83% rename from core/amber/src/main/resources/texera-compiler-web-config.yml rename to core/amber/src/main/resources/texera-compiling-service-web-config.yml index f76a6ac5052..6a5ee49a047 100644 --- a/core/amber/src/main/resources/texera-compiler-web-config.yml +++ b/core/amber/src/main/resources/texera-compiling-service-web-config.yml @@ -14,12 +14,12 @@ server: appenders: - type: console - type: file - currentLogFilename: ../log/access-texera-compiler.log + currentLogFilename: ../log/access.log threshold: ALL queueSize: 512 discardingThreshold: 0 archive: true - archivedLogFilenamePattern: ../log/access-texera-compiler-%d{yyyy-MM-dd}.log.gz + archivedLogFilenamePattern: ../log/access-%d{yyyy-MM-dd}.log.gz archivedFileCount: 7 bufferSize: 8KiB immediateFlush: true @@ -31,7 +31,7 @@ logging: - type: console logFormat: "[%date{ISO8601}] [%level] [%logger] [%thread] - %msg %n" - type: file - currentLogFilename: ../log/texera-compiler-server.log + currentLogFilename: ../log/texera-workflow-compiling-service.log threshold: ALL queueSize: 512 discardingThreshold: 0 diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerWebServiceConfiguration.java b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerWebServiceConfiguration.java deleted file mode 100644 index 75d04bcbc9b..00000000000 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerWebServiceConfiguration.java +++ /dev/null @@ -1,6 +0,0 @@ -package edu.uci.ics.texera.web; - -import io.dropwizard.Configuration; - -public class TexeraWorkflowCompilerWebServiceConfiguration extends Configuration { -} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala similarity index 79% rename from core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala rename to core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala index 87f7066ace4..7aab5983558 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilerService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala @@ -19,34 +19,34 @@ import io.dropwizard.setup.{Bootstrap, Environment} import org.eclipse.jetty.server.session.SessionHandler import org.glassfish.jersey.server.filter.RolesAllowedDynamicFeature -object TexeraWorkflowCompilerService { +object TexeraWorkflowCompilingService { def main(args: Array[String]): Unit = { val argMap = parseArgs(args) - new TexeraWorkflowCompilerService().run( + new TexeraWorkflowCompilingService().run( "server", Utils.amberHomePath .resolve("src") .resolve("main") .resolve("resources") - .resolve("texera-compiler-web-config.yml") + .resolve("texera-compiling-service-web-config.yml") .toString ) } } -class TexeraWorkflowCompilerService - extends io.dropwizard.Application[TexeraWorkflowCompilerWebServiceConfiguration] +class TexeraWorkflowCompilingService + extends io.dropwizard.Application[TexeraWorkflowCompilingServiceConfiguration] with LazyLogging { override def initialize( - bootstrap: Bootstrap[TexeraWorkflowCompilerWebServiceConfiguration] + bootstrap: Bootstrap[TexeraWorkflowCompilingServiceConfiguration] ): Unit = { // register scala module to dropwizard default object mapper bootstrap.getObjectMapper.registerModule(DefaultScalaModule) } override def run( - configuration: TexeraWorkflowCompilerWebServiceConfiguration, + configuration: TexeraWorkflowCompilingServiceConfiguration, environment: Environment ): Unit = { // serve backend at /api/texera diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingServiceConfiguration.java b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingServiceConfiguration.java new file mode 100644 index 00000000000..65f3a60fcaf --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingServiceConfiguration.java @@ -0,0 +1,6 @@ +package edu.uci.ics.texera.web; + +import io.dropwizard.Configuration; + +public class TexeraWorkflowCompilingServiceConfiguration extends Configuration { +} From c0ea4fbb6ad255f1fe35300a5686c64e9930680d Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Thu, 29 Aug 2024 16:36:44 +0800 Subject: [PATCH 07/20] try to change physical op --- .../deploysemantics/PhysicalOp.scala | 576 ++++++++++-------- 1 file changed, 330 insertions(+), 246 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index 1945e9135eb..f8a82cd1e35 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -2,32 +2,22 @@ package edu.uci.ics.amber.engine.architecture.deploysemantics import akka.actor.Deploy import akka.remote.RemoteScope +import com.fasterxml.jackson.annotation.JsonProperty import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.architecture.common.AkkaActorService import edu.uci.ics.amber.engine.architecture.controller.execution.OperatorExecution -import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.{ - OpExecInitInfo, - OpExecInitInfoWithCode -} -import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.{ - AddressInfo, - LocationPreference, - PreferController, - RoundRobinPreference -} +import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.{OpExecInitInfo, OpExecInitInfoWithCode} +import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.{AddressInfo, LocationPreference, PreferController, RoundRobinPreference} import edu.uci.ics.amber.engine.architecture.pythonworker.PythonWorkflowWorker import edu.uci.ics.amber.engine.architecture.scheduling.config.OperatorConfig import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker -import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker.{ - FaultToleranceConfig, - StateRestoreConfig, - WorkerReplayInitialization -} +import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker.{FaultToleranceConfig, StateRestoreConfig, WorkerReplayInitialization} import edu.uci.ics.amber.engine.common.VirtualIdentityUtils import edu.uci.ics.amber.engine.common.virtualidentity._ import edu.uci.ics.amber.engine.common.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} import edu.uci.ics.texera.workflow.common.tuple.schema.Schema import edu.uci.ics.texera.workflow.common.workflow._ +import net.minidev.json.annotate.JsonIgnore import org.jgrapht.graph.{DefaultEdge, DirectedAcyclicGraph} import org.jgrapht.traverse.TopologicalOrderIterator @@ -68,19 +58,20 @@ object PhysicalOp { ) def sourcePhysicalOp( - physicalOpId: PhysicalOpIdentity, - workflowId: WorkflowIdentity, - executionId: ExecutionIdentity, - opExecInitInfo: OpExecInitInfo - ): PhysicalOp = - PhysicalOp( - physicalOpId, - workflowId, - executionId, - opExecInitInfo, + physicalOpId: PhysicalOpIdentity, + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity, + opExecInitInfo: OpExecInitInfo + ): PhysicalOp = { + new PhysicalOp( + id = physicalOpId, + workflowId = workflowId, + executionId = executionId, + opExecInitInfo = opExecInitInfo, parallelizable = false, locationPreference = Option(new PreferController()) ) + } def oneToOnePhysicalOp( workflowId: WorkflowIdentity, @@ -96,12 +87,18 @@ object PhysicalOp { ) def oneToOnePhysicalOp( - physicalOpId: PhysicalOpIdentity, - workflowId: WorkflowIdentity, - executionId: ExecutionIdentity, - opExecInitInfo: OpExecInitInfo - ): PhysicalOp = - PhysicalOp(physicalOpId, workflowId, executionId, opExecInitInfo = opExecInitInfo) + physicalOpId: PhysicalOpIdentity, + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity, + opExecInitInfo: OpExecInitInfo + ): PhysicalOp = { + new PhysicalOp( + id = physicalOpId, + workflowId = workflowId, + executionId = executionId, + opExecInitInfo = opExecInitInfo + ) + } def manyToOnePhysicalOp( workflowId: WorkflowIdentity, @@ -117,16 +114,16 @@ object PhysicalOp { ) def manyToOnePhysicalOp( - physicalOpId: PhysicalOpIdentity, - workflowId: WorkflowIdentity, - executionId: ExecutionIdentity, - opExecInitInfo: OpExecInitInfo - ): PhysicalOp = { - PhysicalOp( - physicalOpId, - workflowId, - executionId, - opExecInitInfo, + physicalOpId: PhysicalOpIdentity, + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity, + opExecInitInfo: OpExecInitInfo + ): PhysicalOp = { + new PhysicalOp( + id = physicalOpId, + workflowId = workflowId, + executionId = executionId, + opExecInitInfo = opExecInitInfo, parallelizable = false, partitionRequirement = List(Option(SinglePartition())), derivePartition = _ => SinglePartition() @@ -157,62 +154,61 @@ object PhysicalOp { } } -case class PhysicalOp( - // the identifier of this PhysicalOp - id: PhysicalOpIdentity, - // the workflow id number - workflowId: WorkflowIdentity, - // the execution id number - executionId: ExecutionIdentity, - // information regarding initializing an operator executor instance - opExecInitInfo: OpExecInitInfo, - // preference of parallelism - parallelizable: Boolean = true, - // preference of worker placement - locationPreference: Option[LocationPreference] = None, - // requirement of partition policy (hash/range/single/none) on inputs - partitionRequirement: List[Option[PartitionInfo]] = List(), - // derive the output partition info given the input partitions - // if not specified, by default the output partition is the same as input partition - derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, - // input/output ports of the physical operator - // for operators with multiple input/output ports: must set these variables properly - inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = - Map.empty, - outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = - Map.empty, - // schema propagation function - propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), - isOneToManyOp: Boolean = false, - // hint for number of workers - suggestedWorkerNum: Option[Int] = None -) extends LazyLogging { - - // all the "dependee" links are also blocking +class PhysicalOp( + @JsonProperty("id") var id: PhysicalOpIdentity, + @JsonProperty("workflowId") var workflowId: WorkflowIdentity, + @JsonProperty("executionId") var executionId: ExecutionIdentity, + @JsonIgnore var opExecInitInfo: OpExecInitInfo, + @JsonProperty("parallelizable") var parallelizable: Boolean = true, + @JsonProperty("locationPreference") var locationPreference: Option[LocationPreference] = None, + @JsonProperty("partitionRequirement") var partitionRequirement: List[Option[PartitionInfo]] = List(), + @JsonIgnore var derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, + @JsonProperty("inputPorts") var inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = Map.empty, + @JsonProperty("outputPorts") var outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = Map.empty, + @JsonIgnore var propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), + @JsonProperty("isOneToManyOp") var isOneToManyOp: Boolean = false, + @JsonProperty("suggestedWorkerNum") var suggestedWorkerNum: Option[Int] = None + ) extends LazyLogging { + + // Auxiliary constructor to allow creation with fewer parameters + def this( + id: PhysicalOpIdentity, + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity, + opExecInitInfo: OpExecInitInfo + ) = { + this( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable = true, + locationPreference = None, + partitionRequirement = List(), + derivePartition = inputParts => inputParts.head, + inputPorts = Map.empty, + outputPorts = Map.empty, + propagateSchema = SchemaPropagationFunc(schemas => schemas), + isOneToManyOp = false, + suggestedWorkerNum = None + ) + } + + // Other methods and helper functions + private lazy val dependeeInputs: List[PortIdentity] = inputPorts.values - .flatMap({ + .flatMap { case (port, _, _) => port.dependencies - }) + } .toList .distinct private lazy val isInitWithCode: Boolean = opExecInitInfo.isInstanceOf[OpExecInitInfoWithCode] - /** - * Helper functions related to compile-time operations - */ - - def isSourceOperator: Boolean = { - inputPorts.isEmpty - } + def isSourceOperator: Boolean = inputPorts.isEmpty - /** - * Helper function used to determine whether the input link is a materialized link. - */ - def isSinkOperator: Boolean = { - outputPorts.forall(port => port._2._2.isEmpty) - } + def isSinkOperator: Boolean = outputPorts.forall(port => port._2._2.isEmpty) def isPythonBased: Boolean = { opExecInitInfo match { @@ -224,203 +220,314 @@ case class PhysicalOp( } def getPythonCode: String = { - val (code, _) = - opExecInitInfo.asInstanceOf[OpExecInitInfoWithCode].codeGen(0, 0) + val (code, _) = opExecInitInfo.asInstanceOf[OpExecInitInfoWithCode].codeGen(0, 0) code } - /** - * creates a copy with the location preference information - */ def withLocationPreference(preference: Option[LocationPreference]): PhysicalOp = { - this.copy(locationPreference = preference) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + preference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum + ) } - /** - * Creates a copy of the PhysicalOp with the specified input ports. Each input port is associated - * with an empty list of links and a None schema, reflecting the absence of predefined connections - * and schema information. - * - * @param inputs A list of InputPort instances to set as the new input ports. - * @return A new instance of PhysicalOp with the input ports updated. - */ def withInputPorts(inputs: List[InputPort]): PhysicalOp = { - this.copy(inputPorts = - inputs - .map(input => - input.id -> (input, List - .empty[PhysicalLink], Left(new SchemaNotAvailableException("schema is not available"))) - ) - .toMap + val newInputPorts = inputs.map(input => + input.id -> (input, List.empty[PhysicalLink], Left(new SchemaNotAvailableException("schema is not available"))) + ).toMap + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + newInputPorts, + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum ) } - /** - * Creates a copy of the PhysicalOp with the specified output ports. Each output port is - * initialized with an empty list of links and a None schema, indicating - * the absence of outbound connections and schema details at this stage. - * - * @param outputs A list of OutputPort instances to set as the new output ports. - * @return A new instance of PhysicalOp with the output ports updated. - */ def withOutputPorts(outputs: List[OutputPort]): PhysicalOp = { - this.copy(outputPorts = - outputs - .map(output => - output.id -> (output, List - .empty[PhysicalLink], Left(new SchemaNotAvailableException("schema is not available"))) - ) - .toMap + val newOutputPorts = outputs.map(output => + output.id -> (output, List.empty[PhysicalLink], Left(new SchemaNotAvailableException("schema is not available"))) + ).toMap + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + newOutputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum ) } - /** - * creates a copy with suggested worker number. This is only to be used by Python UDF operators. - */ def withSuggestedWorkerNum(workerNum: Int): PhysicalOp = { - this.copy(suggestedWorkerNum = Some(workerNum)) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts, + propagateSchema, + isOneToManyOp, + Some(workerNum) + ) } - /** - * creates a copy with the partition requirements - */ def withPartitionRequirement(partitionRequirements: List[Option[PartitionInfo]]): PhysicalOp = { - this.copy(partitionRequirement = partitionRequirements) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirements, + derivePartition, + inputPorts, + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum + ) } - /** - * creates a copy with the partition info derive function - */ def withDerivePartition(derivePartition: List[PartitionInfo] => PartitionInfo): PhysicalOp = { - this.copy(derivePartition = derivePartition) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum + ) } - /** - * creates a copy with the parallelizable specified - */ - def withParallelizable(parallelizable: Boolean): PhysicalOp = - this.copy(parallelizable = parallelizable) + def withParallelizable(parallelizable: Boolean): PhysicalOp = { + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum + ) + } + + def withIsOneToManyOp(isOneToManyOp: Boolean): PhysicalOp = { + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum + ) + } - /** - * creates a copy with the specified property that whether this operator is one-to-many - */ - def withIsOneToManyOp(isOneToManyOp: Boolean): PhysicalOp = - this.copy(isOneToManyOp = isOneToManyOp) - - /** - * Creates a copy of the PhysicalOp with the schema of a specified input port updated. - * The schema can either be a successful schema definition or an error represented as a Throwable. - * - * @param portId The identity of the port to update. - * @param schema The new schema, or error, to be associated with the port, encapsulated within an Either. - * A Right value represents a successful schema, while a Left value represents an error (Throwable). - * @return A new instance of PhysicalOp with the updated input port schema or error information. - */ private def withInputSchema( - portId: PortIdentity, - schema: Either[Throwable, Schema] - ): PhysicalOp = { - this.copy(inputPorts = inputPorts.updatedWith(portId) { - case Some((port, links, _)) => Some((port, links, schema)) - case None => None - }) + portId: PortIdentity, + schema: Either[Throwable, Schema] + ): PhysicalOp = { + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts.updated(portId, inputPorts(portId).copy(_3 = schema)), + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum + ) } - /** - * Creates a copy of the PhysicalOp with the schema of a specified output port updated. - * Similar to `withInputSchema`, the schema can either represent a successful schema definition - * or an error, encapsulated as an Either type. - * - * @param portId The identity of the port to update. - * @param schema The new schema, or error, to be associated with the port, encapsulated within an Either. - * A Right value indicates a successful schema, while a Left value indicates an error (Throwable). - * @return A new instance of PhysicalOp with the updated output port schema or error information. - */ private def withOutputSchema( - portId: PortIdentity, - schema: Either[Throwable, Schema] - ): PhysicalOp = { - this.copy(outputPorts = outputPorts.updatedWith(portId) { - case Some((port, links, _)) => Some((port, links, schema)) - case None => None - }) + portId: PortIdentity, + schema: Either[Throwable, Schema] + ): PhysicalOp = { + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts.updated(portId, outputPorts(portId).copy(_3 = schema)), + propagateSchema, + isOneToManyOp, + suggestedWorkerNum + ) } - /** - * creates a copy with the schema propagation function. - */ def withPropagateSchema(func: SchemaPropagationFunc): PhysicalOp = { - this.copy(propagateSchema = func) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts, + func, + isOneToManyOp, + suggestedWorkerNum + ) } - /** - * creates a copy with an additional input link specified on an input port - */ def addInputLink(link: PhysicalLink): PhysicalOp = { assert(link.toOpId == id) assert(inputPorts.contains(link.toPortId)) val (port, existingLinks, schema) = inputPorts(link.toPortId) val newLinks = existingLinks :+ link - this.copy( - inputPorts = inputPorts + (link.toPortId -> (port, newLinks, schema)) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts + (link.toPortId -> (port, newLinks, schema)), + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum ) } - /** - * creates a copy with an additional output link specified on an output port - */ def addOutputLink(link: PhysicalLink): PhysicalOp = { assert(link.fromOpId == id) assert(outputPorts.contains(link.fromPortId)) val (port, existingLinks, schema) = outputPorts(link.fromPortId) val newLinks = existingLinks :+ link - this.copy( - outputPorts = outputPorts + (link.fromPortId -> (port, newLinks, schema)) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts + (link.fromPortId -> (port, newLinks, schema)), + propagateSchema, + isOneToManyOp, + suggestedWorkerNum ) } - /** - * creates a copy with a removed input link - */ def removeInputLink(linkToRemove: PhysicalLink): PhysicalOp = { val portId = linkToRemove.toPortId val (port, existingLinks, schema) = inputPorts(portId) - this.copy( - inputPorts = - inputPorts + (portId -> (port, existingLinks.filter(link => link != linkToRemove), schema)) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts + (portId -> (port, existingLinks.filter(link => link != linkToRemove), schema)), + outputPorts, + propagateSchema, + isOneToManyOp, + suggestedWorkerNum ) } - /** - * creates a copy with a removed output link - */ def removeOutputLink(linkToRemove: PhysicalLink): PhysicalOp = { val portId = linkToRemove.fromPortId val (port, existingLinks, schema) = outputPorts(portId) - this.copy( - outputPorts = - outputPorts + (portId -> (port, existingLinks.filter(link => link != linkToRemove), schema)) + new PhysicalOp( + id, + workflowId, + executionId, + opExecInitInfo, + parallelizable, + locationPreference, + partitionRequirement, + derivePartition, + inputPorts, + outputPorts + (portId -> (port, existingLinks.filter(link => link != linkToRemove), schema)), + propagateSchema, + isOneToManyOp, + suggestedWorkerNum ) } - /** - * creates a copy with an input schema updated, and if all input schemas are available, propagate - * the schema change to output schemas. - * @param newInputSchema optionally provide a schema for an input port. - */ def propagateSchema(newInputSchema: Option[(PortIdentity, Schema)] = None): PhysicalOp = { - // Update the input schema if a new one is provided val updatedOp = newInputSchema.foldLeft(this) { case (op, (portId, schema)) => op.withInputSchema(portId, Right(schema)) } - // Extract input schemas, checking if all are defined val inputSchemas = updatedOp.inputPorts.collect { case (portId, (_, _, Right(schema))) => portId -> schema } if (updatedOp.inputPorts.size == inputSchemas.size) { - // All input schemas are available, propagate to output schema val schemaPropagationResult = Try(propagateSchema.func(inputSchemas)) schemaPropagationResult match { case Success(schemaMapping) => @@ -429,20 +536,15 @@ case class PhysicalOp( op.withOutputSchema(portId, Right(schema)) } case Failure(exception) => - // apply the exception to all output ports in case of failure updatedOp.outputPorts.keys.foldLeft(updatedOp) { (op, portId) => op.withOutputSchema(portId, Left(exception)) } } } else { - // Not all input schemas are defined, return the updated operation without changes updatedOp } } - /** - * returns all output links. Optionally, if a specific portId is provided, returns the links connected to that portId. - */ def getOutputLinks(portId: PortIdentity): List[PhysicalLink] = { outputPorts.values .flatMap(_._2) @@ -450,9 +552,6 @@ case class PhysicalOp( .toList } - /** - * returns all input links. Optionally, if a specific portId is provided, returns the links connected to that portId. - */ def getInputLinks(portIdOpt: Option[PortIdentity] = None): List[PhysicalLink] = { inputPorts.values .flatMap(_._2) @@ -465,33 +564,20 @@ case class PhysicalOp( ) } - /** - * Tells whether the input port the link connects to is depended by another input . - */ def isInputLinkDependee(link: PhysicalLink): Boolean = { dependeeInputs.contains(link.toPortId) } - /** - * Tells whether the output on this link is blocking i.e. the operator doesn't output anything till this link - * outputs all its tuples. - */ def isOutputLinkBlocking(link: PhysicalLink): Boolean = { this.outputPorts(link.fromPortId)._1.blocking } - /** - * Some operators process their inputs in a particular order. Eg: 2 phase hash join first - * processes the build input, then the probe input. - */ def getInputLinksInProcessingOrder: List[PhysicalLink] = { - val dependencyDag = { - new DirectedAcyclicGraph[PhysicalLink, DefaultEdge](classOf[DefaultEdge]) - } + val dependencyDag = new DirectedAcyclicGraph[PhysicalLink, DefaultEdge](classOf[DefaultEdge]) inputPorts.values .map(_._1) .flatMap(port => port.dependencies.map(dependee => port.id -> dependee)) - .foreach({ + .foreach { case (depender: PortIdentity, dependee: PortIdentity) => val upstreamLink = getInputLinks(Some(dependee)).head val downstreamLink = getInputLinks(Some(depender)).head @@ -502,7 +588,7 @@ case class PhysicalOp( dependencyDag.addVertex(downstreamLink) } dependencyDag.addEdge(upstreamLink, downstreamLink) - }) + } val topologicalIterator = new TopologicalOrderIterator[PhysicalLink, DefaultEdge](dependencyDag) val processingOrder = new ArrayBuffer[PhysicalLink]() @@ -513,12 +599,12 @@ case class PhysicalOp( } def build( - controllerActorService: AkkaActorService, - operatorExecution: OperatorExecution, - operatorConfig: OperatorConfig, - stateRestoreConfig: Option[StateRestoreConfig], - replayLoggingConfig: Option[FaultToleranceConfig] - ): Unit = { + controllerActorService: AkkaActorService, + operatorExecution: OperatorExecution, + operatorConfig: OperatorConfig, + stateRestoreConfig: Option[StateRestoreConfig], + replayLoggingConfig: Option[FaultToleranceConfig] + ): Unit = { val addressInfo = AddressInfo( controllerActorService.getClusterNodeAddresses, controllerActorService.self.path.address @@ -541,8 +627,6 @@ case class PhysicalOp( ) ) } - // Note: At this point, we don't know if the actor is fully initialized. - // Thus, the ActorRef returned from `controllerActorService.actorOf` is ignored. controllerActorService.actorOf( workflowWorker.withDeploy(Deploy(scope = RemoteScope(preferredAddress))) ) From 19428e1c377431f1009afd4a8b54e3e788a9c88a Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Thu, 29 Aug 2024 16:40:36 +0800 Subject: [PATCH 08/20] Revert "try to change physical op" This reverts commit c31a0ff6b50c867ecc7a21af81bf0766a0b9ffab. --- .../deploysemantics/PhysicalOp.scala | 576 ++++++++---------- 1 file changed, 246 insertions(+), 330 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index f8a82cd1e35..1945e9135eb 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -2,22 +2,32 @@ package edu.uci.ics.amber.engine.architecture.deploysemantics import akka.actor.Deploy import akka.remote.RemoteScope -import com.fasterxml.jackson.annotation.JsonProperty import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.architecture.common.AkkaActorService import edu.uci.ics.amber.engine.architecture.controller.execution.OperatorExecution -import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.{OpExecInitInfo, OpExecInitInfoWithCode} -import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.{AddressInfo, LocationPreference, PreferController, RoundRobinPreference} +import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.{ + OpExecInitInfo, + OpExecInitInfoWithCode +} +import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.{ + AddressInfo, + LocationPreference, + PreferController, + RoundRobinPreference +} import edu.uci.ics.amber.engine.architecture.pythonworker.PythonWorkflowWorker import edu.uci.ics.amber.engine.architecture.scheduling.config.OperatorConfig import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker -import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker.{FaultToleranceConfig, StateRestoreConfig, WorkerReplayInitialization} +import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker.{ + FaultToleranceConfig, + StateRestoreConfig, + WorkerReplayInitialization +} import edu.uci.ics.amber.engine.common.VirtualIdentityUtils import edu.uci.ics.amber.engine.common.virtualidentity._ import edu.uci.ics.amber.engine.common.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} import edu.uci.ics.texera.workflow.common.tuple.schema.Schema import edu.uci.ics.texera.workflow.common.workflow._ -import net.minidev.json.annotate.JsonIgnore import org.jgrapht.graph.{DefaultEdge, DirectedAcyclicGraph} import org.jgrapht.traverse.TopologicalOrderIterator @@ -58,20 +68,19 @@ object PhysicalOp { ) def sourcePhysicalOp( - physicalOpId: PhysicalOpIdentity, - workflowId: WorkflowIdentity, - executionId: ExecutionIdentity, - opExecInitInfo: OpExecInitInfo - ): PhysicalOp = { - new PhysicalOp( - id = physicalOpId, - workflowId = workflowId, - executionId = executionId, - opExecInitInfo = opExecInitInfo, + physicalOpId: PhysicalOpIdentity, + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity, + opExecInitInfo: OpExecInitInfo + ): PhysicalOp = + PhysicalOp( + physicalOpId, + workflowId, + executionId, + opExecInitInfo, parallelizable = false, locationPreference = Option(new PreferController()) ) - } def oneToOnePhysicalOp( workflowId: WorkflowIdentity, @@ -87,18 +96,12 @@ object PhysicalOp { ) def oneToOnePhysicalOp( - physicalOpId: PhysicalOpIdentity, - workflowId: WorkflowIdentity, - executionId: ExecutionIdentity, - opExecInitInfo: OpExecInitInfo - ): PhysicalOp = { - new PhysicalOp( - id = physicalOpId, - workflowId = workflowId, - executionId = executionId, - opExecInitInfo = opExecInitInfo - ) - } + physicalOpId: PhysicalOpIdentity, + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity, + opExecInitInfo: OpExecInitInfo + ): PhysicalOp = + PhysicalOp(physicalOpId, workflowId, executionId, opExecInitInfo = opExecInitInfo) def manyToOnePhysicalOp( workflowId: WorkflowIdentity, @@ -114,16 +117,16 @@ object PhysicalOp { ) def manyToOnePhysicalOp( - physicalOpId: PhysicalOpIdentity, - workflowId: WorkflowIdentity, - executionId: ExecutionIdentity, - opExecInitInfo: OpExecInitInfo - ): PhysicalOp = { - new PhysicalOp( - id = physicalOpId, - workflowId = workflowId, - executionId = executionId, - opExecInitInfo = opExecInitInfo, + physicalOpId: PhysicalOpIdentity, + workflowId: WorkflowIdentity, + executionId: ExecutionIdentity, + opExecInitInfo: OpExecInitInfo + ): PhysicalOp = { + PhysicalOp( + physicalOpId, + workflowId, + executionId, + opExecInitInfo, parallelizable = false, partitionRequirement = List(Option(SinglePartition())), derivePartition = _ => SinglePartition() @@ -154,61 +157,62 @@ object PhysicalOp { } } -class PhysicalOp( - @JsonProperty("id") var id: PhysicalOpIdentity, - @JsonProperty("workflowId") var workflowId: WorkflowIdentity, - @JsonProperty("executionId") var executionId: ExecutionIdentity, - @JsonIgnore var opExecInitInfo: OpExecInitInfo, - @JsonProperty("parallelizable") var parallelizable: Boolean = true, - @JsonProperty("locationPreference") var locationPreference: Option[LocationPreference] = None, - @JsonProperty("partitionRequirement") var partitionRequirement: List[Option[PartitionInfo]] = List(), - @JsonIgnore var derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, - @JsonProperty("inputPorts") var inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = Map.empty, - @JsonProperty("outputPorts") var outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = Map.empty, - @JsonIgnore var propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), - @JsonProperty("isOneToManyOp") var isOneToManyOp: Boolean = false, - @JsonProperty("suggestedWorkerNum") var suggestedWorkerNum: Option[Int] = None - ) extends LazyLogging { - - // Auxiliary constructor to allow creation with fewer parameters - def this( - id: PhysicalOpIdentity, - workflowId: WorkflowIdentity, - executionId: ExecutionIdentity, - opExecInitInfo: OpExecInitInfo - ) = { - this( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable = true, - locationPreference = None, - partitionRequirement = List(), - derivePartition = inputParts => inputParts.head, - inputPorts = Map.empty, - outputPorts = Map.empty, - propagateSchema = SchemaPropagationFunc(schemas => schemas), - isOneToManyOp = false, - suggestedWorkerNum = None - ) - } - - // Other methods and helper functions - +case class PhysicalOp( + // the identifier of this PhysicalOp + id: PhysicalOpIdentity, + // the workflow id number + workflowId: WorkflowIdentity, + // the execution id number + executionId: ExecutionIdentity, + // information regarding initializing an operator executor instance + opExecInitInfo: OpExecInitInfo, + // preference of parallelism + parallelizable: Boolean = true, + // preference of worker placement + locationPreference: Option[LocationPreference] = None, + // requirement of partition policy (hash/range/single/none) on inputs + partitionRequirement: List[Option[PartitionInfo]] = List(), + // derive the output partition info given the input partitions + // if not specified, by default the output partition is the same as input partition + derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, + // input/output ports of the physical operator + // for operators with multiple input/output ports: must set these variables properly + inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = + Map.empty, + outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = + Map.empty, + // schema propagation function + propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), + isOneToManyOp: Boolean = false, + // hint for number of workers + suggestedWorkerNum: Option[Int] = None +) extends LazyLogging { + + // all the "dependee" links are also blocking private lazy val dependeeInputs: List[PortIdentity] = inputPorts.values - .flatMap { + .flatMap({ case (port, _, _) => port.dependencies - } + }) .toList .distinct private lazy val isInitWithCode: Boolean = opExecInitInfo.isInstanceOf[OpExecInitInfoWithCode] - def isSourceOperator: Boolean = inputPorts.isEmpty + /** + * Helper functions related to compile-time operations + */ + + def isSourceOperator: Boolean = { + inputPorts.isEmpty + } - def isSinkOperator: Boolean = outputPorts.forall(port => port._2._2.isEmpty) + /** + * Helper function used to determine whether the input link is a materialized link. + */ + def isSinkOperator: Boolean = { + outputPorts.forall(port => port._2._2.isEmpty) + } def isPythonBased: Boolean = { opExecInitInfo match { @@ -220,314 +224,203 @@ class PhysicalOp( } def getPythonCode: String = { - val (code, _) = opExecInitInfo.asInstanceOf[OpExecInitInfoWithCode].codeGen(0, 0) + val (code, _) = + opExecInitInfo.asInstanceOf[OpExecInitInfoWithCode].codeGen(0, 0) code } + /** + * creates a copy with the location preference information + */ def withLocationPreference(preference: Option[LocationPreference]): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - preference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum - ) + this.copy(locationPreference = preference) } + /** + * Creates a copy of the PhysicalOp with the specified input ports. Each input port is associated + * with an empty list of links and a None schema, reflecting the absence of predefined connections + * and schema information. + * + * @param inputs A list of InputPort instances to set as the new input ports. + * @return A new instance of PhysicalOp with the input ports updated. + */ def withInputPorts(inputs: List[InputPort]): PhysicalOp = { - val newInputPorts = inputs.map(input => - input.id -> (input, List.empty[PhysicalLink], Left(new SchemaNotAvailableException("schema is not available"))) - ).toMap - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - newInputPorts, - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum + this.copy(inputPorts = + inputs + .map(input => + input.id -> (input, List + .empty[PhysicalLink], Left(new SchemaNotAvailableException("schema is not available"))) + ) + .toMap ) } + /** + * Creates a copy of the PhysicalOp with the specified output ports. Each output port is + * initialized with an empty list of links and a None schema, indicating + * the absence of outbound connections and schema details at this stage. + * + * @param outputs A list of OutputPort instances to set as the new output ports. + * @return A new instance of PhysicalOp with the output ports updated. + */ def withOutputPorts(outputs: List[OutputPort]): PhysicalOp = { - val newOutputPorts = outputs.map(output => - output.id -> (output, List.empty[PhysicalLink], Left(new SchemaNotAvailableException("schema is not available"))) - ).toMap - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - newOutputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum + this.copy(outputPorts = + outputs + .map(output => + output.id -> (output, List + .empty[PhysicalLink], Left(new SchemaNotAvailableException("schema is not available"))) + ) + .toMap ) } + /** + * creates a copy with suggested worker number. This is only to be used by Python UDF operators. + */ def withSuggestedWorkerNum(workerNum: Int): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts, - propagateSchema, - isOneToManyOp, - Some(workerNum) - ) + this.copy(suggestedWorkerNum = Some(workerNum)) } + /** + * creates a copy with the partition requirements + */ def withPartitionRequirement(partitionRequirements: List[Option[PartitionInfo]]): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirements, - derivePartition, - inputPorts, - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum - ) + this.copy(partitionRequirement = partitionRequirements) } + /** + * creates a copy with the partition info derive function + */ def withDerivePartition(derivePartition: List[PartitionInfo] => PartitionInfo): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum - ) - } - - def withParallelizable(parallelizable: Boolean): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum - ) + this.copy(derivePartition = derivePartition) } - def withIsOneToManyOp(isOneToManyOp: Boolean): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum - ) - } + /** + * creates a copy with the parallelizable specified + */ + def withParallelizable(parallelizable: Boolean): PhysicalOp = + this.copy(parallelizable = parallelizable) + /** + * creates a copy with the specified property that whether this operator is one-to-many + */ + def withIsOneToManyOp(isOneToManyOp: Boolean): PhysicalOp = + this.copy(isOneToManyOp = isOneToManyOp) + + /** + * Creates a copy of the PhysicalOp with the schema of a specified input port updated. + * The schema can either be a successful schema definition or an error represented as a Throwable. + * + * @param portId The identity of the port to update. + * @param schema The new schema, or error, to be associated with the port, encapsulated within an Either. + * A Right value represents a successful schema, while a Left value represents an error (Throwable). + * @return A new instance of PhysicalOp with the updated input port schema or error information. + */ private def withInputSchema( - portId: PortIdentity, - schema: Either[Throwable, Schema] - ): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts.updated(portId, inputPorts(portId).copy(_3 = schema)), - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum - ) + portId: PortIdentity, + schema: Either[Throwable, Schema] + ): PhysicalOp = { + this.copy(inputPorts = inputPorts.updatedWith(portId) { + case Some((port, links, _)) => Some((port, links, schema)) + case None => None + }) } + /** + * Creates a copy of the PhysicalOp with the schema of a specified output port updated. + * Similar to `withInputSchema`, the schema can either represent a successful schema definition + * or an error, encapsulated as an Either type. + * + * @param portId The identity of the port to update. + * @param schema The new schema, or error, to be associated with the port, encapsulated within an Either. + * A Right value indicates a successful schema, while a Left value indicates an error (Throwable). + * @return A new instance of PhysicalOp with the updated output port schema or error information. + */ private def withOutputSchema( - portId: PortIdentity, - schema: Either[Throwable, Schema] - ): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts.updated(portId, outputPorts(portId).copy(_3 = schema)), - propagateSchema, - isOneToManyOp, - suggestedWorkerNum - ) + portId: PortIdentity, + schema: Either[Throwable, Schema] + ): PhysicalOp = { + this.copy(outputPorts = outputPorts.updatedWith(portId) { + case Some((port, links, _)) => Some((port, links, schema)) + case None => None + }) } + /** + * creates a copy with the schema propagation function. + */ def withPropagateSchema(func: SchemaPropagationFunc): PhysicalOp = { - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts, - func, - isOneToManyOp, - suggestedWorkerNum - ) + this.copy(propagateSchema = func) } + /** + * creates a copy with an additional input link specified on an input port + */ def addInputLink(link: PhysicalLink): PhysicalOp = { assert(link.toOpId == id) assert(inputPorts.contains(link.toPortId)) val (port, existingLinks, schema) = inputPorts(link.toPortId) val newLinks = existingLinks :+ link - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts + (link.toPortId -> (port, newLinks, schema)), - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum + this.copy( + inputPorts = inputPorts + (link.toPortId -> (port, newLinks, schema)) ) } + /** + * creates a copy with an additional output link specified on an output port + */ def addOutputLink(link: PhysicalLink): PhysicalOp = { assert(link.fromOpId == id) assert(outputPorts.contains(link.fromPortId)) val (port, existingLinks, schema) = outputPorts(link.fromPortId) val newLinks = existingLinks :+ link - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts + (link.fromPortId -> (port, newLinks, schema)), - propagateSchema, - isOneToManyOp, - suggestedWorkerNum + this.copy( + outputPorts = outputPorts + (link.fromPortId -> (port, newLinks, schema)) ) } + /** + * creates a copy with a removed input link + */ def removeInputLink(linkToRemove: PhysicalLink): PhysicalOp = { val portId = linkToRemove.toPortId val (port, existingLinks, schema) = inputPorts(portId) - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts + (portId -> (port, existingLinks.filter(link => link != linkToRemove), schema)), - outputPorts, - propagateSchema, - isOneToManyOp, - suggestedWorkerNum + this.copy( + inputPorts = + inputPorts + (portId -> (port, existingLinks.filter(link => link != linkToRemove), schema)) ) } + /** + * creates a copy with a removed output link + */ def removeOutputLink(linkToRemove: PhysicalLink): PhysicalOp = { val portId = linkToRemove.fromPortId val (port, existingLinks, schema) = outputPorts(portId) - new PhysicalOp( - id, - workflowId, - executionId, - opExecInitInfo, - parallelizable, - locationPreference, - partitionRequirement, - derivePartition, - inputPorts, - outputPorts + (portId -> (port, existingLinks.filter(link => link != linkToRemove), schema)), - propagateSchema, - isOneToManyOp, - suggestedWorkerNum + this.copy( + outputPorts = + outputPorts + (portId -> (port, existingLinks.filter(link => link != linkToRemove), schema)) ) } + /** + * creates a copy with an input schema updated, and if all input schemas are available, propagate + * the schema change to output schemas. + * @param newInputSchema optionally provide a schema for an input port. + */ def propagateSchema(newInputSchema: Option[(PortIdentity, Schema)] = None): PhysicalOp = { + // Update the input schema if a new one is provided val updatedOp = newInputSchema.foldLeft(this) { case (op, (portId, schema)) => op.withInputSchema(portId, Right(schema)) } + // Extract input schemas, checking if all are defined val inputSchemas = updatedOp.inputPorts.collect { case (portId, (_, _, Right(schema))) => portId -> schema } if (updatedOp.inputPorts.size == inputSchemas.size) { + // All input schemas are available, propagate to output schema val schemaPropagationResult = Try(propagateSchema.func(inputSchemas)) schemaPropagationResult match { case Success(schemaMapping) => @@ -536,15 +429,20 @@ class PhysicalOp( op.withOutputSchema(portId, Right(schema)) } case Failure(exception) => + // apply the exception to all output ports in case of failure updatedOp.outputPorts.keys.foldLeft(updatedOp) { (op, portId) => op.withOutputSchema(portId, Left(exception)) } } } else { + // Not all input schemas are defined, return the updated operation without changes updatedOp } } + /** + * returns all output links. Optionally, if a specific portId is provided, returns the links connected to that portId. + */ def getOutputLinks(portId: PortIdentity): List[PhysicalLink] = { outputPorts.values .flatMap(_._2) @@ -552,6 +450,9 @@ class PhysicalOp( .toList } + /** + * returns all input links. Optionally, if a specific portId is provided, returns the links connected to that portId. + */ def getInputLinks(portIdOpt: Option[PortIdentity] = None): List[PhysicalLink] = { inputPorts.values .flatMap(_._2) @@ -564,20 +465,33 @@ class PhysicalOp( ) } + /** + * Tells whether the input port the link connects to is depended by another input . + */ def isInputLinkDependee(link: PhysicalLink): Boolean = { dependeeInputs.contains(link.toPortId) } + /** + * Tells whether the output on this link is blocking i.e. the operator doesn't output anything till this link + * outputs all its tuples. + */ def isOutputLinkBlocking(link: PhysicalLink): Boolean = { this.outputPorts(link.fromPortId)._1.blocking } + /** + * Some operators process their inputs in a particular order. Eg: 2 phase hash join first + * processes the build input, then the probe input. + */ def getInputLinksInProcessingOrder: List[PhysicalLink] = { - val dependencyDag = new DirectedAcyclicGraph[PhysicalLink, DefaultEdge](classOf[DefaultEdge]) + val dependencyDag = { + new DirectedAcyclicGraph[PhysicalLink, DefaultEdge](classOf[DefaultEdge]) + } inputPorts.values .map(_._1) .flatMap(port => port.dependencies.map(dependee => port.id -> dependee)) - .foreach { + .foreach({ case (depender: PortIdentity, dependee: PortIdentity) => val upstreamLink = getInputLinks(Some(dependee)).head val downstreamLink = getInputLinks(Some(depender)).head @@ -588,7 +502,7 @@ class PhysicalOp( dependencyDag.addVertex(downstreamLink) } dependencyDag.addEdge(upstreamLink, downstreamLink) - } + }) val topologicalIterator = new TopologicalOrderIterator[PhysicalLink, DefaultEdge](dependencyDag) val processingOrder = new ArrayBuffer[PhysicalLink]() @@ -599,12 +513,12 @@ class PhysicalOp( } def build( - controllerActorService: AkkaActorService, - operatorExecution: OperatorExecution, - operatorConfig: OperatorConfig, - stateRestoreConfig: Option[StateRestoreConfig], - replayLoggingConfig: Option[FaultToleranceConfig] - ): Unit = { + controllerActorService: AkkaActorService, + operatorExecution: OperatorExecution, + operatorConfig: OperatorConfig, + stateRestoreConfig: Option[StateRestoreConfig], + replayLoggingConfig: Option[FaultToleranceConfig] + ): Unit = { val addressInfo = AddressInfo( controllerActorService.getClusterNodeAddresses, controllerActorService.self.path.address @@ -627,6 +541,8 @@ class PhysicalOp( ) ) } + // Note: At this point, we don't know if the actor is fully initialized. + // Thus, the ActorRef returned from `controllerActorService.actorOf` is ignored. controllerActorService.actorOf( workflowWorker.withDeploy(Deploy(scope = RemoteScope(preferredAddress))) ) From 7536bd5396a629fcf57440527ae400e37d407a0c Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Thu, 29 Aug 2024 17:03:51 +0800 Subject: [PATCH 09/20] remove op pojo --- .../deploysemantics/PhysicalOp.scala | 48 ++++++-------- .../deploysemantics/PhysicalOpPojo.scala | 66 ------------------- .../web/TexeraWorkflowCompilingService.scala | 10 ++- .../request/WorkflowExecuteRequest.scala | 4 +- .../WorkflowCompilationResource.scala | 2 +- 5 files changed, 26 insertions(+), 104 deletions(-) delete mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOpPojo.scala diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index 1945e9135eb..d302a9fbc05 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -2,27 +2,16 @@ package edu.uci.ics.amber.engine.architecture.deploysemantics import akka.actor.Deploy import akka.remote.RemoteScope +import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty} import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.architecture.common.AkkaActorService import edu.uci.ics.amber.engine.architecture.controller.execution.OperatorExecution -import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.{ - OpExecInitInfo, - OpExecInitInfoWithCode -} -import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.{ - AddressInfo, - LocationPreference, - PreferController, - RoundRobinPreference -} +import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.{OpExecInitInfo, OpExecInitInfoWithCode} +import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.{AddressInfo, LocationPreference, PreferController, RoundRobinPreference} import edu.uci.ics.amber.engine.architecture.pythonworker.PythonWorkflowWorker import edu.uci.ics.amber.engine.architecture.scheduling.config.OperatorConfig import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker -import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker.{ - FaultToleranceConfig, - StateRestoreConfig, - WorkerReplayInitialization -} +import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker.{FaultToleranceConfig, StateRestoreConfig, WorkerReplayInitialization} import edu.uci.ics.amber.engine.common.VirtualIdentityUtils import edu.uci.ics.amber.engine.common.virtualidentity._ import edu.uci.ics.amber.engine.common.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} @@ -159,34 +148,34 @@ object PhysicalOp { case class PhysicalOp( // the identifier of this PhysicalOp - id: PhysicalOpIdentity, + @JsonProperty id: PhysicalOpIdentity, // the workflow id number - workflowId: WorkflowIdentity, + @JsonProperty workflowId: WorkflowIdentity, // the execution id number - executionId: ExecutionIdentity, + @JsonProperty executionId: ExecutionIdentity, // information regarding initializing an operator executor instance - opExecInitInfo: OpExecInitInfo, + @JsonIgnore opExecInitInfo: OpExecInitInfo, // preference of parallelism - parallelizable: Boolean = true, + @JsonProperty parallelizable: Boolean = true, // preference of worker placement - locationPreference: Option[LocationPreference] = None, + @JsonProperty locationPreference: Option[LocationPreference] = None, // requirement of partition policy (hash/range/single/none) on inputs - partitionRequirement: List[Option[PartitionInfo]] = List(), + @JsonProperty partitionRequirement: List[Option[PartitionInfo]] = List(), // derive the output partition info given the input partitions // if not specified, by default the output partition is the same as input partition - derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, + @JsonIgnore derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, // input/output ports of the physical operator // for operators with multiple input/output ports: must set these variables properly - inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = + @JsonProperty inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = Map.empty, - outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = + @JsonProperty outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = Map.empty, // schema propagation function - propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), - isOneToManyOp: Boolean = false, + @JsonIgnore propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), + @JsonProperty isOneToManyOp: Boolean = false, // hint for number of workers - suggestedWorkerNum: Option[Int] = None -) extends LazyLogging { + @JsonProperty suggestedWorkerNum: Option[Int] = None +) extends LazyLogging with Serializable { // all the "dependee" links are also blocking private lazy val dependeeInputs: List[PortIdentity] = @@ -223,6 +212,7 @@ case class PhysicalOp( } } + @JsonIgnore def getPythonCode: String = { val (code, _) = opExecInitInfo.asInstanceOf[OpExecInitInfoWithCode].codeGen(0, 0) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOpPojo.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOpPojo.scala deleted file mode 100644 index a6b29afbf84..00000000000 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOpPojo.scala +++ /dev/null @@ -1,66 +0,0 @@ -package edu.uci.ics.amber.engine.architecture.deploysemantics - -import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.LocationPreference -import edu.uci.ics.amber.engine.common.virtualidentity.{ - ExecutionIdentity, - PhysicalOpIdentity, - WorkflowIdentity -} -import edu.uci.ics.amber.engine.common.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} -import edu.uci.ics.texera.workflow.common.metadata.PropertyNameConstants -import edu.uci.ics.texera.workflow.common.tuple.schema.Schema -import edu.uci.ics.texera.workflow.common.workflow.PartitionInfo -import org.codehaus.jackson.annotate.JsonProperty - -object PhysicalOpPojo { - def apply(physicalOp: PhysicalOp): PhysicalOpPojo = { - - val result = new PhysicalOpPojo() - result.id = physicalOp.id - result.workflowId = physicalOp.workflowId - result.executionId = physicalOp.executionId - result.parallelizable = physicalOp.parallelizable - result.locationPreference = physicalOp.locationPreference - result.partitionRequirement = physicalOp.partitionRequirement - result.inputPorts = physicalOp.inputPorts - result.outputPorts = physicalOp.outputPorts - result.isOneToManyOp = physicalOp.isOneToManyOp - result.suggestedWorkerNum = physicalOp.suggestedWorkerNum - - result - } -} - -class PhysicalOpPojo extends Serializable { - - @JsonProperty(PropertyNameConstants.OPERATOR_ID) - var id: PhysicalOpIdentity = _ - - @JsonProperty(PropertyNameConstants.WORKFLOW_ID) - var workflowId: WorkflowIdentity = _ - - @JsonProperty(PropertyNameConstants.EXECUTION_ID) - var executionId: ExecutionIdentity = _ - - @JsonProperty(PropertyNameConstants.PARALLELIZABLE) - var parallelizable: Boolean = _ - - @JsonProperty(PropertyNameConstants.LOCATION_PREFERENCE) - var locationPreference: Option[LocationPreference] = _ - - @JsonProperty(PropertyNameConstants.PARTITION_REQUIREMENT) - var partitionRequirement: List[Option[PartitionInfo]] = _ - - @JsonProperty(PropertyNameConstants.INPUT_PORTS) - var inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = _ - - @JsonProperty(PropertyNameConstants.OUTPUT_PORTS) - var outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = - _ - - @JsonProperty(PropertyNameConstants.IS_ONE_TO_MANY_OP) - var isOneToManyOp: Boolean = _ - - @JsonProperty(PropertyNameConstants.SUGGESTED_WORKER_NUM) - var suggestedWorkerNum: Option[Int] = _ -} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala index 7aab5983558..1e39a3503cd 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala @@ -1,18 +1,16 @@ package edu.uci.ics.texera.web +import com.fasterxml.jackson.databind.module.SimpleModule import com.fasterxml.jackson.module.scala.DefaultScalaModule import com.github.toastshaman.dropwizard.auth.jwt.JwtAuthFilter import com.typesafe.scalalogging.LazyLogging +import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOp import edu.uci.ics.amber.engine.common.AmberConfig import edu.uci.ics.texera.Utils import edu.uci.ics.texera.web.TexeraWebApplication.parseArgs import edu.uci.ics.texera.web.auth.JwtAuth.jwtConsumer -import edu.uci.ics.texera.web.auth.{ - GuestAuthFilter, - SessionUser, - UserAuthenticator, - UserRoleAuthorizer -} +import edu.uci.ics.texera.web.auth.{GuestAuthFilter, SessionUser, UserAuthenticator, UserRoleAuthorizer} +import edu.uci.ics.texera.web.model.serializer.PhysicalOpSerializer import edu.uci.ics.texera.web.resource.WorkflowCompilationResource import io.dropwizard.auth.{AuthDynamicFeature, AuthValueFactoryProvider} import io.dropwizard.setup.{Bootstrap, Environment} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala index db5ec44c1fb..22903002d3d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala @@ -1,7 +1,7 @@ package edu.uci.ics.texera.web.model.websocket.request import com.fasterxml.jackson.databind.annotation.JsonDeserialize -import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOpPojo +import edu.uci.ics.amber.engine.architecture.deploysemantics.{PhysicalOp, PhysicalOpPojo} import edu.uci.ics.amber.engine.common.workflow.PhysicalLink import edu.uci.ics.texera.workflow.common.operators.LogicalOp import edu.uci.ics.texera.workflow.common.workflow.{LogicalLink, WorkflowSettings} @@ -28,6 +28,6 @@ case class LogicalPlanPojo( ) case class PhysicalPlanPojo( - operators: List[PhysicalOpPojo], + operators: List[PhysicalOp], links: List[PhysicalLink] ) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index 82a3462169b..666a92655d7 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -47,7 +47,7 @@ class WorkflowCompilationResource extends LazyLogging { // convert the physical plan to pojo, which is serializable val physicalPlanPojo = PhysicalPlanPojo( // the reason of using PhysicalOpPojo is because some fields in PhysicalOp is not serializable - physicalPlan.operators.map(op => PhysicalOpPojo(op)).toList, + physicalPlan.operators.toList, physicalPlan.links.toList ) // return the result From 2dd5e8a2fdd7429d44317295614fa72c107fb58e Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Fri, 30 Aug 2024 20:02:11 +0800 Subject: [PATCH 10/20] fmt --- .../deploysemantics/PhysicalOp.scala | 36 ++++++++++++++----- .../web/TexeraWorkflowCompilingService.scala | 7 +++- .../request/WorkflowExecuteRequest.scala | 2 +- .../WorkflowCompilationResource.scala | 11 ++---- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index d302a9fbc05..4750880543b 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -6,12 +6,24 @@ import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty} import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.architecture.common.AkkaActorService import edu.uci.ics.amber.engine.architecture.controller.execution.OperatorExecution -import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.{OpExecInitInfo, OpExecInitInfoWithCode} -import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.{AddressInfo, LocationPreference, PreferController, RoundRobinPreference} +import edu.uci.ics.amber.engine.architecture.deploysemantics.layer.{ + OpExecInitInfo, + OpExecInitInfoWithCode +} +import edu.uci.ics.amber.engine.architecture.deploysemantics.locationpreference.{ + AddressInfo, + LocationPreference, + PreferController, + RoundRobinPreference +} import edu.uci.ics.amber.engine.architecture.pythonworker.PythonWorkflowWorker import edu.uci.ics.amber.engine.architecture.scheduling.config.OperatorConfig import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker -import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker.{FaultToleranceConfig, StateRestoreConfig, WorkerReplayInitialization} +import edu.uci.ics.amber.engine.architecture.worker.WorkflowWorker.{ + FaultToleranceConfig, + StateRestoreConfig, + WorkerReplayInitialization +} import edu.uci.ics.amber.engine.common.VirtualIdentityUtils import edu.uci.ics.amber.engine.common.virtualidentity._ import edu.uci.ics.amber.engine.common.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} @@ -163,19 +175,25 @@ case class PhysicalOp( @JsonProperty partitionRequirement: List[Option[PartitionInfo]] = List(), // derive the output partition info given the input partitions // if not specified, by default the output partition is the same as input partition - @JsonIgnore derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, + @JsonIgnore derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => + inputParts.head, // input/output ports of the physical operator // for operators with multiple input/output ports: must set these variables properly - @JsonProperty inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = - Map.empty, - @JsonProperty outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = - Map.empty, + @JsonProperty inputPorts: Map[ + PortIdentity, + (InputPort, List[PhysicalLink], Either[Throwable, Schema]) + ] = Map.empty, + @JsonProperty outputPorts: Map[ + PortIdentity, + (OutputPort, List[PhysicalLink], Either[Throwable, Schema]) + ] = Map.empty, // schema propagation function @JsonIgnore propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), @JsonProperty isOneToManyOp: Boolean = false, // hint for number of workers @JsonProperty suggestedWorkerNum: Option[Int] = None -) extends LazyLogging with Serializable { +) extends LazyLogging + with Serializable { // all the "dependee" links are also blocking private lazy val dependeeInputs: List[PortIdentity] = diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala index 1e39a3503cd..8710a51c931 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala @@ -9,7 +9,12 @@ import edu.uci.ics.amber.engine.common.AmberConfig import edu.uci.ics.texera.Utils import edu.uci.ics.texera.web.TexeraWebApplication.parseArgs import edu.uci.ics.texera.web.auth.JwtAuth.jwtConsumer -import edu.uci.ics.texera.web.auth.{GuestAuthFilter, SessionUser, UserAuthenticator, UserRoleAuthorizer} +import edu.uci.ics.texera.web.auth.{ + GuestAuthFilter, + SessionUser, + UserAuthenticator, + UserRoleAuthorizer +} import edu.uci.ics.texera.web.model.serializer.PhysicalOpSerializer import edu.uci.ics.texera.web.resource.WorkflowCompilationResource import io.dropwizard.auth.{AuthDynamicFeature, AuthValueFactoryProvider} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala index 22903002d3d..b86031773b9 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala @@ -1,7 +1,7 @@ package edu.uci.ics.texera.web.model.websocket.request import com.fasterxml.jackson.databind.annotation.JsonDeserialize -import edu.uci.ics.amber.engine.architecture.deploysemantics.{PhysicalOp, PhysicalOpPojo} +import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOp import edu.uci.ics.amber.engine.common.workflow.PhysicalLink import edu.uci.ics.texera.workflow.common.operators.LogicalOp import edu.uci.ics.texera.workflow.common.workflow.{LogicalLink, WorkflowSettings} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index 666a92655d7..bcd5e8e9db2 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -1,19 +1,14 @@ package edu.uci.ics.texera.web.resource import com.typesafe.scalalogging.LazyLogging -import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOpPojo import edu.uci.ics.amber.engine.common.virtualidentity.WorkflowIdentity import edu.uci.ics.texera.Utils -import edu.uci.ics.texera.web.auth.SessionUser -import edu.uci.ics.texera.web.model.http.response.SchemaPropagationResponse import edu.uci.ics.texera.web.model.websocket.request.{LogicalPlanPojo, PhysicalPlanPojo} import edu.uci.ics.texera.workflow.common.WorkflowContext import edu.uci.ics.texera.workflow.common.tuple.schema.Attribute import edu.uci.ics.texera.workflow.common.workflow.{PhysicalPlan, WorkflowCompiler} -import io.dropwizard.auth.Auth import org.jooq.types.UInteger -import javax.annotation.security.RolesAllowed import javax.ws.rs.{Consumes, POST, Path, PathParam, Produces} import javax.ws.rs.core.MediaType @@ -43,12 +38,11 @@ class WorkflowCompilationResource extends LazyLogging { val workflowCompilationResult = new WorkflowCompiler(context).compileToPhysicalPlan(logicalPlanPojo) // get the physical plan from the compilation result - val physicalPlan = workflowCompilationResult.physicalPlan // convert the physical plan to pojo, which is serializable val physicalPlanPojo = PhysicalPlanPojo( // the reason of using PhysicalOpPojo is because some fields in PhysicalOp is not serializable - physicalPlan.operators.toList, - physicalPlan.links.toList + workflowCompilationResult.physicalPlan.operators.toList, + workflowCompilationResult.physicalPlan.links.toList ) // return the result WorkflowCompilationResponse( @@ -62,7 +56,6 @@ class WorkflowCompilationResource extends LazyLogging { else Some(schema.get.attributes) } - (opId, attributes) }, operatorErrors = workflowCompilationResult.operatorIdToError.map { From 2bd48b1228bd7af1161556b0aa5ca8576b706d7a Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Tue, 3 Sep 2024 10:49:52 +0800 Subject: [PATCH 11/20] add jwt token --- .../web/TexeraWorkflowCompilingService.scala | 31 ++++++++++++++++--- .../WorkflowCompilationResource.scala | 2 ++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala index 8710a51c931..521bb82c921 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala @@ -1,10 +1,8 @@ package edu.uci.ics.texera.web -import com.fasterxml.jackson.databind.module.SimpleModule import com.fasterxml.jackson.module.scala.DefaultScalaModule import com.github.toastshaman.dropwizard.auth.jwt.JwtAuthFilter import com.typesafe.scalalogging.LazyLogging -import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOp import edu.uci.ics.amber.engine.common.AmberConfig import edu.uci.ics.texera.Utils import edu.uci.ics.texera.web.TexeraWebApplication.parseArgs @@ -15,11 +13,9 @@ import edu.uci.ics.texera.web.auth.{ UserAuthenticator, UserRoleAuthorizer } -import edu.uci.ics.texera.web.model.serializer.PhysicalOpSerializer import edu.uci.ics.texera.web.resource.WorkflowCompilationResource import io.dropwizard.auth.{AuthDynamicFeature, AuthValueFactoryProvider} import io.dropwizard.setup.{Bootstrap, Environment} -import org.eclipse.jetty.server.session.SessionHandler import org.glassfish.jersey.server.filter.RolesAllowedDynamicFeature object TexeraWorkflowCompilingService { @@ -57,5 +53,32 @@ class TexeraWorkflowCompilingService // register the compilation endpoint environment.jersey.register(classOf[WorkflowCompilationResource]) + + // Add JWT Auth layer (without session) + if (AmberConfig.isUserSystemEnabled) { + environment.jersey.register( + new AuthDynamicFeature( + new JwtAuthFilter.Builder[SessionUser]() // Renamed from SessionUser to AuthenticatedUser + .setJwtConsumer(jwtConsumer) + .setRealm("realm") + .setPrefix("Bearer") + .setAuthenticator(UserAuthenticator) + .setAuthorizer(UserRoleAuthorizer) + .buildAuthFilter() + ) + ) + } else { + // register Guest Auth layer (if applicable) + environment.jersey.register( + new AuthDynamicFeature( + new GuestAuthFilter.Builder().setAuthorizer(UserRoleAuthorizer).buildAuthFilter() + ) + ) + } + + environment.jersey.register( + new AuthValueFactoryProvider.Binder[SessionUser](classOf[SessionUser]) // Updated here as well + ) + environment.jersey.register(classOf[RolesAllowedDynamicFeature]) } } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index bcd5e8e9db2..62a42ffa913 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -9,6 +9,7 @@ import edu.uci.ics.texera.workflow.common.tuple.schema.Attribute import edu.uci.ics.texera.workflow.common.workflow.{PhysicalPlan, WorkflowCompiler} import org.jooq.types.UInteger +import javax.annotation.security.RolesAllowed import javax.ws.rs.{Consumes, POST, Path, PathParam, Produces} import javax.ws.rs.core.MediaType @@ -20,6 +21,7 @@ case class WorkflowCompilationResponse( @Consumes(Array(MediaType.APPLICATION_JSON)) @Produces(Array(MediaType.APPLICATION_JSON)) +@RolesAllowed(Array("REGULAR", "ADMIN")) @Path("/compilation") class WorkflowCompilationResource extends LazyLogging { @POST From 8329c1a21eff167a57f3ed93b1ce1428643ff9b9 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Wed, 4 Sep 2024 16:03:10 +0800 Subject: [PATCH 12/20] make json ignore work --- .../deploysemantics/PhysicalOp.scala | 72 ++++++++++++++----- .../WorkflowCompilationResource.scala | 5 +- .../common/workflow/PhysicalPlan.scala | 24 ++++++- 3 files changed, 78 insertions(+), 23 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index 4750880543b..6248a76495d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -2,7 +2,12 @@ package edu.uci.ics.amber.engine.architecture.deploysemantics import akka.actor.Deploy import akka.remote.RemoteScope -import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty} +import com.fasterxml.jackson.annotation.{ + JsonAutoDetect, + JsonIgnore, + JsonIgnoreProperties, + JsonProperty +} import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.architecture.common.AkkaActorService import edu.uci.ics.amber.engine.architecture.controller.execution.OperatorExecution @@ -158,44 +163,54 @@ object PhysicalOp { } } +// @JsonIgnore is not working when directly annotated to fields of a case class +// https://stackoverflow.com/questions/40482904/jsonignore-doesnt-work-in-scala-case-class +@JsonIgnoreProperties( + Array( + "opExecInitInfo", // function type, ignore it + "derivePartition", // function type, ignore it + "inputPorts", // may contain very long stacktrace, ignore it + "outputPorts", // same reason with above + "propagateSchema" // function type, so ignore it + ) +) case class PhysicalOp( // the identifier of this PhysicalOp - @JsonProperty id: PhysicalOpIdentity, + id: PhysicalOpIdentity, // the workflow id number - @JsonProperty workflowId: WorkflowIdentity, + workflowId: WorkflowIdentity, // the execution id number - @JsonProperty executionId: ExecutionIdentity, + executionId: ExecutionIdentity, // information regarding initializing an operator executor instance - @JsonIgnore opExecInitInfo: OpExecInitInfo, + opExecInitInfo: OpExecInitInfo, // preference of parallelism - @JsonProperty parallelizable: Boolean = true, + parallelizable: Boolean = true, // preference of worker placement - @JsonProperty locationPreference: Option[LocationPreference] = None, + locationPreference: Option[LocationPreference] = None, // requirement of partition policy (hash/range/single/none) on inputs - @JsonProperty partitionRequirement: List[Option[PartitionInfo]] = List(), + partitionRequirement: List[Option[PartitionInfo]] = List(), // derive the output partition info given the input partitions // if not specified, by default the output partition is the same as input partition - @JsonIgnore derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => - inputParts.head, + derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, // input/output ports of the physical operator // for operators with multiple input/output ports: must set these variables properly - @JsonProperty inputPorts: Map[ + inputPorts: Map[ PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema]) ] = Map.empty, - @JsonProperty outputPorts: Map[ + outputPorts: Map[ PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema]) ] = Map.empty, // schema propagation function - @JsonIgnore propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), - @JsonProperty isOneToManyOp: Boolean = false, + propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), + isOneToManyOp: Boolean = false, // hint for number of workers - @JsonProperty suggestedWorkerNum: Option[Int] = None -) extends LazyLogging - with Serializable { + suggestedWorkerNum: Option[Int] = None +) extends LazyLogging { // all the "dependee" links are also blocking + @JsonIgnore private lazy val dependeeInputs: List[PortIdentity] = inputPorts.values .flatMap({ @@ -204,12 +219,13 @@ case class PhysicalOp( .toList .distinct + @JsonIgnore private lazy val isInitWithCode: Boolean = opExecInitInfo.isInstanceOf[OpExecInitInfoWithCode] /** * Helper functions related to compile-time operations */ - + @JsonIgnore def isSourceOperator: Boolean = { inputPorts.isEmpty } @@ -217,10 +233,12 @@ case class PhysicalOp( /** * Helper function used to determine whether the input link is a materialized link. */ + @JsonIgnore def isSinkOperator: Boolean = { outputPorts.forall(port => port._2._2.isEmpty) } + @JsonIgnore def isPythonBased: Boolean = { opExecInitInfo match { case opExecInfo: OpExecInitInfoWithCode => @@ -240,6 +258,7 @@ case class PhysicalOp( /** * creates a copy with the location preference information */ + @JsonIgnore def withLocationPreference(preference: Option[LocationPreference]): PhysicalOp = { this.copy(locationPreference = preference) } @@ -252,6 +271,7 @@ case class PhysicalOp( * @param inputs A list of InputPort instances to set as the new input ports. * @return A new instance of PhysicalOp with the input ports updated. */ + @JsonIgnore def withInputPorts(inputs: List[InputPort]): PhysicalOp = { this.copy(inputPorts = inputs @@ -271,6 +291,7 @@ case class PhysicalOp( * @param outputs A list of OutputPort instances to set as the new output ports. * @return A new instance of PhysicalOp with the output ports updated. */ + @JsonIgnore def withOutputPorts(outputs: List[OutputPort]): PhysicalOp = { this.copy(outputPorts = outputs @@ -285,6 +306,7 @@ case class PhysicalOp( /** * creates a copy with suggested worker number. This is only to be used by Python UDF operators. */ + @JsonIgnore def withSuggestedWorkerNum(workerNum: Int): PhysicalOp = { this.copy(suggestedWorkerNum = Some(workerNum)) } @@ -292,6 +314,7 @@ case class PhysicalOp( /** * creates a copy with the partition requirements */ + @JsonIgnore def withPartitionRequirement(partitionRequirements: List[Option[PartitionInfo]]): PhysicalOp = { this.copy(partitionRequirement = partitionRequirements) } @@ -299,6 +322,7 @@ case class PhysicalOp( /** * creates a copy with the partition info derive function */ + @JsonIgnore def withDerivePartition(derivePartition: List[PartitionInfo] => PartitionInfo): PhysicalOp = { this.copy(derivePartition = derivePartition) } @@ -306,12 +330,14 @@ case class PhysicalOp( /** * creates a copy with the parallelizable specified */ + @JsonIgnore def withParallelizable(parallelizable: Boolean): PhysicalOp = this.copy(parallelizable = parallelizable) /** * creates a copy with the specified property that whether this operator is one-to-many */ + @JsonIgnore def withIsOneToManyOp(isOneToManyOp: Boolean): PhysicalOp = this.copy(isOneToManyOp = isOneToManyOp) @@ -324,6 +350,7 @@ case class PhysicalOp( * A Right value represents a successful schema, while a Left value represents an error (Throwable). * @return A new instance of PhysicalOp with the updated input port schema or error information. */ + @JsonIgnore private def withInputSchema( portId: PortIdentity, schema: Either[Throwable, Schema] @@ -344,6 +371,7 @@ case class PhysicalOp( * A Right value indicates a successful schema, while a Left value indicates an error (Throwable). * @return A new instance of PhysicalOp with the updated output port schema or error information. */ + @JsonIgnore private def withOutputSchema( portId: PortIdentity, schema: Either[Throwable, Schema] @@ -357,6 +385,7 @@ case class PhysicalOp( /** * creates a copy with the schema propagation function. */ + @JsonIgnore def withPropagateSchema(func: SchemaPropagationFunc): PhysicalOp = { this.copy(propagateSchema = func) } @@ -364,6 +393,7 @@ case class PhysicalOp( /** * creates a copy with an additional input link specified on an input port */ + @JsonIgnore def addInputLink(link: PhysicalLink): PhysicalOp = { assert(link.toOpId == id) assert(inputPorts.contains(link.toPortId)) @@ -377,6 +407,7 @@ case class PhysicalOp( /** * creates a copy with an additional output link specified on an output port */ + @JsonIgnore def addOutputLink(link: PhysicalLink): PhysicalOp = { assert(link.fromOpId == id) assert(outputPorts.contains(link.fromPortId)) @@ -451,6 +482,7 @@ case class PhysicalOp( /** * returns all output links. Optionally, if a specific portId is provided, returns the links connected to that portId. */ + @JsonIgnore def getOutputLinks(portId: PortIdentity): List[PhysicalLink] = { outputPorts.values .flatMap(_._2) @@ -461,6 +493,7 @@ case class PhysicalOp( /** * returns all input links. Optionally, if a specific portId is provided, returns the links connected to that portId. */ + @JsonIgnore def getInputLinks(portIdOpt: Option[PortIdentity] = None): List[PhysicalLink] = { inputPorts.values .flatMap(_._2) @@ -476,6 +509,7 @@ case class PhysicalOp( /** * Tells whether the input port the link connects to is depended by another input . */ + @JsonIgnore def isInputLinkDependee(link: PhysicalLink): Boolean = { dependeeInputs.contains(link.toPortId) } @@ -484,6 +518,7 @@ case class PhysicalOp( * Tells whether the output on this link is blocking i.e. the operator doesn't output anything till this link * outputs all its tuples. */ + @JsonIgnore def isOutputLinkBlocking(link: PhysicalLink): Boolean = { this.outputPorts(link.fromPortId)._1.blocking } @@ -492,6 +527,7 @@ case class PhysicalOp( * Some operators process their inputs in a particular order. Eg: 2 phase hash join first * processes the build input, then the probe input. */ + @JsonIgnore def getInputLinksInProcessingOrder: List[PhysicalLink] = { val dependencyDag = { new DirectedAcyclicGraph[PhysicalLink, DefaultEdge](classOf[DefaultEdge]) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index 62a42ffa913..583b39a1548 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -14,7 +14,7 @@ import javax.ws.rs.{Consumes, POST, Path, PathParam, Produces} import javax.ws.rs.core.MediaType case class WorkflowCompilationResponse( - physicalPlan: PhysicalPlanPojo, + physicalPlan: PhysicalPlan, operatorInputSchemas: Map[String, List[Option[List[Attribute]]]], operatorErrors: Map[String, String] ) @@ -48,7 +48,8 @@ class WorkflowCompilationResource extends LazyLogging { ) // return the result WorkflowCompilationResponse( - physicalPlan = physicalPlanPojo, +// physicalPlan = physicalPlanPojo, + physicalPlan = workflowCompilationResult.physicalPlan, operatorInputSchemas = workflowCompilationResult.operatorIdToInputSchemas.map { case (operatorIdentity, schemas) => val opId = operatorIdentity.id diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala index 560d4cfcbfe..471effe18a6 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala @@ -1,5 +1,6 @@ package edu.uci.ics.texera.workflow.common.workflow +import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty} import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOp import edu.uci.ics.amber.engine.common.VirtualIdentityUtils @@ -67,11 +68,11 @@ case class PhysicalPlan( links: Set[PhysicalLink] ) extends LazyLogging { - @transient private lazy val operatorMap: Map[PhysicalOpIdentity, PhysicalOp] = + @JsonIgnore @transient private lazy val operatorMap: Map[PhysicalOpIdentity, PhysicalOp] = operators.map(o => (o.id, o)).toMap // the dag will be re-computed again once it reaches the coordinator. - @transient lazy val dag: DirectedAcyclicGraph[PhysicalOpIdentity, PhysicalLink] = { + @JsonIgnore @transient lazy val dag: DirectedAcyclicGraph[PhysicalOpIdentity, PhysicalLink] = { val jgraphtDag = new DirectedAcyclicGraph[PhysicalOpIdentity, PhysicalLink]( null, // vertexSupplier SupplierUtil.createSupplier(classOf[PhysicalLink]), // edgeSupplier @@ -83,11 +84,13 @@ case class PhysicalPlan( jgraphtDag } - @transient lazy val maxChains: Set[Set[PhysicalLink]] = this.getMaxChains + @JsonIgnore @transient lazy val maxChains: Set[Set[PhysicalLink]] = this.getMaxChains + @JsonIgnore def getSourceOperatorIds: Set[PhysicalOpIdentity] = operatorMap.keys.filter(op => dag.inDegreeOf(op) == 0).toSet + @JsonIgnore def getPhysicalOpsOfLogicalOp(logicalOpId: OperatorIdentity): List[PhysicalOp] = { topologicalIterator() .filter(physicalOpId => physicalOpId.logicalOpId == logicalOpId) @@ -95,11 +98,13 @@ case class PhysicalPlan( .toList } + @JsonIgnore def getOperator(physicalOpId: PhysicalOpIdentity): PhysicalOp = operatorMap(physicalOpId) /** * returns a sub-plan that contains the specified operators and the links connected within these operators */ + @JsonIgnore def getSubPlan(subOperators: Set[PhysicalOpIdentity]): PhysicalPlan = { val newOps = operators.filter(op => subOperators.contains(op.id)) val newLinks = @@ -109,18 +114,22 @@ case class PhysicalPlan( PhysicalPlan(newOps, newLinks) } + @JsonIgnore def getUpstreamPhysicalOpIds(physicalOpId: PhysicalOpIdentity): Set[PhysicalOpIdentity] = { dag.incomingEdgesOf(physicalOpId).asScala.map(e => dag.getEdgeSource(e)).toSet } + @JsonIgnore def getUpstreamPhysicalLinks(physicalOpId: PhysicalOpIdentity): Set[PhysicalLink] = { links.filter(l => l.toOpId == physicalOpId) } + @JsonIgnore def getDownstreamPhysicalLinks(physicalOpId: PhysicalOpIdentity): Set[PhysicalLink] = { links.filter(l => l.fromOpId == physicalOpId) } + @JsonIgnore def topologicalIterator(): Iterator[PhysicalOpIdentity] = { new TopologicalOrderIterator(dag).asScala } @@ -157,9 +166,11 @@ case class PhysicalPlan( this.copy(operators = (operatorMap + (physicalOp.id -> physicalOp)).values.toSet) } + @JsonIgnore def getPhysicalOpByWorkerId(workerId: ActorVirtualIdentity): PhysicalOp = getOperator(VirtualIdentityUtils.getPhysicalOpId(workerId)) + @JsonIgnore def getLinksBetween( from: PhysicalOpIdentity, to: PhysicalOpIdentity @@ -168,6 +179,7 @@ case class PhysicalPlan( } + @JsonIgnore def getOutputPartitionInfo( link: PhysicalLink, upstreamPartitionInfo: PartitionInfo, @@ -206,10 +218,12 @@ case class PhysicalPlan( } } + @JsonIgnore private def isMaterializedLink(link: PhysicalLink): Boolean = { getOperator(link.toOpId).isSinkOperator } + @JsonIgnore def getNonMaterializedBlockingAndDependeeLinks: Set[PhysicalLink] = { operators .flatMap { physicalOp => @@ -230,6 +244,7 @@ case class PhysicalPlan( } } + @JsonIgnore def getDependeeLinks: Set[PhysicalLink] = { operators .flatMap { physicalOp => @@ -249,6 +264,7 @@ case class PhysicalPlan( /** * create a DAG similar to the physical DAG but with all dependee links removed. */ + @JsonIgnore def getDependeeLinksRemovedDAG: PhysicalPlan = { this.copy(operators, links.diff(getDependeeLinks)) } @@ -260,6 +276,7 @@ case class PhysicalPlan( * * @return All non-blocking links that are not bridges. */ + @JsonIgnore def getNonBridgeNonBlockingLinks: Set[PhysicalLink] = { val bridges = new BiconnectivityInspector[PhysicalOpIdentity, PhysicalLink](this.dag).getBridges.asScala @@ -284,6 +301,7 @@ case class PhysicalPlan( * * @return All the maximal chains of this physical plan, where each chain is represented as a set of links. */ + @JsonIgnore private def getMaxChains: Set[Set[PhysicalLink]] = { val dijkstra = new AllDirectedPaths[PhysicalOpIdentity, PhysicalLink](this.dag) val chains = this.dag From 173c22934adfe7fe6716214560fef4e1085edc29 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Wed, 4 Sep 2024 16:15:49 +0800 Subject: [PATCH 13/20] remove physical pojo --- .../websocket/request/WorkflowExecuteRequest.scala | 7 +------ .../web/resource/WorkflowCompilationResource.scala | 12 ++---------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala index b86031773b9..927d9c09eca 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala @@ -25,9 +25,4 @@ case class LogicalPlanPojo( links: List[LogicalLink], opsToViewResult: List[String], opsToReuseResult: List[String] -) - -case class PhysicalPlanPojo( - operators: List[PhysicalOp], - links: List[PhysicalLink] -) +) \ No newline at end of file diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index 583b39a1548..55cf3857e37 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -3,7 +3,7 @@ package edu.uci.ics.texera.web.resource import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.common.virtualidentity.WorkflowIdentity import edu.uci.ics.texera.Utils -import edu.uci.ics.texera.web.model.websocket.request.{LogicalPlanPojo, PhysicalPlanPojo} +import edu.uci.ics.texera.web.model.websocket.request.LogicalPlanPojo import edu.uci.ics.texera.workflow.common.WorkflowContext import edu.uci.ics.texera.workflow.common.tuple.schema.Attribute import edu.uci.ics.texera.workflow.common.workflow.{PhysicalPlan, WorkflowCompiler} @@ -36,19 +36,11 @@ class WorkflowCompilationResource extends LazyLogging { workflowId = WorkflowIdentity(wid.toString.toLong) ) - // compile the pojo + // compile the pojo using WorkflowCompiler val workflowCompilationResult = new WorkflowCompiler(context).compileToPhysicalPlan(logicalPlanPojo) - // get the physical plan from the compilation result - // convert the physical plan to pojo, which is serializable - val physicalPlanPojo = PhysicalPlanPojo( - // the reason of using PhysicalOpPojo is because some fields in PhysicalOp is not serializable - workflowCompilationResult.physicalPlan.operators.toList, - workflowCompilationResult.physicalPlan.links.toList - ) // return the result WorkflowCompilationResponse( -// physicalPlan = physicalPlanPojo, physicalPlan = workflowCompilationResult.physicalPlan, operatorInputSchemas = workflowCompilationResult.operatorIdToInputSchemas.map { case (operatorIdentity, schemas) => From 41f43b7b4287e637d0b143d19dc7aee26f70c585 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Wed, 4 Sep 2024 16:47:50 +0800 Subject: [PATCH 14/20] remove redundant json ignore --- .../deploysemantics/PhysicalOp.scala | 25 +----------------- .../request/WorkflowExecuteRequest.scala | 2 +- .../common/workflow/PhysicalPlan.scala | 26 ++++--------------- 3 files changed, 7 insertions(+), 46 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index 6248a76495d..09f5cf727f9 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -210,7 +210,6 @@ case class PhysicalOp( ) extends LazyLogging { // all the "dependee" links are also blocking - @JsonIgnore private lazy val dependeeInputs: List[PortIdentity] = inputPorts.values .flatMap({ @@ -219,13 +218,11 @@ case class PhysicalOp( .toList .distinct - @JsonIgnore private lazy val isInitWithCode: Boolean = opExecInitInfo.isInstanceOf[OpExecInitInfoWithCode] /** * Helper functions related to compile-time operations */ - @JsonIgnore def isSourceOperator: Boolean = { inputPorts.isEmpty } @@ -233,12 +230,10 @@ case class PhysicalOp( /** * Helper function used to determine whether the input link is a materialized link. */ - @JsonIgnore def isSinkOperator: Boolean = { outputPorts.forall(port => port._2._2.isEmpty) } - @JsonIgnore def isPythonBased: Boolean = { opExecInitInfo match { case opExecInfo: OpExecInitInfoWithCode => @@ -248,7 +243,7 @@ case class PhysicalOp( } } - @JsonIgnore + @JsonIgnore // this is needed to prevent the serialization issue def getPythonCode: String = { val (code, _) = opExecInitInfo.asInstanceOf[OpExecInitInfoWithCode].codeGen(0, 0) @@ -258,7 +253,6 @@ case class PhysicalOp( /** * creates a copy with the location preference information */ - @JsonIgnore def withLocationPreference(preference: Option[LocationPreference]): PhysicalOp = { this.copy(locationPreference = preference) } @@ -271,7 +265,6 @@ case class PhysicalOp( * @param inputs A list of InputPort instances to set as the new input ports. * @return A new instance of PhysicalOp with the input ports updated. */ - @JsonIgnore def withInputPorts(inputs: List[InputPort]): PhysicalOp = { this.copy(inputPorts = inputs @@ -291,7 +284,6 @@ case class PhysicalOp( * @param outputs A list of OutputPort instances to set as the new output ports. * @return A new instance of PhysicalOp with the output ports updated. */ - @JsonIgnore def withOutputPorts(outputs: List[OutputPort]): PhysicalOp = { this.copy(outputPorts = outputs @@ -306,7 +298,6 @@ case class PhysicalOp( /** * creates a copy with suggested worker number. This is only to be used by Python UDF operators. */ - @JsonIgnore def withSuggestedWorkerNum(workerNum: Int): PhysicalOp = { this.copy(suggestedWorkerNum = Some(workerNum)) } @@ -314,7 +305,6 @@ case class PhysicalOp( /** * creates a copy with the partition requirements */ - @JsonIgnore def withPartitionRequirement(partitionRequirements: List[Option[PartitionInfo]]): PhysicalOp = { this.copy(partitionRequirement = partitionRequirements) } @@ -322,7 +312,6 @@ case class PhysicalOp( /** * creates a copy with the partition info derive function */ - @JsonIgnore def withDerivePartition(derivePartition: List[PartitionInfo] => PartitionInfo): PhysicalOp = { this.copy(derivePartition = derivePartition) } @@ -330,14 +319,12 @@ case class PhysicalOp( /** * creates a copy with the parallelizable specified */ - @JsonIgnore def withParallelizable(parallelizable: Boolean): PhysicalOp = this.copy(parallelizable = parallelizable) /** * creates a copy with the specified property that whether this operator is one-to-many */ - @JsonIgnore def withIsOneToManyOp(isOneToManyOp: Boolean): PhysicalOp = this.copy(isOneToManyOp = isOneToManyOp) @@ -350,7 +337,6 @@ case class PhysicalOp( * A Right value represents a successful schema, while a Left value represents an error (Throwable). * @return A new instance of PhysicalOp with the updated input port schema or error information. */ - @JsonIgnore private def withInputSchema( portId: PortIdentity, schema: Either[Throwable, Schema] @@ -371,7 +357,6 @@ case class PhysicalOp( * A Right value indicates a successful schema, while a Left value indicates an error (Throwable). * @return A new instance of PhysicalOp with the updated output port schema or error information. */ - @JsonIgnore private def withOutputSchema( portId: PortIdentity, schema: Either[Throwable, Schema] @@ -385,7 +370,6 @@ case class PhysicalOp( /** * creates a copy with the schema propagation function. */ - @JsonIgnore def withPropagateSchema(func: SchemaPropagationFunc): PhysicalOp = { this.copy(propagateSchema = func) } @@ -393,7 +377,6 @@ case class PhysicalOp( /** * creates a copy with an additional input link specified on an input port */ - @JsonIgnore def addInputLink(link: PhysicalLink): PhysicalOp = { assert(link.toOpId == id) assert(inputPorts.contains(link.toPortId)) @@ -407,7 +390,6 @@ case class PhysicalOp( /** * creates a copy with an additional output link specified on an output port */ - @JsonIgnore def addOutputLink(link: PhysicalLink): PhysicalOp = { assert(link.fromOpId == id) assert(outputPorts.contains(link.fromPortId)) @@ -482,7 +464,6 @@ case class PhysicalOp( /** * returns all output links. Optionally, if a specific portId is provided, returns the links connected to that portId. */ - @JsonIgnore def getOutputLinks(portId: PortIdentity): List[PhysicalLink] = { outputPorts.values .flatMap(_._2) @@ -493,7 +474,6 @@ case class PhysicalOp( /** * returns all input links. Optionally, if a specific portId is provided, returns the links connected to that portId. */ - @JsonIgnore def getInputLinks(portIdOpt: Option[PortIdentity] = None): List[PhysicalLink] = { inputPorts.values .flatMap(_._2) @@ -509,7 +489,6 @@ case class PhysicalOp( /** * Tells whether the input port the link connects to is depended by another input . */ - @JsonIgnore def isInputLinkDependee(link: PhysicalLink): Boolean = { dependeeInputs.contains(link.toPortId) } @@ -518,7 +497,6 @@ case class PhysicalOp( * Tells whether the output on this link is blocking i.e. the operator doesn't output anything till this link * outputs all its tuples. */ - @JsonIgnore def isOutputLinkBlocking(link: PhysicalLink): Boolean = { this.outputPorts(link.fromPortId)._1.blocking } @@ -527,7 +505,6 @@ case class PhysicalOp( * Some operators process their inputs in a particular order. Eg: 2 phase hash join first * processes the build input, then the probe input. */ - @JsonIgnore def getInputLinksInProcessingOrder: List[PhysicalLink] = { val dependencyDag = { new DirectedAcyclicGraph[PhysicalLink, DefaultEdge](classOf[DefaultEdge]) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala index 927d9c09eca..818e5e2fb0d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala @@ -25,4 +25,4 @@ case class LogicalPlanPojo( links: List[LogicalLink], opsToViewResult: List[String], opsToReuseResult: List[String] -) \ No newline at end of file +) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala index 471effe18a6..62a5ccc8942 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala @@ -1,6 +1,6 @@ package edu.uci.ics.texera.workflow.common.workflow -import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty} +import com.fasterxml.jackson.annotation.JsonIgnore import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOp import edu.uci.ics.amber.engine.common.VirtualIdentityUtils @@ -68,11 +68,11 @@ case class PhysicalPlan( links: Set[PhysicalLink] ) extends LazyLogging { - @JsonIgnore @transient private lazy val operatorMap: Map[PhysicalOpIdentity, PhysicalOp] = + @transient private lazy val operatorMap: Map[PhysicalOpIdentity, PhysicalOp] = operators.map(o => (o.id, o)).toMap // the dag will be re-computed again once it reaches the coordinator. - @JsonIgnore @transient lazy val dag: DirectedAcyclicGraph[PhysicalOpIdentity, PhysicalLink] = { + @transient lazy val dag: DirectedAcyclicGraph[PhysicalOpIdentity, PhysicalLink] = { val jgraphtDag = new DirectedAcyclicGraph[PhysicalOpIdentity, PhysicalLink]( null, // vertexSupplier SupplierUtil.createSupplier(classOf[PhysicalLink]), // edgeSupplier @@ -84,13 +84,11 @@ case class PhysicalPlan( jgraphtDag } - @JsonIgnore @transient lazy val maxChains: Set[Set[PhysicalLink]] = this.getMaxChains + @transient lazy val maxChains: Set[Set[PhysicalLink]] = this.getMaxChains - @JsonIgnore def getSourceOperatorIds: Set[PhysicalOpIdentity] = operatorMap.keys.filter(op => dag.inDegreeOf(op) == 0).toSet - @JsonIgnore def getPhysicalOpsOfLogicalOp(logicalOpId: OperatorIdentity): List[PhysicalOp] = { topologicalIterator() .filter(physicalOpId => physicalOpId.logicalOpId == logicalOpId) @@ -98,13 +96,11 @@ case class PhysicalPlan( .toList } - @JsonIgnore def getOperator(physicalOpId: PhysicalOpIdentity): PhysicalOp = operatorMap(physicalOpId) /** * returns a sub-plan that contains the specified operators and the links connected within these operators */ - @JsonIgnore def getSubPlan(subOperators: Set[PhysicalOpIdentity]): PhysicalPlan = { val newOps = operators.filter(op => subOperators.contains(op.id)) val newLinks = @@ -114,22 +110,18 @@ case class PhysicalPlan( PhysicalPlan(newOps, newLinks) } - @JsonIgnore def getUpstreamPhysicalOpIds(physicalOpId: PhysicalOpIdentity): Set[PhysicalOpIdentity] = { dag.incomingEdgesOf(physicalOpId).asScala.map(e => dag.getEdgeSource(e)).toSet } - @JsonIgnore def getUpstreamPhysicalLinks(physicalOpId: PhysicalOpIdentity): Set[PhysicalLink] = { links.filter(l => l.toOpId == physicalOpId) } - @JsonIgnore def getDownstreamPhysicalLinks(physicalOpId: PhysicalOpIdentity): Set[PhysicalLink] = { links.filter(l => l.fromOpId == physicalOpId) } - @JsonIgnore def topologicalIterator(): Iterator[PhysicalOpIdentity] = { new TopologicalOrderIterator(dag).asScala } @@ -166,11 +158,9 @@ case class PhysicalPlan( this.copy(operators = (operatorMap + (physicalOp.id -> physicalOp)).values.toSet) } - @JsonIgnore def getPhysicalOpByWorkerId(workerId: ActorVirtualIdentity): PhysicalOp = getOperator(VirtualIdentityUtils.getPhysicalOpId(workerId)) - @JsonIgnore def getLinksBetween( from: PhysicalOpIdentity, to: PhysicalOpIdentity @@ -179,7 +169,6 @@ case class PhysicalPlan( } - @JsonIgnore def getOutputPartitionInfo( link: PhysicalLink, upstreamPartitionInfo: PartitionInfo, @@ -218,12 +207,10 @@ case class PhysicalPlan( } } - @JsonIgnore private def isMaterializedLink(link: PhysicalLink): Boolean = { getOperator(link.toOpId).isSinkOperator } - @JsonIgnore def getNonMaterializedBlockingAndDependeeLinks: Set[PhysicalLink] = { operators .flatMap { physicalOp => @@ -244,7 +231,6 @@ case class PhysicalPlan( } } - @JsonIgnore def getDependeeLinks: Set[PhysicalLink] = { operators .flatMap { physicalOp => @@ -264,7 +250,7 @@ case class PhysicalPlan( /** * create a DAG similar to the physical DAG but with all dependee links removed. */ - @JsonIgnore + @JsonIgnore // this is needed to prevent the serialization issue def getDependeeLinksRemovedDAG: PhysicalPlan = { this.copy(operators, links.diff(getDependeeLinks)) } @@ -276,7 +262,6 @@ case class PhysicalPlan( * * @return All non-blocking links that are not bridges. */ - @JsonIgnore def getNonBridgeNonBlockingLinks: Set[PhysicalLink] = { val bridges = new BiconnectivityInspector[PhysicalOpIdentity, PhysicalLink](this.dag).getBridges.asScala @@ -301,7 +286,6 @@ case class PhysicalPlan( * * @return All the maximal chains of this physical plan, where each chain is represented as a set of links. */ - @JsonIgnore private def getMaxChains: Set[Set[PhysicalLink]] = { val dijkstra = new AllDirectedPaths[PhysicalOpIdentity, PhysicalLink](this.dag) val chains = this.dag From d02cdbdefc0ba233fe3c735c0f4ae59fe51873ed Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Wed, 4 Sep 2024 22:06:19 +0800 Subject: [PATCH 15/20] try resolve comments --- .../web/TexeraWorkflowCompilingService.scala | 8 +------ .../WorkflowCompilationResource.scala | 2 +- .../common/workflow/PhysicalPlan.scala | 1 - .../common/workflow/WorkflowCompiler.scala | 23 +++++++++++++------ 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala index 521bb82c921..0864bfd942e 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala @@ -7,12 +7,7 @@ import edu.uci.ics.amber.engine.common.AmberConfig import edu.uci.ics.texera.Utils import edu.uci.ics.texera.web.TexeraWebApplication.parseArgs import edu.uci.ics.texera.web.auth.JwtAuth.jwtConsumer -import edu.uci.ics.texera.web.auth.{ - GuestAuthFilter, - SessionUser, - UserAuthenticator, - UserRoleAuthorizer -} +import edu.uci.ics.texera.web.auth.{GuestAuthFilter, SessionUser, UserRoleAuthorizer} import edu.uci.ics.texera.web.resource.WorkflowCompilationResource import io.dropwizard.auth.{AuthDynamicFeature, AuthValueFactoryProvider} import io.dropwizard.setup.{Bootstrap, Environment} @@ -62,7 +57,6 @@ class TexeraWorkflowCompilingService .setJwtConsumer(jwtConsumer) .setRealm("realm") .setPrefix("Bearer") - .setAuthenticator(UserAuthenticator) .setAuthorizer(UserRoleAuthorizer) .buildAuthFilter() ) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index 55cf3857e37..71cdb8c0641 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -14,7 +14,7 @@ import javax.ws.rs.{Consumes, POST, Path, PathParam, Produces} import javax.ws.rs.core.MediaType case class WorkflowCompilationResponse( - physicalPlan: PhysicalPlan, + physicalPlan: Option[PhysicalPlan], operatorInputSchemas: Map[String, List[Option[List[Attribute]]]], operatorErrors: Map[String, String] ) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala index 62a5ccc8942..b168c82a6b2 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/PhysicalPlan.scala @@ -20,7 +20,6 @@ import org.jgrapht.util.SupplierUtil import scala.jdk.CollectionConverters.{IteratorHasAsScala, ListHasAsScala, SetHasAsScala} object PhysicalPlan { - def empty: PhysicalPlan = PhysicalPlan(operators = Set.empty, links = Set.empty) def apply(context: WorkflowContext, logicalPlan: LogicalPlan): PhysicalPlan = { var physicalPlan = PhysicalPlan(operators = Set.empty, links = Set.empty) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala index c394ea93841..d890b5431cd 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala @@ -24,7 +24,7 @@ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer case class WorkflowCompilationResult( - physicalPlan: PhysicalPlan, + physicalPlan: Option[PhysicalPlan], // if physical plan is none, the compilation is failed operatorIdToInputSchemas: Map[OperatorIdentity, List[Option[Schema]]], operatorIdToError: Map[OperatorIdentity, WorkflowFatalError] ) @@ -82,7 +82,7 @@ class WorkflowCompiler( // encounter errors during compile pojo to logical plan, // so directly return empty physical plan, schema map and non-empty error map return WorkflowCompilationResult( - physicalPlan = PhysicalPlan.empty, + physicalPlan = None, operatorIdToInputSchemas = Map.empty, operatorIdToError = opIdToError ) @@ -106,14 +106,17 @@ class WorkflowCompiler( .mapValues(_.flatMap(_._2).toList.sortBy(_._1.id).map(_._2)) .toMap - WorkflowCompilationResult(physicalPlan, opIdToInputSchemas, Map.empty) + WorkflowCompilationResult(Some(physicalPlan), opIdToInputSchemas, Map.empty) } + /** + * After separating the compiler as a standalone service, this function needs to be removed. + */ + @Deprecated def compileLogicalPlan( logicalPlanPojo: LogicalPlanPojo, executionStateStore: ExecutionStateStore ): LogicalPlan = { - // TODO: remove this function after separating compiler as a standalone service val errorList = new ArrayBuffer[(OperatorIdentity, Throwable)]() // remove previous error state executionStateStore.metadataStore.updateState { metadataStore => @@ -151,16 +154,19 @@ class WorkflowCompiler( logicalPlan } + /** + * After separating the compiler as a standalone service, this function needs to be removed. + * The sink storage assignment needs to be pushed to the standalone workflow execution service. + */ + @Deprecated def compile( logicalPlanPojo: LogicalPlanPojo, opResultStorage: OpResultStorage, executionStateStore: ExecutionStateStore ): Workflow = { - // TODO: remove this function after separating compiler as a standalone service // generate a LogicalPlan. The logical plan is the injected with all necessary sinks val logicalPlan = compileLogicalPlan(logicalPlanPojo, executionStateStore) - // TODO: push the sink storage assignment directly on physical plan in workflow execution service assignSinkStorage( logicalPlan, context, @@ -177,6 +183,10 @@ class WorkflowCompiler( ) } + /** + * Once standalone compiler is done, move this function to the execution service, and change the 1st parameter from LogicalPlan to PhysicalPlan + */ + @Deprecated private def assignSinkStorage( logicalPlan: LogicalPlan, context: WorkflowContext, @@ -184,7 +194,6 @@ class WorkflowCompiler( reuseStorageSet: Set[OperatorIdentity] = Set() ): Unit = { // create a JSON object that holds pointers to the workflow's results in Mongo - // TODO: move it to the execution service, and change the 1st parameter from LogicalPlan to PhysicalPlan val resultsJSON = objectMapper.createObjectNode() val sinksPointers = objectMapper.createArrayNode() // assign storage to texera-managed sinks before generating exec config From 33c80956caccdaad5de792cb42a198961fa1e231 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Wed, 4 Sep 2024 23:59:23 +0800 Subject: [PATCH 16/20] resolve comments --- .../web/TexeraWorkflowCompilingService.scala | 8 +++++- .../resource/SchemaPropagationResource.scala | 5 ++++ .../WorkflowCompilationResource.scala | 2 +- .../common/workflow/WorkflowCompiler.scala | 28 ++++++------------- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala index 0864bfd942e..521bb82c921 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala @@ -7,7 +7,12 @@ import edu.uci.ics.amber.engine.common.AmberConfig import edu.uci.ics.texera.Utils import edu.uci.ics.texera.web.TexeraWebApplication.parseArgs import edu.uci.ics.texera.web.auth.JwtAuth.jwtConsumer -import edu.uci.ics.texera.web.auth.{GuestAuthFilter, SessionUser, UserRoleAuthorizer} +import edu.uci.ics.texera.web.auth.{ + GuestAuthFilter, + SessionUser, + UserAuthenticator, + UserRoleAuthorizer +} import edu.uci.ics.texera.web.resource.WorkflowCompilationResource import io.dropwizard.auth.{AuthDynamicFeature, AuthValueFactoryProvider} import io.dropwizard.setup.{Bootstrap, Environment} @@ -57,6 +62,7 @@ class TexeraWorkflowCompilingService .setJwtConsumer(jwtConsumer) .setRealm("realm") .setPrefix("Bearer") + .setAuthenticator(UserAuthenticator) .setAuthorizer(UserRoleAuthorizer) .buildAuthFilter() ) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala index 128a2eb5154..cfdc11c540a 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/SchemaPropagationResource.scala @@ -14,11 +14,16 @@ import javax.annotation.security.RolesAllowed import javax.ws.rs._ import javax.ws.rs.core.MediaType +/** + * The SchemaPropagation functionality will be included by the standalone compiling service + */ +@Deprecated @Consumes(Array(MediaType.APPLICATION_JSON)) @Produces(Array(MediaType.APPLICATION_JSON)) @Path("/queryplan") class SchemaPropagationResource extends LazyLogging { + @Deprecated @POST @Path("/autocomplete/{wid}") @RolesAllowed(Array("REGULAR", "ADMIN")) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala index 71cdb8c0641..87748d12f67 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/resource/WorkflowCompilationResource.scala @@ -38,7 +38,7 @@ class WorkflowCompilationResource extends LazyLogging { // compile the pojo using WorkflowCompiler val workflowCompilationResult = - new WorkflowCompiler(context).compileToPhysicalPlan(logicalPlanPojo) + new WorkflowCompiler(context).compile(logicalPlanPojo) // return the result WorkflowCompilationResponse( physicalPlan = workflowCompilationResult.physicalPlan, diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala index d890b5431cd..8935e7958a9 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala @@ -34,13 +34,15 @@ class WorkflowCompiler( ) extends LazyLogging { /** - * Compile the workflow to logical plan and errors(if any) + * Compile a workflow to physical plan, along with the schema propagation result and error(if any) + * * @param logicalPlanPojo the pojo parsed from workflow str provided by user - * @return LogicalPlan, and a Map from OpId to Op's error(this map is empty if there is no error) + * @return WorkflowCompilationResult, containing the physical plan, input schemas per op and error per op */ - def compileToLogicalPlan( + def compile( logicalPlanPojo: LogicalPlanPojo - ): (LogicalPlan, Map[OperatorIdentity, WorkflowFatalError]) = { + ): WorkflowCompilationResult = { + // first compile the pojo to logical plan val errorList = new ArrayBuffer[(OperatorIdentity, Throwable)]() val opIdToError = mutable.Map[OperatorIdentity, WorkflowFatalError]() @@ -51,7 +53,7 @@ class WorkflowCompiler( ) logicalPlan.propagateWorkflowSchema(context, Some(errorList)) - // report compilation errors + // map compilation errors with op id if (errorList.nonEmpty) { errorList.foreach { case (opId, err) => @@ -65,26 +67,14 @@ class WorkflowCompiler( )) } } - (logicalPlan, opIdToError.toMap) - } - /** - * Compile a workflow to physical plan, along with the schema propagation result and error(if any) - * - * @param logicalPlanPojo the pojo parsed from workflow str provided by user - * @return WorkflowCompilationResult, containing the physical plan, input schemas per op and error per op - */ - def compileToPhysicalPlan( - logicalPlanPojo: LogicalPlanPojo - ): WorkflowCompilationResult = { - val (logicalPlan, opIdToError) = compileToLogicalPlan(logicalPlanPojo) if (opIdToError.nonEmpty) { // encounter errors during compile pojo to logical plan, - // so directly return empty physical plan, schema map and non-empty error map + // so directly return None as physical plan, schema map and non-empty error map return WorkflowCompilationResult( physicalPlan = None, operatorIdToInputSchemas = Map.empty, - operatorIdToError = opIdToError + operatorIdToError = opIdToError.toMap ) } // from logical plan to physical plan From e854605dc9fd551e764c783514685d4ac0ea23e2 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Thu, 5 Sep 2024 08:35:00 +0800 Subject: [PATCH 17/20] resolve comments --- .../architecture/deploysemantics/PhysicalOp.scala | 12 ++++-------- .../texera/web/TexeraWorkflowCompilingService.scala | 5 ++--- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index 09f5cf727f9..1fedb610959 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -194,14 +194,10 @@ case class PhysicalOp( derivePartition: List[PartitionInfo] => PartitionInfo = inputParts => inputParts.head, // input/output ports of the physical operator // for operators with multiple input/output ports: must set these variables properly - inputPorts: Map[ - PortIdentity, - (InputPort, List[PhysicalLink], Either[Throwable, Schema]) - ] = Map.empty, - outputPorts: Map[ - PortIdentity, - (OutputPort, List[PhysicalLink], Either[Throwable, Schema]) - ] = Map.empty, + inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = + Map.empty, + outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = + Map.empty, // schema propagation function propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), isOneToManyOp: Boolean = false, diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala index 521bb82c921..33c483d0222 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/TexeraWorkflowCompilingService.scala @@ -58,7 +58,7 @@ class TexeraWorkflowCompilingService if (AmberConfig.isUserSystemEnabled) { environment.jersey.register( new AuthDynamicFeature( - new JwtAuthFilter.Builder[SessionUser]() // Renamed from SessionUser to AuthenticatedUser + new JwtAuthFilter.Builder[SessionUser]() .setJwtConsumer(jwtConsumer) .setRealm("realm") .setPrefix("Bearer") @@ -68,7 +68,6 @@ class TexeraWorkflowCompilingService ) ) } else { - // register Guest Auth layer (if applicable) environment.jersey.register( new AuthDynamicFeature( new GuestAuthFilter.Builder().setAuthorizer(UserRoleAuthorizer).buildAuthFilter() @@ -77,7 +76,7 @@ class TexeraWorkflowCompilingService } environment.jersey.register( - new AuthValueFactoryProvider.Binder[SessionUser](classOf[SessionUser]) // Updated here as well + new AuthValueFactoryProvider.Binder[SessionUser](classOf[SessionUser]) ) environment.jersey.register(classOf[RolesAllowedDynamicFeature]) } From ad39949b63750c8265a21b0b675a628541824169 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Thu, 5 Sep 2024 08:38:33 +0800 Subject: [PATCH 18/20] recover changes --- .../engine/architecture/deploysemantics/PhysicalOp.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index 1fedb610959..67000e14fdd 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -195,9 +195,9 @@ case class PhysicalOp( // input/output ports of the physical operator // for operators with multiple input/output ports: must set these variables properly inputPorts: Map[PortIdentity, (InputPort, List[PhysicalLink], Either[Throwable, Schema])] = - Map.empty, + Map.empty, outputPorts: Map[PortIdentity, (OutputPort, List[PhysicalLink], Either[Throwable, Schema])] = - Map.empty, + Map.empty, // schema propagation function propagateSchema: SchemaPropagationFunc = SchemaPropagationFunc(schemas => schemas), isOneToManyOp: Boolean = false, From ec3f5defdbf0e1713116b7709ac878de84b0006c Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Thu, 5 Sep 2024 08:41:37 +0800 Subject: [PATCH 19/20] fix a typo --- .../workflow/operators/sink/managed/ProgressiveSinkOpDesc.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/sink/managed/ProgressiveSinkOpDesc.java b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/sink/managed/ProgressiveSinkOpDesc.java index 552a6996617..900ee307ee4 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/sink/managed/ProgressiveSinkOpDesc.java +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/operators/sink/managed/ProgressiveSinkOpDesc.java @@ -59,7 +59,7 @@ public class ProgressiveSinkOpDesc extends SinkOpDesc { @Override public PhysicalOp getPhysicalOp(WorkflowIdentity workflowId, ExecutionIdentity executionId) { // Since during workflow compilation phase, the storage can be null, the writer should also be null - // the writer will be set property when workflow execution service receives the physical plan + // the writer will be set properly when workflow execution service receives the physical plan final SinkStorageWriter writer = (storage != null) ? storage.getStorageWriter() : null; return PhysicalOp.localPhysicalOp( workflowId, From f15a724a5013a3bf892ed08c8820e030d3f4b279 Mon Sep 17 00:00:00 2001 From: Jiadong Bai Date: Fri, 6 Sep 2024 07:56:47 +0800 Subject: [PATCH 20/20] fmt --- .../engine/architecture/deploysemantics/PhysicalOp.scala | 7 +------ .../model/websocket/request/WorkflowExecuteRequest.scala | 2 -- .../texera/workflow/common/workflow/WorkflowCompiler.scala | 2 +- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala index 67000e14fdd..581e9ebcd4d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/deploysemantics/PhysicalOp.scala @@ -2,12 +2,7 @@ package edu.uci.ics.amber.engine.architecture.deploysemantics import akka.actor.Deploy import akka.remote.RemoteScope -import com.fasterxml.jackson.annotation.{ - JsonAutoDetect, - JsonIgnore, - JsonIgnoreProperties, - JsonProperty -} +import com.fasterxml.jackson.annotation.{JsonIgnore, JsonIgnoreProperties} import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.engine.architecture.common.AkkaActorService import edu.uci.ics.amber.engine.architecture.controller.execution.OperatorExecution diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala index 818e5e2fb0d..04e94187628 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/web/model/websocket/request/WorkflowExecuteRequest.scala @@ -1,8 +1,6 @@ package edu.uci.ics.texera.web.model.websocket.request import com.fasterxml.jackson.databind.annotation.JsonDeserialize -import edu.uci.ics.amber.engine.architecture.deploysemantics.PhysicalOp -import edu.uci.ics.amber.engine.common.workflow.PhysicalLink import edu.uci.ics.texera.workflow.common.operators.LogicalOp import edu.uci.ics.texera.workflow.common.workflow.{LogicalLink, WorkflowSettings} diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala index 8935e7958a9..341bfc25f52 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/common/workflow/WorkflowCompiler.scala @@ -15,7 +15,7 @@ import edu.uci.ics.texera.web.workflowruntimestate.WorkflowAggregatedState.FAILE import edu.uci.ics.texera.web.workflowruntimestate.WorkflowFatalError import edu.uci.ics.texera.workflow.common.WorkflowContext import edu.uci.ics.texera.workflow.common.storage.OpResultStorage -import edu.uci.ics.texera.workflow.common.tuple.schema.{Attribute, Schema} +import edu.uci.ics.texera.workflow.common.tuple.schema.Schema import edu.uci.ics.texera.workflow.operators.sink.managed.ProgressiveSinkOpDesc import edu.uci.ics.texera.workflow.operators.visualization.VisualizationConstants