From 6dab9db479361646ac95467dd618959b39656d93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Boschi?= Date: Mon, 2 Oct 2023 17:36:12 +0200 Subject: [PATCH] Document webcrawler source and python (#511) --- .../k8s/agents/PythonCodeAgentProvider.java | 54 ++++++ .../agents/WebCrawlerSourceAgentProvider.java | 182 ++++++++++++++++++ .../impl/k8s/PythonCodeAgentsTest.java | 12 +- .../agents/PythonCodeAgentProviderTest.java | 69 +++++++ 4 files changed, 311 insertions(+), 6 deletions(-) create mode 100644 langstream-k8s-runtime/langstream-k8s-runtime-core/src/test/java/ai/langstream/runtime/impl/k8s/agents/PythonCodeAgentProviderTest.java diff --git a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/PythonCodeAgentProvider.java b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/PythonCodeAgentProvider.java index ac7db6e92..a634bc0be 100644 --- a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/PythonCodeAgentProvider.java +++ b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/PythonCodeAgentProvider.java @@ -15,6 +15,8 @@ */ package ai.langstream.runtime.impl.k8s.agents; +import ai.langstream.api.doc.AgentConfig; +import ai.langstream.api.doc.ConfigProperty; import ai.langstream.api.model.AgentConfiguration; import ai.langstream.api.runtime.ComponentType; import ai.langstream.impl.agents.AbstractComposableAgentProvider; @@ -43,4 +45,56 @@ protected final ComponentType getComponentType(AgentConfiguration agentConfigura "Unsupported agent type: " + agentConfiguration.getType()); }; } + + @Override + protected Class getAgentConfigModelClass(String type) { + return switch (type) { + case "python-source" -> PythonSourceConfig.class; + case "python-sink" -> PythonSinkConfig.class; + case "python-processor", "python-function" -> PythonProcessorConfig.class; + default -> throw new IllegalArgumentException("Unsupported agent type: " + type); + }; + } + + @Override + protected boolean isAgentConfigModelAllowUnknownProperties(String type) { + return true; + } + + @AgentConfig( + name = "Python custom source", + description = + """ + Run a your own Python source. + All the configuration properties are available to in the class init method. + """) + public static class PythonSourceConfig extends PythonConfig {} + + @AgentConfig( + name = "Python custom sink", + description = + """ + Run a your own Python sink. + All the configuration properties are available to in the class init method. + """) + public static class PythonSinkConfig extends PythonConfig {} + + @AgentConfig( + name = "Python custom processor", + description = + """ + Run a your own Python processor. + All the configuration properties are available to in the class init method. + """) + public static class PythonProcessorConfig extends PythonConfig {} + + public static class PythonConfig { + @ConfigProperty( + description = + """ + Python class name to instantiate. This class must be present in the application's "python" files. + """, + required = true) + private String className; + } } diff --git a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java index 07a22bc30..b6f4eac03 100644 --- a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java +++ b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java @@ -15,12 +15,16 @@ */ package ai.langstream.runtime.impl.k8s.agents; +import ai.langstream.api.doc.AgentConfig; +import ai.langstream.api.doc.ConfigProperty; import ai.langstream.api.model.AgentConfiguration; import ai.langstream.api.runtime.ComponentType; import ai.langstream.impl.agents.AbstractComposableAgentProvider; import ai.langstream.runtime.impl.k8s.KubernetesClusterRuntime; +import com.fasterxml.jackson.annotation.JsonProperty; import java.util.List; import java.util.Set; +import lombok.Data; import lombok.extern.slf4j.Slf4j; /** Implements support for WebCrawler Source Agents. */ @@ -35,4 +39,182 @@ public WebCrawlerSourceAgentProvider() { protected final ComponentType getComponentType(AgentConfiguration agentConfiguration) { return ComponentType.SOURCE; } + + @Data + @AgentConfig( + name = "Web crawler source", + description = + """ + Crawl a website and extract the content of the pages. + """) + public static class Config { + @ConfigProperty( + description = + """ + Configuration for handling the agent status. + The name of the bucket. + """, + defaultValue = "langstream-source") + private String bucketName; + + @ConfigProperty( + description = + """ + Configuration for handling the agent status. + The S3 endpoint.""", + defaultValue = "http://minio-endpoint.-not-set:9090") + private String endpoint; + + @ConfigProperty( + description = + """ + Configuration for handling the agent status. + Access key for the S3 server. + """, + defaultValue = "minioadmin") + @JsonProperty("access-key") + private String accessKey; + + @ConfigProperty( + description = + """ + Configuration for handling the agent status. + Secret key for the S3 server. + """, + defaultValue = "minioadmin") + @JsonProperty("secret-key") + private String secretKey; + + @ConfigProperty( + description = + """ + Configuration for handling the agent status. + Region for the S3 server. + """) + private String region = ""; + + @ConfigProperty( + description = + """ + Domains that the crawler is allowed to access. + """) + @JsonProperty("allowed-domains") + private Set allowedDomains; + + @ConfigProperty( + description = + """ + Paths that the crawler is not allowed to access. + """) + @JsonProperty("forbidden-paths") + private Set forbiddenPaths; + + @ConfigProperty( + description = + """ + Maximum number of URLs that can be crawled. + """, + defaultValue = "1000") + @JsonProperty("max-urls") + private int maxUrls; + + @ConfigProperty( + description = + """ + Maximum depth of the crawl. + """, + defaultValue = "50") + @JsonProperty("max-depth") + private int maxDepth; + + @ConfigProperty( + description = + """ + Whether to scan the HTML documents to find links to other pages. + """, + defaultValue = "true") + @JsonProperty("handle-robots-file") + private boolean handleRobotsFile; + + @ConfigProperty( + description = + """ + Whether to scan HTML documents for links to other sites. + """, + defaultValue = "true") + @JsonProperty("scan-html-documents") + private boolean scanHtmlDocuments; + + @ConfigProperty( + description = + """ + The starting URLs for the crawl. + """) + @JsonProperty("seed-urls") + private Set seedUrls; + + @ConfigProperty( + description = + """ + Time interval between reindexing of the pages. + """, + defaultValue = (60 * 60 * 24) + "") + @JsonProperty("reindex-interval-seconds") + private int reindexIntervalSeconds; + + @ConfigProperty( + description = + """ + Maximum number of unflushed pages before the agent persists the crawl data. + """, + defaultValue = "100") + @JsonProperty("max-unflushed-pages") + private int maxUnflushedPages; + + @ConfigProperty( + description = + """ + Minimum time between two requests to the same domain. (in milliseconds) + """, + defaultValue = "500") + @JsonProperty("min-time-between-requests") + private int minTimeBetweenRequests; + + @ConfigProperty( + description = + """ + User agent to use for the requests. + """, + defaultValue = + "Mozilla/5.0 (compatible; LangStream.ai/0.1; +https://langstream.ai)") + @JsonProperty("user-agent") + private String userAgent; + + @ConfigProperty( + description = + """ + Maximum number of errors allowed before stopping. + """, + defaultValue = "5") + @JsonProperty("max-error-count") + private int maxErrorCount; + + @ConfigProperty( + description = + """ + Timeout for HTTP requests. (in milliseconds) + """, + defaultValue = "10000") + @JsonProperty("http-timeout") + private int httpTimeout; + + @ConfigProperty( + description = + """ + Whether to handle cookies. + """, + defaultValue = "true") + @JsonProperty("handle-cookies") + private boolean handleCookies; + } } diff --git a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/test/java/ai/langstream/runtime/impl/k8s/PythonCodeAgentsTest.java b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/test/java/ai/langstream/runtime/impl/k8s/PythonCodeAgentsTest.java index 0e7c88203..27de6392a 100644 --- a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/test/java/ai/langstream/runtime/impl/k8s/PythonCodeAgentsTest.java +++ b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/test/java/ai/langstream/runtime/impl/k8s/PythonCodeAgentsTest.java @@ -63,14 +63,14 @@ public void testConfigurePythonAgents() throws Exception { id: "source1" type: "python-source" configuration: - agent.class: my.python.module.MyClass + className: my.python.module.MyClass config1: value1 config2: value2 - name: "process1" id: "process1" type: "python-processor" configuration: - agent.class: my.python.module.MyClass + className: my.python.module.MyClass config1: value1 config2: value2 composable: false @@ -78,7 +78,7 @@ public void testConfigurePythonAgents() throws Exception { id: "sink1" type: "python-sink" configuration: - agent.class: my.python.module.MyClass + className: my.python.module.MyClass config1: value1 config2: value2 """), @@ -111,7 +111,7 @@ public void testConfigurePythonAgents() throws Exception { DefaultAgentNode step = (DefaultAgentNode) agentImplementation; Map configuration = step.getConfiguration(); log.info("Configuration: {}", configuration); - assertEquals("my.python.module.MyClass", configuration.get("agent.class")); + assertEquals("my.python.module.MyClass", configuration.get("className")); assertEquals("value1", configuration.get("config1")); assertEquals("value2", configuration.get("config2")); assertEquals(ComponentType.SOURCE, step.getComponentType()); @@ -124,7 +124,7 @@ public void testConfigurePythonAgents() throws Exception { DefaultAgentNode step = (DefaultAgentNode) agentImplementation; Map configuration = step.getConfiguration(); log.info("Configuration: {}", configuration); - assertEquals("my.python.module.MyClass", configuration.get("agent.class")); + assertEquals("my.python.module.MyClass", configuration.get("className")); assertEquals("value1", configuration.get("config1")); assertEquals("value2", configuration.get("config2")); assertEquals(ComponentType.PROCESSOR, step.getComponentType()); @@ -136,7 +136,7 @@ public void testConfigurePythonAgents() throws Exception { DefaultAgentNode step = (DefaultAgentNode) agentImplementation; Map configuration = step.getConfiguration(); log.info("Configuration: {}", configuration); - assertEquals("my.python.module.MyClass", configuration.get("agent.class")); + assertEquals("my.python.module.MyClass", configuration.get("className")); assertEquals("value1", configuration.get("config1")); assertEquals("value2", configuration.get("config2")); assertEquals(ComponentType.SINK, step.getComponentType()); diff --git a/langstream-k8s-runtime/langstream-k8s-runtime-core/src/test/java/ai/langstream/runtime/impl/k8s/agents/PythonCodeAgentProviderTest.java b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/test/java/ai/langstream/runtime/impl/k8s/agents/PythonCodeAgentProviderTest.java new file mode 100644 index 000000000..893d0afa8 --- /dev/null +++ b/langstream-k8s-runtime/langstream-k8s-runtime-core/src/test/java/ai/langstream/runtime/impl/k8s/agents/PythonCodeAgentProviderTest.java @@ -0,0 +1,69 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package ai.langstream.runtime.impl.k8s.agents; + +import static org.junit.jupiter.api.Assertions.*; + +import lombok.SneakyThrows; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class PythonCodeAgentProviderTest { + @ParameterizedTest + @ValueSource(strings = {"python-source", "python-sink", "python-processor", "python-function"}) + @SneakyThrows + public void testValidation(String type) { + validate( + """ + topics: [] + pipeline: + - name: "python1" + type: "%s" + configuration: + a-field: "val" + """ + .formatted(type), + "Found error on an agent configuration (agent: 'python1', type: '%s'). Property 'className' is required" + .formatted(type)); + validate( + """ + topics: [] + pipeline: + - name: "python1" + type: "%s" + configuration: {} + """ + .formatted(type), + "Found error on an agent configuration (agent: 'python1', type: '%s'). Property 'className' is required" + .formatted(type)); + validate( + """ + topics: [] + pipeline: + - name: "python1" + type: "%s" + configuration: + className: my.class + other: true + """ + .formatted(type), + null); + } + + private void validate(String pipeline, String expectErrMessage) throws Exception { + AgentValidationTestUtil.validate(pipeline, expectErrMessage); + } +}