Skip to content

Commit

Permalink
Document webcrawler source and python (#511)
Browse files Browse the repository at this point in the history
  • Loading branch information
nicoloboschi authored Oct 2, 2023
1 parent 07425de commit 6dab9db
Show file tree
Hide file tree
Showing 4 changed files with 311 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
*/
package ai.langstream.runtime.impl.k8s.agents;

import ai.langstream.api.doc.AgentConfig;
import ai.langstream.api.doc.ConfigProperty;
import ai.langstream.api.model.AgentConfiguration;
import ai.langstream.api.runtime.ComponentType;
import ai.langstream.impl.agents.AbstractComposableAgentProvider;
Expand Down Expand Up @@ -43,4 +45,56 @@ protected final ComponentType getComponentType(AgentConfiguration agentConfigura
"Unsupported agent type: " + agentConfiguration.getType());
};
}

@Override
protected Class getAgentConfigModelClass(String type) {
return switch (type) {
case "python-source" -> PythonSourceConfig.class;
case "python-sink" -> PythonSinkConfig.class;
case "python-processor", "python-function" -> PythonProcessorConfig.class;
default -> throw new IllegalArgumentException("Unsupported agent type: " + type);
};
}

@Override
protected boolean isAgentConfigModelAllowUnknownProperties(String type) {
return true;
}

@AgentConfig(
name = "Python custom source",
description =
"""
Run a your own Python source.
All the configuration properties are available to in the class init method.
""")
public static class PythonSourceConfig extends PythonConfig {}

@AgentConfig(
name = "Python custom sink",
description =
"""
Run a your own Python sink.
All the configuration properties are available to in the class init method.
""")
public static class PythonSinkConfig extends PythonConfig {}

@AgentConfig(
name = "Python custom processor",
description =
"""
Run a your own Python processor.
All the configuration properties are available to in the class init method.
""")
public static class PythonProcessorConfig extends PythonConfig {}

public static class PythonConfig {
@ConfigProperty(
description =
"""
Python class name to instantiate. This class must be present in the application's "python" files.
""",
required = true)
private String className;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@
*/
package ai.langstream.runtime.impl.k8s.agents;

import ai.langstream.api.doc.AgentConfig;
import ai.langstream.api.doc.ConfigProperty;
import ai.langstream.api.model.AgentConfiguration;
import ai.langstream.api.runtime.ComponentType;
import ai.langstream.impl.agents.AbstractComposableAgentProvider;
import ai.langstream.runtime.impl.k8s.KubernetesClusterRuntime;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.util.List;
import java.util.Set;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;

/** Implements support for WebCrawler Source Agents. */
Expand All @@ -35,4 +39,182 @@ public WebCrawlerSourceAgentProvider() {
protected final ComponentType getComponentType(AgentConfiguration agentConfiguration) {
return ComponentType.SOURCE;
}

@Data
@AgentConfig(
name = "Web crawler source",
description =
"""
Crawl a website and extract the content of the pages.
""")
public static class Config {
@ConfigProperty(
description =
"""
Configuration for handling the agent status.
The name of the bucket.
""",
defaultValue = "langstream-source")
private String bucketName;

@ConfigProperty(
description =
"""
Configuration for handling the agent status.
The S3 endpoint.""",
defaultValue = "http://minio-endpoint.-not-set:9090")
private String endpoint;

@ConfigProperty(
description =
"""
Configuration for handling the agent status.
Access key for the S3 server.
""",
defaultValue = "minioadmin")
@JsonProperty("access-key")
private String accessKey;

@ConfigProperty(
description =
"""
Configuration for handling the agent status.
Secret key for the S3 server.
""",
defaultValue = "minioadmin")
@JsonProperty("secret-key")
private String secretKey;

@ConfigProperty(
description =
"""
Configuration for handling the agent status.
Region for the S3 server.
""")
private String region = "";

@ConfigProperty(
description =
"""
Domains that the crawler is allowed to access.
""")
@JsonProperty("allowed-domains")
private Set<String> allowedDomains;

@ConfigProperty(
description =
"""
Paths that the crawler is not allowed to access.
""")
@JsonProperty("forbidden-paths")
private Set<String> forbiddenPaths;

@ConfigProperty(
description =
"""
Maximum number of URLs that can be crawled.
""",
defaultValue = "1000")
@JsonProperty("max-urls")
private int maxUrls;

@ConfigProperty(
description =
"""
Maximum depth of the crawl.
""",
defaultValue = "50")
@JsonProperty("max-depth")
private int maxDepth;

@ConfigProperty(
description =
"""
Whether to scan the HTML documents to find links to other pages.
""",
defaultValue = "true")
@JsonProperty("handle-robots-file")
private boolean handleRobotsFile;

@ConfigProperty(
description =
"""
Whether to scan HTML documents for links to other sites.
""",
defaultValue = "true")
@JsonProperty("scan-html-documents")
private boolean scanHtmlDocuments;

@ConfigProperty(
description =
"""
The starting URLs for the crawl.
""")
@JsonProperty("seed-urls")
private Set<String> seedUrls;

@ConfigProperty(
description =
"""
Time interval between reindexing of the pages.
""",
defaultValue = (60 * 60 * 24) + "")
@JsonProperty("reindex-interval-seconds")
private int reindexIntervalSeconds;

@ConfigProperty(
description =
"""
Maximum number of unflushed pages before the agent persists the crawl data.
""",
defaultValue = "100")
@JsonProperty("max-unflushed-pages")
private int maxUnflushedPages;

@ConfigProperty(
description =
"""
Minimum time between two requests to the same domain. (in milliseconds)
""",
defaultValue = "500")
@JsonProperty("min-time-between-requests")
private int minTimeBetweenRequests;

@ConfigProperty(
description =
"""
User agent to use for the requests.
""",
defaultValue =
"Mozilla/5.0 (compatible; LangStream.ai/0.1; +https://langstream.ai)")
@JsonProperty("user-agent")
private String userAgent;

@ConfigProperty(
description =
"""
Maximum number of errors allowed before stopping.
""",
defaultValue = "5")
@JsonProperty("max-error-count")
private int maxErrorCount;

@ConfigProperty(
description =
"""
Timeout for HTTP requests. (in milliseconds)
""",
defaultValue = "10000")
@JsonProperty("http-timeout")
private int httpTimeout;

@ConfigProperty(
description =
"""
Whether to handle cookies.
""",
defaultValue = "true")
@JsonProperty("handle-cookies")
private boolean handleCookies;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,22 +63,22 @@ public void testConfigurePythonAgents() throws Exception {
id: "source1"
type: "python-source"
configuration:
agent.class: my.python.module.MyClass
className: my.python.module.MyClass
config1: value1
config2: value2
- name: "process1"
id: "process1"
type: "python-processor"
configuration:
agent.class: my.python.module.MyClass
className: my.python.module.MyClass
config1: value1
config2: value2
composable: false
- name: "sink1"
id: "sink1"
type: "python-sink"
configuration:
agent.class: my.python.module.MyClass
className: my.python.module.MyClass
config1: value1
config2: value2
"""),
Expand Down Expand Up @@ -111,7 +111,7 @@ public void testConfigurePythonAgents() throws Exception {
DefaultAgentNode step = (DefaultAgentNode) agentImplementation;
Map<String, Object> configuration = step.getConfiguration();
log.info("Configuration: {}", configuration);
assertEquals("my.python.module.MyClass", configuration.get("agent.class"));
assertEquals("my.python.module.MyClass", configuration.get("className"));
assertEquals("value1", configuration.get("config1"));
assertEquals("value2", configuration.get("config2"));
assertEquals(ComponentType.SOURCE, step.getComponentType());
Expand All @@ -124,7 +124,7 @@ public void testConfigurePythonAgents() throws Exception {
DefaultAgentNode step = (DefaultAgentNode) agentImplementation;
Map<String, Object> configuration = step.getConfiguration();
log.info("Configuration: {}", configuration);
assertEquals("my.python.module.MyClass", configuration.get("agent.class"));
assertEquals("my.python.module.MyClass", configuration.get("className"));
assertEquals("value1", configuration.get("config1"));
assertEquals("value2", configuration.get("config2"));
assertEquals(ComponentType.PROCESSOR, step.getComponentType());
Expand All @@ -136,7 +136,7 @@ public void testConfigurePythonAgents() throws Exception {
DefaultAgentNode step = (DefaultAgentNode) agentImplementation;
Map<String, Object> configuration = step.getConfiguration();
log.info("Configuration: {}", configuration);
assertEquals("my.python.module.MyClass", configuration.get("agent.class"));
assertEquals("my.python.module.MyClass", configuration.get("className"));
assertEquals("value1", configuration.get("config1"));
assertEquals("value2", configuration.get("config2"));
assertEquals(ComponentType.SINK, step.getComponentType());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright DataStax, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ai.langstream.runtime.impl.k8s.agents;

import static org.junit.jupiter.api.Assertions.*;

import lombok.SneakyThrows;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

class PythonCodeAgentProviderTest {
@ParameterizedTest
@ValueSource(strings = {"python-source", "python-sink", "python-processor", "python-function"})
@SneakyThrows
public void testValidation(String type) {
validate(
"""
topics: []
pipeline:
- name: "python1"
type: "%s"
configuration:
a-field: "val"
"""
.formatted(type),
"Found error on an agent configuration (agent: 'python1', type: '%s'). Property 'className' is required"
.formatted(type));
validate(
"""
topics: []
pipeline:
- name: "python1"
type: "%s"
configuration: {}
"""
.formatted(type),
"Found error on an agent configuration (agent: 'python1', type: '%s'). Property 'className' is required"
.formatted(type));
validate(
"""
topics: []
pipeline:
- name: "python1"
type: "%s"
configuration:
className: my.class
other: true
"""
.formatted(type),
null);
}

private void validate(String pipeline, String expectErrMessage) throws Exception {
AgentValidationTestUtil.validate(pipeline, expectErrMessage);
}
}

0 comments on commit 6dab9db

Please sign in to comment.