LangStream · eolivelli · Nov 30, 2023 · Nov 30, 2023
diff --git a/...ream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java b/...ream-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/WebCrawlerSource.java
@@ -115,11 +115,13 @@ public void init(Map<String, Object> configuration) throws Exception {
         String userAgent = getString("user-agent", DEFAULT_USER_AGENT, configuration);
         int maxErrorCount = getInt("max-error-count", 5, configuration);
         int httpTimeout = getInt("http-timeout", 10000, configuration);
+        boolean allowNonHtmlContents = getBoolean("allow-non-html-contents", false, configuration);
 
         boolean handleCookies = getBoolean("handle-cookies", true, configuration);
 
         log.info("allowed-domains: {}", allowedDomains);
         log.info("forbidden-paths: {}", forbiddenPaths);
+        log.info("allow-non-html-contents: {}", allowNonHtmlContents);
         log.info("seed-urls: {}", seedUrls);
         log.info("max-urls: {}", maxUrls);
         log.info("max-depth: {}", maxDepth);
@@ -133,6 +135,7 @@ public void init(Map<String, Object> configuration) throws Exception {
         WebCrawlerConfiguration webCrawlerConfiguration =
                 WebCrawlerConfiguration.builder()
                         .allowedDomains(allowedDomains)
+                        .allowNonHtmlContents(allowNonHtmlContents)
                         .maxUrls(maxUrls)
                         .maxDepth(maxDepth)
                         .forbiddenPaths(forbiddenPaths)

diff --git a/...am-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java b/...am-agent-webcrawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawler.java
@@ -204,7 +204,7 @@ public boolean runCycle() throws Exception {
             // we did something
             return true;
         } catch (UnsupportedMimeTypeException notHtml) {
-            if (configuration.isAllowNonHtmlContent()) {
+            if (configuration.isAllowNonHtmlContents()) {
                 log.info(
                         "Url {} lead to a {} content-type document. allow-not-html-content is true, so we are processing it",
                         current,

diff --git a/...rawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java b/...rawler/src/main/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerConfiguration.java
@@ -39,7 +39,7 @@ public class WebCrawlerConfiguration {
     @Builder.Default private boolean handleCookies = true;
     @Builder.Default private boolean handleRobotsFile = true;
     @Builder.Default private boolean scanHtmlDocuments = true;
-    @Builder.Default private boolean allowNonHtmlContent = false;
+    @Builder.Default private boolean allowNonHtmlContents = false;
 
     @Builder.Default private Set<String> allowedTags = Set.of("a");
 

diff --git a/...gent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerTest.java b/...gent-webcrawler/src/test/java/ai/langstream/agents/webcrawler/crawler/WebCrawlerTest.java
@@ -345,7 +345,7 @@ void testBinaryContent(WireMockRuntimeInfo vmRuntimeInfo) throws Exception {
         WebCrawlerConfiguration configuration =
                 WebCrawlerConfiguration.builder()
                         .allowedDomains(Set.of(vmRuntimeInfo.getHttpBaseUrl()))
-                        .allowNonHtmlContent(true)
+                        .allowNonHtmlContents(true)
                         .handleRobotsFile(false)
                         .maxErrorCount(5)
                         .build();

diff --git a/...re/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java b/...re/src/main/java/ai/langstream/runtime/impl/k8s/agents/WebCrawlerSourceAgentProvider.java
@@ -203,6 +203,15 @@ public static class Config {
         @JsonProperty("scan-html-documents")
         private boolean scanHtmlDocuments;
 
+        @ConfigProperty(
+                description =
+                        """
+                Whether to emit non HTML documents to the pipeline (i.e. PDF Files).
+                                """,
+                defaultValue = "false")
+        @JsonProperty("allow-non-html-contents")
+        private boolean allowNonHtmlContents;
+
         @ConfigProperty(
                 description =
                         """

diff --git a/...runtime/langstream-runtime-impl/src/test/java/ai/langstream/kafka/WebCrawlerSourceIT.java b/...runtime/langstream-runtime-impl/src/test/java/ai/langstream/kafka/WebCrawlerSourceIT.java
@@ -93,6 +93,7 @@ public void test(WireMockRuntimeInfo vmRuntimeInfo) throws Exception {
                                     output: "${globals.output-topic}"
                                     configuration:\s
                                         seed-urls: ["%s/index.html"]
+                                        allow-non-html-contents: true
                                         allowed-domains: ["%s"]
                                         state-storage: disk
                                 """