From d6f82632c862b800142a86b2f740b075e484f4b8 Mon Sep 17 00:00:00 2001
From: Ted Benson <edward.benson@gmail.com>
Date: Tue, 1 Oct 2024 16:15:32 -0400
Subject: [PATCH] Fixed stray tests and added a few more

---
 fixtures/crawlers.yml                 | 13 ++++-
 src/index.ts                          | 20 ++------
 src/patterns.json                     | 48 ++++++++++++++---
 tests/spec/__snapshots__/test.ts.snap |  4 --
 tests/spec/test.ts                    | 74 +++++----------------------
 5 files changed, 72 insertions(+), 87 deletions(-)

diff --git a/fixtures/crawlers.yml b/fixtures/crawlers.yml
index ba62474..679699f 100644
--- a/fixtures/crawlers.yml
+++ b/fixtures/crawlers.yml
@@ -1,8 +1,19 @@
 Google Notebook LLM:
   - Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)GoogleOther
+Google Extended:
+  - Google-Extended
 OpenAI SearchBot:
   - Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot
 OpenAI ChatGPT User:
   - Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot
 OpenAI GPTBot:
-  - Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot
\ No newline at end of file
+  - Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot
+ClaudBot:
+  - Claude-Web/1.0 (web crawler; +https://www.anthropic.com/; bots@anthropic.com)
+  - ClaudeBot
+  - anthropic-ai
+  - Claude-Web
+PerplexityBot:
+  - PerplexityBot
+Cohere:
+  - cohere-ai
diff --git a/src/index.ts b/src/index.ts
index 8a1c073..a6cd06d 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,23 +1,17 @@
 import patternsList from "./patterns.json";
 import { fullPattern } from "./pattern";
 
-/**
- * Naive bot pattern.
- */
-const naivePattern = /bot|crawl|http|lighthouse|scan|search|spider/i;
-
 let pattern: RegExp;
-export function getPattern(): RegExp {
+export function getPattern(): RegExp | null {
 	if (pattern instanceof RegExp) {
 		return pattern;
 	}
 	try {
 		// Build this RegExp dynamically to avoid syntax errors in older engines.
-		pattern = new RegExp(fullPattern, "i");
+		return new RegExp(fullPattern, "i");
 	} catch (error) {
-		pattern = naivePattern;
+		return null;
 	}
-	return pattern;
 }
 
 /**
@@ -25,17 +19,11 @@ export function getPattern(): RegExp {
  */
 export const list: string[] = patternsList.map((pattern) => pattern.pattern);
 
-/**
- * Check if the given user agent includes a bot pattern. Naive implementation (less accurate).
- */
-export const isaiNaive = (userAgent?: string | null): boolean =>
-	Boolean(userAgent) && naivePattern.test(userAgent);
-
 /**
  * Check if the given user agent includes a bot pattern.
  */
 export function isai(userAgent?: string | null): boolean {
-	return Boolean(userAgent) && getPattern().test(userAgent);
+	return Boolean(userAgent) && getPattern()?.test(userAgent) || false;
 }
 
 /**
diff --git a/src/patterns.json b/src/patterns.json
index 2ac2353..5ccd88c 100644
--- a/src/patterns.json
+++ b/src/patterns.json
@@ -10,7 +10,7 @@
     ]
   },
   {
-    "pattern": "https://openai.com/gptbot", 
+    "pattern": "openai.*bot", 
     "confidence": 1.0,
     "reasonsFor": [
       "Published user agent substring by OpenAI"
@@ -19,21 +19,57 @@
     ]
   },
   {
-    "pattern": "https://openai.com/searchbot", 
+    "pattern": "^ClaudeBot", 
     "confidence": 1.0,
     "reasonsFor": [
-      "Published user agent substring by OpenAI"
+      "Reported Anthropic Claude user agent string"
     ],
     "reasonsAgainst": [
     ]
   },
   {
-    "pattern": "https://openai.com/bot", 
+    "pattern": "^anthropic", 
     "confidence": 1.0,
     "reasonsFor": [
-      "Published user agent substring by OpenAI"
+      "Reported Anthropic Claude user agent string"
+    ],
+    "reasonsAgainst": [
+    ]
+  },
+  {
+    "pattern": "^Claude-Web", 
+    "confidence": 1.0,
+    "reasonsFor": [
+      "Reported Anthropic Claude user agent string"
+    ],
+    "reasonsAgainst": [
+    ]
+  },
+  {
+    "pattern": "^PerplexityBot", 
+    "confidence": 1.0,
+    "reasonsFor": [
+      "Reported Perplexity bot string" 
+    ],
+    "reasonsAgainst": [
+    ]
+  },
+  {
+    "pattern": "^cohere-ai", 
+    "confidence": 1.0,
+    "reasonsFor": [
+      "Reported Cohere bot string" 
+    ],
+    "reasonsAgainst": [
+    ]
+  },
+  {
+    "pattern": "^Google-Extended", 
+    "confidence": 1.0,
+    "reasonsFor": [
+      "Reported Google Bard bot string" 
     ],
     "reasonsAgainst": [
     ]
-  }
+  }  
 ]
diff --git a/tests/spec/__snapshots__/test.ts.snap b/tests/spec/__snapshots__/test.ts.snap
index fdceb4e..acef6ed 100644
--- a/tests/spec/__snapshots__/test.ts.snap
+++ b/tests/spec/__snapshots__/test.ts.snap
@@ -6,10 +6,6 @@ exports[`isai module interface interface is as expected 1`] = `
     "list",
     "Array",
   ],
-  [
-    "isaiNaive",
-    "Function",
-  ],
   [
     "createisai",
     "Function",
diff --git a/tests/spec/test.ts b/tests/spec/test.ts
index 0523057..de9e243 100644
--- a/tests/spec/test.ts
+++ b/tests/spec/test.ts
@@ -2,7 +2,6 @@ import {
 	getPattern,
 	list,
 	isai,
-	isaiNaive,
 	isaiMatch,
 	isaiMatches,
 	isaiPattern,
@@ -19,16 +18,6 @@ const AI_USER_AGENT_EXAMPLE =
 const BROWSER_USER_AGENT_EXAMPLE =
 	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91 Safari/537.36";
 
-const USER_AGENT_COMMON = [
-	"Ada Chat Bot/1.0 Request Block",
-	"Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
-	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4590.2 Safari/537.36 Chrome-Lighthouse",
-];
-const USER_AGENT_GOTCHAS = [
-	"Mozilla/5.0 (Linux; Android 10; CUBOT_X30) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.85 Mobile Safari/537.36",
-	"PS4Application libhttp/1.000 (PS4) CoreMedia libhttp/6.72 (PlayStation 4)",
-];
-
 describe("isai", () => {
 	describe("features", () => {
 		test("pattern: pattern is a regex", () => {
@@ -42,20 +31,20 @@ describe("isai", () => {
 			expect(isai(AI_USER_AGENT_EXAMPLE)).toBe(true);
 		});
 		test("isaiMatch: find pattern in bot user agent string", () => {
-			expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("https://openai.com/searchbot");
+			expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("openai.com/searchbot");
 		});
 		test("isaiMatches: find all patterns in bot user agent string", () => {
-			expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("https://openai.com/searchbot");
+			expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("openai.com/searchbot");
 			expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toHaveLength(1);
 		});
 		test("isaiPattern: find first pattern in bot user agent string", () => {
 			expect(isaiPattern(AI_USER_AGENT_EXAMPLE)).toBe(
-				"https://openai.com/searchbot",
+				"openai.*bot",
 			);
 		});
 		test("isaiPatterns: find all patterns in bot user agent string", () => {
 			expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toContain(
-				"https://openai.com/searchbot",
+				"openai.*bot",
 			);
 			expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toHaveLength(1);
 		});
@@ -64,19 +53,19 @@ describe("isai", () => {
 			expect(customisai(AI_USER_AGENT_EXAMPLE)).toBe(true);
 		});
 		test("createisaiFromList: create custom isai function with custom pattern", () => {
-			const ChromeLighthouseUserAgentStrings: string[] = [
-				"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
-				"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot",
+			const ToRemoveStrings: string[] = [
+				"openai.*bot"
 			];
 			const patternsToRemove: Set<string> = new Set(
-				ChromeLighthouseUserAgentStrings.map(isaiMatches).flat(),
+				ToRemoveStrings.map(isaiMatches).flat(),
 			);
-			const isai2 = createisaiFromList(
-				list.filter(
-					(record: string): boolean => patternsToRemove.has(record) === false,
-				),
+			expect(patternsToRemove.size).toBeGreaterThan(0);
+			const list2 = list.filter(
+				(record: string): boolean => patternsToRemove.has(record) === false,
 			);
-			const [ua] = ChromeLighthouseUserAgentStrings;
+			expect(list2.length).toBeLessThan(list.length);
+			const isai2 = createisaiFromList(list2);
+			const ua = "https://openai.com/gptbot"
 			expect(isai(ua)).toBe(true);
 			expect(isai2(ua)).toBe(false);
 		});
@@ -92,27 +81,6 @@ describe("isai", () => {
 		);
 	});
 
-	describe("isaiNaive", () => {
-		test.each([75])(
-			"a large number of user agent strings can be detected (>%s%)",
-			(percent) => {
-				const ratio =
-					crawlers.filter((ua) => isaiNaive(ua)).length / crawlers.length;
-				expect(ratio).toBeLessThanOrEqual(1);
-				expect(ratio).toBeGreaterThan(percent / 100);
-			},
-		);
-		test.each([1])(
-			"a small number of browsers is falsly detected as bots (<%s%)",
-			(percent) => {
-				const ratio =
-					browsers.filter((ua) => isaiNaive(ua)).length / browsers.length;
-				expect(ratio).toBeGreaterThan(0);
-				expect(ratio).toBeLessThan(percent / 100);
-			},
-		);
-	});
-
 	describe("regex fallback", () => {
 		beforeAll(async () => {
 			jest
@@ -132,20 +100,6 @@ describe("isai", () => {
 		afterAll(() => {
 			jest.restoreAllMocks();
 		});
-		test("fallback regex detects commong crawlers", () => {
-			USER_AGENT_COMMON.forEach((ua) => {
-				if (!isaiInstance(ua)) {
-					throw new Error(`Failed to detect ${ua} as bot`);
-				}
-			});
-		});
-		test("fallback detects gotchas as bots", () => {
-			USER_AGENT_GOTCHAS.forEach((ua) => {
-				if (!isaiInstance(ua)) {
-					throw new Error(`Failed to detect ${ua} as bot (gotcha)`);
-				}
-			});
-		});
 		test("fallback does not detect browser as bot", () => {
 			expect(isaiInstance(BROWSER_USER_AGENT_EXAMPLE)).toBe(false);
 		});
@@ -189,7 +143,7 @@ describe("isai", () => {
 		});
 		test("regular expressions exports are as expected", () => {
 			expect(new RegExp(fullPattern, "i").toString()).toBe(
-				getPattern().toString(),
+				getPattern()?.toString(),
 			);
 		});
 	});