From d6f82632c862b800142a86b2f740b075e484f4b8 Mon Sep 17 00:00:00 2001 From: Ted Benson Date: Tue, 1 Oct 2024 16:15:32 -0400 Subject: [PATCH] Fixed stray tests and added a few more --- fixtures/crawlers.yml | 13 ++++- src/index.ts | 20 ++------ src/patterns.json | 48 ++++++++++++++--- tests/spec/__snapshots__/test.ts.snap | 4 -- tests/spec/test.ts | 74 +++++---------------------- 5 files changed, 72 insertions(+), 87 deletions(-) diff --git a/fixtures/crawlers.yml b/fixtures/crawlers.yml index ba62474..679699f 100644 --- a/fixtures/crawlers.yml +++ b/fixtures/crawlers.yml @@ -1,8 +1,19 @@ Google Notebook LLM: - Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)GoogleOther +Google Extended: + - Google-Extended OpenAI SearchBot: - Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot OpenAI ChatGPT User: - Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot OpenAI GPTBot: - - Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot \ No newline at end of file + - Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot +ClaudBot: + - Claude-Web/1.0 (web crawler; +https://www.anthropic.com/; bots@anthropic.com) + - ClaudeBot + - anthropic-ai + - Claude-Web +PerplexityBot: + - PerplexityBot +Cohere: + - cohere-ai diff --git a/src/index.ts b/src/index.ts index 8a1c073..a6cd06d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,23 +1,17 @@ import patternsList from "./patterns.json"; import { fullPattern } from "./pattern"; -/** - * Naive bot pattern. - */ -const naivePattern = /bot|crawl|http|lighthouse|scan|search|spider/i; - let pattern: RegExp; -export function getPattern(): RegExp { +export function getPattern(): RegExp | null { if (pattern instanceof RegExp) { return pattern; } try { // Build this RegExp dynamically to avoid syntax errors in older engines. - pattern = new RegExp(fullPattern, "i"); + return new RegExp(fullPattern, "i"); } catch (error) { - pattern = naivePattern; + return null; } - return pattern; } /** @@ -25,17 +19,11 @@ export function getPattern(): RegExp { */ export const list: string[] = patternsList.map((pattern) => pattern.pattern); -/** - * Check if the given user agent includes a bot pattern. Naive implementation (less accurate). - */ -export const isaiNaive = (userAgent?: string | null): boolean => - Boolean(userAgent) && naivePattern.test(userAgent); - /** * Check if the given user agent includes a bot pattern. */ export function isai(userAgent?: string | null): boolean { - return Boolean(userAgent) && getPattern().test(userAgent); + return Boolean(userAgent) && getPattern()?.test(userAgent) || false; } /** diff --git a/src/patterns.json b/src/patterns.json index 2ac2353..5ccd88c 100644 --- a/src/patterns.json +++ b/src/patterns.json @@ -10,7 +10,7 @@ ] }, { - "pattern": "https://openai.com/gptbot", + "pattern": "openai.*bot", "confidence": 1.0, "reasonsFor": [ "Published user agent substring by OpenAI" @@ -19,21 +19,57 @@ ] }, { - "pattern": "https://openai.com/searchbot", + "pattern": "^ClaudeBot", "confidence": 1.0, "reasonsFor": [ - "Published user agent substring by OpenAI" + "Reported Anthropic Claude user agent string" ], "reasonsAgainst": [ ] }, { - "pattern": "https://openai.com/bot", + "pattern": "^anthropic", "confidence": 1.0, "reasonsFor": [ - "Published user agent substring by OpenAI" + "Reported Anthropic Claude user agent string" + ], + "reasonsAgainst": [ + ] + }, + { + "pattern": "^Claude-Web", + "confidence": 1.0, + "reasonsFor": [ + "Reported Anthropic Claude user agent string" + ], + "reasonsAgainst": [ + ] + }, + { + "pattern": "^PerplexityBot", + "confidence": 1.0, + "reasonsFor": [ + "Reported Perplexity bot string" + ], + "reasonsAgainst": [ + ] + }, + { + "pattern": "^cohere-ai", + "confidence": 1.0, + "reasonsFor": [ + "Reported Cohere bot string" + ], + "reasonsAgainst": [ + ] + }, + { + "pattern": "^Google-Extended", + "confidence": 1.0, + "reasonsFor": [ + "Reported Google Bard bot string" ], "reasonsAgainst": [ ] - } + } ] diff --git a/tests/spec/__snapshots__/test.ts.snap b/tests/spec/__snapshots__/test.ts.snap index fdceb4e..acef6ed 100644 --- a/tests/spec/__snapshots__/test.ts.snap +++ b/tests/spec/__snapshots__/test.ts.snap @@ -6,10 +6,6 @@ exports[`isai module interface interface is as expected 1`] = ` "list", "Array", ], - [ - "isaiNaive", - "Function", - ], [ "createisai", "Function", diff --git a/tests/spec/test.ts b/tests/spec/test.ts index 0523057..de9e243 100644 --- a/tests/spec/test.ts +++ b/tests/spec/test.ts @@ -2,7 +2,6 @@ import { getPattern, list, isai, - isaiNaive, isaiMatch, isaiMatches, isaiPattern, @@ -19,16 +18,6 @@ const AI_USER_AGENT_EXAMPLE = const BROWSER_USER_AGENT_EXAMPLE = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91 Safari/537.36"; -const USER_AGENT_COMMON = [ - "Ada Chat Bot/1.0 Request Block", - "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4590.2 Safari/537.36 Chrome-Lighthouse", -]; -const USER_AGENT_GOTCHAS = [ - "Mozilla/5.0 (Linux; Android 10; CUBOT_X30) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.85 Mobile Safari/537.36", - "PS4Application libhttp/1.000 (PS4) CoreMedia libhttp/6.72 (PlayStation 4)", -]; - describe("isai", () => { describe("features", () => { test("pattern: pattern is a regex", () => { @@ -42,20 +31,20 @@ describe("isai", () => { expect(isai(AI_USER_AGENT_EXAMPLE)).toBe(true); }); test("isaiMatch: find pattern in bot user agent string", () => { - expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("https://openai.com/searchbot"); + expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("openai.com/searchbot"); }); test("isaiMatches: find all patterns in bot user agent string", () => { - expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("https://openai.com/searchbot"); + expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("openai.com/searchbot"); expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toHaveLength(1); }); test("isaiPattern: find first pattern in bot user agent string", () => { expect(isaiPattern(AI_USER_AGENT_EXAMPLE)).toBe( - "https://openai.com/searchbot", + "openai.*bot", ); }); test("isaiPatterns: find all patterns in bot user agent string", () => { expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toContain( - "https://openai.com/searchbot", + "openai.*bot", ); expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toHaveLength(1); }); @@ -64,19 +53,19 @@ describe("isai", () => { expect(customisai(AI_USER_AGENT_EXAMPLE)).toBe(true); }); test("createisaiFromList: create custom isai function with custom pattern", () => { - const ChromeLighthouseUserAgentStrings: string[] = [ - "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot", - "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot", + const ToRemoveStrings: string[] = [ + "openai.*bot" ]; const patternsToRemove: Set = new Set( - ChromeLighthouseUserAgentStrings.map(isaiMatches).flat(), + ToRemoveStrings.map(isaiMatches).flat(), ); - const isai2 = createisaiFromList( - list.filter( - (record: string): boolean => patternsToRemove.has(record) === false, - ), + expect(patternsToRemove.size).toBeGreaterThan(0); + const list2 = list.filter( + (record: string): boolean => patternsToRemove.has(record) === false, ); - const [ua] = ChromeLighthouseUserAgentStrings; + expect(list2.length).toBeLessThan(list.length); + const isai2 = createisaiFromList(list2); + const ua = "https://openai.com/gptbot" expect(isai(ua)).toBe(true); expect(isai2(ua)).toBe(false); }); @@ -92,27 +81,6 @@ describe("isai", () => { ); }); - describe("isaiNaive", () => { - test.each([75])( - "a large number of user agent strings can be detected (>%s%)", - (percent) => { - const ratio = - crawlers.filter((ua) => isaiNaive(ua)).length / crawlers.length; - expect(ratio).toBeLessThanOrEqual(1); - expect(ratio).toBeGreaterThan(percent / 100); - }, - ); - test.each([1])( - "a small number of browsers is falsly detected as bots (<%s%)", - (percent) => { - const ratio = - browsers.filter((ua) => isaiNaive(ua)).length / browsers.length; - expect(ratio).toBeGreaterThan(0); - expect(ratio).toBeLessThan(percent / 100); - }, - ); - }); - describe("regex fallback", () => { beforeAll(async () => { jest @@ -132,20 +100,6 @@ describe("isai", () => { afterAll(() => { jest.restoreAllMocks(); }); - test("fallback regex detects commong crawlers", () => { - USER_AGENT_COMMON.forEach((ua) => { - if (!isaiInstance(ua)) { - throw new Error(`Failed to detect ${ua} as bot`); - } - }); - }); - test("fallback detects gotchas as bots", () => { - USER_AGENT_GOTCHAS.forEach((ua) => { - if (!isaiInstance(ua)) { - throw new Error(`Failed to detect ${ua} as bot (gotcha)`); - } - }); - }); test("fallback does not detect browser as bot", () => { expect(isaiInstance(BROWSER_USER_AGENT_EXAMPLE)).toBe(false); }); @@ -189,7 +143,7 @@ describe("isai", () => { }); test("regular expressions exports are as expected", () => { expect(new RegExp(fullPattern, "i").toString()).toBe( - getPattern().toString(), + getPattern()?.toString(), ); }); });