add openai

eob · Oct 1, 2024 · 54cd315 · 54cd315
1 parent 65650af
commit 54cd315
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 1,107 deletions.
diff --git a/fixtures/crawlers.yml b/fixtures/crawlers.yml
diff --git a/scripts/build/pattern.js b/scripts/build/pattern.js
@@ -11,7 +11,7 @@ const patterns = JSON.parse(
 
 const pattern = new RegExp(
 	patterns
-		.map((pattern) => pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"))
+		.map((pattern) => pattern.pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"))
 		.join("|"),
 ).source;
 

diff --git a/src/index.ts b/src/index.ts
@@ -23,7 +23,7 @@ export function getPattern(): RegExp {
 /**
  * A list of bot identifiers to be used in a regular expression against user agent strings.
  */
-export const list: string[] = patternsList;
+export const list: string[] = patternsList.map((pattern) => pattern.pattern);
 
 /**
  * Check if the given user agent includes a bot pattern. Naive implementation (less accurate).

diff --git a/src/patterns.json b/src/patterns.json
@@ -1,175 +1,39 @@
 [
-  " daum[ /]",
-  " deusu/",
-  " yadirectfetcher",
-  "(?:^|[^g])news(?!sapphire)",
-  "(?<! (?:channel/|google/))google(?!(app|/google| pixel))",
-  "(?<! cu)bots?(?:\\b|_)",
-  "(?<!(?:lib))http",
-  "(?<![hg]m)score",
-  "@[a-z][\\w-]+\\.",
-  "\\(\\)",
-  "\\.com\\b",
-  "\\btime/",
-  "^<",
-  "^[\\w \\.\\-\\(?:\\):]+(?:/v?\\d+(?:\\.\\d+)?(?:\\.\\d{1,10})*?)?(?:,|$)",
-  "^[^ ]{50,}$",
-  "^\\d+\\b",
-  "^\\w*search\\b",
-  "^\\w+/[\\w\\(\\)]*$",
-  "^active",
-  "^ad muncher",
-  "^amaya",
-  "^avsdevicesdk/",
-  "^biglotron",
-  "^bot",
-  "^bw/",
-  "^clamav[ /]",
-  "^client/",
-  "^cobweb/",
-  "^custom",
-  "^ddg[_-]android",
-  "^discourse",
-  "^dispatch/\\d",
-  "^downcast/",
-  "^duckduckgo",
-  "^facebook",
-  "^getright/",
-  "^gozilla/",
-  "^hobbit",
-  "^hotzonu",
-  "^hwcdn/",
-  "^jeode/",
-  "^jetty/",
-  "^jigsaw",
-  "^microsoft bits",
-  "^movabletype",
-  "^mozilla/5\\.0\\s[a-z\\.-]+$",
-  "^mozilla/\\d\\.\\d \\(compatible;?\\)$",
-  "^mozilla/\\d\\.\\d \\w*$",
-  "^navermailapp",
-  "^netsurf",
-  "^offline",
-  "^owler",
-  "^php",
-  "^postman",
-  "^python",
-  "^rank",
-  "^read",
-  "^reed",
-  "^rest",
-  "^rss",
-  "^snapchat",
-  "^space bison",
-  "^svn",
-  "^swcd ",
-  "^taringa",
-  "^thumbor/",
-  "^track",
-  "^valid",
-  "^w3c",
-  "^webbandit/",
-  "^webcopier",
-  "^wget",
-  "^whatsapp",
-  "^wordpress",
-  "^xenu link sleuth",
-  "^yahoo",
-  "^yandex",
-  "^zdm/\\d",
-  "^zoom marketplace/",
-  "^{{.*}}$",
-  "adscanner/",
-  "analyzer",
-  "archive",
-  "ask jeeves/teoma",
-  "bit\\.ly/",
-  "bluecoat drtr",
-  "browsex",
-  "burpcollaborator",
-  "capture",
-  "catch",
-  "check\\b",
-  "checker",
-  "chrome-lighthouse",
-  "chromeframe",
-  "classifier",
-  "cloudflare",
-  "convertify",
-  "crawl",
-  "cypress/",
-  "dareboost",
-  "datanyze",
-  "dejaclick",
-  "detect",
-  "dmbrowser",
-  "download",
-  "evc-batch/",
-  "exaleadcloudview",
-  "feed",
-  "firephp",
-  "functionize",
-  "gomezagent",
-  "headless",
-  "httrack",
-  "hubspot marketing grader",
-  "hydra",
-  "ibisbrowser",
-  "images",
-  "infrawatch",
-  "insight",
-  "inspect",
-  "iplabel",
-  "ips-agent",
-  "java(?!;)",
-  "jsjcw_scanner",
-  "library",
-  "linkcheck",
-  "mail\\.ru/",
-  "manager",
-  "measure",
-  "neustar wpm",
-  "node",
-  "nutch",
-  "offbyone",
-  "optimize",
-  "pageburst",
-  "pagespeed",
-  "parser",
-  "perl",
-  "phantomjs",
-  "pingdom",
-  "powermarks",
-  "preview",
-  "proxy",
-  "ptst[ /]\\d",
-  "reputation",
-  "resolver",
-  "retriever",
-  "rexx;",
-  "rigor",
-  "rss\\b",
-  "scanner\\.",
-  "scrape",
-  "server",
-  "sogou",
-  "sparkler/",
-  "speedcurve",
-  "spider",
-  "splash",
-  "statuscake",
-  "supercleaner",
-  "synapse",
-  "synthetic",
-  "tools",
-  "torrent",
-  "trace",
-  "transcoder",
-  "url",
-  "virtuoso",
-  "wappalyzer",
-  "webglance",
-  "webkit2png",
-  "whatcms/",
-  "zgrab"
+  {
+    "pattern": "GoogleOther", 
+    "confidence": 0.5,
+    "reasonsFor": [
+      "Google Notebook LLM (https://notebooklm.google/) uses this user agent string."
+    ],
+    "reasonsAgainst": [
+      "GoogleOther is known to be a general-purpose agent string for internal Google projects."
+    ]
+  },
+  {
+    "pattern": "https://openai.com/gptbot", 
+    "confidence": 1.0,
+    "reasonsFor": [
+      "Published user agent substring by OpenAI"
+    ],
+    "reasonsAgainst": [
+    ]
+  },
+  {
+    "pattern": "https://openai.com/searchbot", 
+    "confidence": 1.0,
+    "reasonsFor": [
+      "Published user agent substring by OpenAI"
+    ],
+    "reasonsAgainst": [
+    ]
+  },
+  {
+    "pattern": "https://openai.com/bot", 
+    "confidence": 1.0,
+    "reasonsFor": [
+      "Published user agent substring by OpenAI"
+    ],
+    "reasonsAgainst": [
+    ]
+  }
 ]
diff --git a/tests/efficiency/test.ts b/tests/efficiency/test.ts
@@ -10,7 +10,7 @@ const TIMEOUT = 60000;
 const { update, end } = stdline;
 Object.freeze(list);
 
-const clone = (): string[] => list.slice();
+const clone = (): string[] => list.map((p) => p.pattern).slice();
 
 describe("efficiency", () => {
 	describe(`Redundant rules: no rule can be removed. Check each one against ${crawlers.length} user agent strings`, () => {

diff --git a/tests/spec/test.ts b/tests/spec/test.ts
@@ -14,8 +14,8 @@ import { fullPattern } from "../../src/pattern";
 import { crawlers, browsers } from "../../fixtures";
 let isaiInstance: any;
 
-const BOT_USER_AGENT_EXAMPLE =
-	"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
+const AI_USER_AGENT_EXAMPLE =
+	"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot";
 const BROWSER_USER_AGENT_EXAMPLE =
 	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91 Safari/537.36";
 
@@ -38,30 +38,30 @@ describe("isai", () => {
 			expect(list).toBeInstanceOf(Array);
 			expect(list.every((item) => typeof item === "string")).toBe(true);
 		});
-		test("isai: bot user agect string is recognised as bot", () => {
-			expect(isai(BOT_USER_AGENT_EXAMPLE)).toBe(true);
+		test("isai: bot user agent string is recognised as an AI", () => {
+			expect(isai(AI_USER_AGENT_EXAMPLE)).toBe(true);
 		});
 		test("isaiMatch: find pattern in bot user agent string", () => {
-			expect(isaiMatch(BOT_USER_AGENT_EXAMPLE)).toBe("Google");
+			expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("Google");
 		});
 		test("isaiMatches: find all patterns in bot user agent string", () => {
-			expect(isaiMatches(BOT_USER_AGENT_EXAMPLE)).toContain("Google");
-			expect(isaiMatches(BOT_USER_AGENT_EXAMPLE)).toHaveLength(4);
+			expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("Google");
+			expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toHaveLength(4);
 		});
 		test("isaiPattern: find first pattern in bot user agent string", () => {
-			expect(isaiPattern(BOT_USER_AGENT_EXAMPLE)).toBe(
+			expect(isaiPattern(AI_USER_AGENT_EXAMPLE)).toBe(
 				"(?<! (?:channel/|google/))google(?!(app|/google| pixel))",
 			);
 		});
 		test("isaiPatterns: find all patterns in bot user agent string", () => {
-			expect(isaiPatterns(BOT_USER_AGENT_EXAMPLE)).toContain(
+			expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toContain(
 				"(?<! (?:channel/|google/))google(?!(app|/google| pixel))",
 			);
-			expect(isaiPatterns(BOT_USER_AGENT_EXAMPLE)).toHaveLength(4);
+			expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toHaveLength(4);
 		});
 		test("createisai: create custom isai function with custom pattern", () => {
 			const customisai = createisai(/bot/i);
-			expect(customisai(BOT_USER_AGENT_EXAMPLE)).toBe(true);
+			expect(customisai(AI_USER_AGENT_EXAMPLE)).toBe(true);
 		});
 		test("createisaiFromList: create custom isai function with custom pattern", () => {
 			const ChromeLighthouseUserAgentStrings: string[] = [
@@ -165,7 +165,7 @@ describe("isai", () => {
 			expect(misidentifiedStrings).toEqual([]);
 			expect(successCount).toBe(crawlers.length);
 		});
-		test(`✘ ${browsers.length} user agent string should not be recognised as crawler`, () => {
+		test(`✘ ${browsers.length} user agent string should not be recognised as an AI`, () => {
 			let successCount = 0;
 			let misidentifiedStrings: string[] = [];
 			browsers.forEach((browser) => {