Skip to content

Commit

Permalink
add openai
Browse files Browse the repository at this point in the history
  • Loading branch information
eob committed Oct 1, 2024
1 parent 65650af commit 54cd315
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 1,107 deletions.
926 changes: 7 additions & 919 deletions fixtures/crawlers.yml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion scripts/build/pattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ const patterns = JSON.parse(

const pattern = new RegExp(
patterns
.map((pattern) => pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"))
.map((pattern) => pattern.pattern.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"))
.join("|"),
).source;

Expand Down
2 changes: 1 addition & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ export function getPattern(): RegExp {
/**
* A list of bot identifiers to be used in a regular expression against user agent strings.
*/
export const list: string[] = patternsList;
export const list: string[] = patternsList.map((pattern) => pattern.pattern);

/**
* Check if the given user agent includes a bot pattern. Naive implementation (less accurate).
Expand Down
210 changes: 37 additions & 173 deletions src/patterns.json
Original file line number Diff line number Diff line change
@@ -1,175 +1,39 @@
[
" daum[ /]",
" deusu/",
" yadirectfetcher",
"(?:^|[^g])news(?!sapphire)",
"(?<! (?:channel/|google/))google(?!(app|/google| pixel))",
"(?<! cu)bots?(?:\\b|_)",
"(?<!(?:lib))http",
"(?<![hg]m)score",
"@[a-z][\\w-]+\\.",
"\\(\\)",
"\\.com\\b",
"\\btime/",
"^<",
"^[\\w \\.\\-\\(?:\\):]+(?:/v?\\d+(?:\\.\\d+)?(?:\\.\\d{1,10})*?)?(?:,|$)",
"^[^ ]{50,}$",
"^\\d+\\b",
"^\\w*search\\b",
"^\\w+/[\\w\\(\\)]*$",
"^active",
"^ad muncher",
"^amaya",
"^avsdevicesdk/",
"^biglotron",
"^bot",
"^bw/",
"^clamav[ /]",
"^client/",
"^cobweb/",
"^custom",
"^ddg[_-]android",
"^discourse",
"^dispatch/\\d",
"^downcast/",
"^duckduckgo",
"^facebook",
"^getright/",
"^gozilla/",
"^hobbit",
"^hotzonu",
"^hwcdn/",
"^jeode/",
"^jetty/",
"^jigsaw",
"^microsoft bits",
"^movabletype",
"^mozilla/5\\.0\\s[a-z\\.-]+$",
"^mozilla/\\d\\.\\d \\(compatible;?\\)$",
"^mozilla/\\d\\.\\d \\w*$",
"^navermailapp",
"^netsurf",
"^offline",
"^owler",
"^php",
"^postman",
"^python",
"^rank",
"^read",
"^reed",
"^rest",
"^rss",
"^snapchat",
"^space bison",
"^svn",
"^swcd ",
"^taringa",
"^thumbor/",
"^track",
"^valid",
"^w3c",
"^webbandit/",
"^webcopier",
"^wget",
"^whatsapp",
"^wordpress",
"^xenu link sleuth",
"^yahoo",
"^yandex",
"^zdm/\\d",
"^zoom marketplace/",
"^{{.*}}$",
"adscanner/",
"analyzer",
"archive",
"ask jeeves/teoma",
"bit\\.ly/",
"bluecoat drtr",
"browsex",
"burpcollaborator",
"capture",
"catch",
"check\\b",
"checker",
"chrome-lighthouse",
"chromeframe",
"classifier",
"cloudflare",
"convertify",
"crawl",
"cypress/",
"dareboost",
"datanyze",
"dejaclick",
"detect",
"dmbrowser",
"download",
"evc-batch/",
"exaleadcloudview",
"feed",
"firephp",
"functionize",
"gomezagent",
"headless",
"httrack",
"hubspot marketing grader",
"hydra",
"ibisbrowser",
"images",
"infrawatch",
"insight",
"inspect",
"iplabel",
"ips-agent",
"java(?!;)",
"jsjcw_scanner",
"library",
"linkcheck",
"mail\\.ru/",
"manager",
"measure",
"neustar wpm",
"node",
"nutch",
"offbyone",
"optimize",
"pageburst",
"pagespeed",
"parser",
"perl",
"phantomjs",
"pingdom",
"powermarks",
"preview",
"proxy",
"ptst[ /]\\d",
"reputation",
"resolver",
"retriever",
"rexx;",
"rigor",
"rss\\b",
"scanner\\.",
"scrape",
"server",
"sogou",
"sparkler/",
"speedcurve",
"spider",
"splash",
"statuscake",
"supercleaner",
"synapse",
"synthetic",
"tools",
"torrent",
"trace",
"transcoder",
"url",
"virtuoso",
"wappalyzer",
"webglance",
"webkit2png",
"whatcms/",
"zgrab"
{
"pattern": "GoogleOther",
"confidence": 0.5,
"reasonsFor": [
"Google Notebook LLM (https://notebooklm.google/) uses this user agent string."
],
"reasonsAgainst": [
"GoogleOther is known to be a general-purpose agent string for internal Google projects."
]
},
{
"pattern": "https://openai.com/gptbot",
"confidence": 1.0,
"reasonsFor": [
"Published user agent substring by OpenAI"
],
"reasonsAgainst": [
]
},
{
"pattern": "https://openai.com/searchbot",
"confidence": 1.0,
"reasonsFor": [
"Published user agent substring by OpenAI"
],
"reasonsAgainst": [
]
},
{
"pattern": "https://openai.com/bot",
"confidence": 1.0,
"reasonsFor": [
"Published user agent substring by OpenAI"
],
"reasonsAgainst": [
]
}
]
2 changes: 1 addition & 1 deletion tests/efficiency/test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const TIMEOUT = 60000;
const { update, end } = stdline;
Object.freeze(list);

const clone = (): string[] => list.slice();
const clone = (): string[] => list.map((p) => p.pattern).slice();

describe("efficiency", () => {
describe(`Redundant rules: no rule can be removed. Check each one against ${crawlers.length} user agent strings`, () => {
Expand Down
24 changes: 12 additions & 12 deletions tests/spec/test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ import { fullPattern } from "../../src/pattern";
import { crawlers, browsers } from "../../fixtures";
let isaiInstance: any;

const BOT_USER_AGENT_EXAMPLE =
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
const AI_USER_AGENT_EXAMPLE =
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot";
const BROWSER_USER_AGENT_EXAMPLE =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91 Safari/537.36";

Expand All @@ -38,30 +38,30 @@ describe("isai", () => {
expect(list).toBeInstanceOf(Array);
expect(list.every((item) => typeof item === "string")).toBe(true);
});
test("isai: bot user agect string is recognised as bot", () => {
expect(isai(BOT_USER_AGENT_EXAMPLE)).toBe(true);
test("isai: bot user agent string is recognised as an AI", () => {
expect(isai(AI_USER_AGENT_EXAMPLE)).toBe(true);
});
test("isaiMatch: find pattern in bot user agent string", () => {
expect(isaiMatch(BOT_USER_AGENT_EXAMPLE)).toBe("Google");
expect(isaiMatch(AI_USER_AGENT_EXAMPLE)).toBe("Google");
});
test("isaiMatches: find all patterns in bot user agent string", () => {
expect(isaiMatches(BOT_USER_AGENT_EXAMPLE)).toContain("Google");
expect(isaiMatches(BOT_USER_AGENT_EXAMPLE)).toHaveLength(4);
expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toContain("Google");
expect(isaiMatches(AI_USER_AGENT_EXAMPLE)).toHaveLength(4);
});
test("isaiPattern: find first pattern in bot user agent string", () => {
expect(isaiPattern(BOT_USER_AGENT_EXAMPLE)).toBe(
expect(isaiPattern(AI_USER_AGENT_EXAMPLE)).toBe(
"(?<! (?:channel/|google/))google(?!(app|/google| pixel))",
);
});
test("isaiPatterns: find all patterns in bot user agent string", () => {
expect(isaiPatterns(BOT_USER_AGENT_EXAMPLE)).toContain(
expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toContain(
"(?<! (?:channel/|google/))google(?!(app|/google| pixel))",
);
expect(isaiPatterns(BOT_USER_AGENT_EXAMPLE)).toHaveLength(4);
expect(isaiPatterns(AI_USER_AGENT_EXAMPLE)).toHaveLength(4);
});
test("createisai: create custom isai function with custom pattern", () => {
const customisai = createisai(/bot/i);
expect(customisai(BOT_USER_AGENT_EXAMPLE)).toBe(true);
expect(customisai(AI_USER_AGENT_EXAMPLE)).toBe(true);
});
test("createisaiFromList: create custom isai function with custom pattern", () => {
const ChromeLighthouseUserAgentStrings: string[] = [
Expand Down Expand Up @@ -165,7 +165,7 @@ describe("isai", () => {
expect(misidentifiedStrings).toEqual([]);
expect(successCount).toBe(crawlers.length);
});
test(`✘ ${browsers.length} user agent string should not be recognised as crawler`, () => {
test(`✘ ${browsers.length} user agent string should not be recognised as an AI`, () => {
let successCount = 0;
let misidentifiedStrings: string[] = [];
browsers.forEach((browser) => {
Expand Down

0 comments on commit 54cd315

Please sign in to comment.