From 4949f6cc20df7910edac99499f33e1a27d37d18c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 22 Nov 2024 23:22:19 -0800 Subject: [PATCH 1/7] ui: prepend https:// to URLs that are missing it but considered valid (except for URL list) part of fix for #2167 --- .../crawl-workflows/workflow-editor.ts | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 114f20e27..9f0907be4 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -813,6 +813,17 @@ export class WorkflowEditor extends BtrixElement { const text = msg("Please enter a valid URL."); inputEl.helpText = text; inputEl.setCustomValidity(text); + } else if ( + inputEl.value && + !inputEl.value.startsWith("https://") && + !inputEl.value.startsWith("http://") + ) { + this.updateFormState( + { + urlList: "https://" + inputEl.value, + }, + true, + ); } }} > @@ -996,6 +1007,18 @@ https://archiveweb.page/guide`} const text = msg("Please enter a valid URL."); inputEl.helpText = text; inputEl.setCustomValidity(text); + inputEl.setCustomValidity(text); + } else if ( + inputEl.value && + !inputEl.value.startsWith("https://") && + !inputEl.value.startsWith("http://") + ) { + this.updateFormState( + { + primarySeedUrl: "https://" + inputEl.value, + }, + true, + ); } }} > From d48395a8e52b4ea8a8144eb37309e25096325d43 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 22 Nov 2024 23:37:07 -0800 Subject: [PATCH 2/7] better regex for URL validation --- frontend/src/features/crawl-workflows/workflow-editor.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 9f0907be4..2a55e5791 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -162,7 +162,8 @@ function getLocalizedWeekDays() { } function validURL(url: string) { - return /((((https?):(?:\/\/)?)(?:[-;:&=+$,\w]+@)?[A-Za-z0-9.-]+|(?:www\.|[-;:&=+$,\w]+@)[A-Za-z0-9.-]+)((?:\/[+~%/.\w\-_]*)?\??(?:[-+=&;%@.\w_]*)#?(?:[.!/\\\w]*))?)/.test( + // adapted from: https://gist.github.com/dperini/729294 + return /^(?:https?:\/\/)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?$/i.test( url, ); } From 55eb2fb8663c9dce61f6f0449f4086fa7322f6c6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 23 Nov 2024 09:52:23 -0800 Subject: [PATCH 3/7] Update frontend/src/features/crawl-workflows/workflow-editor.ts --- frontend/src/features/crawl-workflows/workflow-editor.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 2a55e5791..4b480908e 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -1008,7 +1008,6 @@ https://archiveweb.page/guide`} const text = msg("Please enter a valid URL."); inputEl.helpText = text; inputEl.setCustomValidity(text); - inputEl.setCustomValidity(text); } else if ( inputEl.value && !inputEl.value.startsWith("https://") && From b2cd5522c07b07392c36492714ad4ba5369ed331 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 23 Nov 2024 10:25:17 -0800 Subject: [PATCH 4/7] add auto-prepend https to url list as well add validation onblur for url list --- .../crawl-workflows/workflow-editor.ts | 94 ++++++++----------- 1 file changed, 41 insertions(+), 53 deletions(-) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 4b480908e..7ff20944b 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -112,7 +112,6 @@ const DEFAULT_BEHAVIORS = [ "autofetch", "siteSpecific", ]; -const MAX_ADDITIONAL_URLS = 100; const getDefaultProgressState = (hasConfigId = false): ProgressState => { let activeTab: StepName = "crawlSetup"; @@ -846,19 +845,7 @@ https://archiveweb.page/guide`} required @keyup=${async (e: KeyboardEvent) => { if (e.key === "Enter") { - const inputEl = e.target as SlInput; - await inputEl.updateComplete; - if (!inputEl.value) return; - const { isValid, helpText } = this.validateUrlList( - inputEl.value, - MAX_ADDITIONAL_URLS, - ); - inputEl.helpText = helpText; - if (isValid) { - inputEl.setCustomValidity(""); - } else { - inputEl.setCustomValidity(helpText); - } + this.doValidateTextArea(e.target); } }} @sl-input=${(e: CustomEvent) => { @@ -868,24 +855,16 @@ https://archiveweb.page/guide`} } }} @sl-change=${async (e: CustomEvent) => { - const inputEl = e.target as SlInput; - if (!inputEl.value) return; - const { isValid, helpText } = this.validateUrlList( - inputEl.value, - MAX_ADDITIONAL_URLS, - ); - inputEl.helpText = helpText; - if (isValid) { - inputEl.setCustomValidity(""); - } else { - inputEl.setCustomValidity(helpText); - } + this.doValidateTextArea(e.target); + }} + @sl-blur=${async (e: CustomEvent) => { + this.doValidateTextArea(e.target); }} > `)} ${this.renderHelpTextCol( msg( - str`The crawler will visit and record each URL listed here. You can enter up to ${MAX_ADDITIONAL_URLS.toLocaleString()} URLs.`, + str`The crawler will visit and record each URL listed here. You can enter up to ${URL_LIST_MAX_URLS.toLocaleString()} URLs.`, ), )} `} @@ -1121,19 +1100,7 @@ https://example.net`} https://archiveweb.page/images/${"logo.svg"}`} @keyup=${async (e: KeyboardEvent) => { if (e.key === "Enter") { - const inputEl = e.target as SlInput; - await inputEl.updateComplete; - if (!inputEl.value) return; - const { isValid, helpText } = this.validateUrlList( - inputEl.value, - MAX_ADDITIONAL_URLS, - ); - inputEl.helpText = helpText; - if (isValid) { - inputEl.setCustomValidity(""); - } else { - inputEl.setCustomValidity(helpText); - } + this.doValidateTextArea(e.target); } }} @sl-input=${(e: CustomEvent) => { @@ -1143,24 +1110,16 @@ https://archiveweb.page/images/${"logo.svg"}`} } }} @sl-change=${async (e: CustomEvent) => { - const inputEl = e.target as SlInput; - if (!inputEl.value) return; - const { isValid, helpText } = this.validateUrlList( - inputEl.value, - MAX_ADDITIONAL_URLS, - ); - inputEl.helpText = helpText; - if (isValid) { - inputEl.setCustomValidity(""); - } else { - inputEl.setCustomValidity(helpText); - } + this.doValidateTextArea(e.target); + }} + @sl-blur=${async (e: CustomEvent) => { + this.doValidateTextArea(e.target); }} > `)} ${this.renderHelpTextCol( msg( - str`The crawler will visit and record each URL listed here. You can enter up to ${MAX_ADDITIONAL_URLS.toLocaleString()} URLs.`, + str`The crawler will visit and record each URL listed here. You can enter up to ${URL_LIST_MAX_URLS.toLocaleString()} URLs.`, ), )} @@ -1169,6 +1128,21 @@ https://archiveweb.page/images/${"logo.svg"}`} `; }; + private doValidateTextArea(target: EventTarget | null) { + const inputEl = target as SlInput; + if (!inputEl.value) return; + const { isValid, helpText } = this.validateUrlList( + inputEl.value, + URL_LIST_MAX_URLS, + ); + inputEl.helpText = helpText; + if (isValid) { + inputEl.setCustomValidity(""); + } else { + inputEl.setCustomValidity(helpText); + } + } + private renderCrawlLimits() { // Max Pages minimum value cannot be lower than seed count const minPages = Math.max( @@ -2098,6 +2072,20 @@ https://archiveweb.page/images/${"logo.svg"}`} str`Please remove or fix the following invalid URL: ${invalidUrl}`, ); } + if (isValid) { + // auto-add https:// prefix if otherwise a valid URL + let updated = false; + for (let i = 0; i < urlList.length; i++) { + const url = urlList[i]; + if (!url.startsWith("http://") && !url.startsWith("https://")) { + urlList[i] = "https://" + url; + updated = true; + } + } + if (updated) { + this.updateFormState({ urlList: urlList.join("\n") }); + } + } } return { isValid, helpText }; } From c6d0ad11ba6722f747fb08e82a1e45a0e0232fd5 Mon Sep 17 00:00:00 2001 From: ikreymer Date: Sat, 23 Nov 2024 18:27:07 +0000 Subject: [PATCH 5/7] Apply `localize:extract` changes --- frontend/xliff/es.xlf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/xliff/es.xlf b/frontend/xliff/es.xlf index d11806c40..69f38a804 100644 --- a/frontend/xliff/es.xlf +++ b/frontend/xliff/es.xlf @@ -3688,7 +3688,7 @@ The URL of the page to crawl. - The crawler will visit and record each URL listed here. You can enter up to URLs. + The crawler will visit and record each URL listed here. You can enter up to URLs. If checked, the crawler will visit pages one link away. From ed30812e5e92cca31028f159f5e71de7674bfd2f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sat, 23 Nov 2024 10:31:24 -0800 Subject: [PATCH 6/7] set URL_LIST_MAX_URLS to 100, removed redundant MAX_ADDITIONAL_URLS, just using single limit todo: make customizable in the future --- frontend/src/features/crawl-workflows/workflow-editor.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 7ff20944b..d4c21e56e 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -173,7 +173,8 @@ const urlListToArray = flow( trimArray, ); -const URL_LIST_MAX_URLS = 1000; +//todo: make this customizable, perhaps at deploy time +const URL_LIST_MAX_URLS = 100; type CrawlConfigResponse = { run_now_job?: boolean; From e0394ab44fa61ea5012fa5516ffd634da6a55136 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 24 Nov 2024 11:31:54 -0800 Subject: [PATCH 7/7] readd await updateComplete for consistency --- frontend/src/features/crawl-workflows/workflow-editor.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index d4c21e56e..c564af81e 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -846,6 +846,7 @@ https://archiveweb.page/guide`} required @keyup=${async (e: KeyboardEvent) => { if (e.key === "Enter") { + await (e.target as SlInput).updateComplete; this.doValidateTextArea(e.target); } }} @@ -1101,6 +1102,7 @@ https://example.net`} https://archiveweb.page/images/${"logo.svg"}`} @keyup=${async (e: KeyboardEvent) => { if (e.key === "Enter") { + await (e.target as SlInput).updateComplete; this.doValidateTextArea(e.target); } }}