Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepend HTTPS to Crawl Start URL #2177

Merged
merged 7 commits into from
Nov 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 69 additions & 55 deletions frontend/src/features/crawl-workflows/workflow-editor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ const DEFAULT_BEHAVIORS = [
"autofetch",
"siteSpecific",
];
const MAX_ADDITIONAL_URLS = 100;

const getDefaultProgressState = (hasConfigId = false): ProgressState => {
let activeTab: StepName = "crawlSetup";
Expand Down Expand Up @@ -162,7 +161,8 @@ function getLocalizedWeekDays() {
}

function validURL(url: string) {
return /((((https?):(?:\/\/)?)(?:[-;:&=+$,\w]+@)?[A-Za-z0-9.-]+|(?:www\.|[-;:&=+$,\w]+@)[A-Za-z0-9.-]+)((?:\/[+~%/.\w\-_]*)?\??(?:[-+=&;%@.\w_]*)#?(?:[.!/\\\w]*))?)/.test(
// adapted from: https://gist.github.com/dperini/729294
return /^(?:https?:\/\/)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?$/i.test(
url,
);
}
Expand All @@ -173,7 +173,8 @@ const urlListToArray = flow(
trimArray,
);

const URL_LIST_MAX_URLS = 1000;
//todo: make this customizable, perhaps at deploy time
const URL_LIST_MAX_URLS = 100;

type CrawlConfigResponse = {
run_now_job?: boolean;
Expand Down Expand Up @@ -813,6 +814,17 @@ export class WorkflowEditor extends BtrixElement {
const text = msg("Please enter a valid URL.");
inputEl.helpText = text;
inputEl.setCustomValidity(text);
} else if (
inputEl.value &&
!inputEl.value.startsWith("https://") &&
!inputEl.value.startsWith("http://")
) {
this.updateFormState(
{
urlList: "https://" + inputEl.value,
},
true,
);
}
}}
>
Expand All @@ -834,19 +846,8 @@ https://archiveweb.page/guide`}
required
@keyup=${async (e: KeyboardEvent) => {
if (e.key === "Enter") {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
MAX_ADDITIONAL_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
await (e.target as SlInput).updateComplete;
this.doValidateTextArea(e.target);
}
}}
@sl-input=${(e: CustomEvent) => {
Expand All @@ -856,24 +857,16 @@ https://archiveweb.page/guide`}
}
}}
@sl-change=${async (e: CustomEvent) => {
const inputEl = e.target as SlInput;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
MAX_ADDITIONAL_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
this.doValidateTextArea(e.target);
}}
@sl-blur=${async (e: CustomEvent) => {
this.doValidateTextArea(e.target);
}}
></sl-textarea>
`)}
${this.renderHelpTextCol(
msg(
str`The crawler will visit and record each URL listed here. You can enter up to ${MAX_ADDITIONAL_URLS.toLocaleString()} URLs.`,
str`The crawler will visit and record each URL listed here. You can enter up to ${URL_LIST_MAX_URLS.toLocaleString()} URLs.`,
),
)}
`}
Expand Down Expand Up @@ -996,6 +989,17 @@ https://archiveweb.page/guide`}
const text = msg("Please enter a valid URL.");
inputEl.helpText = text;
inputEl.setCustomValidity(text);
} else if (
inputEl.value &&
!inputEl.value.startsWith("https://") &&
!inputEl.value.startsWith("http://")
) {
this.updateFormState(
{
primarySeedUrl: "https://" + inputEl.value,
},
true,
);
}
}}
>
Expand Down Expand Up @@ -1098,19 +1102,8 @@ https://example.net`}
https://archiveweb.page/images/${"logo.svg"}`}
@keyup=${async (e: KeyboardEvent) => {
if (e.key === "Enter") {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
MAX_ADDITIONAL_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
await (e.target as SlInput).updateComplete;
this.doValidateTextArea(e.target);
}
}}
@sl-input=${(e: CustomEvent) => {
Expand All @@ -1120,24 +1113,16 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
}}
@sl-change=${async (e: CustomEvent) => {
const inputEl = e.target as SlInput;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
MAX_ADDITIONAL_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
this.doValidateTextArea(e.target);
}}
@sl-blur=${async (e: CustomEvent) => {
this.doValidateTextArea(e.target);
}}
></sl-textarea>
`)}
${this.renderHelpTextCol(
msg(
str`The crawler will visit and record each URL listed here. You can enter up to ${MAX_ADDITIONAL_URLS.toLocaleString()} URLs.`,
str`The crawler will visit and record each URL listed here. You can enter up to ${URL_LIST_MAX_URLS.toLocaleString()} URLs.`,
),
)}
</div>
Expand All @@ -1146,6 +1131,21 @@ https://archiveweb.page/images/${"logo.svg"}`}
`;
};

private doValidateTextArea(target: EventTarget | null) {
const inputEl = target as SlInput;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
URL_LIST_MAX_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
}

private renderCrawlLimits() {
// Max Pages minimum value cannot be lower than seed count
const minPages = Math.max(
Expand Down Expand Up @@ -2075,6 +2075,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
str`Please remove or fix the following invalid URL: ${invalidUrl}`,
);
}
if (isValid) {
// auto-add https:// prefix if otherwise a valid URL
let updated = false;
for (let i = 0; i < urlList.length; i++) {
const url = urlList[i];
if (!url.startsWith("http://") && !url.startsWith("https://")) {
urlList[i] = "https://" + url;
updated = true;
}
}
if (updated) {
this.updateFormState({ urlList: urlList.join("\n") });
}
}
}
return { isValid, helpText };
}
Expand Down
2 changes: 1 addition & 1 deletion frontend/xliff/es.xlf
Original file line number Diff line number Diff line change
Expand Up @@ -3688,7 +3688,7 @@
<source>The URL of the page to crawl.</source>
</trans-unit>
<trans-unit id="s41d2278219615589">
<source>The crawler will visit and record each URL listed here. You can enter up to <x equiv-text="${MAX_ADDITIONAL_URLS.toLocaleString()}" id="0"/> URLs.</source>
<source>The crawler will visit and record each URL listed here. You can enter up to <x equiv-text="${URL_LIST_MAX_URLS.toLocaleString()}" id="0"/> URLs.</source>
</trans-unit>
<trans-unit id="sfc5e402f8b21ef5f">
<source>If checked, the crawler will visit pages one link away.</source>
Expand Down
Loading