Skip to content
This repository has been archived by the owner on Sep 20, 2023. It is now read-only.

Commit

Permalink
Issue27 updated (#35)
Browse files Browse the repository at this point in the history
* fixed ts syntax error on stopNodeSelection helper function

* fixed issue27

* Migrate feature to StepOptions

Co-authored-by: Baptiste Arnaud <[email protected]>
  • Loading branch information
CakeCrusher and baptisteArno authored Mar 17, 2021
1 parent 004ed88 commit 597e7c1
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 44 deletions.
6 changes: 4 additions & 2 deletions public/content.js
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,9 @@ const onClick = (
break;
}
default: {
content = selectedNodes[0].textContent;
content = selectedNodes[0].textContent
.replace(/(\r\n|\n|\r)/gm, "")
.trim();
}
}
extensionIframe.contentWindow.postMessage(
Expand Down Expand Up @@ -482,7 +484,7 @@ const onClick = (
e.preventDefault();
e.stopImmediatePropagation();
tippyOnlyThisButton.destroy();
let content = clicked.textContent;
let content = clicked.textContent.replace(/(\r\n|\n|\r)/gm, "").trim();
switch (type) {
case "a": {
content = clicked.href;
Expand Down
2 changes: 1 addition & 1 deletion public/manifest.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "Tinking - Scrapping Tool",
"version": "0.0.0.3",
"version": "0.0.0.4",
"manifest_version": 2,
"description": "Extract data from any website without code, just clicks",
"icons": {
Expand Down
29 changes: 19 additions & 10 deletions src/StepItem/OptionItem.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import {
import React, { ChangeEvent, useEffect, useState } from "react";
import {
launchNodeSelection,
parseInputPlaceholderFromOption,
stopNodeSelection,
} from "../service/helperFunctions";
import {
Expand Down Expand Up @@ -44,6 +45,9 @@ export const OptionItem = ({
false
);
const [regexValid, setRegexValid] = useState<boolean | undefined>();
const [inputPlaceholder, setInputPlaceholder] = useState(
parseInputPlaceholderFromOption(option?.type)
);

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const handleIncomingMessageFromPage = (event: any) => {
Expand All @@ -67,20 +71,26 @@ export const OptionItem = ({
};
});

const handleOptionTypeChange = (option: OptionType) => {
const handleOptionTypeChange = (optionType: OptionType) => {
let newOption: SimpleOption | OptionWithValue;
if (option === OptionType.PAGINATION || option === OptionType.REGEX) {
const optionNeedValue =
optionType === OptionType.PAGINATION ||
optionType === OptionType.REGEX ||
optionType === OptionType.CUSTOM_AMOUNT_TO_EXTRACT;

if (optionNeedValue) {
newOption = {
type: option,
type: optionType,
value: "",
};
setInputPlaceholder(parseInputPlaceholderFromOption(optionType));
} else {
newOption = {
type: option,
type: optionType,
};
}
onOptionChange(newOption);
if (option === OptionType.PAGINATION) {
if (optionType === OptionType.PAGINATION) {
setIsSelectingPaginateElement(true);
launchNodeSelection(stepIndex, "pagination", { optionIndex });
}
Expand Down Expand Up @@ -126,11 +136,7 @@ export const OptionItem = ({
<InputGroup size="sm">
<Input
ml={1}
placeholder={
option.type === OptionType.REGEX
? "Regex with group to match"
: "Selector"
}
placeholder={inputPlaceholder}
value={option.value}
onChange={(e) => handleOptionValueChange(e, option)}
/>
Expand Down Expand Up @@ -172,6 +178,9 @@ const SelectOption = ({
onChange={(e) => onOptionChange(e.target.value as OptionType)}
>
<option>Select an option</option>
<option value={OptionType.CUSTOM_AMOUNT_TO_EXTRACT}>
{OptionType.CUSTOM_AMOUNT_TO_EXTRACT}
</option>
<option value={OptionType.INFINITE_SCROLL}>
{OptionType.INFINITE_SCROLL}
</option>
Expand Down
98 changes: 69 additions & 29 deletions src/lib/scriptGenerator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ const parseSingleCommandFromStep = (
const regexOption = step.options.find(
(option) => option?.type === OptionType.REGEX
) as OptionWithValue;

const amountToExtract = getAmountToExtract(step);

switch (step.action) {
case StepAction.NAVIGATE: {
command += `
Expand All @@ -191,16 +194,20 @@ const parseSingleCommandFromStep = (
break;
}
case StepAction.EXTRACT_TEXT: {
if (step.totalSelected && step.totalSelected > 1) {
if (
step.totalSelected &&
step.totalSelected > 1 &&
amountToExtract !== "1"
) {
command += `
const ${variableName} = await page.evaluate(() => {
const elements = document.querySelectorAll("${step.selector}")
return [...elements].map(element => element.textContent || null);
return [...elements].map(element => element.textContent.replace(/(\r\n|\n|\r)/gm, "").trim() || null).slice(0,${amountToExtract});
});`;
} else {
command += `const ${variableName} = await page.evaluate(() => {
const element = document.querySelector("${step.selector}")
return element.textContent;
return element.textContent.replace(/(\r\n|\n|\r)/gm, "").trim();
});
let formatted${
variableName.charAt(0).toUpperCase() + variableName.slice(1)
Expand All @@ -210,11 +217,15 @@ const parseSingleCommandFromStep = (
break;
}
case StepAction.EXTRACT_IMAGE_SRC: {
if (step.totalSelected && step.totalSelected > 1) {
if (
step.totalSelected &&
step.totalSelected > 1 &&
amountToExtract !== "1"
) {
command += `
const ${variableName} = await page.evaluate(() => {
const elements = document.querySelectorAll("${step.selector}")
return [...elements].map(element => element.src || null);
return [...elements].map(element => element.src || null).slice(0,${amountToExtract});
});`;
} else {
command += `
Expand All @@ -230,11 +241,15 @@ const parseSingleCommandFromStep = (
break;
}
case StepAction.EXTRACT_HREF: {
if (step.totalSelected && step.totalSelected > 1) {
if (
step.totalSelected &&
step.totalSelected > 1 &&
amountToExtract !== "1"
) {
command += `
const ${variableName} = await page.evaluate(() => {
const elements = document.querySelectorAll("${step.selector}")
return [...elements].map(element => element.href || null);
return [...elements].map(element => element.href || null).slice(0,${amountToExtract});
});`;
} else {
command += `
Expand Down Expand Up @@ -269,11 +284,29 @@ const parseSingleCommandFromStep = (
return command;
};

const getAmountToExtract = (step: Step): string => {
const optionIndex = step.options.findIndex(
(option) => option?.type === OptionType.CUSTOM_AMOUNT_TO_EXTRACT
);
const stepHasCustomAmountToExtract = optionIndex !== -1;

if (stepHasCustomAmountToExtract) {
const option: OptionWithValue = step.options[
optionIndex
] as OptionWithValue;
return option.value;
} else {
return "";
}
};

const parseLoopFromStep = (step: Step) => {
const paginationOption = step.options?.find(
(option) => option?.type === OptionType.PAGINATION
) as OptionWithValue | undefined;

const amountToExtract = getAmountToExtract(step);

let urlsExtractionCommand;
if (paginationOption) {
urlsExtractionCommand = `
Expand All @@ -282,29 +315,36 @@ const parseLoopFromStep = (step: Step) => {
urls = await page.evaluate(() => {
return [...document.querySelectorAll("${step.selector}")].map((node) => node.href);
});
let i = 0
// 1000 pages max
console.log("Extracting URLs");
const paginationBar = new ProgressBar(" scrapping [:bar] :rate/bps :percent :etas", {
complete: "=",
incomplete: " ",
width: 20,
total: 1000
});
while(i <= 1000){
paginationBar.tick()
i += 1
const nodes = await page.$$("${paginationOption?.value}");
await nodes.pop().click();
await page.waitForTimeout(4000);
try{
await page.waitForSelector("${step.selector}")
}catch{
break;
if(urls.length >= ${amountToExtract}){
urls = urls.slice(0, ${amountToExtract})
} else {
let i = 0
console.log("Extracting URLs");
const paginationBar = new ProgressBar(" scrapping [:bar] :rate/bps :percent :etas", {
complete: "=",
incomplete: " ",
width: 20,
total: 1000
});
while(i <= 1000){
paginationBar.tick()
i += 1
const nodes = await page.$$("${paginationOption?.value}");
await nodes.pop().click();
await page.waitForTimeout(4000);
try{
await page.waitForSelector("${step.selector}")
}catch{
break;
}
urls = urls.concat(await page.evaluate(() => {
return [...document.querySelectorAll("${step.selector}")].map(node => node.href);
}))
if (urls.length >= ${amountToExtract}) {
urls = urls.slice(0, ${amountToExtract})
break;
}
}
urls = urls.concat(await page.evaluate(() => {
return [...document.querySelectorAll("${step.selector}")].map(node => node.href);
}))
}
`;
} else {
Expand Down
25 changes: 23 additions & 2 deletions src/service/helperFunctions.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { TagType, StepAction, Step, ScrappedStep } from "../types";
import { TagType, StepAction, Step, ScrappedStep, OptionType } from "../types";

export const parseTagTypeFromAction = (action: StepAction): TagType => {
if (action === StepAction.EXTRACT_HREF || action === StepAction.NAVIGATE) {
Expand Down Expand Up @@ -42,7 +42,9 @@ export const getSelectorContent = (
}
switch (action) {
case StepAction.EXTRACT_TEXT: {
return element.textContent ?? undefined;
return (
element.textContent?.replace(/(\r\n|\n|\r)/gm, "").trim() ?? undefined
);
}
case StepAction.EXTRACT_IMAGE_SRC: {
return (element as HTMLImageElement).src;
Expand Down Expand Up @@ -159,3 +161,22 @@ export const isStepInActionProcess = (step: Step): boolean => {
}
return isSelectingButNoTagName;
};

export const parseInputPlaceholderFromOption = (
optionType?: OptionType
): string => {
switch (optionType) {
case OptionType.PAGINATION: {
return "Node query selector";
}
case OptionType.REGEX: {
return "Regex with group to match";
}
case OptionType.CUSTOM_AMOUNT_TO_EXTRACT: {
return "Amount to extract";
}
default: {
return "";
}
}
};
1 change: 1 addition & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ export enum OptionType {
INFINITE_SCROLL = "Infinite Scroll",
PAGINATION = "Pagination",
REGEX = "Regex",
CUSTOM_AMOUNT_TO_EXTRACT = "Custom amount to extract",
}

export type SimpleOption = {
Expand Down

0 comments on commit 597e7c1

Please sign in to comment.