Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: handle browser link navigation #208

Merged
merged 11 commits into from
Dec 4, 2024
2 changes: 1 addition & 1 deletion maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}
},
(key) => key // Use the original key in the output
));
)) || [];
}

/**
Expand Down
87 changes: 73 additions & 14 deletions maxun-core/src/interpret.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@ import Concurrency from './utils/concurrency';
import Preprocessor from './preprocessor';
import log, { Level } from './utils/logger';

/**
* Extending the Window interface for custom scraping functions.
*/
declare global {
interface Window {
scrape: (selector: string | null) => Record<string, string>[];
scrapeSchema: (
schema: Record<string, { selector: string; tag: string; attribute: string }>
) => Record<string, any>;
scrapeList: (config: { listSelector: string; fields: any; limit?: number; pagination: any }) => Record<string, any>[];
scrapeListAuto: (listSelector: string) => { selector: string; innerText: string }[];
scrollDown: (pages?: number) => void;
scrollUp: (pages?: number) => void;
}
}


/**
* Defines optional intepreter options (passed in constructor)
*/
Expand All @@ -31,7 +48,6 @@ interface InterpreterOptions {
}>
}


/**
* Class for running the Smart Workflows.
*/
Expand All @@ -50,14 +66,18 @@ export default class Interpreter extends EventEmitter {

private blocker: PlaywrightBlocker | null = null;

private cumulativeResults: Record<string, any>[] = [];

constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
super();
this.workflow = workflow.workflow;
this.initializedWorkflow = null;
this.options = {
maxRepeats: 5,
maxConcurrency: 5,
serializableCallback: (data) => { log(JSON.stringify(data), Level.WARN); },
serializableCallback: (data) => {
log(JSON.stringify(data), Level.WARN);
},
binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); },
debug: false,
debugChannel: {},
Expand Down Expand Up @@ -214,11 +234,11 @@ export default class Interpreter extends EventEmitter {
// every condition is treated as a single context

switch (key as keyof typeof operators) {
case '$and':
case '$and' as keyof typeof operators:
return array?.every((x) => this.applicable(x, context));
case '$or':
case '$or' as keyof typeof operators:
return array?.some((x) => this.applicable(x, context));
case '$not':
case '$not' as keyof typeof operators:
return !this.applicable(<Where>value, context); // $not should be a unary operator
default:
throw new Error('Undefined logic operator.');
Expand All @@ -233,9 +253,9 @@ export default class Interpreter extends EventEmitter {
};

switch (key as keyof typeof meta) {
case '$before':
case '$before' as keyof typeof meta:
return !usedActions.find(testRegexString);
case '$after':
case '$after' as keyof typeof meta:
return !!usedActions.find(testRegexString);
default:
throw new Error('Undefined meta operator.');
Expand Down Expand Up @@ -308,9 +328,43 @@ export default class Interpreter extends EventEmitter {

scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; }>) => {
await this.ensureScriptsLoaded(page);

const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
await this.options.serializableCallback(scrapeResult);

const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult];
newResults.forEach((result) => {
Object.entries(result).forEach(([key, value]) => {
const keyExists = this.cumulativeResults.some(
(item) => key in item && item[key] !== undefined
);

if (!keyExists) {
this.cumulativeResults.push({ [key]: value });
}
});
});

const mergedResult: Record<string, string>[] = [
Object.fromEntries(
Object.entries(
this.cumulativeResults.reduce((acc, curr) => {
Object.entries(curr).forEach(([key, value]) => {
// If the key doesn't exist or the current value is not undefined, add/update it
if (value !== undefined) {
acc[key] = value;
}
});
return acc;
}, {})
)
)
];

// Log cumulative results after each action
console.log("CUMULATIVE results:", this.cumulativeResults);
console.log("MERGED results:", mergedResult);

await this.options.serializableCallback(mergedResult);
},

scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
Expand Down Expand Up @@ -357,15 +411,15 @@ export default class Interpreter extends EventEmitter {
};

for (const step of steps) {
this.log(`Launching ${step.action}`, Level.LOG);
this.log(`Launching ${String(step.action)}`, Level.LOG);

if (step.action in wawActions) {
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
await wawActions[step.action as CustomFunctions](...(params ?? []));
} else {
// Implements the dot notation for the "method name" in the workflow
const levels = step.action.split('.');
const levels = String(step.action).split('.');
const methodName = levels[levels.length - 1];

let invokee: any = page;
Expand Down Expand Up @@ -534,9 +588,14 @@ export default class Interpreter extends EventEmitter {
if (this.options.debug) {
this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
}
const actionId = workflow.findIndex(
(step) => this.applicable(step.where, pageState, usedActions),
);

const actionId = workflow.findIndex((step) => {
const isApplicable = this.applicable(step.where, pageState, usedActions);
console.log(`Where:`, step.where);
console.log(`Page state:`, pageState);
console.log(`Match result: ${isApplicable}`);
return isApplicable;
});

const action = workflow[actionId];

Expand Down
96 changes: 79 additions & 17 deletions server/src/workflow-management/classes/Generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import { getBestSelectorForAction } from "../utils";
import { browserPool } from "../../server";
import { uuid } from "uuidv4";
import { capture } from "../../utils/analytics"
import { encrypt } from "../../utils/auth";

interface PersistedGeneratedData {
lastUsedSelector: string;
Expand Down Expand Up @@ -159,6 +160,55 @@ export class WorkflowGenerator {
})
};

/**
* New function to handle actionable check for scrapeList
* @param page The current Playwright Page object.
* @param config The scrapeList configuration object.
* @returns {Promise<string[]>} Array of actionable selectors.
*/
private async getSelectorsForScrapeList(page: Page, config: {
listSelector: string;
fields: any;
limit?: number;
pagination: any;
}): Promise<string[]> {
const { listSelector } = config;

// Verify if the selectors are present and actionable on the current page
const actionableSelectors: string[] = [];
if (listSelector) {
const isActionable = await page.isVisible(listSelector).catch(() => false);
if (isActionable) {
actionableSelectors.push(listSelector);
logger.log('debug', `List selector ${listSelector} is actionable.`);
} else {
logger.log('warn', `List selector ${listSelector} is not visible on the page.`);
}
}

return actionableSelectors;
}

/**
* New function to handle actionable check for scrapeList
* @param page The current Playwright Page object.
* @param schema The scrapeSchema configuration object.
* @returns {Promise<string[]>} Array of actionable selectors.
*/
private async getSelectorsForSchema(page: Page, schema: Record<string, { selector: string }>): Promise<string[]> {
const selectors = Object.values(schema).map((field) => field.selector);

// Verify if the selectors are present and actionable on the current page
const actionableSelectors: string[] = [];
for (const selector of selectors) {
const isActionable = await page.isVisible(selector).catch(() => false);
if (isActionable) {
actionableSelectors.push(selector);
}
}
return actionableSelectors;
}

/**
* Adds a newly generated pair to the workflow and notifies the client about it by
* sending the updated workflow through socket.
Expand All @@ -184,55 +234,67 @@ export class WorkflowGenerator {
*/
private addPairToWorkflowAndNotifyClient = async (pair: WhereWhatPair, page: Page) => {
let matched = false;
// validate if a pair with the same where conditions is already present in the workflow

// Check for scrapeSchema actions and enhance the where condition
if (pair.what[0].action === 'scrapeSchema') {
const schema = pair.what[0]?.args?.[0];
if (schema) {
const additionalSelectors = await this.getSelectorsForSchema(page, schema);
pair.where.selectors = [...(pair.where.selectors || []), ...additionalSelectors];
}
}

if (pair.what[0].action === 'scrapeList') {
const config = pair.what[0]?.args?.[0];
if (config) {
const actionableSelectors = await this.getSelectorsForScrapeList(page, config);
pair.where.selectors = [...(pair.where.selectors || []), ...actionableSelectors];
}
}

// Validate if the pair is already in the workflow
if (pair.where.selectors && pair.where.selectors[0]) {
const match = selectorAlreadyInWorkflow(pair.where.selectors[0], this.workflowRecord.workflow);
if (match) {
// if a match of where conditions is found, the new action is added into the matched rule
const matchedIndex = this.workflowRecord.workflow.indexOf(match);
if (pair.what[0].action !== 'waitForLoadState' && pair.what[0].action !== 'press') {
pair.what.push({
action: 'waitForLoadState',
args: ['networkidle'],
})
});
}
this.workflowRecord.workflow[matchedIndex].what = this.workflowRecord.workflow[matchedIndex].what.concat(pair.what);
logger.log('info', `Pushed ${JSON.stringify(this.workflowRecord.workflow[matchedIndex])} to workflow pair`);
matched = true;
}
}
// is the where conditions of the pair are not already in the workflow, we need to validate the where conditions
// for possible overshadowing of different rules and handle cases according to the recording logic

// Handle cases where the where condition isn't already present
if (!matched) {
const handled = await this.handleOverShadowing(pair, page, this.generatedData.lastIndex || 0);
if (!handled) {
//adding waitForLoadState with networkidle, for better success rate of automatically recorded workflows
if (pair.what[0].action !== 'waitForLoadState' && pair.what[0].action !== 'press') {
pair.what.push({
action: 'waitForLoadState',
args: ['networkidle'],
})
});
}
if (this.generatedData.lastIndex === 0) {
this.generatedData.lastIndex = null;
// we want to have the most specific selectors at the beginning of the workflow
this.workflowRecord.workflow.unshift(pair);
} else {
this.workflowRecord.workflow.splice(this.generatedData.lastIndex || 0, 0, pair);
if (this.generatedData.lastIndex) {
this.generatedData.lastIndex = this.generatedData.lastIndex - 1;
this.generatedData.lastIndex -= 1;
}
}
logger.log('info',
`${JSON.stringify(pair)}: Added to workflow file on index: ${this.generatedData.lastIndex || 0}`);
} else {
logger.log('debug',
` ${JSON.stringify(this.workflowRecord.workflow[this.generatedData.lastIndex || 0])} added action to workflow pair`);
}
}

// Emit the updated workflow to the client
this.socket.emit('workflow', this.workflowRecord);
logger.log('info', `Workflow emitted`);
};


/**
* Generates a pair for the click event.
Expand Down Expand Up @@ -300,7 +362,7 @@ export class WorkflowGenerator {
where,
what: [{
action: 'press',
args: [selector, key],
args: [selector, encrypt(key)],
}],
}
if (selector) {
Expand Down Expand Up @@ -797,7 +859,7 @@ export class WorkflowGenerator {
// when more than one press action is present, add a type action
pair.what.splice(index - input.actionCounter, input.actionCounter, {
action: 'type',
args: [input.selector, input.value],
args: [input.selector, encrypt(input.value)],
}, {
action: 'waitForLoadState',
args: ['networkidle'],
Expand Down
Loading