Skip to content

Commit

Permalink
Use puppeteer cluster application
Browse files Browse the repository at this point in the history
The first naive implementation did not really work for me,
I have noticed many hanging Chromes, and the overal performance was
a problem. It seems to look better with the pooling.
  • Loading branch information
oltarasenko committed Aug 23, 2023
1 parent 51c99d9 commit c9b28d3
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ EXPOSE 3000
# Set environment variables
ENV CHROME_EXECUTABLE_PATH=/usr/bin/google-chrome
# Command to start your application
CMD ["node", "render.js"]
CMD ["node", "cluster.js"]
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ npm install
To start the Crawly Render Server, use the following command:

``` sh
node render.js
node cluster.js
```

The server will listen on port 3000 by default. You can access the rendering endpoint at http://localhost:3000/render.
Expand Down Expand Up @@ -74,7 +74,7 @@ docker run -p 3000:3000 crawly-render-server
The server will be accessible at http://localhost:3000.

## Configuration
You can customize the Crawly Render Server by modifying the code in render.js. For example, you can adjust the server port or Puppeteer launch options.
You can customize the Crawly Render Server by modifying the code in `cluster.js` For example, you can adjust the server port or Puppeteer launch options.

## License
This project is licensed under the MIT License - see the LICENSE file for details.
107 changes: 107 additions & 0 deletions cluster.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
const { Cluster } = require('puppeteer-cluster');
const express = require('express');

let servedRequests = 0;
let errorCount = 0;

const app = express();
const port = 3000;

app.use(express.json());
// Function to log server stats
const logServerStats = () => {
console.log(`Served Requests: ${servedRequests}`);
console.log(`Error Count: ${errorCount}`);
};

// Log server stats every minute (60,000 milliseconds)
setInterval(logServerStats, 60000);

// Define your launch options here
const launchOptions = {
headless: "new",
args: [
'--no-sandbox',
'--disable-gpu',
'--disable-dev-shm-usage',
'--disable-setuid-sandbox',
'--no-first-run',
'--no-zygote',
'--deterministic-fetch',
'--disable-features=IsolateOrigins',
'--disable-site-isolation-trials',
// '--single-process',

],
};
if (process.env.CHROME_EXECUTABLE_PATH) {
launchOptions.executablePath = process.env.CHROME_EXECUTABLE_PATH;
};

let max_concurrency = 2;
if (process.env.MAX_CONCURRENCY) {
max_concurrency = parseInt(process.env.MAX_CONCURRENCY, 10);
};

(async () => {
// Create a cluster with N workers
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_CONTEXT,
maxConcurrency: max_concurrency,
puppeteerOptions: launchOptions,
});

// Define a task
cluster.task(async ({ page, data: {url, headers} }) => {
const startTime = Date.now();
if (headers) {
for (const [name, value] of Object.entries(headers)) {
await page.setExtraHTTPHeaders({ [name]: value });
}
}
const response = await page.goto(url, {timeout: 60000});
const status_code = response.status()
// const pageBody = await page.evaluate(() => document.body.innerHTML);
const finalUrl = page.url();
const pageBody = await page.content()
const endTime = Date.now();
const loadTime = endTime - startTime;
let url_string = "'" + url + "'"
if(finalUrl != url)
url_string = "'" + url + "' -> '" + finalUrl + "'"
tpl = `[DEBUG] Fetched ${url_string} status: ${status_code} (${loadTime/1000}s)`
console.log(tpl)
servedRequests++;
return {page: pageBody, status: status_code, headers: response.headers()};
});

// Define a route for receiving URLs via POST requests
app.post('/render', async (req, res) => {
const { url, headers } = req.body;

if (!url) {
return res.status(400).json({ error: 'URL parameter is required.' });
}

try {
const result = await cluster.execute({url, headers});
res.status(200).json(result);
} catch (err) {
errorCount++;
console.debug("[DEBUG] Could not get '" + url + "' Error: " + err)
res.status(500).json({ error: 'An error occurred while processing the URL.' + err });
}
});

// Start the Express server
app.listen(port, () => {
console.log(`Server is running on port ${port}`);
});

// Shutdown the cluster and close Express server on process termination
process.on('SIGINT', async () => {
await cluster.idle();
await cluster.close();
process.exit();
});
})();
22 changes: 21 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"express": "^4.18.2",
"http-proxy": "^1.18.1",
"mocha": "^10.2.0",
"puppeteer": "^21.0.3"
"puppeteer": "^21.0.3",
"puppeteer-cluster": "^0.23.0"
}
}

0 comments on commit c9b28d3

Please sign in to comment.