From f0f4ce828b2decb2efe63d1876ae7896f4f17112 Mon Sep 17 00:00:00 2001 From: Tushar Mathur Date: Sat, 16 Sep 2017 15:37:45 +0530 Subject: [PATCH 1/4] dream11 solution --- crawler.js | 60 +++++++++++++++++++++++++++++++++++++++++----------- package.json | 7 +++++- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/crawler.js b/crawler.js index b248dbc..fd5e72a 100644 --- a/crawler.js +++ b/crawler.js @@ -4,15 +4,51 @@ 'use strict' -/** - * Crawls a website using a start {url}, and returns the lexicographically smallest string. - * @param url - * @return {Promise.} - */ -module.exports = url => - new Promise((resolve, reject) => { - /** - * TODO: Write your high performance code here. - */ - reject(new Error('NotImplemented')) - }) +const O = require('observable-air') +const promiseRetry = require('promise-retry') +const axios = require('axios') +const {JSDOM} = require('jsdom') +const R = require('ramda') + +const requestRetry = url => promiseRetry(retry => axios.get(url).catch(retry)) + +const request$ = url => O.fromPromise(() => requestRetry(url)) +const extractDOM = R.compose( + R.path(['window', 'document']), + R.construct(JSDOM), + R.prop('data') +) +const querySelectorAll = R.curry((selector, doc) => + Array.from(doc.querySelectorAll(selector)) +) +const extractCodes = R.compose( + O.fromArray, + R.pluck('innerHTML'), + querySelectorAll('h1') +) +const extractLinks = R.compose( + O.fromArray, + R.pluck('href'), + querySelectorAll('a') +) + +const crawl = R.curry((base, unique, url) => { + const response$ = O.multicast(request$(url)) + const document$ = O.map(extractDOM, response$) + const code$ = O.flatMap(extractCodes, document$) + const link$ = O.map(R.concat(base), O.flatMap(extractLinks, document$)) + return O.merge(code$, O.flatMap(crawl(base, unique), unique(link$))) +}) + +const findMin = source => + O.reduce( + (last, current) => (current < last ? current : last), + 'zzzzzzzz', + source + ) + +const main = url => findMin(crawl(url, O.uniqueWith(new Set()), url)) + +module.exports = url => { + return new Promise(resolve => O.forEach(result => resolve(result), main(url))) +} diff --git a/package.json b/package.json index 0631069..2c4b1cf 100644 --- a/package.json +++ b/package.json @@ -10,10 +10,15 @@ "author": "", "license": "ISC", "dependencies": { + "axios": "^0.16.2", "express": "^4.15.4", "express-rate-limit": "^2.9.0", + "jsdom": "^11.2.0", "mocha": "^3.5.3", "nodemon": "^1.12.0", - "pug": "^2.0.0-rc.4" + "observable-air": "^7.4.0", + "promise-retry": "^1.1.1", + "pug": "^2.0.0-rc.4", + "ramda": "^0.24.1" } } From 2cd148a0f327bb2457812c0a60dec1fcb0fa9664 Mon Sep 17 00:00:00 2001 From: Cholaraja Date: Sat, 16 Sep 2017 15:43:09 +0530 Subject: [PATCH 2/4] Create crawler.py --- crawler.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..45d258d --- /dev/null +++ b/crawler.py @@ -0,0 +1,32 @@ +import time +import requests +from BeautifulSoup import BeautifulSoup + +word_list = [] +link_list = set() +url = 'http://localhost:8080' + +# def get_links(html): +start = time.time() +def recursiveLinks(route): + nurl = url+route + page = requests.get(nurl) + html = BeautifulSoup(page.content) + words = [ h1.text for h1 in html.findAll('h1') if h1.text] + word_list.extend(words) + links = html.findAll('a') + if links > 0: + count = 0 + maxi = len(links) + while count < maxi: + link = links[count] + if not link['href'] in link_list: + link_list.add(link['href']) + links.extend(recursiveLinks(link['href'])) + count += 1 + return links + +# print(start) +recursiveLinks('') +print(min(word_list)) +print(time.time() - start) From 854b9065ea5c41e44e276268b8bd6672b34c68f9 Mon Sep 17 00:00:00 2001 From: Cholaraja Date: Sat, 16 Sep 2017 15:53:29 +0530 Subject: [PATCH 3/4] Create requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2aa9911 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +BeautifulSoup==3.2.1 +requests==2.18.4 From 80dbf3b206c7a1d02bb9c52ace891f1bce27b5e2 Mon Sep 17 00:00:00 2001 From: Cholaraja Date: Sat, 16 Sep 2017 16:14:29 +0530 Subject: [PATCH 4/4] Update crawler.py --- crawler.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crawler.py b/crawler.py index 45d258d..f0e3e25 100644 --- a/crawler.py +++ b/crawler.py @@ -16,14 +16,10 @@ def recursiveLinks(route): word_list.extend(words) links = html.findAll('a') if links > 0: - count = 0 - maxi = len(links) - while count < maxi: - link = links[count] + for link in links: if not link['href'] in link_list: link_list.add(link['href']) links.extend(recursiveLinks(link['href'])) - count += 1 return links # print(start)