From f0f4ce828b2decb2efe63d1876ae7896f4f17112 Mon Sep 17 00:00:00 2001
From: Tushar Mathur <tusharmath@gmail.com>
Date: Sat, 16 Sep 2017 15:37:45 +0530
Subject: [PATCH 1/4] dream11 solution

---
 crawler.js   | 60 +++++++++++++++++++++++++++++++++++++++++-----------
 package.json |  7 +++++-
 2 files changed, 54 insertions(+), 13 deletions(-)
diff --git a/crawler.js b/crawler.js
index b248dbc..fd5e72a 100644
--- a/crawler.js
+++ b/crawler.js
@@ -4,15 +4,51 @@
 
 'use strict'
 
-/**
- * Crawls a website using a start {url}, and returns the lexicographically smallest string.
- * @param url
- * @return {Promise.<string>}
- */
-module.exports = url =>
-  new Promise((resolve, reject) => {
-    /**
-     * TODO: Write your high performance code here.
-     */
-    reject(new Error('NotImplemented'))
-  })
+const O = require('observable-air')
+const promiseRetry = require('promise-retry')
+const axios = require('axios')
+const {JSDOM} = require('jsdom')
+const R = require('ramda')
+
+const requestRetry = url => promiseRetry(retry => axios.get(url).catch(retry))
+
+const request$ = url => O.fromPromise(() => requestRetry(url))
+const extractDOM = R.compose(
+  R.path(['window', 'document']),
+  R.construct(JSDOM),
+  R.prop('data')
+)
+const querySelectorAll = R.curry((selector, doc) =>
+  Array.from(doc.querySelectorAll(selector))
+)
+const extractCodes = R.compose(
+  O.fromArray,
+  R.pluck('innerHTML'),
+  querySelectorAll('h1')
+)
+const extractLinks = R.compose(
+  O.fromArray,
+  R.pluck('href'),
+  querySelectorAll('a')
+)
+
+const crawl = R.curry((base, unique, url) => {
+  const response$ = O.multicast(request$(url))
+  const document$ = O.map(extractDOM, response$)
+  const code$ = O.flatMap(extractCodes, document$)
+  const link$ = O.map(R.concat(base), O.flatMap(extractLinks, document$))
+  return O.merge(code$, O.flatMap(crawl(base, unique), unique(link$)))
+})
+
+const findMin = source =>
+  O.reduce(
+    (last, current) => (current < last ? current : last),
+    'zzzzzzzz',
+    source
+  )
+
+const main = url => findMin(crawl(url, O.uniqueWith(new Set()), url))
+
+module.exports = url => {
+  return new Promise(resolve => O.forEach(result => resolve(result), main(url)))
+}
diff --git a/package.json b/package.json
index 0631069..2c4b1cf 100644
--- a/package.json
+++ b/package.json
@@ -10,10 +10,15 @@
   "author": "",
   "license": "ISC",
   "dependencies": {
+    "axios": "^0.16.2",
     "express": "^4.15.4",
     "express-rate-limit": "^2.9.0",
+    "jsdom": "^11.2.0",
     "mocha": "^3.5.3",
     "nodemon": "^1.12.0",
-    "pug": "^2.0.0-rc.4"
+    "observable-air": "^7.4.0",
+    "promise-retry": "^1.1.1",
+    "pug": "^2.0.0-rc.4",
+    "ramda": "^0.24.1"
   }
 }

From 2cd148a0f327bb2457812c0a60dec1fcb0fa9664 Mon Sep 17 00:00:00 2001
From: Cholaraja <madhureddy88@gmail.com>
Date: Sat, 16 Sep 2017 15:43:09 +0530
Subject: [PATCH 2/4] Create crawler.py

---
 crawler.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 crawler.py

diff --git a/crawler.py b/crawler.py
new file mode 100644
index 0000000..45d258d
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,32 @@
+import time
+import requests
+from BeautifulSoup import BeautifulSoup
+
+word_list = []
+link_list = set()
+url = 'http://localhost:8080'
+
+# def get_links(html):
+start = time.time()
+def recursiveLinks(route):
+    nurl = url+route
+    page = requests.get(nurl)
+    html = BeautifulSoup(page.content)
+    words = [ h1.text for h1 in html.findAll('h1') if h1.text]
+    word_list.extend(words)
+    links = html.findAll('a')
+    if links > 0:
+        count = 0
+        maxi = len(links)
+        while count < maxi:
+            link = links[count]
+            if not link['href'] in link_list:
+                link_list.add(link['href'])
+                links.extend(recursiveLinks(link['href']))
+            count += 1
+    return links
+
+# print(start)
+recursiveLinks('')
+print(min(word_list))
+print(time.time() - start)

From 854b9065ea5c41e44e276268b8bd6672b34c68f9 Mon Sep 17 00:00:00 2001
From: Cholaraja <madhureddy88@gmail.com>
Date: Sat, 16 Sep 2017 15:53:29 +0530
Subject: [PATCH 3/4] Create requirements.txt

---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..2aa9911
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+BeautifulSoup==3.2.1
+requests==2.18.4

From 80dbf3b206c7a1d02bb9c52ace891f1bce27b5e2 Mon Sep 17 00:00:00 2001
From: Cholaraja <madhureddy88@gmail.com>
Date: Sat, 16 Sep 2017 16:14:29 +0530
Subject: [PATCH 4/4] Update crawler.py

---
 crawler.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/crawler.py b/crawler.py
index 45d258d..f0e3e25 100644
--- a/crawler.py
+++ b/crawler.py
@@ -16,14 +16,10 @@ def recursiveLinks(route):
     word_list.extend(words)
     links = html.findAll('a')
     if links > 0:
-        count = 0
-        maxi = len(links)
-        while count < maxi:
-            link = links[count]
+        for link in links:
             if not link['href'] in link_list:
                 link_list.add(link['href'])
                 links.extend(recursiveLinks(link['href']))
-            count += 1
     return links
 
 # print(start)