nmwalsh · pulkitgupta2k · Mar 31, 2020 · Mar 31, 2020 · Mar 31, 2020 · Apr 1, 2020
diff --git a/__pycache__/getMatchIDs.cpython-36.pyc b/__pycache__/getMatchIDs.cpython-36.pyc
diff --git a/__pycache__/getMatchIDs.cpython-37.pyc b/__pycache__/getMatchIDs.cpython-37.pyc
diff --git a/__pycache__/helper.cpython-36.pyc b/__pycache__/helper.cpython-36.pyc
diff --git a/__pycache__/helper.cpython-37.pyc b/__pycache__/helper.cpython-37.pyc
diff --git a/__pycache__/html.cpython-37.pyc b/__pycache__/html.cpython-37.pyc
diff --git a/__pycache__/htmls.cpython-36.pyc b/__pycache__/htmls.cpython-36.pyc
diff --git a/__pycache__/htmls.cpython-37.pyc b/__pycache__/htmls.cpython-37.pyc
diff --git a/__pycache__/scraper.cpython-36.pyc b/__pycache__/scraper.cpython-36.pyc
diff --git a/__pycache__/scraper.cpython-37.pyc b/__pycache__/scraper.cpython-37.pyc
diff --git a/__pycache__/start.cpython-37.pyc b/__pycache__/start.cpython-37.pyc
diff --git a/csv/eventIDs.csv b/csv/eventIDs.csv
diff --git a/csv/eventIDs_1.csv b/csv/eventIDs_1.csv
diff --git a/csv/joinMatchEvent.csv b/csv/joinMatchEvent.csv
diff --git a/csv/joinMatchEvent_1.csv b/csv/joinMatchEvent_1.csv
diff --git a/csv/matchIDs.csv b/csv/matchIDs.csv
diff --git a/csv/matchIDs_1.csv b/csv/matchIDs_1.csv
diff --git a/csv/matchLineups.csv b/csv/matchLineups.csv
diff --git a/csv/matchLineups_1.csv b/csv/matchLineups_1.csv
diff --git a/csv/matchResults.csv b/csv/matchResults.csv
diff --git a/csv/matchResults_1.csv b/csv/matchResults_1.csv
diff --git a/csv/playerStats.csv b/csv/playerStats.csv
diff --git a/csv/playerStats_1.csv b/csv/playerStats_1.csv
diff --git a/csv/players.csv b/csv/players.csv
diff --git a/csv/teams.csv b/csv/teams.csv
diff --git a/getMatchIDs.py b/getMatchIDs.py
@@ -1,4 +1,4 @@
-from html import getHTML
+from htmls import getHTML
 import re
 
 
@@ -61,7 +61,6 @@ def endCheck(matchIDs, stop):
 def findMatchIDsAtURL(url):
     # Get the HTML using getHTML()
     html = getHTML(url)
-
     # Create an array of all of the Match URLs on the page
     matchIDs = re.findall('"(.*?000"><a href="/matches/.*?)"', html)
 

diff --git a/helper.py b/helper.py
@@ -1,8 +1,8 @@
 from multiprocessing.dummy import Pool as ThreadPool
-from html import getHTML
+from htmls import getHTML
 import csv
 import sys
-
+import numpy
 
 def scrape(array, function, threads):
         # Define the number of threads
@@ -12,7 +12,10 @@ def scrape(array, function, threads):
         print("Scraping %s items using %s on %s threads." % (len(array), function, threads))
 
         # Calls get() and adds the filesize returned each call to an array called filesizes
-        result = pool.map(function, array)
+        result = list(map(function, array))
+        # print("start")
+        # print(list(result))
+        # print("end")
         pool.close()
         pool.join()
         return result
@@ -31,10 +34,10 @@ def addNewLine(file):
 def tabulate(csvFile, array):
     # Files must be in the csv directory inside the project folder
     # Opens the CSV file
-    with open("csv/%s.csv" % (csvFile), 'a', encoding='utf-8') as f:
+    with open("csv/%s.csv" % (csvFile), 'a', newline='' ,encoding='utf-8') as f:
         writer = csv.writer(f, delimiter=',')
         # Adds a new line if there is not one present
-        addNewLine("csv/%s.csv" % (csvFile))
+        # addNewLine("csv/%s.csv" % (csvFile))
         # Add the array passed in to the CSV file
         for i in range(0, len(array)):
             if len(array[i]) > 0:
@@ -85,14 +88,16 @@ def unDimension(array, item):
     return result
 
 
-def fixArray(array, value):
+def fixArray(array):
     # Used to clean match info results for matches with more than one map
-    for i in range(0, len(array)):
-        if len(array[i]) < value:
-            for b in range(0, len(array[i])):
-                array.append(array[i][b])
-            array.remove(array[i])
-    return array
+    newArray = []
+    for i in array:
+        if len(numpy.array(i).shape) == 2:
+            for temp in i:
+                newArray.append(temp)
+        else:
+            newArray.append(i)
+    return newArray
 
 
 def fixPlayerStats(array):

diff --git a/html.py → htmls.py b/html.py → htmls.py
@@ -1,21 +1,24 @@
 from urllib.request import Request, urlopen
 import urllib.request
 import re
+import http
 
 
 def getHTML(url):
     # Open the URL
     # Spoof the user agent
-    request = Request(url)
-    request.add_header('User-Agent', 'Mozilla/5.0')
+    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
+    req = Request(url=url, headers=headers)
     # Read the response as HTML
     try:
-        urlopen(request).read()
-        html = urlopen(request).read().decode('ascii', 'ignore')
+        html = urlopen(req).read().decode('ascii', 'ignore')
         if len(re.findall('error-desc', html)) > 0:
             return None
         else:
             return html
     except urllib.error.HTTPError as err:
         print("%s for %s" % (err.code, url))
         return None
+    except:
+        print('END POINT ERROR')
+        return None
diff --git a/readme.md b/readme.md
@@ -1,3 +1,5 @@
+## Working Update
+
 # HLTV Scraper
 
 This is a multi-threaded Python scraper designed to pull data from HLTV.org and tabulate it into a series of CSV files. It is written in pure Python, so it should run on any system that can run Python 3. It is not compatible with Python 2, so you may need to install the latest Python release from [here](https://www.python.org/downloads/).
@@ -46,4 +48,8 @@ Each match has player stats for each map. The script looks for these statistics
 
 ## Updating Players and Teams
 
-Each player and team on HLTV has a unique identification number that increases as new players are added to the database. To find new players and teams, we get the maximum identifier value form the respective `.csv` file and iterate over it using `getIterableItems`. From there the relevant pages are scraped and tabulated to `players.csv` and `teams.csv`.
+Each player and team on HLTV has a unique identification number that increases as new players are added to the database. To find new players and teams, we get the maximum identifier value form the respective `.csv` file and iterate over it using `getIterableItems`. From there the relevant pages are scraped and tabulated to `players.csv` and `teams.csv`.
+
+## Starting Over
+
+If you made an entry of a more recent event and want to go beyond that, remove clear the csv : `matchIDs.csv` to restart.(do not remove first row, ID and Title)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+bs4
+numpy