Skip to content

Commit

Permalink
Initial Commit
Browse files Browse the repository at this point in the history
Added Complete files of GMapsScraper.
  • Loading branch information
Anonym0usWork1221 authored Aug 16, 2023
1 parent 209014d commit f358a95
Show file tree
Hide file tree
Showing 14 changed files with 2,048 additions and 194 deletions.
388 changes: 194 additions & 194 deletions README.md

Large diffs are not rendered by default.

Binary file added chrome_driver_backup/chromedriver.exe
Binary file not shown.
1 change: 1 addition & 0 deletions commandline.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
python maps.py -q ./queries.txt -w 2 -l -1 -u "Not Available" -bw 15 -se contacts -se about -o ./CSV_FILES
Binary file added extensions/finger_print_defender.crx
Binary file not shown.
117 changes: 117 additions & 0 deletions maps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from webdriver_manager.chrome import ChromeDriverManager
from utils.threading_controller import FastSearchAlgo
from argparse import ArgumentParser
from os.path import isfile
import sys


class GMapsScraper:
def __init__(self):
self._args = None

def arg_parser(self):
parser = ArgumentParser(description='Command Line Google Map Scraper by Abdul Moez')

# Input options
parser.add_argument('-q', '--query-file', help='Path to query file (default: ./queries.txt)', type=str,
default="./queries.txt")
parser.add_argument('-w', '--threads', help='Number of threads to use (default: 1)', type=int, default=1)
parser.add_argument('-l', '--limit', help='Number of results to scrape (-1 for all results, default: -1)',
type=int, default=-1)
parser.add_argument('-u', '--unavailable-text',
help='Replacement text for unavailable information (default: "Not Available")', type=str,
default="Not Available")
parser.add_argument('-bw', '--browser-wait', help='Browser waiting time in seconds (default: 15)', type=int,
default=15)
parser.add_argument('-se', '--suggested-ext',
help='Suggested URL extensions to try (can be specified multiple times)', action='append',
default=[])
parser.add_argument('-wb', '--windowed-browser', help='Disable headless mode', action='store_false',
default=True)
parser.add_argument('-v', '--verbose', help='Enable verbose mode', action='store_true')
parser.add_argument('-o', '--output-folder', help='Output folder to store CSV details (default: ./CSV_FILES)',
type=str, default='./CSV_FILES')
parser.add_argument('-d', '--driver-path',
help='Path to Chrome driver (if not provided, it will be downloaded)', type=str,
default='')

# Custom commands for additional help
parser.add_argument('--help-query-file', action='store_true', help='Get help for specifying the query file')
parser.add_argument('--help-limit', action='store_true', help='Get help for specifying the result limit')
parser.add_argument('--help-driver-path', action='store_true', help='Get help for specifying the driver path')

self._args = parser.parse_args()

@staticmethod
def print_query_file_help():
print("The query file should contain a list of search queries, each query on a separate line.")
print("For example:")
print("Pizza restaurants")
print("Coffee shops")
print("...")
sys.exit(0)

@staticmethod
def print_limit_help():
print("Use this option to specify the maximum number of results to scrape.")
print("Use '-1' to scrape all results.")
sys.exit(0)

@staticmethod
def print_driver_path_help():
print("If you have a specific Chrome driver path, you can provide it using this option.")
print("If not provided, the script will attempt to download the driver automatically.")
print("You can download a compatible driver from https://chromedriver.chromium.org/downloads.")
sys.exit(0)

def check_args(self):
q = self._args.query_file
if not isfile(q):
print(f"[-] File not found at path: {q}")
sys.exit(1)

def scrape_maps_data(self):
self.check_args()

if self._args.help_query_file:
self.print_query_file_help()

if self._args.help_limit:
self.print_limit_help()

if self._args.help_driver_path:
self.print_driver_path_help()

queries_list = FastSearchAlgo.load_query_file(file_name=self._args.query_file)
threads_limit = min(self._args.threads, len(queries_list))
limit_results = None if self._args.limit == -1 else self._args.limit

driver_path = self._args.driver_path
if not self._args.driver_path:
try:
driver_path = ChromeDriverManager().install()
except ValueError:
print("[-] Not able to download the driver which is capable with your browser.")
print("[INFO] Head to this site (https://chromedriver.chromium.org/downloads)"
" and find your version driver and pass it with argument -d.")
exit()

algo_obj = FastSearchAlgo(
unavailable_text=self._args.unavailable_text,
headless=self._args.windowed_browser,
wait_time=self._args.browser_wait,
suggested_ext=self._args.suggested_ext,
output_path=self._args.output_folder,
workers=threads_limit,
result_range=limit_results,
verbose=self._args.verbose,
driver_path=driver_path
)

algo_obj.fast_search_algorithm(queries_list)


if __name__ == '__main__':
App = GMapsScraper()
App.arg_parser()
App.scrape_maps_data()
2 changes: 2 additions & 0 deletions queries.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
best coffee shops for working in berlin de
best restaurants in berlin
34 changes: 34 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
async-generator==1.10
attrs==22.2.0
beautifulsoup4==4.12.0
certifi==2022.12.7
cffi==1.15.1
charset-normalizer==3.1.0
colorama==0.4.6
exceptiongroup==1.1.1
h11==0.14.0
idna==3.4
lxml==4.9.2
numpy==1.24.2
outcome==1.2.0
packaging==23.0
pandas==1.5.3
psutil==5.9.4
pycparser==2.21
PySocks==1.7.1
python-dateutil==2.8.2
python-dotenv==1.0.0
pytz==2023.3
requests==2.28.2
selenium==4.8.3
selenium-stealth==1.0.6
six==1.16.0
sniffio==1.3.0
sortedcontainers==2.4.0
soupsieve==2.4
tqdm==4.65.0
trio==0.22.0
trio-websocket==0.10.2
urllib3==1.26.15
webdriver-manager==3.8.5
wsproto==1.2.0
40 changes: 40 additions & 0 deletions utils/dict_cleaner_and_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from collections import OrderedDict


class DictCleaner:
def __init__(self, unavailable_data: str = "Not Available"):
self._unavailable_data = unavailable_data

@staticmethod
def _unique_repeating_sets(output_data_dict_list: list[dict]) -> tuple[set, set]:
unique_keys = set()
repeating_keys = set()

for data_dict in output_data_dict_list:
unique_keys.update(data_dict.keys())
for key in data_dict.keys():
if sum(1 for x in output_data_dict_list if key in x) > 1:
repeating_keys.add(key)

return unique_keys, repeating_keys

def _dict_cleaner(self, output_data_dict_list: list[dict], unique_keys: set, repeating_keys: set) -> list[dict]:

final_data = []
for data_dict in output_data_dict_list:
ordered_dict = OrderedDict()
for key in unique_keys:
if key not in data_dict:
ordered_dict[key] = self._unavailable_data
elif key in repeating_keys:
ordered_dict[key] = f"{key}_{data_dict[key]}"
else:
ordered_dict[key] = data_dict[key]
final_data.append(dict(ordered_dict))
return final_data

def start_cleaning_dict_data(self, dict_list: list[dict]) -> list[dict]:
unique_keys, repeating_keys = self._unique_repeating_sets(dict_list)
cleaned_data_list = self._dict_cleaner(dict_list, unique_keys, repeating_keys)
return cleaned_data_list

Loading

0 comments on commit f358a95

Please sign in to comment.