-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbot.py
101 lines (82 loc) · 3.93 KB
/
bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import random
import time
import re
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
from selenium import webdriver
# List of important financial, technological or cryptocurrency centres
places = ['New York', 'San Francisco', 'London', 'Singapore', 'Toronto', 'Hong Kong', 'Zurich', 'Berlin', 'Tokyo', 'Seoul', 'Shanghai', 'Dubai', 'Bangalore', 'Tel Aviv', 'Amsterdam', 'Sydney', 'Sao Paulo', 'Moscow', 'Paris', 'Los Angeles', 'Chicago']
# Loop over places
for place in places:
# List of key terms in the cryptocurrency and blockchain industry
hashtags = ['Bitcoin', 'Ethereum', 'Blockchain', 'Cryptocurrency', 'Crypto', 'BTC', 'ETH', 'DeFi', 'CryptoTrading', 'HODL', 'Altcoin', 'CryptoNews', 'Cryptowallet', 'BitcoinMining', 'BitcoinPrice', 'EthereumPrice', 'SmartContract', 'NFT', 'Stablecoin', 'DApp', 'ICO', 'TokenSale', 'Binance', 'Coinbase', 'Ripple', 'XRP', 'Litecoin', 'Dogecoin']
# Initialize WebDriver with path to GeckoDriver
driver = webdriver.Firefox(executable_path= r'') #Your GeckoDriver Path
# Navigate to Google
driver.get("https://www.google.com")
# Loop over hashtags
for tag in hashtags:
# List to store the results
results = []
# Find the search box on the page
search = driver.find_element(by=By.CSS_SELECTOR, value="textarea.gLFyf")
# Generate query programmatically
query = place + ' ' + tag + ' \"@gmail.com\" '
# Clear the search box and enter the query
search.clear()
search.send_keys(query)
# Submit the search
search.send_keys(Keys.RETURN)
# Parse the page with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Check for recaptcha
div_element = soup.find("div", id="recaptcha")
if div_element:
# If recaptcha is found, wait for 30 seconds
time.sleep(30)
# Wait for the page to load
time.sleep(random.randint(3,5))
# Regular expression to search in the span
regex = r'[a-zA-Z0-9_.+-]+@gmail\.com'
try:
# Loop over the first 100 pages of results
for _ in range(100):
# Parse the page with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Check for recaptcha
div_element = soup.find("div", id="recaptcha")
if div_element:
# If recaptcha is found, wait for 30 seconds
time.sleep(30)
# Find all span elements
spans = soup.find_all('span')
# For each span, find the text that matches the regex
for span in spans:
matches = re.findall(regex, span.text)
results.extend(matches)
# Create a DataFrame with the results
df = pd.DataFrame(results)
# Append to the csv file
df.to_csv('results-IG-01', index=False, mode="a", header=False)
# Go to the next page
try:
next_page = driver.find_element(by=By.XPATH, value="//span[text()='Siguiente']")
next_page.click()
except:
print("No more pages")
break
# Wait for the page to load
time.sleep(random.randint(1,3))
except NoSuchElementException:
# No results found for this search, continue to the next
print(f"No results found for the search: {query}")
continue
# Close the driver
driver.quit()
# Random delay before the next iteration
time.sleep(random.randint(5,8))