diff --git a/build/lib/scrape_it/__init__.py b/build/lib/scrape_it/__init__.py
old mode 100755
new mode 100644
index 60fcb24..ff33d6b
--- a/build/lib/scrape_it/__init__.py
+++ b/build/lib/scrape_it/__init__.py
@@ -51,7 +51,7 @@
from .scrape_it import Scrape_it
-__version__ = '0.3.6'
+__version__ = '0.3.8'
if __name__ == '__main__':
import doctest
diff --git a/build/lib/scrape_it/regex.py b/build/lib/scrape_it/regex.py
old mode 100755
new mode 100644
index 88cf597..a87aa5c
--- a/build/lib/scrape_it/regex.py
+++ b/build/lib/scrape_it/regex.py
@@ -8,6 +8,135 @@
r'[(-,. ]*[0-9]{2}[)-,. ]+[0-9]{4}[()-,. ]?[0-9]{4}']
-email_keywords = ['help', 'faq', 'info', 'support']
+email_keywords = ['help', 'faq', 'info', 'support', 'hello']
-js_keywords = ['ngMeta']
\ No newline at end of file
+js_keywords = ['ngMeta']
+
+categories = {'Auto Locksmith': r'((auto)|(automotive)|(automobile)|(car)).locksmith',
+ 'Auto Repair Services': r'((auto)|(automotive)|(automobile)|(car)).((repair)|(mechanic)|(technic))',
+ 'Auto Upholstery Repair': r'((auto)|(automotive)|(automobile)|(car)).upholstery',
+ 'Bicycle Repair': r'((bicycle)|(bike)).((repair)|(mechanic)|(technic))',
+ 'Car Wash': r'((auto)|(automotive)|(automobile)|(car)).((wash)|(detailing))',
+ 'Driving School': r'driv.*((school)|(course))',
+ 'Hoverboard & Segway Repair': r'((hoverboard)|(segway)).((repair)|(mechanic)|(technic))',
+ 'Lawn Equipment Repair': r'((lawn)|(snow)|(leaf)*|(leav)*|(grass)).((equipment)|(mower)|(blower)).((repair)|(mechanic)|(technic))?',
+ 'Motorcycle Repair': r'((motorcycle)|(bike)).((repair)|(mechanic)|(technic))',
+ 'Towing Services': r'((car)|(truck)|(trailer)|(motorcycle)|(bike))?.((tow)|(towing)|(road assistance))(.service)?',
+ 'Air Conditioner Installation': r'air.condit*install*',
+ 'Air Duct Cleaning': r'air.duct*clean*',
+ 'Bathtub Installation & Replacement': r'bath*install*',
+ 'Carpenter': r'carpent*',
+ 'Carpet Installation': r'carpet.install*',
+ 'Deck Building & Repair': r'deck.((build)*|(repair)|(service))',
+ 'Door Services': r'door.service*',
+ 'Drain Cleaning': r'drain.clean*',
+ 'Drywall Installation': r'drywall.install*',
+ 'Electrician': 'electric*(service)?',
+ 'Exterior Door Installation': r'exterior.door.install*',
+ 'Flooring Contractors': r'flooring(.service)?',
+ 'Furniture Assembly & Repair': r'furniture.((assembl)*|(repair))',
+ 'Garage Door Installation': r'garage.door.install*',
+ 'Garage Door Repair': r'garage.door.((repair)|(service))',
+ 'General Contractors': r'general.contract*',
+ 'Handyman': r'handym(a|e)n',
+ 'Hardwood Flooring': r'hardwood.floor*',
+ 'Heating Installation': r'heat*install*',
+ 'HVAC Repair': r'hvac.((repair)|(technic))',
+ 'HVAC Services': r'hvac.service*',
+ 'Laminate Flooring': r'laminate.floor*',
+ 'Painting Contractors': r'paint*contract*',
+ 'Piano Tuning': r'piano.tun((er)|(ing))',
+ 'Plumber': r'plumb((er)|(ing))',
+ 'Remodeling Services': r'remodel*service*',
+ 'Spray Foam Insulation': r'((spray).)?foam.insul*',
+ 'Tile & Ceramic Flooring': r'tile.((ceramic).)?flooring',
+ 'Tile Contractors': r'tile.contract*',
+ 'TV Installation': r'((tv)|(television)).install*',
+ 'Vinyl & Linoleum Flooring': r'((vinyl)|(linoleum)).floor*',
+ 'Water Heater Installation': r'water.heater.install*',
+ 'Water Heater Repair': r'water.heater.((repair)|(service)|(technic))',
+ 'Welding Services': 'welding-services',
+ 'Window Installation': 'window-installation',
+ 'Window Repair': 'window-repair',
+ 'Bartenders': 'bartenders',
+ 'Boudoir Photographers': 'boudoir-photographers',
+ 'Bounce House Rentals': 'bounce-houses',
+ 'Catering': 'catering',
+ 'Commercial Photographers': 'commercial-photographers',
+ 'DJ & MC': 'dj-mc',
+ 'Event Photographers': 'event-photographers',
+ 'Event Planning': 'event-planner',
+ 'Face Painting': 'face-painting',
+ 'Family Photographers': 'family-photographers',
+ 'Flower Delivery': 'flower-delivery',
+ 'Limo Services': 'limo-services',
+ 'Party Entertainment': 'party-entertainment',
+ 'Party Equipment Rental': 'party-equipment-rental',
+ 'Personal Chef': 'personal-chef',
+ 'Photographers': 'photographers',
+ 'Portrait Photographers': 'portrait-photographers', 'Promotional Video Services': 'promotional-video-service',
+ 'Table & Chair Rentals': 'table-chair-rentals', 'Videographers': 'videographers', 'Wedding DJ': 'wedding-dj',
+ 'Wedding Officiants': 'wedding-officiants', 'Wedding Photography': 'wedding-photography',
+ 'Wedding Planner': 'wedding-planners', 'Wedding Videography': 'wedding-videography', 'Balayage': 'balayage-hair',
+ 'Barbers': 'barbershop', 'Box Braids': 'box-braids', 'Crochet Braids': 'corchet-braids',
+ 'Eyebrow Tinting': 'eyebrow-tinting', 'Eyelash Extension': 'eyelash-extension', 'Fashion Design': 'fashion-design',
+ 'Hair Extensions': 'hair-extensions', 'Hair Stylist': 'hair-stylist',
+ 'Henna Tattoos Artist': 'henna-tattoos-artist', 'Image Consultant': 'image-consultant',
+ 'MakeUp Artist': 'makeup-artist', 'Nail Services': 'nail-services', 'Nutritionist': 'nutritionist',
+ 'Permanent MakeUp': 'permanent-makeup', 'Personal Stylist': 'personal-stylist',
+ 'Personal Trainers': 'personal-trainers', 'Sew in': 'sew-in', 'Skin Care': 'skin-care', 'Tailors': 'tailors',
+ 'Tattoo Artist': 'tattoo-artist', 'Wedding Hair & Makeup Artist': 'wedding-makeup-artist',
+ 'Animal Control': 'animal-control', 'Ant Control': 'ant-exterminators',
+ 'Appliance Repair & Installation': 'appliance-repair-installation', 'Bathroom Design': 'bathroom-designers',
+ 'Bed Bug Control': 'bed-bug-control', 'Cell Phone Repair': 'cell-phone-repair',
+ 'Closet Organization': 'closet-organizers', 'Computer Repair': 'computer-repair',
+ 'Computer Services': 'computer-services', 'Decorating': 'home-decorators',
+ 'Dry-cleaning, Laundry & Alteration': 'dry-cleaning-laundry-alteration',
+ 'Grocery Shopping & Delivery': 'grocery-shopping-delivery', 'Interior Designer': 'interior-designer',
+ 'Kitchen Design & Planning': 'kitchen-designers', 'Landscape Designers': 'landscape-designers',
+ 'Lighting Design Services': 'lighting-designers', 'Locksmith': 'locksmith', 'Moving': 'moving',
+ 'Pest Control': 'pest-control-services', 'Piano Movers': 'piano-movers',
+ 'Pool Table Movers': 'pool-table-movers', 'Rat Control': 'rat-control',
+ 'Security Installation': 'security-installation', 'Self Storage': 'self-storage',
+ 'Termite Control': 'termite-control', 'Virus Removal': 'virus-removal',
+ 'Wasp & Bee Removal': 'wasp-bee-removal', 'Apartment Cleaning': 'apartment-cleaning',
+ 'Appliance Cleaning': 'appliance-cleaning', 'Carpet Cleaning': 'carpet-cleaning',
+ 'Commercial Cleaning': 'commercial-cleaning', 'House Cleaning': 'house-cleaning',
+ 'Housekeeping': 'housekeeping', 'Janitorial Services': 'janitorial-services', 'Maids': 'maids',
+ 'Mattress Cleaning': 'mattress-cleaning', 'Move Out Cleaning': 'move-out-cleaning',
+ 'Office Cleaning': 'office-cleaning', 'Upholstery Cleaning': 'upholstery-cleaning',
+ 'Window Cleaning': 'window-cleaning', 'Chimney Services': 'chimney-services',
+ 'Concrete Contractors': 'concrete-contractors', 'Demolition Services': 'demolition-services',
+ 'Fence Contractors': 'fence-contractors', 'Fence Repair': 'fence-repair', 'Firewood': 'firewood',
+ 'Garbage Removal': 'garbage-removal', 'Gardening': 'gardening', 'Gutter Cleaning': 'gutter-cleaning',
+ 'Gutter Installation & Repair': 'gutter-installation-and-repair', 'Hardscape Contractors': 'hardscapers',
+ 'Landscaping': 'landscaping', 'Lawn Care': 'lawn-care', 'Masonry Contractors': 'masonry-contractors',
+ 'Pool Buildings': 'pool-buildings', 'Pool Cleaners': 'pool-cleaners', 'Pool Maintenance': 'pool-maintenance',
+ 'Pressure Washing': 'pressure-washing', 'Roof Cleaning': 'roof-cleaning', 'Roofing Contractors': 'roofing',
+ 'Roofing Installation & Repair': 'roofing-installation-and-repair', 'Snow & Ice Removal': 'snow-ice-removal',
+ 'Sprinkler Repairs': 'sprinkler-repairs', 'Tree Services': 'tree-services', 'Yard Clean-Up': 'yard-clean-up',
+ 'Aquarium Services': 'aquarium-services', 'Dog Training': 'dog-training', 'Horse Boarding': 'horse-boarding',
+ 'Horse Training': 'horse-training', 'Pet Daycare & Boarding': 'pet-daycare-boarding',
+ 'Pet Groomers': 'pet-groomer', 'Pet Sitters & Walkers': 'pet-sitting-and-walking',
+ 'Veterinary Services': 'veterinary', 'Academic Writing': 'academic-writing',
+ 'Bankruptcy Lawyers': 'bankruptcy-lawyers', 'Business Lawyers': 'business-lawyers',
+ 'Civil Rights Lawyers': 'civil-rights-lawyers', 'Copywriting': 'copywriting',
+ 'Criminal Defense Attorneys': 'criminal-defense-attorneys', 'Divorce Lawyers': 'divorce-lawyers',
+ 'Essays Writing & Editing': 'essays-writing-and-editing', 'Family Lawyers': 'family-lawyers',
+ 'Immigration Lawyers': 'immigration-lawyers', 'Lawyers': 'lawyer', 'Notary Services': 'notary-services',
+ 'Personal Driver': 'personal-driver', 'Personal Injury Lawyers': 'personal-injury-lawyers',
+ 'Private Detective': 'private-detective', 'Resume Writing': 'resume-writing',
+ 'Tax Preparation': 'tax-preparation', 'Translation Services': 'translator',
+ 'Writing & Editing': 'writing-editing-services', 'Babysitting': 'babysitters',
+ 'Beauty Schools': 'beauty-schools', 'Caregiver': 'caregiver', 'Chinese Lessons': 'chinese-lessons',
+ 'Cooking Lessons': 'cooking-lessons', 'Dancing Lessons': 'dancing-lessons', 'Daycare': 'daycare',
+ 'Drawing Classes': 'drawing-lessons', 'English Lessons': 'english-lessons',
+ 'French Lessons': 'french-lessons', 'Horseback Riding': 'horseback-riding',
+ 'Italian Lessons': 'italian-lessons', 'Japanese Lessons': 'japanese-lessons', 'Language Classes': 'classes',
+ 'MakeUp Lessons': 'makeup-lessons', 'Martial Arts': 'martial-arts', 'Music Lessons': 'music-lessons',
+ 'Other Classes': 'other-classes', 'Photography Classes': 'photography-classes',
+ 'Piano Lessons': 'piano-lessons', 'Portuguese Lessons': 'portuguese-lessons',
+ 'Private Tutoring': 'private-tutor', 'Russian Lessons': 'russian-lessons',
+ 'Singing Lessons': 'singing-lessons', 'Spanish Lessons': 'spanish-lessons',
+ 'Sport Lessons': 'sport-lessons', 'Surfing lessons': 'surfing-lessons',
+ 'Swim lessons': 'swim-lessons', 'Tennis lessons': 'tenis-lessons'}
\ No newline at end of file
diff --git a/build/lib/scrape_it/scrape_it.py b/build/lib/scrape_it/scrape_it.py
old mode 100755
new mode 100644
index f161ee7..ebc192d
--- a/build/lib/scrape_it/scrape_it.py
+++ b/build/lib/scrape_it/scrape_it.py
@@ -16,23 +16,23 @@
Note on additional .txt files in regex directory
name_stop_words.txt - Contains words to filter unneeded words
- during the search of entity's name
+ during the search of entity's name
email_keywords.txt - Specific file to filter emails based on
- keywords it might contain (this is
- as needed for current task)
+ keywords it might contain (this is
+ as needed for current task)
regex.txt - some regular expressions to search phone numbers;
- at the current time is not in use, phonenumbers
- package is used instead; one of improvements
- should be a workflow which would allow efficient
- and accurate phone matching with good filter
- pipeline from 'scraping trash'
+ at the current time is not in use, phonenumbers
+ package is used instead; one of improvements
+ should be a workflow which would allow efficient
+ and accurate phone matching with good filter
+ pipeline from 'scraping trash'
address.txt - some regular expressions to match addresses,
- not perfect expesially given the diversity
- of different address formats accross
- different countries
+ not perfect expesially given the diversity
+ of different address formats accross
+ different countries
"""
@@ -48,7 +48,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-#from models import Business
from .correct_base import process_phones, define_country
from .regex import *
from bs4 import BeautifulSoup
@@ -64,647 +63,725 @@
"""
with open('regex/name_stop_words.txt', 'r') as file:
- name_stop_words = [r.strip() for r in file.readlines()]
+ name_stop_words = [r.strip() for r in file.readlines()]
with open('regex/phones_regex.txt', 'r') as file:
- phone_regex = [r.strip() for r in file.readlines()]
+ phone_regex = [r.strip() for r in file.readlines()]
with open('regex/email_keywords.txt', 'r') as file:
- email_keywords = [w.strip() for w in file.readlines()]
+ email_keywords = [w.strip() for w in file.readlines()]
with open('regex/js_keywords.txt', 'r') as file:
- js_keywords = [w.strip() for w in file.readlines()]
+ js_keywords = [w.strip() for w in file.readlines()]
"""
internal_links = {'contact_link': r'contact.*',
- 'privacy_link': r'privacy.*policy',
- 'shipping_link': r'(deliver|shiping).*(policy)*',
- 'terms_link': r'term.*(condition|use|service)',
- 'faq_link': r'(faq)|(frequently.*asked.*question)',
- 'return_link': r'return.*',
- 'warranty_link': r'(warrant)|(guarant)'}
+ 'privacy_link': r'privacy.*policy',
+ 'shipping_link': r'(deliver|shiping).*(policy)*',
+ 'terms_link': r'term.*(condition|use|service)',
+ 'faq_link': r'(faq)|(frequently.*asked.*question)',
+ 'return_link': r'return.*',
+ 'warranty_link': r'(warrant)|(guarant)'}
external_links = {'twitter': 'twitter.com',
- 'facebook': 'facebook.com',
- 'instagram': 'instagram.com',
- 'pinterest':'pinterest.com',
- 'youtube': 'youtube.com',
- 'linkedin': 'linkedin.com'}
+ 'facebook': 'facebook.com',
+ 'instagram': 'instagram.com',
+ 'pinterest': 'pinterest.com',
+ 'youtube': 'youtube.com',
+ 'linkedin': 'linkedin.com'}
-class Scrape_it:
-
- def __init__(self, url, method='requests', country='us',
- company_name=None, category=None, geo_key=None,
- verbose=0, driver=None):
-
- self.url = url
- self.method = method
- self.model = {'url': self.url, 'country': country,
- 'category': category, 'company_name': company_name}
- self.soup = None
- self.geo_key = geo_key
- self.verbose = verbose
- self.driver = driver
-
-
- def init_model(self):
- """
- Current task of mine is reflected in the model; it is planned
- by me to export models to seperate file and use different onces
- as need or to simplify the process of defining the model and
- needed methods to execute
- """
-
- self.model['url'] = self.url
- self.model['company_name'] = self.model['company_name']
- self.model['country'] = self.model['country']
- self.model['category'] = self.model['category']
- self.model['contact_link'] = None
- self.model['phones'] = None
- self.model['phone_1'] = None
- self.model['phone_2'] = None
- self.model['phone_3'] = None
- self.model['phone_4'] = None
- self.model['phone_5'] = None
- self.model['phone_6'] = None
- self.model['address'] = None
- self.model['state'] = None
- self.model['county'] = None
- self.model['city'] = None
- self.model['street'] = None
- self.model['housenumber'] = None
- self.model['postalcode'] = None
- self.model['district'] = None
- self.model['email'] = None
- self.model['facebook'] = None
- self.model['instagram'] = None
- self.model['linkedin'] = None
- self.model['pinterest'] = None
- self.model['twitter'] = None
- self.model['youtube'] = None
- self.model['faq_link'] = None
- self.model['privacy_link'] = None
- self.model['return_link'] = None
- self.model['shipping_link'] = None
- self.model['terms_link'] = None
- self.model['warranty_link'] = None
- self.model['faq_text'] = None
- self.model['privacy_text'] = None
- self.model['return_text'] = None
- self.model['shipping_text'] = None
- self.model['terms_text'] = None
- self.model['warranty_text'] = None
-
- def logging(self):
- """
- Log some text while scraping if verbose is set to 1
- """
-
- if self.verbose == 1:
- print('Scraping', self.url, '...')
-
-
- def define_domain(self):
- """
- Define domain name of the link
- """
-
- def get_domain(url):
- domain = tldextract.extract(str(url)).domain+'.'+tldextract.extract(str(url)).suffix
- return domain
-
- self.model['url'] = get_domain(self.url)
-
-
- def get_soup(self, url):
- """
- Gets soup object depending on the method
- """
-
- if self.method == 'requests':
- import requests
- try:
- r = requests.get(url)
- soup = BeautifulSoup(r.text, 'lxml')
- except Exception as e:
- print(e, url)
-
- if self.method == 'webdriver':
- from selenium import webdriver
- options = webdriver.ChromeOptions()
- options.add_argument('headless')
- self.driver = webdriver.Chrome(executable_path='./chromedriver', options=options)
- try:
- self.driver.get(url)
- except Exception as e:
- print(e, url)
- soup = BeautifulSoup(self.driver.page_source, 'lxml')
- try:
- assert soup != None
- return soup
- except Exception:
- return
-
-
- def clean_name(self):
- """
- Since company name is scraped from code too it can
- be messy and needs to be cleaned from short descriptions
- """
-
- delims = ['|', '-', ':']
-
- if self.model['company_name']:
-
- for d in delims:
- if d in self.model['company_name']:
- for i, s in enumerate(self.model['company_name'].split(d)):
- if len(set(name_stop_words).intersection(s.split(' '))) > 0:
- break
- self.model['company_name'] = self.model['company_name'].split(d)[i]
- break
-
-
- return self.model['company_name']
-
-
- def get_name(self):
- """
- Get company name from the most likely places in html it could be found
- """
- for script in self.soup(["script", "style"]):
- script.extract()
-
- metas_og = ['og:site_name', 'og:title']
- metas = ['title', 'name']
- for meta in metas_og:
- if self.model['company_name'] == None or self.model['company_name'] == '':
- try:
- self.model['company_name'] = self.soup.find('meta', attrs={'property': meta}).get('content')
- except AttributeError:
- pass
-
- for meta in metas:
- if self.model['company_name'] == None or self.model['company_name'] == '':
- try:
- self.model['company_name'] = self.soup.find('meta', attrs={'name': meta}).get('content')
- except AttributeError:
- if self.soup.find('title'):
- if len(self.soup.find('title')) > 0:
- self.model['company_name'] = self.soup.find('title').text
- if self.model['company_name'] != None:
- if 'forbidden' in self.model['company_name'].lower() or\
- 'ngMeta' in self.model['company_name']:
- self.model['company_name'] = None
- if self.model['company_name']:
- self.model['company_name'] = self.clean_name().strip()
-
-
- def find_phones(self):
-
- def get_from_href(soup):
- """
- If phonenumbers package could not find any phone numbers
- there could be some embedded in links as in
-
- """
- phones = set()
- for script in soup(["script", "style"]):
- script.extract()
-
- try:
-
- for line in soup.find_all('a'):
- if line.get('href').startswith('tel:'):
- phones.add(line.get('href')[3:])
-
- return phones
-
- except AttributeError:
- return None
-
-
-
-
- def match_phones(soup):
- """
- Find phones using phonenumbers package, location is provided from
- model's country value
- """
-
- phones = set()
- for script in soup(["script", "style"]):
- script.extract()
- for line in soup.get_text().split('\n'):
- for match in phonenumbers.PhoneNumberMatcher(line, str(self.model["country"]).upper()):
- phones.add(phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164))
-
-
- return phones
-
-
- self.model['phones'] = get_from_href(self.soup)
- if self.model['phones']:
- self.model['phones'] = self.model['phones'].union(match_phones(self.soup))
- else:
- self.model['phones'] = match_phones(self.soup)
-
- #if len(self.model['phones']) == 0:
- #self.model['phones'] = get_from_href(self.soup)
-
-
- def find_address(self):
-
- def find_regex(soup):
- """
- Find address with regular expression(s) specified in regex/address.txt
- """
-
- #with open('regex/address.txt') as f:
- #address_regex = f.read()
- try:
- address_regex = r'^([0-9\-/]+) ?([A-Za-z](?= ))? (.*?) ([^ ]+?) ?((?<= )[A-Za-z])? ?((?<= )\d*)?$\
- ^([A-Za-z]+ ?)+[0-9]{3,6}$\
- ^([A-Za-z]+ ?)$'
- except Exception:
- pass
- for script in soup(["script", "style"]):
- script.extract()
- text = soup.get_text()
- address = re.search(address_regex, text)
- if address:
- address = address.group(0)
- else:
- address = None
-
- return address
-
-
-
- def find_base(soup, country='us'):
- """
- Find addresses using pyap package
- """
-
- for script in soup(["script", "style"]):
- script.extract()
- text = soup.get_text()
- address = ''
-
- adr = pyap.parse(text, country='us')
- if len(adr) > 0:
- for item in adr:
- address = address+' '+str(item)
-
- return address
-
-
-
- if self.model['address'] == None:
- self.model['address'] = find_regex(self.soup)
- if self.model['address'] == None:
- self.model['address'] = find_base(self.soup, self.model['country'])
-
- if len(self.model['address']) > 0:
- if define_country(self.model['country']) != None:
- self.model['country'] = define_country(self.model['country'])
-
-
- def find_email(self):
-
- def get_all_emails(soup):
- """
- Get set of emails using regular expression
- """
-
- emails = set()
-
- email_pattern = r'[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk|au|net|me){1}'
- for script in soup(["script", "style"]):
- script.extract()
-
- for each in soup.get_text().split('\n'):
- email_re = re.search(email_pattern, each)
- if email_re:
- if len(email_re.group(0)) > 5 and len(email_re.group(0)) < 75:
- emails.add(email_re.group(0))
-
- return emails
-
- def keep_with_keywords(emails, keywords):
- """
- Filter emails and keep one of the set found, as for my task
- either one with keywors specified in regex/email_keywors.txt
- or the first one if there are none which contain needed
- keywords
- """
+#cropped_emails = r'[info|support|contact|faq|help|hello]@[0-9A-Za-z]*\.[0-9A-Za-z]{2,3}'
- for word in keywords:
- if word in ''.join(list(emails)):
- for email in emails:
- if word in email:
- return email
-
- if len(list(emails)) > 0:
- return list(emails)[0]
- return None
-
- self.model['email'] = keep_with_keywords(get_all_emails(self.soup), keywords=email_keywords)
-
-
- def find_links(self):
-
- def find_raw_links(soup):
- """
- Find links:
- external: social media links
- internal: links to policies, faq, etc
- """
-
- links = {}
- for each in soup.find_all('a'):
- for ext_key, ext_val in external_links.items():
- if ext_val in str(each.get('href')):
- links[ext_key] = str(each.get('href'))
-
- for int_key, int_val in internal_links.items():
- try:
- url = re.findall(int_val, each.get('href'))
- if len(url) > 0:
- links[int_key] = str(each.get('href'))
- except Exception:
- pass
-
- return links
-
-
-
- def build_links(links):
- """
- Build links from raw scraped hfer attributes
- """
-
- for key, link in links.items():
- if link.startswith('http') or link.startswith('www'):
- links[key] = self.fix_link(link)
- continue
- if link.startswith('//'):
- links[key] = self.fix_link(link[2:])
- continue
- if key in external_links.keys():
- continue
- if link.startswith('/'):
- if self.url.endswith('/'):
- links[key] = self.url+link[1:]
- else:
- links[key] = self.url+link
-
-
- else:
- if link.startswith('http') == False and link.startswith('www') == False:
- if self.url.endswith('/'):
- links[key] = self.url+link
- else:
- links[key] = self.url+'/'+link
-
- links = clean_links(links)
-
- return links
-
- def clean_links(links):
- """
- Clean links which require login or sign up and containing
- some search/meta data parameters
- """
-
- stop_attrs = ['#', '?', 'login', 'signup', 'sign-up', 'sign_up']
-
- for key, link in links.items():
- for attr in stop_attrs:
- if attr in link:
- links[key] = link.split(attr)[0]
-
- return links
-
-
-
-
- links = build_links(find_raw_links(self.soup))
-
- for key, link in links.items():
- self.model[key] = link
-
-
- def validate_address(self):
-
- def check_address(adr, geo_key=None):
- """
- Validate address using geolocation API, first to make sure
- scraped address is a valid one, seccond to fix if there
- is any missing pieces and third to aid mt current task
- """
-
- if geo_key:
-
- r = requests.get(f'https://geocoder.ls.hereapi.com/6.2/geocode.json?apiKey={geo_key}&searchtext={adr}')
-
- try:
- return json.loads(r.text)['Response']['View'][0]['Result'][0]['Location']['Address']
-
- except Exception:
- return None
- else:
- return adr
-
- def extend_addresses(address, geo_key=None):
- """
- If address is a valid one break up address to corresponding
- pieces (i.e. house number, street number, etc)
- """
-
- adr_dict = {}
-
- address = check_address(address, geo_key)
- try:
- if len(address.keys()) > 0:
- for key in address.keys():
- if key == 'Label':
- adr_dict['address'] = address[key].split(',')[0]
- continue
- if key == 'AdditionalData':
- continue
- if key == 'Country' and address[key] != None:
- adr_dict['country'] = define_country(address['Country'])
- continue
-
-
- adr_dict[key] = address[key]
- except Exception:
- adr_dict = None
-
-
- return adr_dict
-
-
-
- if self.model['address'] != None and len(self.model['address']) > 0:
- if extend_addresses(self.model['address'], self.geo_key) != None:
- for key, val in extend_addresses(self.model['address'], self.geo_key).items():
- if key.lower() == 'country' and self.model['country']:
- continue
- self.model[key.lower()] = val
-
- else:
- self.model['address'] = None
-
- if self.model['address']:
- if len(self.model['address']) > 25:
- self.model['address'] = None
-
-
- def fix_link(self, link):
- """
- requests library does not handle well links in www.site.com format,
- hence needs to be fixed to be the format 'https://www.site.com'
- """
-
- if link.startswith('www.'):
- return 'https://'+link
- return link
-
-
- def split_phones(self):
- """
- Method to seperate found phones into individual ones
- """
-
- if self.split_phones_to_cols:
- for i in range(6):
- try:
- if self.model['phones'][i].startswith('+'):
- self.model[f'phone_{i+1}'] = self.model['phones'][i]
- continue
- self.model[f'phone_{i+1}'] = self.model['phones'][i]
- except IndexError:
- pass
-
-
- def scrape_text(self, method):
- """
- Scrape text of the page of interest;
- credit for the module scrape_policy_text
- goes to Olha Babich
- """
-
- for key, _ in internal_links.items():
- if self.model[key] != None:
- if 'contact' in key:
- continue
- text_key = key.split('_')[0]+'_text'
- self.model[text_key] = _get_text_list(self.model[key], method=method, web_driver=self.driver)
- try:
- if self.model[text_key] != None:
- if self.model['company_name'] != None:
- self.model[text_key] = ' '.join(text_generator(text_mas=self.model[text_key][0],
- company_name=self.model['company_name'],
- company_website=self.model['url']))
- else:
- self.model[text_key] = ' '.join(text_generator(text_mas=self.model[text_key][0],
- company_name='Company',
- company_website=self.model['url']))
- except TypeError:
- self.model[key] = None
-
- try:
- assert len(self.model[text_key]) > 1
- if self.model[text_key][0] == None and self.model[text_key][1] == None:
- self.model[text_key] = None
- except Exception:
- pass
-
-
-
- def remove_not_parsed(self):
- """
- Common issue is incapability to render the JavaScript whcih
- results in the text like 'Seems your browser is not using
- JavaScript...'
- """
-
- fields = ['faq_text', 'privacy_text', 'return_text',
- 'shipping_text', 'terms_text', 'warranty_text']
-
- for each in fields:
- if self.model[each] != None:
- if 'JavaScript' in self.model[each]:
- self.model[each] = None
-
-
-
- def scrape(self):
- """
- General pipeline of methods to scrape the website
- """
-
- self.soup = self.get_soup(self.url)
- if self.soup == None:
- return
- self.init_model()
- self.logging()
- self.define_domain()
- if self.model['company_name'] == None:
- self.get_name()
- self.find_address()
- self.find_phones()
- self.find_email()
- self.find_links()
-
- if self.model['address'] != None or len(self.model['address']) != 0:
- self.validate_address()
-
-
- if self.model['contact_link']:
- self.soup = self.get_soup(self.model['contact_link'])
- if self.soup == None:
- return
- if self.model['address'] == None or len(self.model['address']) == 0:
- self.find_address()
- self.validate_address()
- self.find_phones()
- if self.model['email'] == None or len(self.model['email']) == 0:
- self.find_email()
- if self.method == 'requests':
- self.scrape_text(method='requests')
- else:
- self.scrape_text(method='webdriver')
-
-
- self.remove_not_parsed()
-
- if self.model['phones']:
- fixed_phones = []
- for phone in list(self.model['phones']):
- fixed_phones.append(process_phones(phone, self.model['country']))
-
- self.model['phones'] = fixed_phones
- else:
- self.model['phones'] = None
-
- if self.model['phones']:
- self.split_phones_to_cols = True
- self.split_phones()
-
-
- if self.verbose == 1:
- for key, val in self.model.items():
- print(key, ':', val)
-
- if self.driver:
- self.driver.quit()
-
- #del self.model['phones']
-
- return self.model
+class Scrape_it:
+ def __init__(self, url, method='requests', country='us',
+ company_name=None, category=None, geo_key=None,
+ verbose=0, driver=None):
+
+ self.url = url
+ self.method = method
+ self.model = {'url': self.url, 'country': country,
+ 'category': category, 'company_name': company_name}
+ self.soup = None
+ self.geo_key = geo_key
+ self.verbose = verbose
+ self.driver = driver
+
+ def init_model(self):
+ """
+ Current task of mine is reflected in the model; it is planned
+ by me to export models to seperate file and use different onces
+ as need or to simplify the process of defining the model and
+ needed methods to execute
+ """
+
+ self.model['url'] = self.url
+ self.model['company_name'] = self.model['company_name']
+ self.model['country'] = self.model['country']
+ self.model['category'] = self.model['category']
+ self.model['contact_link'] = None
+ self.model['description'] = None
+ self.model['phones'] = None
+ self.model['phone_1'] = None
+ self.model['phone_2'] = None
+ self.model['phone_3'] = None
+ self.model['phone_4'] = None
+ self.model['phone_5'] = None
+ self.model['phone_6'] = None
+ self.model['phone_7'] = None
+ self.model['phone_8'] = None
+ self.model['phone_9'] = None
+ self.model['phone_10'] = None
+ self.model['phone_11'] = None
+ self.model['phone_12'] = None
+ self.model['phone_13'] = None
+ self.model['phone_14'] = None
+ self.model['phone_15'] = None
+ self.model['phone_16'] = None
+ self.model['phone_17'] = None
+ self.model['phone_18'] = None
+ self.model['phone_19'] = None
+ self.model['phone_20'] = None
+ self.model['address'] = None
+ self.model['state'] = None
+ self.model['county'] = None
+ self.model['city'] = None
+ self.model['street'] = None
+ self.model['housenumber'] = None
+ self.model['postalcode'] = None
+ self.model['district'] = None
+ self.model['email'] = None
+ self.model['facebook'] = None
+ self.model['instagram'] = None
+ self.model['linkedin'] = None
+ self.model['pinterest'] = None
+ self.model['twitter'] = None
+ self.model['youtube'] = None
+ self.model['faq_link'] = None
+ self.model['privacy_link'] = None
+ self.model['return_link'] = None
+ self.model['shipping_link'] = None
+ self.model['terms_link'] = None
+ self.model['warranty_link'] = None
+ self.model['faq_text'] = None
+ self.model['privacy_text'] = None
+ self.model['return_text'] = None
+ self.model['shipping_text'] = None
+ self.model['terms_text'] = None
+ self.model['warranty_text'] = None
+
+ def logging(self):
+ """
+ Log some text while scraping if verbose is set to 1
+ """
+
+ if self.verbose == 1:
+ print('Scraping', self.url, '...')
+
+ def define_domain(self):
+ """
+ Define domain name of the link
+ """
+
+ def get_domain(url):
+ domain = tldextract.extract(str(url)).domain+'.'+tldextract.extract(str(url)).suffix
+ return domain
+
+ self.model['url'] = get_domain(self.url)
+
+ def get_soup(self, url):
+ """
+ Gets soup object depending on the method
+ """
+ if self.method == 'requests':
+ import requests
+ try:
+ r = requests.get(url)
+ soup = BeautifulSoup(r.text, 'lxml')
+ except Exception as e:
+ print(e, url)
+
+ if self.method == 'webdriver':
+ from selenium import webdriver
+ options = webdriver.ChromeOptions()
+ options.add_argument('headless')
+ self.driver = webdriver.Chrome(executable_path='./chromedriver',
+ options=options)
+ try:
+ self.driver.get(url)
+ except Exception as e:
+ print(e, url)
+ soup = BeautifulSoup(self.driver.page_source, 'lxml')
+ try:
+ assert soup is not None
+ return soup
+ except Exception:
+ return
+
+ def clean_name(self):
+ """
+ Since company name is scraped from code too it can
+ be messy and needs to be cleaned from short descriptions
+ """
+
+ delims = ['|', '-', ':']
+
+ if self.model['company_name']:
+
+ for d in delims:
+ if d in self.model['company_name']:
+ for i, s in enumerate(self.model['company_name'].split(d)):
+ same_words = set(name_stop_words).intersection(s.split(' '))
+ if len(same_words) > 0:
+ break
+ self.model['company_name'] = self.model['company_name'].split(d)[i]
+ break
+ return self.model['company_name']
+
+ def get_name(self):
+ """
+ Get company name from the most likely places in html it could be found
+ """
+ for script in self.soup(["script", "style"]):
+ script.extract()
+
+ metas_og = ['og:site_name', 'og:title']
+ metas = ['title', 'name']
+ for meta in metas_og:
+ if self.model['company_name'] is None \
+ or self.model['company_name'] == '':
+ try:
+ meta_name = self.soup.find('meta', attrs={'property': meta})
+ self.model['company_name'] = meta_name.get('content')
+ except AttributeError:
+ pass
+
+ for meta in metas:
+ if self.model['company_name'] is None \
+ or self.model['company_name'] == '':
+ try:
+ meta_name = self.soup.find('meta', attrs={'name': meta})
+ self.model['company_name'] = meta_name.get('content')
+ except AttributeError:
+ if self.soup.find('title'):
+ if len(self.soup.find('title')) > 0:
+ title = self.soup.find('title')
+ self.model['company_name'] = title.text
+ if self.model['company_name'] is not None:
+ if 'forbidden' in self.model['company_name'].lower() or\
+ 'ngMeta' in self.model['company_name']:
+ self.model['company_name'] = None
+ if self.model['company_name']:
+ self.model['company_name'] = self.clean_name().strip()
+
+ def find_description(self):
+ """
+ Get company description from the most likely places in html it could be found
+ """
+ for script in self.soup(["script", "style"]):
+ script.extract()
+
+ metas_og = ['og:description']
+ metas = ['description']
+ for meta in metas_og:
+ if self.model['description'] is None \
+ or self.model['description'] == '':
+ try:
+ meta_name = self.soup.find('meta', attrs={'property': meta})
+ self.model['description'] = meta_name.get('content')
+ except AttributeError:
+ pass
+
+ for meta in metas:
+
+ try:
+ meta_name = self.soup.find('meta', attrs={'name': meta})
+ self.model['description'] = meta_name.get('content')
+ except AttributeError:
+ if self.soup.find('title'):
+ if len(self.soup.find('title')) > 0:
+ title = self.soup.find('title')
+ self.model['description'] = title.text
+ if self.model['description'] is not None:
+ if 'forbidden' in self.model['description'].lower() or\
+ 'ngMeta' in self.model['description']:
+ self.model['description'] = None
+
+ def set_category(self):
+ """
+ Get company description from the most likely places in html it could be found
+ """
+ for script in self.soup(["script", "style"]):
+ script.extract()
+
+ for cat, reg in categories.items():
+ score = 0
+ for text in self.soup.get_text().split('\n'):
+ #print(text.lower())
+ if re.search(reg, text.lower()):
+ print(reg, text)
+ score += 1
+ if score > 2:
+ self.model['category'] = cat
+ break
+
+ def find_phones(self):
+
+ def get_from_href(soup):
+ """
+ If phonenumbers package could not find any phone numbers
+ there could be some embedded in links as in
+
+ """
+ phones = set()
+ for script in soup(["script", "style"]):
+ script.extract()
+
+ try:
+
+ for line in soup.find_all('a'):
+ if line.get('href').startswith('tel:'):
+ phones.add(line.get('href')[3:])
+
+ return phones
+
+ except AttributeError:
+ return None
+
+ def match_phones(soup):
+ """
+ Find phones using phonenumbers package, location is provided from
+ model's country value
+ """
+ phones = set()
+ for script in soup(["script", "style"]):
+ script.extract()
+ for line in soup.get_text().split('\n'):
+ matches = phonenumbers.PhoneNumberMatcher(line, str(self.model["country"]).upper())
+ for match in matches:
+ phones.add(phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164))
+
+ return phones
+
+ def remove_dublicates(phones):
+
+ def __get_digits__(phone):
+ phone = '+'+re.sub(r"[^0-9]", "", phone).strip()
+ return phone.replace(' ', '').replace('-', '').replace('(', '').replace(')', '')
+
+ temp_set = set()
+
+ for phone in phones:
+ temp_set.add(__get_digits__(phone))
+
+ return list(temp_set)
+
+ self.model['phones'] = get_from_href(self.soup)
+ if self.model['phones']:
+ self.model['phones'] = self.model['phones'].union(match_phones(self.soup))
+ else:
+ self.model['phones'] = match_phones(self.soup)
+ if len(self.model['phones']) > 20:
+ self.model['phones'] = list(self.model['phones'])[:19]
+ self.model['phones'] = remove_dublicates(self.model['phones'])
+
+ def find_address(self):
+
+ def find_regex(soup):
+ """
+ Find address with regular expression(s) specified in regex/address.txt
+ """
+ try:
+ address_regex = r'^([0-9\-/]+) ?([A-Za-z](?= ))? (.*?) ([^ ]+?) ?((?<= )[A-Za-z])? ?((?<= )\d*)?$\
+ ^([A-Za-z]+ ?)+[0-9]{3,6}$\
+ ^([A-Za-z]+ ?)$'
+ except Exception:
+ pass
+ for script in soup(["script", "style"]):
+ script.extract()
+ text = soup.get_text()
+ address = re.search(address_regex, text)
+ if address:
+ address = address.group(0)
+ else:
+ address = None
+
+ return address
+
+ def find_base(soup, country='us'):
+ """
+ Find addresses using pyap package
+ """
+ for script in soup(["script", "style"]):
+ script.extract()
+ text = soup.get_text()
+ address = ''
+
+ adr = pyap.parse(text, country='us')
+ if len(adr) > 0:
+ for item in adr:
+ address = address+' '+str(item)
+
+ return address
+
+ if self.model['address'] is None:
+ self.model['address'] = find_regex(self.soup)
+ if self.model['address'] is None:
+ base = find_base(self.soup, self.model['country'])
+ self.model['address'] = base
+
+ if len(self.model['address']) > 0:
+ if define_country(self.model['country']) is not None:
+ self.model['country'] = define_country(self.model['country'])
+
+ def find_email(self):
+
+ def get_all_emails(soup):
+ """
+ Get set of emails using regular expression
+ """
+
+ emails = set()
+
+ email_pattern = r'[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk|au|net|me){1}'
+ for script in soup(["script", "style"]):
+ script.extract()
+
+ for each in soup.get_text().split('\n'):
+ email_re = re.search(email_pattern, each)
+ if email_re:
+ if len(email_re.group(0)) > 5 \
+ and len(email_re.group(0)) < 75:
+ emails.add(email_re.group(0))
+
+ return emails
+
+ def keep_with_keywords(emails, keywords):
+ """
+ Filter emails and keep one of the set found, as for my task
+ either one with keywors specified in regex/email_keywors.txt
+ or the first one if there are none which contain needed
+ keywords
+ """
+
+ for word in keywords:
+ if word in ''.join(list(emails)):
+ for email in emails:
+ if word in email:
+ return email
+
+ if len(list(emails)) > 0:
+ return list(emails)[0]
+ return None
+
+ def remove_junk_numns(email):
+
+ while email[0].isdigit():
+ email = email[1:]
+
+ return email
+
+ mails = get_all_emails(self.soup)
+ self.model['email'] = keep_with_keywords(mails, email_keywords)
+ if self.model['email']:
+ self.model['email'] = remove_junk_numns(self.model['email'])
+
+ def find_links(self):
+
+ def find_raw_links(soup):
+ """
+ Find links:
+ external: social media links
+ internal: links to policies, faq, etc
+ """
+
+ links = {}
+ for each in soup.find_all('a'):
+ for ext_key, ext_val in external_links.items():
+ if ext_val in str(each.get('href')) \
+ and str(each.get('href')).endswith(str(ext_val)) is False \
+ and str(each.get('href')).endswith(str(ext_val)+'/') is False:
+ #print('Link', str(each.get('href')), 'does not end with', ext_val, 'and does not end with', ext_val+'/')
+ #print(str(each.get('href')).endswith(str(ext_val)))
+ #print(str(each.get('href')).endswith(str(ext_val)+'/'))
+ links[ext_key] = str(each.get('href'))
+
+ for int_key, int_val in internal_links.items():
+ try:
+ url = re.findall(int_val, each.get('href'))
+ if len(url) > 0:
+ links[int_key] = str(each.get('href'))
+ except Exception:
+ pass
+
+ return links
+
+ def build_links(links):
+ """
+ Build links from raw scraped hfer attributes
+ """
+
+ for key, link in links.items():
+ if link.startswith('http') or link.startswith('www'):
+ links[key] = self.fix_link(link)
+ continue
+ if link.startswith('//'):
+ links[key] = self.fix_link(link[2:])
+ continue
+ if key in external_links.keys():
+ continue
+ if link.startswith('/'):
+ if self.url.endswith('/'):
+ links[key] = self.url+link[1:]
+ else:
+ links[key] = self.url+link
+ else:
+ if link.startswith('http') is False \
+ and link.startswith('www') is False:
+ if self.url.endswith('/'):
+ links[key] = self.url+link
+ else:
+ links[key] = self.url+'/'+link
+
+ links = clean_links(links)
+
+ return links
+
+ def clean_links(links):
+ """
+ Clean links which require login or sign up and containing
+ some search/meta data parameters
+ """
+ stop_attrs = ['#', '?', 'login', 'signup', 'sign-up', 'sign_up', 'sharer']
+
+ for key, link in links.items():
+ for attr in stop_attrs:
+ if attr in link:
+ links[key] = link.split(attr)[0]
+
+ return links
+
+ links = build_links(find_raw_links(self.soup))
+
+ for key, link in links.items():
+ if link.endswith(key+'.com') or link.endswith(key+'.com/') \
+ or link.endswith(self.model['url']) or link.endswith(self.model['url']+'/'):
+ self.model[key] = None
+ continue
+ self.model[key] = link
+ #print(key, link)
+
+ def validate_address(self):
+
+ def check_address(adr, geo_key=None):
+ """
+ Validate address using geolocation API, first to make sure
+ scraped address is a valid one, seccond to fix if there
+ is any missing pieces and third to aid mt current task
+ """
+ if geo_key:
+
+ r = requests.get(f'https://geocoder.ls.hereapi.com/6.2/geocode.json?apiKey={geo_key}&searchtext={adr}')
+
+ try:
+ location = json.loads(r.text)['Response']['View'][0]['Result']
+ return location[0]['Location']['Address']
+
+ except Exception:
+ return None
+ else:
+ return adr
+
+ def extend_addresses(address, geo_key=None):
+ """
+ If address is a valid one break up address to corresponding
+ pieces (i.e. house number, street number, etc)
+ """
+
+ adr_dict = {}
+
+ address = check_address(address, geo_key)
+ try:
+ if len(address.keys()) > 0:
+ for key in address.keys():
+ if key == 'Label':
+ adr_dict['address'] = address[key].split(',')[0]
+ continue
+ if key == 'AdditionalData':
+ continue
+ if key == 'Country' and address[key] is not None:
+ c = define_country(address['Country'])
+ adr_dict['country'] = c
+ continue
+ adr_dict[key] = address[key]
+ except Exception:
+ adr_dict = None
+
+ return adr_dict
+
+ if self.model['address'] is not None and len(self.model['address']) > 0:
+ if extend_addresses(self.model['address'], self.geo_key) is not None:
+ extended = extend_addresses(self.model['address'], self.geo_key)
+ for key, val in extended.items():
+ if key.lower() == 'country' and self.model['country']:
+ continue
+ self.model[key.lower()] = val
+
+ else:
+ self.model['address'] = None
+
+ if self.model['address']:
+ if len(self.model['address']) > 25:
+ self.model['address'] = None
+
+ def fix_link(self, link):
+ """
+ requests library does not handle well links in www.site.com format,
+ hence needs to be fixed to be the format 'https://www.site.com'
+ """
+ if link.startswith('http') == False and link.startswith('www') == False:
+ return 'https://www.'+link
+ if link.startswith('www.'):
+ return 'https://'+link
+ return link
+
+ def split_phones(self):
+ """
+ Method to seperate found phones into individual ones
+ """
+ if self.split_phones_to_cols:
+ for i in range(6):
+ try:
+ if self.model['phones'][i].startswith('+'):
+ self.model[f'phone_{i+1}'] = self.model['phones'][i]
+ continue
+ self.model[f'phone_{i+1}'] = self.model['phones'][i]
+ except IndexError:
+ pass
+
+ def phones_to_string(self):
+
+ string = ''
+
+ for phone in self.model['phones']:
+ string = string+str(phone)+'; '
+
+ self.model['phones'] = string[:-2]
+
+ def scrape_text(self, method):
+ """
+ Scrape text of the page of interest;
+ credit for the module scrape_policy_text
+ goes to Olha Babich
+ """
+
+ for key, _ in internal_links.items():
+ if self.model[key] is not None:
+ if 'contact' in key:
+ continue
+ text_key = key.split('_')[0]+'_text'
+ text_list = _get_text_list(self.model[key], method=method, web_driver=self.driver)
+ self.model[text_key] = text_list
+ try:
+ if self.model[text_key] is not None:
+ if self.model['company_name'] is not None:
+ text = ' '.join(text_generator(text_mas=self.model[text_key][0],
+ company_name=self.model['company_name'],
+ company_website='Website'))
+ self.model[text_key] = text
+ else:
+ text = ' '.join(text_generator(text_mas=self.model[text_key][0],
+ company_name='Company',
+ company_website='Website'))
+ self.model[text_key] = text
+ except TypeError:
+ self.model[key] = None
+
+ try:
+ assert len(self.model[text_key]) > 1
+ if self.model[text_key][0] is None \
+ and self.model[text_key][1] is None:
+ self.model[text_key] = None
+ except Exception:
+ pass
+
+ def remove_not_parsed(self):
+ """
+ Common issue is incapability to render the JavaScript whcih
+ results in the text like 'Seems your browser is not using
+ JavaScript...'
+ """
+ fields = ['faq_text', 'privacy_text', 'return_text',
+ 'shipping_text', 'terms_text', 'warranty_text']
+
+ for each in fields:
+ if self.model[each] is not None:
+ if 'JavaScript' in self.model[each]:
+ self.model[each] = None
+
+ def scrape(self):
+ """
+ General pipeline of methods to scrape the website
+ """
+ self.soup = self.get_soup(self.url)
+ if self.soup is None:
+ return
+ self.init_model()
+ self.logging()
+ self.define_domain()
+ if not self.model['category']:
+ self.set_category()
+ if self.model['company_name'] is None:
+ self.get_name()
+ self.find_description()
+ self.find_address()
+ self.find_phones()
+ self.find_email()
+ self.find_links()
+
+ if self.model['address'] is not None \
+ or len(self.model['address']) != 0:
+ self.validate_address()
+
+ if self.model['contact_link']:
+ self.soup = self.get_soup(self.model['contact_link'])
+ if self.soup is None:
+ return
+ if self.model['address'] is None or len(self.model['address']) == 0:
+ self.find_address()
+ self.validate_address()
+ self.find_phones()
+ if self.model['email'] is None or len(self.model['email']) == 0:
+ self.find_email()
+ if self.method == 'requests':
+ self.scrape_text(method='requests')
+ else:
+ self.scrape_text(method='webdriver')
+
+ self.remove_not_parsed()
+
+ if self.model['phones']:
+ fixed_phones = []
+ for phone in list(self.model['phones']):
+ ph = process_phones(phone, self.model['country'])
+ fixed_phones.append(ph)
+
+ self.model['phones'] = fixed_phones
+ else:
+ self.model['phones'] = None
+
+ if self.model['phones']:
+ self.split_phones_to_cols = True
+ self.split_phones()
+ self.phones_to_string()
+
+ if self.verbose == 1:
+ for key, val in self.model.items():
+ print(key, ':', val)
+
+ if self.driver:
+ self.driver.quit()
+
+ return self.model
diff --git a/build/lib/scrape_it/scrape_it_experimental.py b/build/lib/scrape_it/scrape_it_experimental.py
new file mode 100644
index 0000000..d111986
--- /dev/null
+++ b/build/lib/scrape_it/scrape_it_experimental.py
@@ -0,0 +1,698 @@
+"""
+Scrape_it
+
+Author: Valentyna Fihurska
+
+Lisence: Apache-2.0
+
+Scrape_it is a tool for extracting valueble information
+from the website of interest. Save your time on reading
+and crawling through the website and leave it for Scrape_it!
+
+Find an example how to run program in the run.py
+or refer to README
+
+
+Note on additional .txt files in regex directory
+
+name_stop_words.txt - Contains words to filter unneeded words
+ during the search of entity's name
+
+email_keywords.txt - Specific file to filter emails based on
+ keywords it might contain (this is
+ as needed for current task)
+
+regex.txt - some regular expressions to search phone numbers;
+ at the current time is not in use, phonenumbers
+ package is used instead; one of improvements
+ should be a workflow which would allow efficient
+ and accurate phone matching with good filter
+ pipeline from 'scraping trash'
+
+address.txt - some regular expressions to match addresses,
+ not perfect expesially given the diversity
+ of different address formats accross
+ different countries
+
+"""
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .correct_base import process_phones, define_country
+from .regex import *
+from bs4 import BeautifulSoup
+import re
+import phonenumbers
+import pyap
+import requests
+import json
+import os
+from selenium import webdriver
+from .scrape_policy_text import _get_text_list, text_generator
+import tldextract
+
+"""
+with open('regex/name_stop_words.txt', 'r') as file:
+ name_stop_words = [r.strip() for r in file.readlines()]
+
+with open('regex/phones_regex.txt', 'r') as file:
+ phone_regex = [r.strip() for r in file.readlines()]
+
+with open('regex/email_keywords.txt', 'r') as file:
+ email_keywords = [w.strip() for w in file.readlines()]
+
+with open('regex/js_keywords.txt', 'r') as file:
+ js_keywords = [w.strip() for w in file.readlines()]
+"""
+
+
+internal_links = {'contact_link': r'contact.*',
+ 'privacy_link': r'privacy.*policy',
+ 'shipping_link': r'(deliver|shiping).*(policy)*',
+ 'terms_link': r'term.*(condition|use|service)',
+ 'faq_link': r'(faq)|(frequently.*asked.*question)',
+ 'return_link': r'return.*',
+ 'warranty_link': r'(warrant)|(guarant)'}
+
+external_links = {'twitter': 'twitter.com',
+ 'facebook': 'facebook.com',
+ 'instagram': 'instagram.com',
+ 'pinterest': 'pinterest.com',
+ 'youtube': 'youtube.com',
+ 'linkedin': 'linkedin.com'}
+
+
+class Scrape_it:
+
+ def __init__(self, url, method='requests', country='us',
+ company_name=None, category=None, geo_key=None,
+ verbose=0, driver=None):
+
+ self.url = url
+ self.method = method
+ #self.model = {'url': self.url, 'country': country,
+ # 'category': category, 'company_name': company_name}
+ self.soup = None
+ self.geo_key = geo_key
+ self.verbose = verbose
+ self.driver = driver
+ self.get_soup()
+
+
+ def logging(self):
+ """
+ Log some text while scraping if verbose is set to 1
+ """
+
+ if self.verbose == 1:
+ print('Scraping', self.url, '...')
+
+ def define_domain(self):
+ """
+ Define domain name of the link
+ """
+
+ def get_domain(url):
+ domain = tldextract.extract(str(url)).domain+'.'+tldextract.extract(str(url)).suffix
+ return ['https://'+domain, 'http://'+domain, 'https://www.'+domain]
+
+ return get_domain(self.url)
+
+ #self.model['url'] = get_domain(self.url)
+
+ def get_soup(self):
+ """
+ Gets soup object depending on the method
+ """
+ url = self.define_domain(self.url)
+ if self.method == 'requests':
+ import requests
+ try:
+ for link in url:
+ r = requests.get(link)
+ if r.status_code == 200:
+ soup = BeautifulSoup(r.text, 'lxml')
+ break
+
+ except Exception as e:
+ print(e, link)
+
+ if self.method == 'webdriver':
+ from selenium import webdriver
+ options = webdriver.ChromeOptions()
+ options.add_argument('headless')
+ self.driver = webdriver.Chrome(executable_path='./chromedriver',
+ options=options)
+
+ for link in url:
+ try:
+ self.driver.get(link)
+ soup = BeautifulSoup(self.driver.page_source, 'lxml')
+ break
+ except Exception as e:
+ print(e, link)
+
+ try:
+ assert soup is not None
+ return soup
+ except Exception:
+ return
+
+ #def scrape(self):
+
+ #self.soup = self.get_soup(self.url)
+
+
+
+
+
+class CustomScrape(Scrape_it):
+
+ def __init__(self):
+ super.__init__()
+
+ self.model = {'url': self.url, 'country': country,
+ 'category': category, 'company_name': company_name}
+
+
+ def init_model(self):
+ """
+ Current task of mine is reflected in the model; it is planned
+ by me to export models to seperate file and use different onces
+ as need or to simplify the process of defining the model and
+ needed methods to execute
+ """
+
+ self.model['url'] = self.url
+ self.model['company_name'] = self.model['company_name']
+ self.model['country'] = self.model['country']
+ self.model['category'] = self.model['category']
+ self.model['contact_link'] = None
+ self.model['phones'] = None
+ self.model['phone_1'] = None
+ self.model['phone_2'] = None
+ self.model['phone_3'] = None
+ self.model['phone_4'] = None
+ self.model['phone_5'] = None
+ self.model['phone_6'] = None
+ self.model['address'] = None
+ self.model['state'] = None
+ self.model['county'] = None
+ self.model['city'] = None
+ self.model['street'] = None
+ self.model['housenumber'] = None
+ self.model['postalcode'] = None
+ self.model['district'] = None
+ self.model['email'] = None
+ self.model['facebook'] = None
+ self.model['instagram'] = None
+ self.model['linkedin'] = None
+ self.model['pinterest'] = None
+ self.model['twitter'] = None
+ self.model['youtube'] = None
+ self.model['faq_link'] = None
+ self.model['privacy_link'] = None
+ self.model['return_link'] = None
+ self.model['shipping_link'] = None
+ self.model['terms_link'] = None
+ self.model['warranty_link'] = None
+ self.model['faq_text'] = None
+ self.model['privacy_text'] = None
+ self.model['return_text'] = None
+ self.model['shipping_text'] = None
+ self.model['terms_text'] = None
+ self.model['warranty_text'] = None
+
+ def clean_name(self):
+ """
+ Since company name is scraped from code too it can
+ be messy and needs to be cleaned from short descriptions
+ """
+
+ delims = ['|', '-', ':']
+
+ if self.model['company_name']:
+
+ for d in delims:
+ if d in self.model['company_name']:
+ for i, s in enumerate(self.model['company_name'].split(d)):
+ same_words = set(name_stop_words).intersection(s.split(' '))
+ if len(same_words) > 0:
+ break
+ self.model['company_name'] = self.model['company_name'].split(d)[i]
+ break
+ return self.model['company_name']
+
+ def get_name(self):
+ """
+ Get company name from the most likely places in html it could be found
+ """
+ for script in self.soup(["script", "style"]):
+ script.extract()
+
+ metas_og = ['og:site_name', 'og:title']
+ metas = ['title', 'name']
+ for meta in metas_og:
+ if self.model['company_name'] is None \
+ or self.model['company_name'] == '':
+ try:
+ meta_name = self.soup.find('meta', attrs={'property': meta})
+ self.model['company_name'] = meta_name.get('content')
+ except AttributeError:
+ pass
+
+ for meta in metas:
+ if self.model['company_name'] is None \
+ or self.model['company_name'] == '':
+ try:
+ meta_name = self.soup.find('meta', attrs={'name': meta})
+ self.model['company_name'] = meta_name.get('content')
+ except AttributeError:
+ if self.soup.find('title'):
+ if len(self.soup.find('title')) > 0:
+ title = self.soup.find('title')
+ self.model['company_name'] = title.text
+ if self.model['company_name'] is not None:
+ if 'forbidden' in self.model['company_name'].lower() or\
+ 'ngMeta' in self.model['company_name']:
+ self.model['company_name'] = None
+ if self.model['company_name']:
+ self.model['company_name'] = self.clean_name().strip()
+
+ def find_phones(self):
+
+ def get_from_href(soup):
+ """
+ If phonenumbers package could not find any phone numbers
+ there could be some embedded in links as in
+
+ """
+ phones = set()
+ for script in soup(["script", "style"]):
+ script.extract()
+
+ try:
+
+ for line in soup.find_all('a'):
+ if line.get('href').startswith('tel:'):
+ phones.add(line.get('href')[3:])
+
+ return phones
+
+ except AttributeError:
+ return None
+
+ def match_phones(soup):
+ """
+ Find phones using phonenumbers package, location is provided from
+ model's country value
+ """
+ phones = set()
+ for script in soup(["script", "style"]):
+ script.extract()
+ for line in soup.get_text().split('\n'):
+ matches = phonenumbers.PhoneNumberMatcher(line, str(self.model["country"]).upper())
+ for match in matches:
+ phones.add(phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164))
+
+ return phones
+
+ self.model['phones'] = get_from_href(self.soup)
+ if self.model['phones']:
+ self.model['phones'] = self.model['phones'].union(match_phones(self.soup))
+ else:
+ self.model['phones'] = match_phones(self.soup)
+
+ def find_address(self):
+
+ def find_regex(soup):
+ """
+ Find address with regular expression(s) specified in regex/address.txt
+ """
+ try:
+ address_regex = r'^([0-9\-/]+) ?([A-Za-z](?= ))? (.*?) ([^ ]+?) ?((?<= )[A-Za-z])? ?((?<= )\d*)?$\
+ ^([A-Za-z]+ ?)+[0-9]{3,6}$\
+ ^([A-Za-z]+ ?)$'
+ except Exception:
+ pass
+ for script in soup(["script", "style"]):
+ script.extract()
+ text = soup.get_text()
+ address = re.search(address_regex, text)
+ if address:
+ address = address.group(0)
+ else:
+ address = None
+
+ return address
+
+ def find_base(soup, country='us'):
+ """
+ Find addresses using pyap package
+ """
+ for script in soup(["script", "style"]):
+ script.extract()
+ text = soup.get_text()
+ address = ''
+
+ adr = pyap.parse(text, country='us')
+ if len(adr) > 0:
+ for item in adr:
+ address = address+' '+str(item)
+
+ return address
+
+ if self.model['address'] is None:
+ self.model['address'] = find_regex(self.soup)
+ if self.model['address'] is None:
+ base = find_base(self.soup, self.model['country'])
+ self.model['address'] = base
+
+ if len(self.model['address']) > 0:
+ if define_country(self.model['country']) is not None:
+ self.model['country'] = define_country(self.model['country'])
+
+ def find_email(self):
+
+ def get_all_emails(soup):
+ """
+ Get set of emails using regular expression
+ """
+
+ emails = set()
+
+ email_pattern = r'[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk|au|net|me){1}'
+ for script in soup(["script", "style"]):
+ script.extract()
+
+ for each in soup.get_text().split('\n'):
+ email_re = re.search(email_pattern, each)
+ if email_re:
+ if len(email_re.group(0)) > 5 \
+ and len(email_re.group(0)) < 75:
+ emails.add(email_re.group(0))
+
+ return emails
+
+ def keep_with_keywords(emails, keywords):
+ """
+ Filter emails and keep one of the set found, as for my task
+ either one with keywors specified in regex/email_keywors.txt
+ or the first one if there are none which contain needed
+ keywords
+ """
+
+ for word in keywords:
+ if word in ''.join(list(emails)):
+ for email in emails:
+ if word in email:
+ return email
+
+ if len(list(emails)) > 0:
+ return list(emails)[0]
+ return None
+ mails = get_all_emails(self.soup)
+ self.model['email'] = keep_with_keywords(mails, email_keywords)
+
+ def find_links(self):
+
+ def find_raw_links(soup):
+ """
+ Find links:
+ external: social media links
+ internal: links to policies, faq, etc
+ """
+
+ links = {}
+ for each in soup.find_all('a'):
+ for ext_key, ext_val in external_links.items():
+ if ext_val in str(each.get('href')):
+ links[ext_key] = str(each.get('href'))
+
+ for int_key, int_val in internal_links.items():
+ try:
+ url = re.findall(int_val, each.get('href'))
+ if len(url) > 0:
+ links[int_key] = str(each.get('href'))
+ except Exception:
+ pass
+
+ return links
+
+ def build_links(links):
+ """
+ Build links from raw scraped hfer attributes
+ """
+
+ for key, link in links.items():
+ if link.startswith('http') or link.startswith('www'):
+ links[key] = self.fix_link(link)
+ continue
+ if link.startswith('//'):
+ links[key] = self.fix_link(link[2:])
+ continue
+ if key in external_links.keys():
+ continue
+ if link.startswith('/'):
+ if self.url.endswith('/'):
+ links[key] = self.url+link[1:]
+ else:
+ links[key] = self.url+link
+ else:
+ if link.startswith('http') is False \
+ and link.startswith('www') is False:
+ if self.url.endswith('/'):
+ links[key] = self.url+link
+ else:
+ links[key] = self.url+'/'+link
+
+ links = clean_links(links)
+
+ return links
+
+ def clean_links(links):
+ """
+ Clean links which require login or sign up and containing
+ some search/meta data parameters
+ """
+ stop_attrs = ['#', '?', 'login', 'signup', 'sign-up', 'sign_up']
+
+ for key, link in links.items():
+ for attr in stop_attrs:
+ if attr in link:
+ links[key] = link.split(attr)[0]
+
+ return links
+
+ links = build_links(find_raw_links(self.soup))
+
+ for key, link in links.items():
+ self.model[key] = link
+
+ def validate_address(self):
+
+ def check_address(adr, geo_key=None):
+ """
+ Validate address using geolocation API, first to make sure
+ scraped address is a valid one, seccond to fix if there
+ is any missing pieces and third to aid mt current task
+ """
+ if geo_key:
+
+ r = requests.get(f'https://geocoder.ls.hereapi.com/6.2/geocode.json?apiKey={geo_key}&searchtext={adr}')
+
+ try:
+ location = json.loads(r.text)['Response']['View'][0]['Result']
+ return location[0]['Location']['Address']
+
+ except Exception:
+ return None
+ else:
+ return adr
+
+ def extend_addresses(address, geo_key=None):
+ """
+ If address is a valid one break up address to corresponding
+ pieces (i.e. house number, street number, etc)
+ """
+
+ adr_dict = {}
+
+ address = check_address(address, geo_key)
+ try:
+ if len(address.keys()) > 0:
+ for key in address.keys():
+ if key == 'Label':
+ adr_dict['address'] = address[key].split(',')[0]
+ continue
+ if key == 'AdditionalData':
+ continue
+ if key == 'Country' and address[key] is not None:
+ c = define_country(address['Country'])
+ adr_dict['country'] = c
+ continue
+ adr_dict[key] = address[key]
+ except Exception:
+ adr_dict = None
+
+ return adr_dict
+
+ if self.model['address'] is not None and len(self.model['address']) > 0:
+ if extend_addresses(self.model['address'], self.geo_key) is not None:
+ extended = extend_addresses(self.model['address'], self.geo_key)
+ for key, val in extended.items():
+ if key.lower() == 'country' and self.model['country']:
+ continue
+ self.model[key.lower()] = val
+
+ else:
+ self.model['address'] = None
+
+ if self.model['address']:
+ if len(self.model['address']) > 25:
+ self.model['address'] = None
+
+ def fix_link(self, link):
+ """
+ requests library does not handle well links in www.site.com format,
+ hence needs to be fixed to be the format 'https://www.site.com'
+ """
+ if link.startswith('www.'):
+ return 'https://'+link
+ return link
+
+ def split_phones(self):
+ """
+ Method to seperate found phones into individual ones
+ """
+ if self.split_phones_to_cols:
+ for i in range(6):
+ try:
+ if self.model['phones'][i].startswith('+'):
+ self.model[f'phone_{i+1}'] = self.model['phones'][i]
+ continue
+ self.model[f'phone_{i+1}'] = self.model['phones'][i]
+ except IndexError:
+ pass
+
+ def scrape_text(self, method):
+ """
+ Scrape text of the page of interest;
+ credit for the module scrape_policy_text
+ goes to Olha Babich
+ """
+
+ for key, _ in internal_links.items():
+ if self.model[key] is not None:
+ if 'contact' in key:
+ continue
+ text_key = key.split('_')[0]+'_text'
+ text_list = _get_text_list(self.model[key], method=method, web_driver=self.driver)
+ self.model[text_key] = text_list
+ try:
+ if self.model[text_key] is not None:
+ if self.model['company_name'] is not None:
+ text = ' '.join(text_generator(text_mas=self.model[text_key][0],
+ company_name=self.model['company_name'],
+ company_website=self.model['url']))
+ self.model[text_key] = text
+ else:
+ text = ' '.join(text_generator(text_mas=self.model[text_key][0],
+ company_name='Company',
+ company_website=self.model['url']))
+ self.model[text_key] = text
+ except TypeError:
+ self.model[key] = None
+
+ try:
+ assert len(self.model[text_key]) > 1
+ if self.model[text_key][0] is None \
+ and self.model[text_key][1] is None:
+ self.model[text_key] = None
+ except Exception:
+ pass
+
+ def remove_not_parsed(self):
+ """
+ Common issue is incapability to render the JavaScript whcih
+ results in the text like 'Seems your browser is not using
+ JavaScript...'
+ """
+ fields = ['faq_text', 'privacy_text', 'return_text',
+ 'shipping_text', 'terms_text', 'warranty_text']
+
+ for each in fields:
+ if self.model[each] is not None:
+ if 'JavaScript' in self.model[each]:
+ self.model[each] = None
+
+ def scrape(self):
+ """
+ General pipeline of methods to scrape the website
+ """
+ self.soup = self.get_soup(self.url)
+ if self.soup is None:
+ return
+ self.init_model()
+ self.logging()
+ self.define_domain()
+ if self.model['company_name'] is None:
+ self.get_name()
+ self.find_address()
+ self.find_phones()
+ self.find_email()
+ self.find_links()
+
+ if self.model['address'] is not None \
+ or len(self.model['address']) != 0:
+ self.validate_address()
+
+ if self.model['contact_link']:
+ self.soup = self.get_soup(self.model['contact_link'])
+ if self.soup is None:
+ return
+ if self.model['address'] is None or len(self.model['address']) == 0:
+ self.find_address()
+ self.validate_address()
+ self.find_phones()
+ if self.model['email'] is None or len(self.model['email']) == 0:
+ self.find_email()
+ if self.method == 'requests':
+ self.scrape_text(method='requests')
+ else:
+ self.scrape_text(method='webdriver')
+
+ self.remove_not_parsed()
+
+ if self.model['phones']:
+ fixed_phones = []
+ for phone in list(self.model['phones']):
+ ph = process_phones(phone, self.model['country'])
+ fixed_phones.append(ph)
+
+ self.model['phones'] = fixed_phones
+ else:
+ self.model['phones'] = None
+
+ if self.model['phones']:
+ self.split_phones_to_cols = True
+ self.split_phones()
+
+ if self.verbose == 1:
+ for key, val in self.model.items():
+ print(key, ':', val)
+
+ if self.driver:
+ self.driver.quit()
+
+ return self.model
\ No newline at end of file
diff --git a/dist/scrape_it-0.3.7-py2.py3-none-any.whl b/dist/scrape_it-0.3.7-py2.py3-none-any.whl
new file mode 100644
index 0000000..fd82804
Binary files /dev/null and b/dist/scrape_it-0.3.7-py2.py3-none-any.whl differ
diff --git a/dist/scrape_it-0.3.7.tar.gz b/dist/scrape_it-0.3.7.tar.gz
new file mode 100644
index 0000000..b312598
Binary files /dev/null and b/dist/scrape_it-0.3.7.tar.gz differ
diff --git a/scrape_it.egg-info/PKG-INFO b/scrape_it.egg-info/PKG-INFO
index b516894..b7f2071 100755
--- a/scrape_it.egg-info/PKG-INFO
+++ b/scrape_it.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: scrape-it
-Version: 0.3.6
+Version: 0.3.7
Summary: Systemitized tool for scraping
Home-page: https://github.com/erelin6613/Scrape_it
Author: Valentyna Fihurska
diff --git a/scrape_it.egg-info/SOURCES.txt b/scrape_it.egg-info/SOURCES.txt
index 949fe10..2e407bf 100755
--- a/scrape_it.egg-info/SOURCES.txt
+++ b/scrape_it.egg-info/SOURCES.txt
@@ -8,6 +8,7 @@ scrape_it/models.py
scrape_it/regex.py
scrape_it/run.py
scrape_it/scrape_it.py
+scrape_it/scrape_it_experimental.py
scrape_it/scrape_policy_text.py
scrape_it.egg-info/PKG-INFO
scrape_it.egg-info/SOURCES.txt
diff --git a/scrape_it/__init__.py b/scrape_it/__init__.py
index 3cf7186..ff33d6b 100755
--- a/scrape_it/__init__.py
+++ b/scrape_it/__init__.py
@@ -51,7 +51,7 @@
from .scrape_it import Scrape_it
-__version__ = '0.3.7'
+__version__ = '0.3.8'
if __name__ == '__main__':
import doctest