diff --git a/build/lib/scrape_it/__init__.py b/build/lib/scrape_it/__init__.py old mode 100755 new mode 100644 index 60fcb24..ff33d6b --- a/build/lib/scrape_it/__init__.py +++ b/build/lib/scrape_it/__init__.py @@ -51,7 +51,7 @@ from .scrape_it import Scrape_it -__version__ = '0.3.6' +__version__ = '0.3.8' if __name__ == '__main__': import doctest diff --git a/build/lib/scrape_it/regex.py b/build/lib/scrape_it/regex.py old mode 100755 new mode 100644 index 88cf597..a87aa5c --- a/build/lib/scrape_it/regex.py +++ b/build/lib/scrape_it/regex.py @@ -8,6 +8,135 @@ r'[(-,. ]*[0-9]{2}[)-,. ]+[0-9]{4}[()-,. ]?[0-9]{4}'] -email_keywords = ['help', 'faq', 'info', 'support'] +email_keywords = ['help', 'faq', 'info', 'support', 'hello'] -js_keywords = ['ngMeta'] \ No newline at end of file +js_keywords = ['ngMeta'] + +categories = {'Auto Locksmith': r'((auto)|(automotive)|(automobile)|(car)).locksmith', + 'Auto Repair Services': r'((auto)|(automotive)|(automobile)|(car)).((repair)|(mechanic)|(technic))', + 'Auto Upholstery Repair': r'((auto)|(automotive)|(automobile)|(car)).upholstery', + 'Bicycle Repair': r'((bicycle)|(bike)).((repair)|(mechanic)|(technic))', + 'Car Wash': r'((auto)|(automotive)|(automobile)|(car)).((wash)|(detailing))', + 'Driving School': r'driv.*((school)|(course))', + 'Hoverboard & Segway Repair': r'((hoverboard)|(segway)).((repair)|(mechanic)|(technic))', + 'Lawn Equipment Repair': r'((lawn)|(snow)|(leaf)*|(leav)*|(grass)).((equipment)|(mower)|(blower)).((repair)|(mechanic)|(technic))?', + 'Motorcycle Repair': r'((motorcycle)|(bike)).((repair)|(mechanic)|(technic))', + 'Towing Services': r'((car)|(truck)|(trailer)|(motorcycle)|(bike))?.((tow)|(towing)|(road assistance))(.service)?', + 'Air Conditioner Installation': r'air.condit*install*', + 'Air Duct Cleaning': r'air.duct*clean*', + 'Bathtub Installation & Replacement': r'bath*install*', + 'Carpenter': r'carpent*', + 'Carpet Installation': r'carpet.install*', + 'Deck Building & Repair': r'deck.((build)*|(repair)|(service))', + 'Door Services': r'door.service*', + 'Drain Cleaning': r'drain.clean*', + 'Drywall Installation': r'drywall.install*', + 'Electrician': 'electric*(service)?', + 'Exterior Door Installation': r'exterior.door.install*', + 'Flooring Contractors': r'flooring(.service)?', + 'Furniture Assembly & Repair': r'furniture.((assembl)*|(repair))', + 'Garage Door Installation': r'garage.door.install*', + 'Garage Door Repair': r'garage.door.((repair)|(service))', + 'General Contractors': r'general.contract*', + 'Handyman': r'handym(a|e)n', + 'Hardwood Flooring': r'hardwood.floor*', + 'Heating Installation': r'heat*install*', + 'HVAC Repair': r'hvac.((repair)|(technic))', + 'HVAC Services': r'hvac.service*', + 'Laminate Flooring': r'laminate.floor*', + 'Painting Contractors': r'paint*contract*', + 'Piano Tuning': r'piano.tun((er)|(ing))', + 'Plumber': r'plumb((er)|(ing))', + 'Remodeling Services': r'remodel*service*', + 'Spray Foam Insulation': r'((spray).)?foam.insul*', + 'Tile & Ceramic Flooring': r'tile.((ceramic).)?flooring', + 'Tile Contractors': r'tile.contract*', + 'TV Installation': r'((tv)|(television)).install*', + 'Vinyl & Linoleum Flooring': r'((vinyl)|(linoleum)).floor*', + 'Water Heater Installation': r'water.heater.install*', + 'Water Heater Repair': r'water.heater.((repair)|(service)|(technic))', + 'Welding Services': 'welding-services', + 'Window Installation': 'window-installation', + 'Window Repair': 'window-repair', + 'Bartenders': 'bartenders', + 'Boudoir Photographers': 'boudoir-photographers', + 'Bounce House Rentals': 'bounce-houses', + 'Catering': 'catering', + 'Commercial Photographers': 'commercial-photographers', + 'DJ & MC': 'dj-mc', + 'Event Photographers': 'event-photographers', + 'Event Planning': 'event-planner', + 'Face Painting': 'face-painting', + 'Family Photographers': 'family-photographers', + 'Flower Delivery': 'flower-delivery', + 'Limo Services': 'limo-services', + 'Party Entertainment': 'party-entertainment', + 'Party Equipment Rental': 'party-equipment-rental', + 'Personal Chef': 'personal-chef', + 'Photographers': 'photographers', + 'Portrait Photographers': 'portrait-photographers', 'Promotional Video Services': 'promotional-video-service', + 'Table & Chair Rentals': 'table-chair-rentals', 'Videographers': 'videographers', 'Wedding DJ': 'wedding-dj', + 'Wedding Officiants': 'wedding-officiants', 'Wedding Photography': 'wedding-photography', + 'Wedding Planner': 'wedding-planners', 'Wedding Videography': 'wedding-videography', 'Balayage': 'balayage-hair', + 'Barbers': 'barbershop', 'Box Braids': 'box-braids', 'Crochet Braids': 'corchet-braids', + 'Eyebrow Tinting': 'eyebrow-tinting', 'Eyelash Extension': 'eyelash-extension', 'Fashion Design': 'fashion-design', + 'Hair Extensions': 'hair-extensions', 'Hair Stylist': 'hair-stylist', + 'Henna Tattoos Artist': 'henna-tattoos-artist', 'Image Consultant': 'image-consultant', + 'MakeUp Artist': 'makeup-artist', 'Nail Services': 'nail-services', 'Nutritionist': 'nutritionist', + 'Permanent MakeUp': 'permanent-makeup', 'Personal Stylist': 'personal-stylist', + 'Personal Trainers': 'personal-trainers', 'Sew in': 'sew-in', 'Skin Care': 'skin-care', 'Tailors': 'tailors', + 'Tattoo Artist': 'tattoo-artist', 'Wedding Hair & Makeup Artist': 'wedding-makeup-artist', + 'Animal Control': 'animal-control', 'Ant Control': 'ant-exterminators', + 'Appliance Repair & Installation': 'appliance-repair-installation', 'Bathroom Design': 'bathroom-designers', + 'Bed Bug Control': 'bed-bug-control', 'Cell Phone Repair': 'cell-phone-repair', + 'Closet Organization': 'closet-organizers', 'Computer Repair': 'computer-repair', + 'Computer Services': 'computer-services', 'Decorating': 'home-decorators', + 'Dry-cleaning, Laundry & Alteration': 'dry-cleaning-laundry-alteration', + 'Grocery Shopping & Delivery': 'grocery-shopping-delivery', 'Interior Designer': 'interior-designer', + 'Kitchen Design & Planning': 'kitchen-designers', 'Landscape Designers': 'landscape-designers', + 'Lighting Design Services': 'lighting-designers', 'Locksmith': 'locksmith', 'Moving': 'moving', + 'Pest Control': 'pest-control-services', 'Piano Movers': 'piano-movers', + 'Pool Table Movers': 'pool-table-movers', 'Rat Control': 'rat-control', + 'Security Installation': 'security-installation', 'Self Storage': 'self-storage', + 'Termite Control': 'termite-control', 'Virus Removal': 'virus-removal', + 'Wasp & Bee Removal': 'wasp-bee-removal', 'Apartment Cleaning': 'apartment-cleaning', + 'Appliance Cleaning': 'appliance-cleaning', 'Carpet Cleaning': 'carpet-cleaning', + 'Commercial Cleaning': 'commercial-cleaning', 'House Cleaning': 'house-cleaning', + 'Housekeeping': 'housekeeping', 'Janitorial Services': 'janitorial-services', 'Maids': 'maids', + 'Mattress Cleaning': 'mattress-cleaning', 'Move Out Cleaning': 'move-out-cleaning', + 'Office Cleaning': 'office-cleaning', 'Upholstery Cleaning': 'upholstery-cleaning', + 'Window Cleaning': 'window-cleaning', 'Chimney Services': 'chimney-services', + 'Concrete Contractors': 'concrete-contractors', 'Demolition Services': 'demolition-services', + 'Fence Contractors': 'fence-contractors', 'Fence Repair': 'fence-repair', 'Firewood': 'firewood', + 'Garbage Removal': 'garbage-removal', 'Gardening': 'gardening', 'Gutter Cleaning': 'gutter-cleaning', + 'Gutter Installation & Repair': 'gutter-installation-and-repair', 'Hardscape Contractors': 'hardscapers', + 'Landscaping': 'landscaping', 'Lawn Care': 'lawn-care', 'Masonry Contractors': 'masonry-contractors', + 'Pool Buildings': 'pool-buildings', 'Pool Cleaners': 'pool-cleaners', 'Pool Maintenance': 'pool-maintenance', + 'Pressure Washing': 'pressure-washing', 'Roof Cleaning': 'roof-cleaning', 'Roofing Contractors': 'roofing', + 'Roofing Installation & Repair': 'roofing-installation-and-repair', 'Snow & Ice Removal': 'snow-ice-removal', + 'Sprinkler Repairs': 'sprinkler-repairs', 'Tree Services': 'tree-services', 'Yard Clean-Up': 'yard-clean-up', + 'Aquarium Services': 'aquarium-services', 'Dog Training': 'dog-training', 'Horse Boarding': 'horse-boarding', + 'Horse Training': 'horse-training', 'Pet Daycare & Boarding': 'pet-daycare-boarding', + 'Pet Groomers': 'pet-groomer', 'Pet Sitters & Walkers': 'pet-sitting-and-walking', + 'Veterinary Services': 'veterinary', 'Academic Writing': 'academic-writing', + 'Bankruptcy Lawyers': 'bankruptcy-lawyers', 'Business Lawyers': 'business-lawyers', + 'Civil Rights Lawyers': 'civil-rights-lawyers', 'Copywriting': 'copywriting', + 'Criminal Defense Attorneys': 'criminal-defense-attorneys', 'Divorce Lawyers': 'divorce-lawyers', + 'Essays Writing & Editing': 'essays-writing-and-editing', 'Family Lawyers': 'family-lawyers', + 'Immigration Lawyers': 'immigration-lawyers', 'Lawyers': 'lawyer', 'Notary Services': 'notary-services', + 'Personal Driver': 'personal-driver', 'Personal Injury Lawyers': 'personal-injury-lawyers', + 'Private Detective': 'private-detective', 'Resume Writing': 'resume-writing', + 'Tax Preparation': 'tax-preparation', 'Translation Services': 'translator', + 'Writing & Editing': 'writing-editing-services', 'Babysitting': 'babysitters', + 'Beauty Schools': 'beauty-schools', 'Caregiver': 'caregiver', 'Chinese Lessons': 'chinese-lessons', + 'Cooking Lessons': 'cooking-lessons', 'Dancing Lessons': 'dancing-lessons', 'Daycare': 'daycare', + 'Drawing Classes': 'drawing-lessons', 'English Lessons': 'english-lessons', + 'French Lessons': 'french-lessons', 'Horseback Riding': 'horseback-riding', + 'Italian Lessons': 'italian-lessons', 'Japanese Lessons': 'japanese-lessons', 'Language Classes': 'classes', + 'MakeUp Lessons': 'makeup-lessons', 'Martial Arts': 'martial-arts', 'Music Lessons': 'music-lessons', + 'Other Classes': 'other-classes', 'Photography Classes': 'photography-classes', + 'Piano Lessons': 'piano-lessons', 'Portuguese Lessons': 'portuguese-lessons', + 'Private Tutoring': 'private-tutor', 'Russian Lessons': 'russian-lessons', + 'Singing Lessons': 'singing-lessons', 'Spanish Lessons': 'spanish-lessons', + 'Sport Lessons': 'sport-lessons', 'Surfing lessons': 'surfing-lessons', + 'Swim lessons': 'swim-lessons', 'Tennis lessons': 'tenis-lessons'} \ No newline at end of file diff --git a/build/lib/scrape_it/scrape_it.py b/build/lib/scrape_it/scrape_it.py old mode 100755 new mode 100644 index f161ee7..ebc192d --- a/build/lib/scrape_it/scrape_it.py +++ b/build/lib/scrape_it/scrape_it.py @@ -16,23 +16,23 @@ Note on additional .txt files in regex directory name_stop_words.txt - Contains words to filter unneeded words - during the search of entity's name + during the search of entity's name email_keywords.txt - Specific file to filter emails based on - keywords it might contain (this is - as needed for current task) + keywords it might contain (this is + as needed for current task) regex.txt - some regular expressions to search phone numbers; - at the current time is not in use, phonenumbers - package is used instead; one of improvements - should be a workflow which would allow efficient - and accurate phone matching with good filter - pipeline from 'scraping trash' + at the current time is not in use, phonenumbers + package is used instead; one of improvements + should be a workflow which would allow efficient + and accurate phone matching with good filter + pipeline from 'scraping trash' address.txt - some regular expressions to match addresses, - not perfect expesially given the diversity - of different address formats accross - different countries + not perfect expesially given the diversity + of different address formats accross + different countries """ @@ -48,7 +48,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -#from models import Business from .correct_base import process_phones, define_country from .regex import * from bs4 import BeautifulSoup @@ -64,647 +63,725 @@ """ with open('regex/name_stop_words.txt', 'r') as file: - name_stop_words = [r.strip() for r in file.readlines()] + name_stop_words = [r.strip() for r in file.readlines()] with open('regex/phones_regex.txt', 'r') as file: - phone_regex = [r.strip() for r in file.readlines()] + phone_regex = [r.strip() for r in file.readlines()] with open('regex/email_keywords.txt', 'r') as file: - email_keywords = [w.strip() for w in file.readlines()] + email_keywords = [w.strip() for w in file.readlines()] with open('regex/js_keywords.txt', 'r') as file: - js_keywords = [w.strip() for w in file.readlines()] + js_keywords = [w.strip() for w in file.readlines()] """ internal_links = {'contact_link': r'contact.*', - 'privacy_link': r'privacy.*policy', - 'shipping_link': r'(deliver|shiping).*(policy)*', - 'terms_link': r'term.*(condition|use|service)', - 'faq_link': r'(faq)|(frequently.*asked.*question)', - 'return_link': r'return.*', - 'warranty_link': r'(warrant)|(guarant)'} + 'privacy_link': r'privacy.*policy', + 'shipping_link': r'(deliver|shiping).*(policy)*', + 'terms_link': r'term.*(condition|use|service)', + 'faq_link': r'(faq)|(frequently.*asked.*question)', + 'return_link': r'return.*', + 'warranty_link': r'(warrant)|(guarant)'} external_links = {'twitter': 'twitter.com', - 'facebook': 'facebook.com', - 'instagram': 'instagram.com', - 'pinterest':'pinterest.com', - 'youtube': 'youtube.com', - 'linkedin': 'linkedin.com'} + 'facebook': 'facebook.com', + 'instagram': 'instagram.com', + 'pinterest': 'pinterest.com', + 'youtube': 'youtube.com', + 'linkedin': 'linkedin.com'} -class Scrape_it: - - def __init__(self, url, method='requests', country='us', - company_name=None, category=None, geo_key=None, - verbose=0, driver=None): - - self.url = url - self.method = method - self.model = {'url': self.url, 'country': country, - 'category': category, 'company_name': company_name} - self.soup = None - self.geo_key = geo_key - self.verbose = verbose - self.driver = driver - - - def init_model(self): - """ - Current task of mine is reflected in the model; it is planned - by me to export models to seperate file and use different onces - as need or to simplify the process of defining the model and - needed methods to execute - """ - - self.model['url'] = self.url - self.model['company_name'] = self.model['company_name'] - self.model['country'] = self.model['country'] - self.model['category'] = self.model['category'] - self.model['contact_link'] = None - self.model['phones'] = None - self.model['phone_1'] = None - self.model['phone_2'] = None - self.model['phone_3'] = None - self.model['phone_4'] = None - self.model['phone_5'] = None - self.model['phone_6'] = None - self.model['address'] = None - self.model['state'] = None - self.model['county'] = None - self.model['city'] = None - self.model['street'] = None - self.model['housenumber'] = None - self.model['postalcode'] = None - self.model['district'] = None - self.model['email'] = None - self.model['facebook'] = None - self.model['instagram'] = None - self.model['linkedin'] = None - self.model['pinterest'] = None - self.model['twitter'] = None - self.model['youtube'] = None - self.model['faq_link'] = None - self.model['privacy_link'] = None - self.model['return_link'] = None - self.model['shipping_link'] = None - self.model['terms_link'] = None - self.model['warranty_link'] = None - self.model['faq_text'] = None - self.model['privacy_text'] = None - self.model['return_text'] = None - self.model['shipping_text'] = None - self.model['terms_text'] = None - self.model['warranty_text'] = None - - def logging(self): - """ - Log some text while scraping if verbose is set to 1 - """ - - if self.verbose == 1: - print('Scraping', self.url, '...') - - - def define_domain(self): - """ - Define domain name of the link - """ - - def get_domain(url): - domain = tldextract.extract(str(url)).domain+'.'+tldextract.extract(str(url)).suffix - return domain - - self.model['url'] = get_domain(self.url) - - - def get_soup(self, url): - """ - Gets soup object depending on the method - """ - - if self.method == 'requests': - import requests - try: - r = requests.get(url) - soup = BeautifulSoup(r.text, 'lxml') - except Exception as e: - print(e, url) - - if self.method == 'webdriver': - from selenium import webdriver - options = webdriver.ChromeOptions() - options.add_argument('headless') - self.driver = webdriver.Chrome(executable_path='./chromedriver', options=options) - try: - self.driver.get(url) - except Exception as e: - print(e, url) - soup = BeautifulSoup(self.driver.page_source, 'lxml') - try: - assert soup != None - return soup - except Exception: - return - - - def clean_name(self): - """ - Since company name is scraped from code too it can - be messy and needs to be cleaned from short descriptions - """ - - delims = ['|', '-', ':'] - - if self.model['company_name']: - - for d in delims: - if d in self.model['company_name']: - for i, s in enumerate(self.model['company_name'].split(d)): - if len(set(name_stop_words).intersection(s.split(' '))) > 0: - break - self.model['company_name'] = self.model['company_name'].split(d)[i] - break - - - return self.model['company_name'] - - - def get_name(self): - """ - Get company name from the most likely places in html it could be found - """ - for script in self.soup(["script", "style"]): - script.extract() - - metas_og = ['og:site_name', 'og:title'] - metas = ['title', 'name'] - for meta in metas_og: - if self.model['company_name'] == None or self.model['company_name'] == '': - try: - self.model['company_name'] = self.soup.find('meta', attrs={'property': meta}).get('content') - except AttributeError: - pass - - for meta in metas: - if self.model['company_name'] == None or self.model['company_name'] == '': - try: - self.model['company_name'] = self.soup.find('meta', attrs={'name': meta}).get('content') - except AttributeError: - if self.soup.find('title'): - if len(self.soup.find('title')) > 0: - self.model['company_name'] = self.soup.find('title').text - if self.model['company_name'] != None: - if 'forbidden' in self.model['company_name'].lower() or\ - 'ngMeta' in self.model['company_name']: - self.model['company_name'] = None - if self.model['company_name']: - self.model['company_name'] = self.clean_name().strip() - - - def find_phones(self): - - def get_from_href(soup): - """ - If phonenumbers package could not find any phone numbers - there could be some embedded in links as in - - """ - phones = set() - for script in soup(["script", "style"]): - script.extract() - - try: - - for line in soup.find_all('a'): - if line.get('href').startswith('tel:'): - phones.add(line.get('href')[3:]) - - return phones - - except AttributeError: - return None - - - - - def match_phones(soup): - """ - Find phones using phonenumbers package, location is provided from - model's country value - """ - - phones = set() - for script in soup(["script", "style"]): - script.extract() - for line in soup.get_text().split('\n'): - for match in phonenumbers.PhoneNumberMatcher(line, str(self.model["country"]).upper()): - phones.add(phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)) - - - return phones - - - self.model['phones'] = get_from_href(self.soup) - if self.model['phones']: - self.model['phones'] = self.model['phones'].union(match_phones(self.soup)) - else: - self.model['phones'] = match_phones(self.soup) - - #if len(self.model['phones']) == 0: - #self.model['phones'] = get_from_href(self.soup) - - - def find_address(self): - - def find_regex(soup): - """ - Find address with regular expression(s) specified in regex/address.txt - """ - - #with open('regex/address.txt') as f: - #address_regex = f.read() - try: - address_regex = r'^([0-9\-/]+) ?([A-Za-z](?= ))? (.*?) ([^ ]+?) ?((?<= )[A-Za-z])? ?((?<= )\d*)?$\ - ^([A-Za-z]+ ?)+[0-9]{3,6}$\ - ^([A-Za-z]+ ?)$' - except Exception: - pass - for script in soup(["script", "style"]): - script.extract() - text = soup.get_text() - address = re.search(address_regex, text) - if address: - address = address.group(0) - else: - address = None - - return address - - - - def find_base(soup, country='us'): - """ - Find addresses using pyap package - """ - - for script in soup(["script", "style"]): - script.extract() - text = soup.get_text() - address = '' - - adr = pyap.parse(text, country='us') - if len(adr) > 0: - for item in adr: - address = address+' '+str(item) - - return address - - - - if self.model['address'] == None: - self.model['address'] = find_regex(self.soup) - if self.model['address'] == None: - self.model['address'] = find_base(self.soup, self.model['country']) - - if len(self.model['address']) > 0: - if define_country(self.model['country']) != None: - self.model['country'] = define_country(self.model['country']) - - - def find_email(self): - - def get_all_emails(soup): - """ - Get set of emails using regular expression - """ - - emails = set() - - email_pattern = r'[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk|au|net|me){1}' - for script in soup(["script", "style"]): - script.extract() - - for each in soup.get_text().split('\n'): - email_re = re.search(email_pattern, each) - if email_re: - if len(email_re.group(0)) > 5 and len(email_re.group(0)) < 75: - emails.add(email_re.group(0)) - - return emails - - def keep_with_keywords(emails, keywords): - """ - Filter emails and keep one of the set found, as for my task - either one with keywors specified in regex/email_keywors.txt - or the first one if there are none which contain needed - keywords - """ +#cropped_emails = r'[info|support|contact|faq|help|hello]@[0-9A-Za-z]*\.[0-9A-Za-z]{2,3}' - for word in keywords: - if word in ''.join(list(emails)): - for email in emails: - if word in email: - return email - - if len(list(emails)) > 0: - return list(emails)[0] - return None - - self.model['email'] = keep_with_keywords(get_all_emails(self.soup), keywords=email_keywords) - - - def find_links(self): - - def find_raw_links(soup): - """ - Find links: - external: social media links - internal: links to policies, faq, etc - """ - - links = {} - for each in soup.find_all('a'): - for ext_key, ext_val in external_links.items(): - if ext_val in str(each.get('href')): - links[ext_key] = str(each.get('href')) - - for int_key, int_val in internal_links.items(): - try: - url = re.findall(int_val, each.get('href')) - if len(url) > 0: - links[int_key] = str(each.get('href')) - except Exception: - pass - - return links - - - - def build_links(links): - """ - Build links from raw scraped hfer attributes - """ - - for key, link in links.items(): - if link.startswith('http') or link.startswith('www'): - links[key] = self.fix_link(link) - continue - if link.startswith('//'): - links[key] = self.fix_link(link[2:]) - continue - if key in external_links.keys(): - continue - if link.startswith('/'): - if self.url.endswith('/'): - links[key] = self.url+link[1:] - else: - links[key] = self.url+link - - - else: - if link.startswith('http') == False and link.startswith('www') == False: - if self.url.endswith('/'): - links[key] = self.url+link - else: - links[key] = self.url+'/'+link - - links = clean_links(links) - - return links - - def clean_links(links): - """ - Clean links which require login or sign up and containing - some search/meta data parameters - """ - - stop_attrs = ['#', '?', 'login', 'signup', 'sign-up', 'sign_up'] - - for key, link in links.items(): - for attr in stop_attrs: - if attr in link: - links[key] = link.split(attr)[0] - - return links - - - - - links = build_links(find_raw_links(self.soup)) - - for key, link in links.items(): - self.model[key] = link - - - def validate_address(self): - - def check_address(adr, geo_key=None): - """ - Validate address using geolocation API, first to make sure - scraped address is a valid one, seccond to fix if there - is any missing pieces and third to aid mt current task - """ - - if geo_key: - - r = requests.get(f'https://geocoder.ls.hereapi.com/6.2/geocode.json?apiKey={geo_key}&searchtext={adr}') - - try: - return json.loads(r.text)['Response']['View'][0]['Result'][0]['Location']['Address'] - - except Exception: - return None - else: - return adr - - def extend_addresses(address, geo_key=None): - """ - If address is a valid one break up address to corresponding - pieces (i.e. house number, street number, etc) - """ - - adr_dict = {} - - address = check_address(address, geo_key) - try: - if len(address.keys()) > 0: - for key in address.keys(): - if key == 'Label': - adr_dict['address'] = address[key].split(',')[0] - continue - if key == 'AdditionalData': - continue - if key == 'Country' and address[key] != None: - adr_dict['country'] = define_country(address['Country']) - continue - - - adr_dict[key] = address[key] - except Exception: - adr_dict = None - - - return adr_dict - - - - if self.model['address'] != None and len(self.model['address']) > 0: - if extend_addresses(self.model['address'], self.geo_key) != None: - for key, val in extend_addresses(self.model['address'], self.geo_key).items(): - if key.lower() == 'country' and self.model['country']: - continue - self.model[key.lower()] = val - - else: - self.model['address'] = None - - if self.model['address']: - if len(self.model['address']) > 25: - self.model['address'] = None - - - def fix_link(self, link): - """ - requests library does not handle well links in www.site.com format, - hence needs to be fixed to be the format 'https://www.site.com' - """ - - if link.startswith('www.'): - return 'https://'+link - return link - - - def split_phones(self): - """ - Method to seperate found phones into individual ones - """ - - if self.split_phones_to_cols: - for i in range(6): - try: - if self.model['phones'][i].startswith('+'): - self.model[f'phone_{i+1}'] = self.model['phones'][i] - continue - self.model[f'phone_{i+1}'] = self.model['phones'][i] - except IndexError: - pass - - - def scrape_text(self, method): - """ - Scrape text of the page of interest; - credit for the module scrape_policy_text - goes to Olha Babich - """ - - for key, _ in internal_links.items(): - if self.model[key] != None: - if 'contact' in key: - continue - text_key = key.split('_')[0]+'_text' - self.model[text_key] = _get_text_list(self.model[key], method=method, web_driver=self.driver) - try: - if self.model[text_key] != None: - if self.model['company_name'] != None: - self.model[text_key] = ' '.join(text_generator(text_mas=self.model[text_key][0], - company_name=self.model['company_name'], - company_website=self.model['url'])) - else: - self.model[text_key] = ' '.join(text_generator(text_mas=self.model[text_key][0], - company_name='Company', - company_website=self.model['url'])) - except TypeError: - self.model[key] = None - - try: - assert len(self.model[text_key]) > 1 - if self.model[text_key][0] == None and self.model[text_key][1] == None: - self.model[text_key] = None - except Exception: - pass - - - - def remove_not_parsed(self): - """ - Common issue is incapability to render the JavaScript whcih - results in the text like 'Seems your browser is not using - JavaScript...' - """ - - fields = ['faq_text', 'privacy_text', 'return_text', - 'shipping_text', 'terms_text', 'warranty_text'] - - for each in fields: - if self.model[each] != None: - if 'JavaScript' in self.model[each]: - self.model[each] = None - - - - def scrape(self): - """ - General pipeline of methods to scrape the website - """ - - self.soup = self.get_soup(self.url) - if self.soup == None: - return - self.init_model() - self.logging() - self.define_domain() - if self.model['company_name'] == None: - self.get_name() - self.find_address() - self.find_phones() - self.find_email() - self.find_links() - - if self.model['address'] != None or len(self.model['address']) != 0: - self.validate_address() - - - if self.model['contact_link']: - self.soup = self.get_soup(self.model['contact_link']) - if self.soup == None: - return - if self.model['address'] == None or len(self.model['address']) == 0: - self.find_address() - self.validate_address() - self.find_phones() - if self.model['email'] == None or len(self.model['email']) == 0: - self.find_email() - if self.method == 'requests': - self.scrape_text(method='requests') - else: - self.scrape_text(method='webdriver') - - - self.remove_not_parsed() - - if self.model['phones']: - fixed_phones = [] - for phone in list(self.model['phones']): - fixed_phones.append(process_phones(phone, self.model['country'])) - - self.model['phones'] = fixed_phones - else: - self.model['phones'] = None - - if self.model['phones']: - self.split_phones_to_cols = True - self.split_phones() - - - if self.verbose == 1: - for key, val in self.model.items(): - print(key, ':', val) - - if self.driver: - self.driver.quit() - - #del self.model['phones'] - - return self.model +class Scrape_it: + def __init__(self, url, method='requests', country='us', + company_name=None, category=None, geo_key=None, + verbose=0, driver=None): + + self.url = url + self.method = method + self.model = {'url': self.url, 'country': country, + 'category': category, 'company_name': company_name} + self.soup = None + self.geo_key = geo_key + self.verbose = verbose + self.driver = driver + + def init_model(self): + """ + Current task of mine is reflected in the model; it is planned + by me to export models to seperate file and use different onces + as need or to simplify the process of defining the model and + needed methods to execute + """ + + self.model['url'] = self.url + self.model['company_name'] = self.model['company_name'] + self.model['country'] = self.model['country'] + self.model['category'] = self.model['category'] + self.model['contact_link'] = None + self.model['description'] = None + self.model['phones'] = None + self.model['phone_1'] = None + self.model['phone_2'] = None + self.model['phone_3'] = None + self.model['phone_4'] = None + self.model['phone_5'] = None + self.model['phone_6'] = None + self.model['phone_7'] = None + self.model['phone_8'] = None + self.model['phone_9'] = None + self.model['phone_10'] = None + self.model['phone_11'] = None + self.model['phone_12'] = None + self.model['phone_13'] = None + self.model['phone_14'] = None + self.model['phone_15'] = None + self.model['phone_16'] = None + self.model['phone_17'] = None + self.model['phone_18'] = None + self.model['phone_19'] = None + self.model['phone_20'] = None + self.model['address'] = None + self.model['state'] = None + self.model['county'] = None + self.model['city'] = None + self.model['street'] = None + self.model['housenumber'] = None + self.model['postalcode'] = None + self.model['district'] = None + self.model['email'] = None + self.model['facebook'] = None + self.model['instagram'] = None + self.model['linkedin'] = None + self.model['pinterest'] = None + self.model['twitter'] = None + self.model['youtube'] = None + self.model['faq_link'] = None + self.model['privacy_link'] = None + self.model['return_link'] = None + self.model['shipping_link'] = None + self.model['terms_link'] = None + self.model['warranty_link'] = None + self.model['faq_text'] = None + self.model['privacy_text'] = None + self.model['return_text'] = None + self.model['shipping_text'] = None + self.model['terms_text'] = None + self.model['warranty_text'] = None + + def logging(self): + """ + Log some text while scraping if verbose is set to 1 + """ + + if self.verbose == 1: + print('Scraping', self.url, '...') + + def define_domain(self): + """ + Define domain name of the link + """ + + def get_domain(url): + domain = tldextract.extract(str(url)).domain+'.'+tldextract.extract(str(url)).suffix + return domain + + self.model['url'] = get_domain(self.url) + + def get_soup(self, url): + """ + Gets soup object depending on the method + """ + if self.method == 'requests': + import requests + try: + r = requests.get(url) + soup = BeautifulSoup(r.text, 'lxml') + except Exception as e: + print(e, url) + + if self.method == 'webdriver': + from selenium import webdriver + options = webdriver.ChromeOptions() + options.add_argument('headless') + self.driver = webdriver.Chrome(executable_path='./chromedriver', + options=options) + try: + self.driver.get(url) + except Exception as e: + print(e, url) + soup = BeautifulSoup(self.driver.page_source, 'lxml') + try: + assert soup is not None + return soup + except Exception: + return + + def clean_name(self): + """ + Since company name is scraped from code too it can + be messy and needs to be cleaned from short descriptions + """ + + delims = ['|', '-', ':'] + + if self.model['company_name']: + + for d in delims: + if d in self.model['company_name']: + for i, s in enumerate(self.model['company_name'].split(d)): + same_words = set(name_stop_words).intersection(s.split(' ')) + if len(same_words) > 0: + break + self.model['company_name'] = self.model['company_name'].split(d)[i] + break + return self.model['company_name'] + + def get_name(self): + """ + Get company name from the most likely places in html it could be found + """ + for script in self.soup(["script", "style"]): + script.extract() + + metas_og = ['og:site_name', 'og:title'] + metas = ['title', 'name'] + for meta in metas_og: + if self.model['company_name'] is None \ + or self.model['company_name'] == '': + try: + meta_name = self.soup.find('meta', attrs={'property': meta}) + self.model['company_name'] = meta_name.get('content') + except AttributeError: + pass + + for meta in metas: + if self.model['company_name'] is None \ + or self.model['company_name'] == '': + try: + meta_name = self.soup.find('meta', attrs={'name': meta}) + self.model['company_name'] = meta_name.get('content') + except AttributeError: + if self.soup.find('title'): + if len(self.soup.find('title')) > 0: + title = self.soup.find('title') + self.model['company_name'] = title.text + if self.model['company_name'] is not None: + if 'forbidden' in self.model['company_name'].lower() or\ + 'ngMeta' in self.model['company_name']: + self.model['company_name'] = None + if self.model['company_name']: + self.model['company_name'] = self.clean_name().strip() + + def find_description(self): + """ + Get company description from the most likely places in html it could be found + """ + for script in self.soup(["script", "style"]): + script.extract() + + metas_og = ['og:description'] + metas = ['description'] + for meta in metas_og: + if self.model['description'] is None \ + or self.model['description'] == '': + try: + meta_name = self.soup.find('meta', attrs={'property': meta}) + self.model['description'] = meta_name.get('content') + except AttributeError: + pass + + for meta in metas: + + try: + meta_name = self.soup.find('meta', attrs={'name': meta}) + self.model['description'] = meta_name.get('content') + except AttributeError: + if self.soup.find('title'): + if len(self.soup.find('title')) > 0: + title = self.soup.find('title') + self.model['description'] = title.text + if self.model['description'] is not None: + if 'forbidden' in self.model['description'].lower() or\ + 'ngMeta' in self.model['description']: + self.model['description'] = None + + def set_category(self): + """ + Get company description from the most likely places in html it could be found + """ + for script in self.soup(["script", "style"]): + script.extract() + + for cat, reg in categories.items(): + score = 0 + for text in self.soup.get_text().split('\n'): + #print(text.lower()) + if re.search(reg, text.lower()): + print(reg, text) + score += 1 + if score > 2: + self.model['category'] = cat + break + + def find_phones(self): + + def get_from_href(soup): + """ + If phonenumbers package could not find any phone numbers + there could be some embedded in links as in + + """ + phones = set() + for script in soup(["script", "style"]): + script.extract() + + try: + + for line in soup.find_all('a'): + if line.get('href').startswith('tel:'): + phones.add(line.get('href')[3:]) + + return phones + + except AttributeError: + return None + + def match_phones(soup): + """ + Find phones using phonenumbers package, location is provided from + model's country value + """ + phones = set() + for script in soup(["script", "style"]): + script.extract() + for line in soup.get_text().split('\n'): + matches = phonenumbers.PhoneNumberMatcher(line, str(self.model["country"]).upper()) + for match in matches: + phones.add(phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)) + + return phones + + def remove_dublicates(phones): + + def __get_digits__(phone): + phone = '+'+re.sub(r"[^0-9]", "", phone).strip() + return phone.replace(' ', '').replace('-', '').replace('(', '').replace(')', '') + + temp_set = set() + + for phone in phones: + temp_set.add(__get_digits__(phone)) + + return list(temp_set) + + self.model['phones'] = get_from_href(self.soup) + if self.model['phones']: + self.model['phones'] = self.model['phones'].union(match_phones(self.soup)) + else: + self.model['phones'] = match_phones(self.soup) + if len(self.model['phones']) > 20: + self.model['phones'] = list(self.model['phones'])[:19] + self.model['phones'] = remove_dublicates(self.model['phones']) + + def find_address(self): + + def find_regex(soup): + """ + Find address with regular expression(s) specified in regex/address.txt + """ + try: + address_regex = r'^([0-9\-/]+) ?([A-Za-z](?= ))? (.*?) ([^ ]+?) ?((?<= )[A-Za-z])? ?((?<= )\d*)?$\ + ^([A-Za-z]+ ?)+[0-9]{3,6}$\ + ^([A-Za-z]+ ?)$' + except Exception: + pass + for script in soup(["script", "style"]): + script.extract() + text = soup.get_text() + address = re.search(address_regex, text) + if address: + address = address.group(0) + else: + address = None + + return address + + def find_base(soup, country='us'): + """ + Find addresses using pyap package + """ + for script in soup(["script", "style"]): + script.extract() + text = soup.get_text() + address = '' + + adr = pyap.parse(text, country='us') + if len(adr) > 0: + for item in adr: + address = address+' '+str(item) + + return address + + if self.model['address'] is None: + self.model['address'] = find_regex(self.soup) + if self.model['address'] is None: + base = find_base(self.soup, self.model['country']) + self.model['address'] = base + + if len(self.model['address']) > 0: + if define_country(self.model['country']) is not None: + self.model['country'] = define_country(self.model['country']) + + def find_email(self): + + def get_all_emails(soup): + """ + Get set of emails using regular expression + """ + + emails = set() + + email_pattern = r'[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk|au|net|me){1}' + for script in soup(["script", "style"]): + script.extract() + + for each in soup.get_text().split('\n'): + email_re = re.search(email_pattern, each) + if email_re: + if len(email_re.group(0)) > 5 \ + and len(email_re.group(0)) < 75: + emails.add(email_re.group(0)) + + return emails + + def keep_with_keywords(emails, keywords): + """ + Filter emails and keep one of the set found, as for my task + either one with keywors specified in regex/email_keywors.txt + or the first one if there are none which contain needed + keywords + """ + + for word in keywords: + if word in ''.join(list(emails)): + for email in emails: + if word in email: + return email + + if len(list(emails)) > 0: + return list(emails)[0] + return None + + def remove_junk_numns(email): + + while email[0].isdigit(): + email = email[1:] + + return email + + mails = get_all_emails(self.soup) + self.model['email'] = keep_with_keywords(mails, email_keywords) + if self.model['email']: + self.model['email'] = remove_junk_numns(self.model['email']) + + def find_links(self): + + def find_raw_links(soup): + """ + Find links: + external: social media links + internal: links to policies, faq, etc + """ + + links = {} + for each in soup.find_all('a'): + for ext_key, ext_val in external_links.items(): + if ext_val in str(each.get('href')) \ + and str(each.get('href')).endswith(str(ext_val)) is False \ + and str(each.get('href')).endswith(str(ext_val)+'/') is False: + #print('Link', str(each.get('href')), 'does not end with', ext_val, 'and does not end with', ext_val+'/') + #print(str(each.get('href')).endswith(str(ext_val))) + #print(str(each.get('href')).endswith(str(ext_val)+'/')) + links[ext_key] = str(each.get('href')) + + for int_key, int_val in internal_links.items(): + try: + url = re.findall(int_val, each.get('href')) + if len(url) > 0: + links[int_key] = str(each.get('href')) + except Exception: + pass + + return links + + def build_links(links): + """ + Build links from raw scraped hfer attributes + """ + + for key, link in links.items(): + if link.startswith('http') or link.startswith('www'): + links[key] = self.fix_link(link) + continue + if link.startswith('//'): + links[key] = self.fix_link(link[2:]) + continue + if key in external_links.keys(): + continue + if link.startswith('/'): + if self.url.endswith('/'): + links[key] = self.url+link[1:] + else: + links[key] = self.url+link + else: + if link.startswith('http') is False \ + and link.startswith('www') is False: + if self.url.endswith('/'): + links[key] = self.url+link + else: + links[key] = self.url+'/'+link + + links = clean_links(links) + + return links + + def clean_links(links): + """ + Clean links which require login or sign up and containing + some search/meta data parameters + """ + stop_attrs = ['#', '?', 'login', 'signup', 'sign-up', 'sign_up', 'sharer'] + + for key, link in links.items(): + for attr in stop_attrs: + if attr in link: + links[key] = link.split(attr)[0] + + return links + + links = build_links(find_raw_links(self.soup)) + + for key, link in links.items(): + if link.endswith(key+'.com') or link.endswith(key+'.com/') \ + or link.endswith(self.model['url']) or link.endswith(self.model['url']+'/'): + self.model[key] = None + continue + self.model[key] = link + #print(key, link) + + def validate_address(self): + + def check_address(adr, geo_key=None): + """ + Validate address using geolocation API, first to make sure + scraped address is a valid one, seccond to fix if there + is any missing pieces and third to aid mt current task + """ + if geo_key: + + r = requests.get(f'https://geocoder.ls.hereapi.com/6.2/geocode.json?apiKey={geo_key}&searchtext={adr}') + + try: + location = json.loads(r.text)['Response']['View'][0]['Result'] + return location[0]['Location']['Address'] + + except Exception: + return None + else: + return adr + + def extend_addresses(address, geo_key=None): + """ + If address is a valid one break up address to corresponding + pieces (i.e. house number, street number, etc) + """ + + adr_dict = {} + + address = check_address(address, geo_key) + try: + if len(address.keys()) > 0: + for key in address.keys(): + if key == 'Label': + adr_dict['address'] = address[key].split(',')[0] + continue + if key == 'AdditionalData': + continue + if key == 'Country' and address[key] is not None: + c = define_country(address['Country']) + adr_dict['country'] = c + continue + adr_dict[key] = address[key] + except Exception: + adr_dict = None + + return adr_dict + + if self.model['address'] is not None and len(self.model['address']) > 0: + if extend_addresses(self.model['address'], self.geo_key) is not None: + extended = extend_addresses(self.model['address'], self.geo_key) + for key, val in extended.items(): + if key.lower() == 'country' and self.model['country']: + continue + self.model[key.lower()] = val + + else: + self.model['address'] = None + + if self.model['address']: + if len(self.model['address']) > 25: + self.model['address'] = None + + def fix_link(self, link): + """ + requests library does not handle well links in www.site.com format, + hence needs to be fixed to be the format 'https://www.site.com' + """ + if link.startswith('http') == False and link.startswith('www') == False: + return 'https://www.'+link + if link.startswith('www.'): + return 'https://'+link + return link + + def split_phones(self): + """ + Method to seperate found phones into individual ones + """ + if self.split_phones_to_cols: + for i in range(6): + try: + if self.model['phones'][i].startswith('+'): + self.model[f'phone_{i+1}'] = self.model['phones'][i] + continue + self.model[f'phone_{i+1}'] = self.model['phones'][i] + except IndexError: + pass + + def phones_to_string(self): + + string = '' + + for phone in self.model['phones']: + string = string+str(phone)+'; ' + + self.model['phones'] = string[:-2] + + def scrape_text(self, method): + """ + Scrape text of the page of interest; + credit for the module scrape_policy_text + goes to Olha Babich + """ + + for key, _ in internal_links.items(): + if self.model[key] is not None: + if 'contact' in key: + continue + text_key = key.split('_')[0]+'_text' + text_list = _get_text_list(self.model[key], method=method, web_driver=self.driver) + self.model[text_key] = text_list + try: + if self.model[text_key] is not None: + if self.model['company_name'] is not None: + text = ' '.join(text_generator(text_mas=self.model[text_key][0], + company_name=self.model['company_name'], + company_website='Website')) + self.model[text_key] = text + else: + text = ' '.join(text_generator(text_mas=self.model[text_key][0], + company_name='Company', + company_website='Website')) + self.model[text_key] = text + except TypeError: + self.model[key] = None + + try: + assert len(self.model[text_key]) > 1 + if self.model[text_key][0] is None \ + and self.model[text_key][1] is None: + self.model[text_key] = None + except Exception: + pass + + def remove_not_parsed(self): + """ + Common issue is incapability to render the JavaScript whcih + results in the text like 'Seems your browser is not using + JavaScript...' + """ + fields = ['faq_text', 'privacy_text', 'return_text', + 'shipping_text', 'terms_text', 'warranty_text'] + + for each in fields: + if self.model[each] is not None: + if 'JavaScript' in self.model[each]: + self.model[each] = None + + def scrape(self): + """ + General pipeline of methods to scrape the website + """ + self.soup = self.get_soup(self.url) + if self.soup is None: + return + self.init_model() + self.logging() + self.define_domain() + if not self.model['category']: + self.set_category() + if self.model['company_name'] is None: + self.get_name() + self.find_description() + self.find_address() + self.find_phones() + self.find_email() + self.find_links() + + if self.model['address'] is not None \ + or len(self.model['address']) != 0: + self.validate_address() + + if self.model['contact_link']: + self.soup = self.get_soup(self.model['contact_link']) + if self.soup is None: + return + if self.model['address'] is None or len(self.model['address']) == 0: + self.find_address() + self.validate_address() + self.find_phones() + if self.model['email'] is None or len(self.model['email']) == 0: + self.find_email() + if self.method == 'requests': + self.scrape_text(method='requests') + else: + self.scrape_text(method='webdriver') + + self.remove_not_parsed() + + if self.model['phones']: + fixed_phones = [] + for phone in list(self.model['phones']): + ph = process_phones(phone, self.model['country']) + fixed_phones.append(ph) + + self.model['phones'] = fixed_phones + else: + self.model['phones'] = None + + if self.model['phones']: + self.split_phones_to_cols = True + self.split_phones() + self.phones_to_string() + + if self.verbose == 1: + for key, val in self.model.items(): + print(key, ':', val) + + if self.driver: + self.driver.quit() + + return self.model diff --git a/build/lib/scrape_it/scrape_it_experimental.py b/build/lib/scrape_it/scrape_it_experimental.py new file mode 100644 index 0000000..d111986 --- /dev/null +++ b/build/lib/scrape_it/scrape_it_experimental.py @@ -0,0 +1,698 @@ +""" +Scrape_it + +Author: Valentyna Fihurska + +Lisence: Apache-2.0 + +Scrape_it is a tool for extracting valueble information +from the website of interest. Save your time on reading +and crawling through the website and leave it for Scrape_it! + +Find an example how to run program in the run.py +or refer to README + + +Note on additional .txt files in regex directory + +name_stop_words.txt - Contains words to filter unneeded words + during the search of entity's name + +email_keywords.txt - Specific file to filter emails based on + keywords it might contain (this is + as needed for current task) + +regex.txt - some regular expressions to search phone numbers; + at the current time is not in use, phonenumbers + package is used instead; one of improvements + should be a workflow which would allow efficient + and accurate phone matching with good filter + pipeline from 'scraping trash' + +address.txt - some regular expressions to match addresses, + not perfect expesially given the diversity + of different address formats accross + different countries + +""" + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .correct_base import process_phones, define_country +from .regex import * +from bs4 import BeautifulSoup +import re +import phonenumbers +import pyap +import requests +import json +import os +from selenium import webdriver +from .scrape_policy_text import _get_text_list, text_generator +import tldextract + +""" +with open('regex/name_stop_words.txt', 'r') as file: + name_stop_words = [r.strip() for r in file.readlines()] + +with open('regex/phones_regex.txt', 'r') as file: + phone_regex = [r.strip() for r in file.readlines()] + +with open('regex/email_keywords.txt', 'r') as file: + email_keywords = [w.strip() for w in file.readlines()] + +with open('regex/js_keywords.txt', 'r') as file: + js_keywords = [w.strip() for w in file.readlines()] +""" + + +internal_links = {'contact_link': r'contact.*', + 'privacy_link': r'privacy.*policy', + 'shipping_link': r'(deliver|shiping).*(policy)*', + 'terms_link': r'term.*(condition|use|service)', + 'faq_link': r'(faq)|(frequently.*asked.*question)', + 'return_link': r'return.*', + 'warranty_link': r'(warrant)|(guarant)'} + +external_links = {'twitter': 'twitter.com', + 'facebook': 'facebook.com', + 'instagram': 'instagram.com', + 'pinterest': 'pinterest.com', + 'youtube': 'youtube.com', + 'linkedin': 'linkedin.com'} + + +class Scrape_it: + + def __init__(self, url, method='requests', country='us', + company_name=None, category=None, geo_key=None, + verbose=0, driver=None): + + self.url = url + self.method = method + #self.model = {'url': self.url, 'country': country, + # 'category': category, 'company_name': company_name} + self.soup = None + self.geo_key = geo_key + self.verbose = verbose + self.driver = driver + self.get_soup() + + + def logging(self): + """ + Log some text while scraping if verbose is set to 1 + """ + + if self.verbose == 1: + print('Scraping', self.url, '...') + + def define_domain(self): + """ + Define domain name of the link + """ + + def get_domain(url): + domain = tldextract.extract(str(url)).domain+'.'+tldextract.extract(str(url)).suffix + return ['https://'+domain, 'http://'+domain, 'https://www.'+domain] + + return get_domain(self.url) + + #self.model['url'] = get_domain(self.url) + + def get_soup(self): + """ + Gets soup object depending on the method + """ + url = self.define_domain(self.url) + if self.method == 'requests': + import requests + try: + for link in url: + r = requests.get(link) + if r.status_code == 200: + soup = BeautifulSoup(r.text, 'lxml') + break + + except Exception as e: + print(e, link) + + if self.method == 'webdriver': + from selenium import webdriver + options = webdriver.ChromeOptions() + options.add_argument('headless') + self.driver = webdriver.Chrome(executable_path='./chromedriver', + options=options) + + for link in url: + try: + self.driver.get(link) + soup = BeautifulSoup(self.driver.page_source, 'lxml') + break + except Exception as e: + print(e, link) + + try: + assert soup is not None + return soup + except Exception: + return + + #def scrape(self): + + #self.soup = self.get_soup(self.url) + + + + + +class CustomScrape(Scrape_it): + + def __init__(self): + super.__init__() + + self.model = {'url': self.url, 'country': country, + 'category': category, 'company_name': company_name} + + + def init_model(self): + """ + Current task of mine is reflected in the model; it is planned + by me to export models to seperate file and use different onces + as need or to simplify the process of defining the model and + needed methods to execute + """ + + self.model['url'] = self.url + self.model['company_name'] = self.model['company_name'] + self.model['country'] = self.model['country'] + self.model['category'] = self.model['category'] + self.model['contact_link'] = None + self.model['phones'] = None + self.model['phone_1'] = None + self.model['phone_2'] = None + self.model['phone_3'] = None + self.model['phone_4'] = None + self.model['phone_5'] = None + self.model['phone_6'] = None + self.model['address'] = None + self.model['state'] = None + self.model['county'] = None + self.model['city'] = None + self.model['street'] = None + self.model['housenumber'] = None + self.model['postalcode'] = None + self.model['district'] = None + self.model['email'] = None + self.model['facebook'] = None + self.model['instagram'] = None + self.model['linkedin'] = None + self.model['pinterest'] = None + self.model['twitter'] = None + self.model['youtube'] = None + self.model['faq_link'] = None + self.model['privacy_link'] = None + self.model['return_link'] = None + self.model['shipping_link'] = None + self.model['terms_link'] = None + self.model['warranty_link'] = None + self.model['faq_text'] = None + self.model['privacy_text'] = None + self.model['return_text'] = None + self.model['shipping_text'] = None + self.model['terms_text'] = None + self.model['warranty_text'] = None + + def clean_name(self): + """ + Since company name is scraped from code too it can + be messy and needs to be cleaned from short descriptions + """ + + delims = ['|', '-', ':'] + + if self.model['company_name']: + + for d in delims: + if d in self.model['company_name']: + for i, s in enumerate(self.model['company_name'].split(d)): + same_words = set(name_stop_words).intersection(s.split(' ')) + if len(same_words) > 0: + break + self.model['company_name'] = self.model['company_name'].split(d)[i] + break + return self.model['company_name'] + + def get_name(self): + """ + Get company name from the most likely places in html it could be found + """ + for script in self.soup(["script", "style"]): + script.extract() + + metas_og = ['og:site_name', 'og:title'] + metas = ['title', 'name'] + for meta in metas_og: + if self.model['company_name'] is None \ + or self.model['company_name'] == '': + try: + meta_name = self.soup.find('meta', attrs={'property': meta}) + self.model['company_name'] = meta_name.get('content') + except AttributeError: + pass + + for meta in metas: + if self.model['company_name'] is None \ + or self.model['company_name'] == '': + try: + meta_name = self.soup.find('meta', attrs={'name': meta}) + self.model['company_name'] = meta_name.get('content') + except AttributeError: + if self.soup.find('title'): + if len(self.soup.find('title')) > 0: + title = self.soup.find('title') + self.model['company_name'] = title.text + if self.model['company_name'] is not None: + if 'forbidden' in self.model['company_name'].lower() or\ + 'ngMeta' in self.model['company_name']: + self.model['company_name'] = None + if self.model['company_name']: + self.model['company_name'] = self.clean_name().strip() + + def find_phones(self): + + def get_from_href(soup): + """ + If phonenumbers package could not find any phone numbers + there could be some embedded in links as in + + """ + phones = set() + for script in soup(["script", "style"]): + script.extract() + + try: + + for line in soup.find_all('a'): + if line.get('href').startswith('tel:'): + phones.add(line.get('href')[3:]) + + return phones + + except AttributeError: + return None + + def match_phones(soup): + """ + Find phones using phonenumbers package, location is provided from + model's country value + """ + phones = set() + for script in soup(["script", "style"]): + script.extract() + for line in soup.get_text().split('\n'): + matches = phonenumbers.PhoneNumberMatcher(line, str(self.model["country"]).upper()) + for match in matches: + phones.add(phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.E164)) + + return phones + + self.model['phones'] = get_from_href(self.soup) + if self.model['phones']: + self.model['phones'] = self.model['phones'].union(match_phones(self.soup)) + else: + self.model['phones'] = match_phones(self.soup) + + def find_address(self): + + def find_regex(soup): + """ + Find address with regular expression(s) specified in regex/address.txt + """ + try: + address_regex = r'^([0-9\-/]+) ?([A-Za-z](?= ))? (.*?) ([^ ]+?) ?((?<= )[A-Za-z])? ?((?<= )\d*)?$\ + ^([A-Za-z]+ ?)+[0-9]{3,6}$\ + ^([A-Za-z]+ ?)$' + except Exception: + pass + for script in soup(["script", "style"]): + script.extract() + text = soup.get_text() + address = re.search(address_regex, text) + if address: + address = address.group(0) + else: + address = None + + return address + + def find_base(soup, country='us'): + """ + Find addresses using pyap package + """ + for script in soup(["script", "style"]): + script.extract() + text = soup.get_text() + address = '' + + adr = pyap.parse(text, country='us') + if len(adr) > 0: + for item in adr: + address = address+' '+str(item) + + return address + + if self.model['address'] is None: + self.model['address'] = find_regex(self.soup) + if self.model['address'] is None: + base = find_base(self.soup, self.model['country']) + self.model['address'] = base + + if len(self.model['address']) > 0: + if define_country(self.model['country']) is not None: + self.model['country'] = define_country(self.model['country']) + + def find_email(self): + + def get_all_emails(soup): + """ + Get set of emails using regular expression + """ + + emails = set() + + email_pattern = r'[A-Za-z0-9]*@{1}[A-Za-z0-9]*\.(com|org|de|edu|gov|uk|au|net|me){1}' + for script in soup(["script", "style"]): + script.extract() + + for each in soup.get_text().split('\n'): + email_re = re.search(email_pattern, each) + if email_re: + if len(email_re.group(0)) > 5 \ + and len(email_re.group(0)) < 75: + emails.add(email_re.group(0)) + + return emails + + def keep_with_keywords(emails, keywords): + """ + Filter emails and keep one of the set found, as for my task + either one with keywors specified in regex/email_keywors.txt + or the first one if there are none which contain needed + keywords + """ + + for word in keywords: + if word in ''.join(list(emails)): + for email in emails: + if word in email: + return email + + if len(list(emails)) > 0: + return list(emails)[0] + return None + mails = get_all_emails(self.soup) + self.model['email'] = keep_with_keywords(mails, email_keywords) + + def find_links(self): + + def find_raw_links(soup): + """ + Find links: + external: social media links + internal: links to policies, faq, etc + """ + + links = {} + for each in soup.find_all('a'): + for ext_key, ext_val in external_links.items(): + if ext_val in str(each.get('href')): + links[ext_key] = str(each.get('href')) + + for int_key, int_val in internal_links.items(): + try: + url = re.findall(int_val, each.get('href')) + if len(url) > 0: + links[int_key] = str(each.get('href')) + except Exception: + pass + + return links + + def build_links(links): + """ + Build links from raw scraped hfer attributes + """ + + for key, link in links.items(): + if link.startswith('http') or link.startswith('www'): + links[key] = self.fix_link(link) + continue + if link.startswith('//'): + links[key] = self.fix_link(link[2:]) + continue + if key in external_links.keys(): + continue + if link.startswith('/'): + if self.url.endswith('/'): + links[key] = self.url+link[1:] + else: + links[key] = self.url+link + else: + if link.startswith('http') is False \ + and link.startswith('www') is False: + if self.url.endswith('/'): + links[key] = self.url+link + else: + links[key] = self.url+'/'+link + + links = clean_links(links) + + return links + + def clean_links(links): + """ + Clean links which require login or sign up and containing + some search/meta data parameters + """ + stop_attrs = ['#', '?', 'login', 'signup', 'sign-up', 'sign_up'] + + for key, link in links.items(): + for attr in stop_attrs: + if attr in link: + links[key] = link.split(attr)[0] + + return links + + links = build_links(find_raw_links(self.soup)) + + for key, link in links.items(): + self.model[key] = link + + def validate_address(self): + + def check_address(adr, geo_key=None): + """ + Validate address using geolocation API, first to make sure + scraped address is a valid one, seccond to fix if there + is any missing pieces and third to aid mt current task + """ + if geo_key: + + r = requests.get(f'https://geocoder.ls.hereapi.com/6.2/geocode.json?apiKey={geo_key}&searchtext={adr}') + + try: + location = json.loads(r.text)['Response']['View'][0]['Result'] + return location[0]['Location']['Address'] + + except Exception: + return None + else: + return adr + + def extend_addresses(address, geo_key=None): + """ + If address is a valid one break up address to corresponding + pieces (i.e. house number, street number, etc) + """ + + adr_dict = {} + + address = check_address(address, geo_key) + try: + if len(address.keys()) > 0: + for key in address.keys(): + if key == 'Label': + adr_dict['address'] = address[key].split(',')[0] + continue + if key == 'AdditionalData': + continue + if key == 'Country' and address[key] is not None: + c = define_country(address['Country']) + adr_dict['country'] = c + continue + adr_dict[key] = address[key] + except Exception: + adr_dict = None + + return adr_dict + + if self.model['address'] is not None and len(self.model['address']) > 0: + if extend_addresses(self.model['address'], self.geo_key) is not None: + extended = extend_addresses(self.model['address'], self.geo_key) + for key, val in extended.items(): + if key.lower() == 'country' and self.model['country']: + continue + self.model[key.lower()] = val + + else: + self.model['address'] = None + + if self.model['address']: + if len(self.model['address']) > 25: + self.model['address'] = None + + def fix_link(self, link): + """ + requests library does not handle well links in www.site.com format, + hence needs to be fixed to be the format 'https://www.site.com' + """ + if link.startswith('www.'): + return 'https://'+link + return link + + def split_phones(self): + """ + Method to seperate found phones into individual ones + """ + if self.split_phones_to_cols: + for i in range(6): + try: + if self.model['phones'][i].startswith('+'): + self.model[f'phone_{i+1}'] = self.model['phones'][i] + continue + self.model[f'phone_{i+1}'] = self.model['phones'][i] + except IndexError: + pass + + def scrape_text(self, method): + """ + Scrape text of the page of interest; + credit for the module scrape_policy_text + goes to Olha Babich + """ + + for key, _ in internal_links.items(): + if self.model[key] is not None: + if 'contact' in key: + continue + text_key = key.split('_')[0]+'_text' + text_list = _get_text_list(self.model[key], method=method, web_driver=self.driver) + self.model[text_key] = text_list + try: + if self.model[text_key] is not None: + if self.model['company_name'] is not None: + text = ' '.join(text_generator(text_mas=self.model[text_key][0], + company_name=self.model['company_name'], + company_website=self.model['url'])) + self.model[text_key] = text + else: + text = ' '.join(text_generator(text_mas=self.model[text_key][0], + company_name='Company', + company_website=self.model['url'])) + self.model[text_key] = text + except TypeError: + self.model[key] = None + + try: + assert len(self.model[text_key]) > 1 + if self.model[text_key][0] is None \ + and self.model[text_key][1] is None: + self.model[text_key] = None + except Exception: + pass + + def remove_not_parsed(self): + """ + Common issue is incapability to render the JavaScript whcih + results in the text like 'Seems your browser is not using + JavaScript...' + """ + fields = ['faq_text', 'privacy_text', 'return_text', + 'shipping_text', 'terms_text', 'warranty_text'] + + for each in fields: + if self.model[each] is not None: + if 'JavaScript' in self.model[each]: + self.model[each] = None + + def scrape(self): + """ + General pipeline of methods to scrape the website + """ + self.soup = self.get_soup(self.url) + if self.soup is None: + return + self.init_model() + self.logging() + self.define_domain() + if self.model['company_name'] is None: + self.get_name() + self.find_address() + self.find_phones() + self.find_email() + self.find_links() + + if self.model['address'] is not None \ + or len(self.model['address']) != 0: + self.validate_address() + + if self.model['contact_link']: + self.soup = self.get_soup(self.model['contact_link']) + if self.soup is None: + return + if self.model['address'] is None or len(self.model['address']) == 0: + self.find_address() + self.validate_address() + self.find_phones() + if self.model['email'] is None or len(self.model['email']) == 0: + self.find_email() + if self.method == 'requests': + self.scrape_text(method='requests') + else: + self.scrape_text(method='webdriver') + + self.remove_not_parsed() + + if self.model['phones']: + fixed_phones = [] + for phone in list(self.model['phones']): + ph = process_phones(phone, self.model['country']) + fixed_phones.append(ph) + + self.model['phones'] = fixed_phones + else: + self.model['phones'] = None + + if self.model['phones']: + self.split_phones_to_cols = True + self.split_phones() + + if self.verbose == 1: + for key, val in self.model.items(): + print(key, ':', val) + + if self.driver: + self.driver.quit() + + return self.model \ No newline at end of file diff --git a/dist/scrape_it-0.3.7-py2.py3-none-any.whl b/dist/scrape_it-0.3.7-py2.py3-none-any.whl new file mode 100644 index 0000000..fd82804 Binary files /dev/null and b/dist/scrape_it-0.3.7-py2.py3-none-any.whl differ diff --git a/dist/scrape_it-0.3.7.tar.gz b/dist/scrape_it-0.3.7.tar.gz new file mode 100644 index 0000000..b312598 Binary files /dev/null and b/dist/scrape_it-0.3.7.tar.gz differ diff --git a/scrape_it.egg-info/PKG-INFO b/scrape_it.egg-info/PKG-INFO index b516894..b7f2071 100755 --- a/scrape_it.egg-info/PKG-INFO +++ b/scrape_it.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: scrape-it -Version: 0.3.6 +Version: 0.3.7 Summary: Systemitized tool for scraping Home-page: https://github.com/erelin6613/Scrape_it Author: Valentyna Fihurska diff --git a/scrape_it.egg-info/SOURCES.txt b/scrape_it.egg-info/SOURCES.txt index 949fe10..2e407bf 100755 --- a/scrape_it.egg-info/SOURCES.txt +++ b/scrape_it.egg-info/SOURCES.txt @@ -8,6 +8,7 @@ scrape_it/models.py scrape_it/regex.py scrape_it/run.py scrape_it/scrape_it.py +scrape_it/scrape_it_experimental.py scrape_it/scrape_policy_text.py scrape_it.egg-info/PKG-INFO scrape_it.egg-info/SOURCES.txt diff --git a/scrape_it/__init__.py b/scrape_it/__init__.py index 3cf7186..ff33d6b 100755 --- a/scrape_it/__init__.py +++ b/scrape_it/__init__.py @@ -51,7 +51,7 @@ from .scrape_it import Scrape_it -__version__ = '0.3.7' +__version__ = '0.3.8' if __name__ == '__main__': import doctest