diff --git a/data_extractor.py b/data_extractor.py new file mode 100644 index 0000000..a3fa47c --- /dev/null +++ b/data_extractor.py @@ -0,0 +1,54 @@ +class DataExtractor: + """ + Use to extract, cleanse, sum and amend incorrect website data collection. + """ + def __init__(self, data): + self.data = data + + def find_items(self, value=4): + """ + Find and return a new list of items where key "value" is greater than or equal to parameter value. + :param value: int, value to find items for. + :return: list(dict), list of dictionaries matching the above filtering rule. + """ + return [item for item in self.data if item.get('value') and item.get('value') >= value] + + def amend_domain_values(self, prefix='www.'): + """ + Fixes missing parts of the domain names. By default we add missing 'www.'. + :param prefix: str, prefix to add to the domain name. + :return: amended: list(dict), amended list of web records. + """ + amended = [] + for item in self.data: + if item.get('domain') and not item.get('domain').startswith(prefix): + item['domain'] = f"{prefix}{item['domain']}" + amended.append(item) + return amended + + def cleanse_data(self): + """ + Fix errors in "secure" key values. All urls starting with https should be set to "secure": True, those starting + with http "secure": False. + :return: amended: list(dict), amended list of web records. + """ + amended = [] + for item in self.data: + url = item.get('url') + secure = item.get('secure') + if url: + # https marked as secure = False + if url.startswith('https:') and not secure: + item['secure'] = True + # http marked as secure = True + elif url.startswith('http:') and secure: + item['secure'] = False + amended.append(item) + return amended + + def get_value_sum(self): + """ + Returns sum of all value keys in the data set. + :return: int, sum of all value keys in the data set. + """ + return sum([item.get('value', 0) for item in self.data]) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data_extractor.py b/tests/test_data_extractor.py new file mode 100644 index 0000000..29a453d --- /dev/null +++ b/tests/test_data_extractor.py @@ -0,0 +1,179 @@ +from data_extractor import DataExtractor +from websites.resources.data import WEBSITES + +data_extractor = DataExtractor(WEBSITES) + + +class TestDataExtractor: + + def test_find_items(self): + expected = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'google.co.uk', + 'secure': True, + 'value': 5}, + { + 'name': 'Facebook', + 'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/', + 'domain': 'facebook.com', + 'secure': True, + 'value': 4}, + { + 'name': 'YouTube', + 'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc', + 'domain': 'youtube.com', + 'secure': True, + 'value': 5 + } + ] + assert data_extractor.find_items() == expected + + def test_find_items_none_found(self): + assert data_extractor.find_items(100) == [] + + def test_find_items_all_matching(self): + assert data_extractor.find_items(1) == WEBSITES + + def test_amend_domain_values(self): + expected = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'www.google.co.uk', + 'secure': True, + 'value': 5}, + { + 'name': 'Facebook', + 'url': 'https://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/', + 'domain': 'www.facebook.com', + 'secure': True, 'value': 4}, + { + 'name': 'Bing', + 'url': 'https://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A', + 'domain': 'www.bing.com', + 'secure': False, + 'value': 3 + }, + { + 'name': 'Ask', + 'url': 'https://uk.ask.com/web?o=0&l=dir&qo=serpSearchTopBox&q=jupiter', + 'domain': 'www.ask.com', + 'secure': False, + 'value': 1}, + { + 'name': 'Duck Duck Go', + 'url': 'http://duckduckgo.com/?q=plane&t=h_&ia=web', + 'domain': 'www.duckduckgo.com', + 'secure': True, + 'value': 2 + }, + { + 'name': 'Vimeo', + 'url': 'https://vimeo.com/53812885', + 'domain': 'www.vimeo.com', + 'secure': False, + 'value': 2 + }, + { + 'name': 'YouTube', + 'url': 'https://www.youtube.com/watch?v=09Cd7NKKvDc', + 'domain': 'www.youtube.com', + 'secure': True, + 'value': 5 + }, + { + 'name': 'Daily Motion', + 'url': 'http://www.dailymotion.com/search/football', + 'domain': 'www.dailymotion.com', + 'secure': True, + 'value': 1 + } + ] + assert data_extractor.amend_domain_values() == expected + + def test_amend_domain_values_retains_original_if_prefix_matching(self): + test_data = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'www.google.co.uk', + 'secure': True, + 'value': 5 + } + ] + _data_extractor = DataExtractor(test_data) + assert _data_extractor.amend_domain_values() == test_data + + def test_cleanse_data(self): + test_data = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'google.co.uk', + 'secure': False, + 'value': 5 + }, + { + 'name': 'Facebook', + 'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/', + 'domain': 'facebook.com', + 'secure': True, + 'value': 4 + }, + { + 'name': 'Bing', + 'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A', + 'domain': 'bing.com', + 'secure': False, + 'value': 3 + }, + { + 'name': 'Duck Duck Go', + 'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web', + 'domain': 'duckduckgo.com', + 'secure': True, + 'value': 2 + }, + ] + + expected = [ + { + 'name': 'Google', + 'url': 'https://www.google.co.uk', + 'domain': 'google.co.uk', + 'secure': True, + 'value': 5 + }, + { + 'name': 'Facebook', + 'url': 'http://developers.facebook.com/blog/post/2018/10/02/facebook-login-update/', + 'domain': 'facebook.com', + 'secure': False, + 'value': 4 + }, + { + 'name': 'Bing', + 'url': 'http://www.bing.com/search?q=athlete&qs=n&form=QBLH&sp=-1&pq=athlete&sc=8-7&sk=&cvid=53830DD7FB2E47B7A5D9CF27F106BC9A', + 'domain': 'bing.com', + 'secure': False, + 'value': 3 + }, + { + 'name': 'Duck Duck Go', + 'url': 'https://duckduckgo.com/?q=plane&t=h_&ia=web', + 'domain': 'duckduckgo.com', + 'secure': True, + 'value': 2 + }, + ] + _data_extractor = DataExtractor(test_data) + assert _data_extractor.cleanse_data() == expected + + def test_get_value_sum(self): + assert data_extractor.get_value_sum() == 23 + + def test_get_value_sum_empty_data_set(self): + _data_extractor = DataExtractor([]) + assert _data_extractor.get_value_sum() == 0