From 55eb391f01097a1f59af4df4340287c88cf7b229 Mon Sep 17 00:00:00 2001 From: scrambldchannel <41974784+scrambldchannel@users.noreply.github.com> Date: Mon, 11 Nov 2019 02:29:45 +0100 Subject: [PATCH 1/6] basic attempt to parse rss feed --- espncricinfo/summary.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/espncricinfo/summary.py b/espncricinfo/summary.py index a8fcee4..0ae773b 100644 --- a/espncricinfo/summary.py +++ b/espncricinfo/summary.py @@ -1,18 +1,22 @@ import requests +from bs4 import BeautifulSoup class Summary(object): def __init__(self): - self.url = "http://www.espncricinfo.com/netstorage/summary.json" - self.json = self.get_json() - self.match_ids = self.json['matches'].keys() - self.all_matches = self.json['matches'].values() + self.url = "http://static.cricinfo.com/rss/livescores.xml" + self.xml = self.get_xml() + + self.match_ids = [] + self.match_urls = [] - def get_json(self): - r = requests.get(self.url) - return r.json() + for g in self.xml.findAll('guid'): + self.match_ids.append(g.text.strip().split('/')[-1].split('.')[0]) + self.match_urls.append(g.text.strip()) - def match(self, id): - m = self.json['matches'][id] - m['url'] = "http://www.espncricinfo.com"+m['url'] - return m + def get_xml(self): + r = requests.get(self.url) + if r.ok: + return BeautifulSoup(r.text) + else: + return None From 87d4c769ab24e02fb669155b26549c2287eeb6ac Mon Sep 17 00:00:00 2001 From: scrambldchannel <41974784+scrambldchannel@users.noreply.github.com> Date: Mon, 11 Nov 2019 13:00:54 +0100 Subject: [PATCH 2/6] used ET instead of bs for xml parsing --- espncricinfo/summary.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/espncricinfo/summary.py b/espncricinfo/summary.py index 0ae773b..b4148ac 100644 --- a/espncricinfo/summary.py +++ b/espncricinfo/summary.py @@ -1,22 +1,29 @@ import requests -from bs4 import BeautifulSoup +from xml.etree import ElementTree as ET class Summary(object): def __init__(self): self.url = "http://static.cricinfo.com/rss/livescores.xml" - self.xml = self.get_xml() + self.rss = self.get_rss() + self.matches = {} self.match_ids = [] - self.match_urls = [] - for g in self.xml.findAll('guid'): - self.match_ids.append(g.text.strip().split('/')[-1].split('.')[0]) - self.match_urls.append(g.text.strip()) + if len(self.rss) > 0: - def get_xml(self): + for i in self.rss[0].findall('item'): + desc = i.find('description').text + guid = i.find('guid').text + match_id = guid.split('/')[-1].split('.')[0] + self.matches[match_id] = {'description' : desc, 'url' : guid} + + self.match_ids = list(self.matches.keys()) + + def get_rss(self): + r = requests.get(self.url) if r.ok: - return BeautifulSoup(r.text) + return ET.fromstring(r.content) else: return None From cd2e9c798eecd90ffe1139502d664dbceaad6af4 Mon Sep 17 00:00:00 2001 From: scrambldchannel <41974784+scrambldchannel@users.noreply.github.com> Date: Tue, 12 Nov 2019 15:11:36 +0100 Subject: [PATCH 3/6] added ground not found exception --- espncricinfo/exceptions.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/espncricinfo/exceptions.py b/espncricinfo/exceptions.py index 0ee4e37..08d2db8 100644 --- a/espncricinfo/exceptions.py +++ b/espncricinfo/exceptions.py @@ -12,6 +12,15 @@ class MatchNotFoundError(TypeError): pass class PlayerNotFoundError(TypeError): + """ + Exception raised if a player_id is not valid or does not exist. + """ + pass + +class GroundNotFoundError(TypeError): + """ + Exception raised if a ground_id is not valid or does not exist. + """ pass class NoScorecardError(TypeError): From 2c98d7f24613e0739ec711526b875000073b2e3b Mon Sep 17 00:00:00 2001 From: scrambldchannel <41974784+scrambldchannel@users.noreply.github.com> Date: Tue, 12 Nov 2019 15:11:46 +0100 Subject: [PATCH 4/6] added ground json parsing --- espncricinfo/ground.py | 72 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 espncricinfo/ground.py diff --git a/espncricinfo/ground.py b/espncricinfo/ground.py new file mode 100644 index 0000000..12011b2 --- /dev/null +++ b/espncricinfo/ground.py @@ -0,0 +1,72 @@ +import requests +from bs4 import BeautifulSoup +from espncricinfo.exceptions import GroundNotFoundError + +class Ground(object): + + def __init__(self, ground_id): + self.cricinfo_id = ground_id + self.url = "http://www.espncricinfo.com/ci/content/ground/{0}.html".format(str(self.cricinfo_id)) + self.json_url = "http://core.espnuk.org/v2/sports/cricket/venues/{0}".format(str(self.cricinfo_id)) + + self.json = self.get_json() + if self.json: + self.__unicode__ = self._full_name() + self.short_name = self._short_name() + self.capacity = self._capacity() + self.grass = self._grass() + self.indoor = self._indoor() + self.address = self._address() + self.city = self._city() + self.state = self._state() + self.zipcode = self._zipcode() + self.country = self._country() + self.summary = self._summary() + + def get_html(self): + r = requests.get(self.url) + if not r.ok: + raise GroundNotFoundError + else: + soup = BeautifulSoup(r.text, 'html.parser') + return soup.find("div", class_="pnl650T") + + def get_json(self): + r = requests.get(self.json_url) + if not r.ok: + raise GroundNotFoundError + else: + return r.json() + + def _full_name(self): + return self.json.get('fullName') + + def _short_name(self): + return self.json.get('shortName') + + def _capacity(self): + return self.json.get('capacity') + + def _grass(self): + return self.json.get('grass', False) + + def _indoor(self): + return self.json.get('indoor', False) + + def _address(self): + return self.json.get('address', {}) + + def _city(self): + return self.address.get('city') + + def _state(self): + return self.address.get('state') + + def _zipcode(self): + return self.address.get('zipCode') + + def _country(self): + return self.address.get('country') + + def _summary(self): + return self.address.get('summary') From a01942fe589d3cfc3418e67ed71ee842ad2fb26a Mon Sep 17 00:00:00 2001 From: scrambldchannel <41974784+scrambldchannel@users.noreply.github.com> Date: Wed, 13 Nov 2019 16:54:58 +0100 Subject: [PATCH 5/6] ground atttributes html parsing added --- espncricinfo/ground.py | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/espncricinfo/ground.py b/espncricinfo/ground.py index 12011b2..8304673 100644 --- a/espncricinfo/ground.py +++ b/espncricinfo/ground.py @@ -9,7 +9,9 @@ def __init__(self, ground_id): self.url = "http://www.espncricinfo.com/ci/content/ground/{0}.html".format(str(self.cricinfo_id)) self.json_url = "http://core.espnuk.org/v2/sports/cricket/venues/{0}".format(str(self.cricinfo_id)) + self.parsed_html = self.get_html() self.json = self.get_json() + if self.json: self.__unicode__ = self._full_name() self.short_name = self._short_name() @@ -23,13 +25,12 @@ def __init__(self, ground_id): self.country = self._country() self.summary = self._summary() - def get_html(self): - r = requests.get(self.url) - if not r.ok: - raise GroundNotFoundError - else: - soup = BeautifulSoup(r.text, 'html.parser') - return soup.find("div", class_="pnl650T") + if self.parsed_html: + self.established = self._established() + self.floodlights_added = self._floodlights_added() + self.end_names = self._end_names() + self.home_team = self._home_team() + self.other_sports = self._other_sports() def get_json(self): r = requests.get(self.json_url) @@ -70,3 +71,26 @@ def _country(self): def _summary(self): return self.address.get('summary') + + def get_html(self): + r = requests.get(self.url) + if not r.ok: + raise GroundNotFoundError + else: + soup = BeautifulSoup(r.text, 'html.parser').find('div', id= 'ciHomeContentlhs') + return soup + + def _established(self): + return self.parsed_html.find('div', id = 'stats').find('label', text = 'Established ').next_sibling + + def _floodlights_added(self): + return self.parsed_html.find('div', id = 'stats').find('label', text = 'Floodlights ').next_sibling + + def _end_names(self): + return self.parsed_html.find('div', id = 'stats').find('label', text = 'End names ').next_sibling + + def _home_team(self): + return self.parsed_html.find('div', id = 'stats').find('label', text = 'Home team ').next_sibling + + def _other_sports(self): + return self.parsed_html.find('div', id = 'stats').find('label', text = 'Other sports ').next_sibling From cedb34c53bead7993750568dd94e4f153b3abc48 Mon Sep 17 00:00:00 2001 From: scrambldchannel <41974784+scrambldchannel@users.noreply.github.com> Date: Wed, 13 Nov 2019 19:18:41 +0100 Subject: [PATCH 6/6] Added methods to parse first/lasts --- espncricinfo/ground.py | 80 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 5 deletions(-) diff --git a/espncricinfo/ground.py b/espncricinfo/ground.py index 8304673..04f81d8 100644 --- a/espncricinfo/ground.py +++ b/espncricinfo/ground.py @@ -31,6 +31,12 @@ def __init__(self, ground_id): self.end_names = self._end_names() self.home_team = self._home_team() self.other_sports = self._other_sports() + self.first_test = self._first_test() + self.last_test = self._last_test() + self.first_odi = self._first_odi() + self.last_odi = self._last_odi() + self.first_t20i = self._first_t20i() + self.last_t20i = self._last_t20i() def get_json(self): r = requests.get(self.json_url) @@ -80,17 +86,81 @@ def get_html(self): soup = BeautifulSoup(r.text, 'html.parser').find('div', id= 'ciHomeContentlhs') return soup + def _parse_ground_stats(self): + return self.parsed_html.find('div', id = 'stats') + + def _parse_ground_records(self): + return self.parsed_html.find('div', id = 'recs') + def _established(self): - return self.parsed_html.find('div', id = 'stats').find('label', text = 'Established ').next_sibling + t = self._parse_ground_stats().find('label', text = 'Established ') + if t: + return t.next_sibling def _floodlights_added(self): - return self.parsed_html.find('div', id = 'stats').find('label', text = 'Floodlights ').next_sibling + t = self._parse_ground_stats().find('label', text = 'Floodlights ') + if t: + return t.next_sibling def _end_names(self): - return self.parsed_html.find('div', id = 'stats').find('label', text = 'End names ').next_sibling + t = self._parse_ground_stats().find('label', text = 'End names ') + if t: + return t.next_sibling def _home_team(self): - return self.parsed_html.find('div', id = 'stats').find('label', text = 'Home team ').next_sibling + t = self._parse_ground_stats().find('label', text = 'Home team ') + if t: + return t.next_sibling def _other_sports(self): - return self.parsed_html.find('div', id = 'stats').find('label', text = 'Other sports ').next_sibling + t = self._parse_ground_stats().find('label', text = 'Other sports ') + if t: + return t.next_sibling + + def _first_test(self): + for tr in self._parse_ground_records().find_all('tr'): + if tr.find('label', text = 'First Test'): + url = 'http://www.espncricinfo.com' + tr.find('a')['href'] + title = tr.find_all('td')[1].text + match_id = int(url.split('/')[-1].split('.')[0]) + return {'url': url, 'match_id': match_id, 'title': title} + + def _last_test(self): + for tr in self._parse_ground_records().find_all('tr'): + if tr.find('label', text = 'Last Test'): + url = 'http://www.espncricinfo.com' + tr.find('a')['href'] + title = tr.find_all('td')[1].text + match_id = int(url.split('/')[-1].split('.')[0]) + return {'url': url, 'match_id': match_id, 'title': title} + + def _first_odi(self): + for tr in self._parse_ground_records().find_all('tr'): + if tr.find('label', text = 'First ODI'): + url = 'http://www.espncricinfo.com' + tr.find('a')['href'] + title = tr.find_all('td')[1].text + match_id = int(url.split('/')[-1].split('.')[0]) + return {'url': url, 'match_id': match_id, 'title': title} + + def _last_odi(self): + for tr in self._parse_ground_records().find_all('tr'): + if tr.find('label', text = 'Last ODI'): + url = 'http://www.espncricinfo.com' + tr.find('a')['href'] + title = tr.find_all('td')[1].text + match_id = int(url.split('/')[-1].split('.')[0]) + return {'url': url, 'match_id': match_id, 'title': title} + + def _first_t20i(self): + for tr in self._parse_ground_records().find_all('tr'): + if tr.find('label', text = 'First T20I'): + url = 'http://www.espncricinfo.com' + tr.find('a')['href'] + title = tr.find_all('td')[1].text + match_id = int(url.split('/')[-1].split('.')[0]) + return {'url': url, 'match_id': match_id, 'title': title} + + def _last_t20i(self): + for tr in self._parse_ground_records().find_all('tr'): + if tr.find('label', text = 'Last T20I'): + url = 'http://www.espncricinfo.com' + tr.find('a')['href'] + title = tr.find_all('td')[1].text + match_id = int(url.split('/')[-1].split('.')[0]) + return {'url': url, 'match_id': match_id, 'title': title}