From 55eb391f01097a1f59af4df4340287c88cf7b229 Mon Sep 17 00:00:00 2001 From: scrambldchannel <41974784+scrambldchannel@users.noreply.github.com> Date: Mon, 11 Nov 2019 02:29:45 +0100 Subject: [PATCH 1/2] basic attempt to parse rss feed --- espncricinfo/summary.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/espncricinfo/summary.py b/espncricinfo/summary.py index a8fcee4..0ae773b 100644 --- a/espncricinfo/summary.py +++ b/espncricinfo/summary.py @@ -1,18 +1,22 @@ import requests +from bs4 import BeautifulSoup class Summary(object): def __init__(self): - self.url = "http://www.espncricinfo.com/netstorage/summary.json" - self.json = self.get_json() - self.match_ids = self.json['matches'].keys() - self.all_matches = self.json['matches'].values() + self.url = "http://static.cricinfo.com/rss/livescores.xml" + self.xml = self.get_xml() + + self.match_ids = [] + self.match_urls = [] - def get_json(self): - r = requests.get(self.url) - return r.json() + for g in self.xml.findAll('guid'): + self.match_ids.append(g.text.strip().split('/')[-1].split('.')[0]) + self.match_urls.append(g.text.strip()) - def match(self, id): - m = self.json['matches'][id] - m['url'] = "http://www.espncricinfo.com"+m['url'] - return m + def get_xml(self): + r = requests.get(self.url) + if r.ok: + return BeautifulSoup(r.text) + else: + return None From 87d4c769ab24e02fb669155b26549c2287eeb6ac Mon Sep 17 00:00:00 2001 From: scrambldchannel <41974784+scrambldchannel@users.noreply.github.com> Date: Mon, 11 Nov 2019 13:00:54 +0100 Subject: [PATCH 2/2] used ET instead of bs for xml parsing --- espncricinfo/summary.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/espncricinfo/summary.py b/espncricinfo/summary.py index 0ae773b..b4148ac 100644 --- a/espncricinfo/summary.py +++ b/espncricinfo/summary.py @@ -1,22 +1,29 @@ import requests -from bs4 import BeautifulSoup +from xml.etree import ElementTree as ET class Summary(object): def __init__(self): self.url = "http://static.cricinfo.com/rss/livescores.xml" - self.xml = self.get_xml() + self.rss = self.get_rss() + self.matches = {} self.match_ids = [] - self.match_urls = [] - for g in self.xml.findAll('guid'): - self.match_ids.append(g.text.strip().split('/')[-1].split('.')[0]) - self.match_urls.append(g.text.strip()) + if len(self.rss) > 0: - def get_xml(self): + for i in self.rss[0].findall('item'): + desc = i.find('description').text + guid = i.find('guid').text + match_id = guid.split('/')[-1].split('.')[0] + self.matches[match_id] = {'description' : desc, 'url' : guid} + + self.match_ids = list(self.matches.keys()) + + def get_rss(self): + r = requests.get(self.url) if r.ok: - return BeautifulSoup(r.text) + return ET.fromstring(r.content) else: return None