From 6dd0b22c3752b64e09a4bbc36ca4d3988e45361c Mon Sep 17 00:00:00 2001 From: Alexandre Magno Date: Fri, 10 Jul 2015 04:57:59 -0300 Subject: [PATCH] Option -s/--sync --- paicemana/scripts/cli.py | 17 ++++++++++------- paicemana/textdownload.py | 12 +++++++++--- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/paicemana/scripts/cli.py b/paicemana/scripts/cli.py index c5dcd21..2227d58 100644 --- a/paicemana/scripts/cli.py +++ b/paicemana/scripts/cli.py @@ -14,17 +14,20 @@ @click.command(context_settings=CONTEXT_SETTINGS) @click.option('-g', '--archive', type=int, help='Number in permalink like www.weeklyosm.eu/archives/4205') -def cli(archive): +@click.option('-s', '--sync', is_flag=True, + help='Downloads the brazilian version already published') +def cli(archive, sync): """A helper script for works at OSMBrasil/semanario""" if not archive: raise click.UsageError('try the -h/--help option') try: - download = MarkdownDownload(archive) - analyzer = MarkdownAnalyzer(download.filename) - organizer = analyzer.getOrganizer() - translators = ['alexandre-mbm', 'jgpacker', 'vgeorge'] - organizer.distribute_for(translators) - print('\n%s\n\n%s\n' % (organizer, organizer.scores())) + download = MarkdownDownload(archive, sync) + if not sync: + analyzer = MarkdownAnalyzer(download.filename) + organizer = analyzer.getOrganizer() + translators = ['alexandre-mbm', 'jgpacker', 'vgeorge'] + organizer.distribute_for(translators) + print('\n%s\n\n%s\n' % (organizer, organizer.scores())) except HTTPError as e: click.echo(e) diff --git a/paicemana/textdownload.py b/paicemana/textdownload.py index b5c778d..acf0cc8 100644 --- a/paicemana/textdownload.py +++ b/paicemana/textdownload.py @@ -8,18 +8,20 @@ class MarkdownDownload(object): """Class to download text weeklyosm.eu""" - def __init__(self, archive): + def __init__(self, archive, sync=False): """ @params archive - number in permalink like www.weeklyosm.eu/archives/4205 + sync - True for downloading the brazilian version already published """ - self.url = 'http://www.weeklyosm.eu/archives/%s' % archive + lang = 'en' if not sync else 'pt' + self.url = 'http://www.weeklyosm.eu/%s/archives/%s' % (lang, archive) self.page = html.fromstring(urllib.request.urlopen(self.url).read()) root = self.page.xpath('//article')[0] etree.strip_tags(root,'div','span') - root_html = etree.tostring(root, pretty_print=True) + root_html = etree.tostring(root, encoding='utf-8', pretty_print=True, method='html') markdown = html2text.html2text(root_html.decode('utf-8')) @@ -29,7 +31,11 @@ def __init__(self, archive): s = re.sub(r'\n', '\n\n', s) s = re.sub(r'\n\n\n\n?', '\n\n', s) s = re.sub(r'…', '...', s) + s = re.sub(r'“', '"', s) + s = re.sub(r'”', '"', s) + s = re.sub(r' \[\]\(.*\/OSMBrasil\/semanario.*\n\n.*\)', '', s) s = s.split('### Share this:')[0] + s = s.split('### Compartilhe isso:')[0] markdown = s #print(markdown)