Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse_wikidump: programmatic access #28

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 81 additions & 58 deletions semanticizest/parse_wikidump/__main__.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,22 @@
"""parse_wikidump

Usage:
parse_wikidump [options] <dump> <model-filename>
parse_wikidump --download=<wikiname> <model-filename>

Options:
--download=wikiname Download dump from dumps.wikimedia.org first
--ngram=<order>, -N <order>
Maximum order of ngrams, set to None to disable
[default: 7]
--help, -h This help
"""
Parse Wikidump

Reads in a Wikipedia snapshot file, or downloads it if it doesn't exist locally.
Then it attempts to parse it and store it in an SQL3 database, which it first
initializes.
"""
from __future__ import print_function

import logging
import re
import sqlite3
import sys
import errno

from six.moves.urllib.error import HTTPError
from six.moves.urllib.request import urlretrieve

import argparse
from docopt import docopt

from . import parse_dump
Expand All @@ -43,6 +39,41 @@ def __call__(self, n_blocks, blocksize, totalsize):
self.threshold += .05


class Db(object):
def __init__(self, fname):
self.db_fname = fname
self.db = ""


def connect(self):
try:
self.db = sqlite3.connect(self.db_fname)
except sqlite3.OperationalError as e:
if 'unable to open' in str(e):
# This exception doesn't store the path.
die("%s: %r" % (e, self.db_fname))
else:
raise

def disconnect(self):
if self.db:
self.db.close()

def setup(self):
logger.info("Creating database at %r" % self.db_fname)
with open(createtables_path()) as f:
create = f.read()

c = self.db.cursor()
try:
c.executescript(create)
except sqlite3.OperationalError as e:
if re.search(r'table .* already exists', str(e)):
die("database %r already populated" % self.db_fname)
else:
raise


DUMP_TEMPLATE = (
"https://dumps.wikimedia.org/{0}/latest/{0}-latest-pages-articles.xml.bz2")

Expand All @@ -52,53 +83,45 @@ def die(msg):
sys.exit(1)


def main(args):
args = docopt(__doc__, args)

if args["--download"]:
wikidump = args["--download"] + ".xml.bz2"
else:
wikidump = args['<dump>']

model_fname = args['<model-filename>']
ngram = args['--ngram']
if ngram == "None":
ngram = None
else:
ngram = int(ngram)

logger.info("Creating database at %r" % model_fname)
try:
db = sqlite3.connect(model_fname)
except sqlite3.OperationalError as e:
if 'unable to open' in str(e):
# This exception doesn't store the path.
die("%s: %r" % (e, model_fname))
else:
raise
with open(createtables_path()) as f:
create = f.read()
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog="semanticizer.parse_wikidump", description="Semanticizest Wiki parser")
parser.add_argument('snapshot',
help='Local Wikipedia snapshot to use.')
parser.add_argument('model',
help='File to store the model.')
parser.add_argument('--download', dest='download', action="store_true",
help='Download snapshot if it does not exist as snapshot.xml.bz2. The corpus file name should match that of snapshot.')
parser.add_argument('-N', '--ngram', dest='ngram', default=7, type=int,
help='Maximum order of ngrams, set to None to disable [default: 7].')
args = parser.parse_args()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think @IsaacHaze will be very unhappy when he sees his darling docopt replaced by ArgumentParser...

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I didn't know. I was inspired from how things are being done in xtas.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was written before I knew about docopt :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:'(

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, so docopt is the current default for parsing args?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright. I'll revert the arg parsing logic to use docopt, then.


c = db.cursor()
try:
c.executescript(create)
except sqlite3.OperationalError as e:
if re.search(r'table .* already exists', str(e)):
die("database %r already populated" % model_fname)
fh = open(args.snapshot, 'r')
except (IOError, OSError) as e:
if e.errno == errno.ENOENT and args.download:
m = re.match(r"(.+?)\.xml")
if m:
args.snapshot = m.group(1)
url = DUMP_TEMPLATE.format(args.snapshot)
print(url)
args.snapshot = args.snapshot + ".xml.bz2"
try:
urlretrieve(url, args.snapshot, Progress())
except HTTPError as e:
die("Cannot download {0!r}: {1}".format(url, e))
else:
raise

if args["--download"]:
url = DUMP_TEMPLATE.format(args["--download"])
logger.info("Saving wikidump to %r", wikidump)
try:
urlretrieve(url, wikidump, Progress())
except HTTPError as e:
die("Cannot download {0!r}: {1}".format(url, e))

parse_dump(wikidump, db, N=ngram)
db.close()


if __name__ == '__main__':
main(sys.argv[1:])
else:
fh.close()

# Init, connect to DB and setup db schema
db = Db(args.model)
db.connect()
db.setup()

# Parse wiki snapshot and store it to DB
parse_dump(args.snapshot, db.db, N=args.ngram)

# Close connection to DB and exit
db.disconnect()