From 4e661de8b7da6b9231848047587297c935f6620c Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Tue, 13 Oct 2020 22:23:16 -0400 Subject: [PATCH 01/41] Allowing binding to 0.0.0.0 instead of 127.0.0.1 --- bookwormDB/manager.py | 8 +++++--- bookwormDB/wsgi.py | 33 +++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index 11f7350..c66ccbd 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -140,14 +140,14 @@ def query(self, args): caller = SQLAPIcall(query) print(caller.execute()) - def serve(self,args): + def serve(self, args): """ Serve the api. """ from bookwormDB.wsgi import run - run(args.bind, args.workers) + run(args.port, args.bind, args.workers) import http.server from http.server import HTTPServer @@ -579,7 +579,9 @@ def run_arguments(): serve_parser.add_argument("--full-site", action = "store_true", help="Serve a webpage as well as a query endpoint? Not active.") - serve_parser.add_argument("--bind", "-b", default="10012", help="The port over which to serve the bookworm",type=int) + serve_parser.add_argument("--port", "-p", default="10012", help="The port over which to serve the bookworm", type=int) + + serve_parser.add_argument("--bind", "-b", default="127.0.0.1", help="The IP address to bind the server to.", type=str) serve_parser.add_argument("--workers", "-w", default="0", help="How many gunicorn worker threads to launch for the API. Reduce if you're seeing memory issues.",type=int) diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index 9cf4b68..fdf5de8 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -11,16 +11,16 @@ def content_type(query): format = query['format'] except: return 'text/plain' - + if format == "json": return "application/json" - + if format == "feather": return "application/octet-stream" - + if format == "html": return "text/html" - + return 'text/plain' def application(environ, start_response, logfile = "bookworm_queries.log"): @@ -43,7 +43,7 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): if ip is None: ip = environ.get('REMOTE_ADDR') query = unquote(q) - + headers = { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET, POST, PUT, OPTIONS', @@ -53,7 +53,7 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): } - + logging.debug("Received query {}".format(query)) start = datetime.now() @@ -61,7 +61,7 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): # a named argument. query = query.strip("query=") query = query.strip("queryTerms=") - + try: query = json.loads(query) query['ip'] = ip @@ -76,10 +76,10 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): # It might be binary already. headers['Content-type'] = content_type(query) - + if headers['Content-type'] != 'application/octet-stream': response_body = bytes(response_body, 'utf-8') - + headers['Content-Length'] = str(len(response_body)) status = '200 OK' start_response(status, list(headers.items())) @@ -100,9 +100,11 @@ def number_of_workers(): return (multiprocessing.cpu_count() * 2) + 1 class StandaloneApplication(gunicorn.app.base.BaseApplication): + """ Superclassed to allow bookworm to do the running. """ + def __init__(self, app, options=None): self.options = options or {} self.application = app @@ -117,14 +119,17 @@ def load_config(self): def load(self): return self.application -def run(port = 10012, workers = number_of_workers()): +def run(port = 10012, bind="0.0.0.0", workers = number_of_workers()): + """ + port: the service port + bind: the host to bind to. + """ if workers==0: workers = number_of_workers() - + options = { - 'bind': '{}:{}'.format('127.0.0.1', port), + 'bind': f'{bind}:{port}', 'workers': workers, } - + StandaloneApplication(application, options).run() - From 5134324fed142a70370ab17ca0bf30b960246043 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 14 Oct 2020 11:04:43 -0400 Subject: [PATCH 02/41] change user to allow docker --- bookwormDB/CreateDatabase.py | 60 +++++++++++++++++++----------------- bookwormDB/configuration.py | 5 ++- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/bookwormDB/CreateDatabase.py b/bookwormDB/CreateDatabase.py index f2ee016..0f8b305 100755 --- a/bookwormDB/CreateDatabase.py +++ b/bookwormDB/CreateDatabase.py @@ -33,7 +33,7 @@ def __init__(self, dbname = None): if not re.match("^[A-Za-z0-9_]+$", self.dbname): raise NameError("Database names must not include any spaces or special characters") self.conn = None - + def connect(self, setengine=True): #These scripts run as the Bookworm _Administrator_ on this machine; defined by the location of this my.cnf file. conf = Configfile("admin") @@ -61,7 +61,7 @@ def connect(self, setengine=True): self.conn = MySQLdb.connect(**connect_args) else: raise - + cursor = self.conn.cursor() cursor.execute("CREATE DATABASE IF NOT EXISTS %s default character set utf8" % self.dbname) # Don't use native query attribute here to avoid infinite loops @@ -87,14 +87,14 @@ def query(self, sql, params = None, many_params=None): provided. """ logging.debug(" -- Preparing to execute SQL code -- " + sql) - logging.debug(" -- with params {}".format(params)) - + logging.debug(" -- with params {}".format(params)) + try: cursor = self.conn.cursor() if many_params is not None: cursor.executemany(sql, many_params) else: - + cursor.execute(sql) except: try: @@ -133,9 +133,9 @@ def __init__(self, dbname=None, """ self.config_manager = Configfile("admin") config = self.config_manager.config - + self.dbname = dbname - + self.conn = None if self.dbname is not None: @@ -144,7 +144,7 @@ def __init__(self, dbname=None, self.db = DB(dbname=self.dbname) else: self.db = None - + if variableFile is not None: try: self.setVariables(originFile=variableFile) @@ -163,12 +163,14 @@ def grantPrivileges(self): username=globalfile.config.get("client","user") password=globalfile.config.get("client","password") + clienthostname=globalfile.config.get("client","clienthostname") + try: - self.db.query("GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname,username,password)) + self.db.query("GRANT SELECT ON %s.* TO '%s'@'%s' IDENTIFIED BY '%s'" % (self.dbname,username,clienthostname, password)) except MySQLdb._exceptions.OperationalError: - self.db.query("CREATE USER '%s'@'localhost' IDENTIFIED BY '%s'" % (username,password)) - self.db.query("GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname,username,password)) - + self.db.query("CREATE USER '%s'@'%s' IDENTIFIED BY '%s'" % (username,clienthostname,password)) + self.db.query("GRANT SELECT ON %s.* TO '%s'@'%s' IDENTIFIED BY '%s'" % (self.dbname,username,clienthostname,password)) + def setVariables(self, originFile, anchorField="bookid", jsonDefinition=".bookworm/metadata/field_descriptions_derived.json"): self.variableSet = variableSet(originFile=originFile, anchorField=anchorField, jsonDefinition=jsonDefinition,db=self.db) @@ -185,7 +187,7 @@ def importNewFile(self,originFile,anchorField,jsonDefinition): """ self.setVariables(originFile,anchorField=anchorField,jsonDefinition=jsonDefinition) self.variableSet.writeMetadata() - self.variableSet.loadMetadata() + self.variableSet.loadMetadata() self.variableSet.updateMasterVariableTable() for variable in self.variableSet.variables: variable.clear_associated_memory_tables() @@ -195,9 +197,9 @@ def create_database(self): dbname = self.dbname dbuser = self.dbuser dbpassword = self.dbpassword - + db = self.db - + #This must be run as a MySQL user with create_table privileges try: db.query("CREATE DATABASE " + dbname) @@ -206,7 +208,7 @@ def create_database(self): "Setting up permissions for web user..." db.query("GRANT SELECT ON " + dbname + ".*" + " TO '" + dbuser + "'@'localhost' IDENTIFIED BY '" + dbpassword + "'") - db.query("GRANT SELECT ON {}.* TO 'bookworm'@'localhost'".format(dbname)) + db.query("GRANT SELECT ON {}.* TO 'bookworm'@'localhost'".format(dbname)) db.query("FLUSH PRIVILEGES") #a field to store stuff we might need later. db.query("CREATE TABLE IF NOT EXISTS bookworm_information (entry VARCHAR(255), PRIMARY KEY (entry), value VARCHAR(50000))") @@ -235,7 +237,7 @@ def load_word_list(self): def load_book_list(self): """ - Slated for deletion. + Slated for deletion. Loads in the tables that have already been created by a previous call to `Bookworm.variableSet.writeMetadata()` @@ -250,7 +252,7 @@ def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, rev ngramname = "unigrams" tablenameroot = "master_bookcounts" # If you are splitting the input into multiple tables - # to be joined as a merge table, come up with multiple + # to be joined as a merge table, come up with multiple # table names and we'll cycle through. if table_count == 1: tablenames = [tablenameroot] @@ -266,12 +268,12 @@ def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, rev if (len(grampath) == 0) or (grampath == "/"): logging.error("Woah! Don't set the ngram path to your system root!") raise - + if newtable: if os.path.exists(tmpdir): import shutil shutil.rmtree(tmpdir) - + logging.info("Dropping older %s table, if it exists" % ngramname) for tablename in tablenames: db.query("DROP TABLE IF EXISTS " + tablename) @@ -290,7 +292,7 @@ def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, rev db.query("set NAMES utf8;") db.query("set CHARACTER SET utf8;") logging.info("loading data using LOAD DATA LOCAL INFILE") - + files = os.listdir(grampath) for i, filename in enumerate(files): if filename.endswith('.txt'): @@ -429,15 +431,15 @@ def loadVariableDescriptionsIntoDatabase(self): self.variableSet.updateMasterVariableTable() def reloadMemoryTables(self, force=False, names = None): - + """ Checks to see if memory tables need to be repopulated (by seeing if they are empty) and then does so if necessary. - If an array is passed to 'names', only the specified tables will be + If an array is passed to 'names', only the specified tables will be loaded into memory; otherwise, all will. """ - + q = "SELECT tablename,memoryCode FROM masterTableTable" existingCreateCodes = self.db.query(q).fetchall() @@ -471,14 +473,14 @@ def fastcat_creation_SQL(self, engine="MEMORY"): tbname = "fastcat" if engine=="MYISAM": tbname = "fastcat_" - + fastFieldsCreateList = [ "bookid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (bookid)", "nwords MEDIUMINT UNSIGNED NOT NULL" ] - + fastFieldsCreateList += [variable.fastSQL() for variable in self.variableSet.uniques("fast")] - + create_command = """DROP TABLE IF EXISTS tmp;""" create_command += "CREATE TABLE tmp ({}) ENGINE={};""".format( ", ".join(fastFieldsCreateList), engine) @@ -535,7 +537,7 @@ def addWordsToMasterVariableTable(self, max_word_length = 30, max_words = 150000 query += "VALUES ('wordsheap','wordsheap','{}'); ".format(wordCommand) logging.info("Creating wordsheap") self.db.query(query) - + def jsonify_data(self): variables = self.variableSet.variables dbname = self.dbname @@ -573,7 +575,7 @@ def jsonify_data(self): except: logging.warning("No default search created because of insufficient data.") output['ui_components'] = ui_components - + with open('.bookworm/%s.json' % dbname, 'w') as outfile: outfile.write(json.dumps(output)) diff --git a/bookwormDB/configuration.py b/bookwormDB/configuration.py index 5e8a77e..44ec3c2 100644 --- a/bookwormDB/configuration.py +++ b/bookwormDB/configuration.py @@ -117,12 +117,16 @@ def __init__(self, usertype, possible_locations=None, default=None, ask_about_de self.config.set("client", "host", "localhost") self.config.set("client", "user", "root") self.config.set("client", "password", "") + self.config.set("clienthostname", "localhost", "") else: self.ensure_section("client") self.config.set("client", "host", "localhost") self.config.set("client", "user", "bookworm") self.config.set("client", "password", "") + # A different section here can change the name of the host + # allowed to log in for select queries. + self.config.set("client", "clienthostname", "localhost") self.read_config_files(possible_locations) @@ -148,7 +152,6 @@ def read_config_files(self, used_files): successes = self.config.read(used_files) - def default_locations_from_type(self,usertype): """ The default locations for each usertype. From 27551d39510cf7af7bf7ad43930fcec212d84dd4 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 14 Oct 2020 12:51:43 -0400 Subject: [PATCH 03/41] config file tweak --- bookwormDB/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bookwormDB/configuration.py b/bookwormDB/configuration.py index 44ec3c2..e32cc27 100644 --- a/bookwormDB/configuration.py +++ b/bookwormDB/configuration.py @@ -117,7 +117,7 @@ def __init__(self, usertype, possible_locations=None, default=None, ask_about_de self.config.set("client", "host", "localhost") self.config.set("client", "user", "root") self.config.set("client", "password", "") - self.config.set("clienthostname", "localhost", "") + self.config.set("client", "clienthostname", "localhost") else: self.ensure_section("client") From 70cfbc72de2cae64441fd3ce1eb556119aa28f78 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 14 Oct 2020 13:09:14 -0400 Subject: [PATCH 04/41] switch to re from regex --- bookwormDB/tokenizer.py | 50 +++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/bookwormDB/tokenizer.py b/bookwormDB/tokenizer.py index b043550..dd338d9 100644 --- a/bookwormDB/tokenizer.py +++ b/bookwormDB/tokenizer.py @@ -18,7 +18,7 @@ # import regex as re --now done only when the function is actually called. # Set at a global to avoid multiple imports. -re = None +import re # Likewise, store a thread-wise count on whether we've thrown a unicode encoding error. haveWarnedUnicode = False @@ -32,9 +32,6 @@ def wordRegex(): Note that this uses *unicode*: among other things, that means that it needs to be passed a unicode-decoded string: and that we have to use the "regex" module instead of the "re" module. Python3 will make this, perhaps, easier. """ - global re - if re is None: - import regex as re MasterExpression = r"\w+" possessive = MasterExpression + r"'s" numbers = r"(?:[\$])?\d+" @@ -43,7 +40,7 @@ def wordRegex(): sharps = r"[a-gjxA-GJX]#" punctuators = r"[^\w\p{Z}]" """ - Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms + Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms """ bigregex = re.compile("|".join([decimals,possessive,numbers,abbreviation,sharps,punctuators,MasterExpression]),re.UNICODE|re.IGNORECASE) return bigregex @@ -64,21 +61,21 @@ def readIDfile(prefix=""): class tokenBatches(object): """ - A tokenBatches is a manager for tokenizers. Each one corresponds to + A tokenBatches is a manager for tokenizers. Each one corresponds to a reasonable number of texts to read in to memory on a single processor: during the initial loads, there will probably be one per core. It doesn't store the original text, just the unigram and bigram tokenizations in its attached self.counts arrays. - - It writes out its dat to a single file: + + It writes out its dat to a single file: in this way, a batch of up to several hundred thousand individual files is grouped into a single file. It also has a method that encodes and writes its wordcounts into a tsv file appropriate for reading with mysql, with 3-byte integer encoding for wordid and bookid. """ - + def __init__(self, levels=["unigrams","bigrams"]): """ - + mode: 'encode' (write files out) """ self.id = '%030x' % random.randrange(16**30) @@ -86,13 +83,13 @@ def __init__(self, levels=["unigrams","bigrams"]): # placeholder to alert that createOutputFiles must be run. self.completedFile = None - + def createOutputFiles(self): self.completedFile = open(".bookworm/texts/encoded/completed/" + self.id,"w") self.outputFiles = dict() for level in self.levels: self.outputFiles[level] = open(".bookworm/texts/encoded/{}/{}.txt".format(level, self.id),"w") - + def attachDictionaryAndID(self): self.dictionary = readDictionaryFile() self.IDfile = readIDfile() @@ -100,14 +97,14 @@ def attachDictionaryAndID(self): def close(self): """ - This test allows the creation of bookworms with fewer document than requested + This test allows the creation of bookworms with fewer document than requested threads, which happens to be the case in the tests. """ if self.completedFile is not None: self.completedFile.close() for v in self.outputFiles.values(): v.close() - + def encodeRow(self, filename, tokenizer, @@ -121,7 +118,7 @@ def encodeRow(self, if self.completedFile is None: self.createOutputFiles() self.attachDictionaryAndID() - + #The dictionary and ID lookup tables should be pre-attached. dictionary = self.dictionary IDfile = self.IDfile @@ -146,7 +143,7 @@ def encodeRow(self, raise tokens = preTokenized(token, count, self.levels[0]) """ - + try: textid = IDfile[filename] except KeyError: @@ -170,7 +167,7 @@ def encodeRow(self, if any of the words to be included is not in the dictionary, we don't include the whole n-gram in the counts. """ - skip = True + skip = True if not skip: wordids = "\t".join(wordList) output.append("{}\t{}\t{}".format(int(textid), wordids, count)) @@ -179,7 +176,7 @@ def encodeRow(self, if len(output) > 0: # The test is necessary because otherwise this prints a blank line. outputFile.write("\n".join(output) + "\n") - + except IOError as e: logging.exception(e) @@ -198,9 +195,9 @@ class Tokenizer(object): the general way to call it is to initialize, and then for each desired set of counts call "tokenizer.counts("bigrams")" (or whatever). That returns a dictionary, whose keys are tuples of length 1 for unigrams, 2 for bigrams, etc., and whose values are counts for that ngram. The tuple form should allow faster parsing down the road. - + """ - + def __init__(self, string, tokenization_regex=None): global haveWarnedUnicode self.string = string @@ -234,7 +231,7 @@ def ngrams(self, n, collapse = False): All the ngrams in the text can be created as a tuple by zipping an arbitrary number of copies of the text to itself. """ - + self.tokenize() l = list(zip(*[self.tokens[i:] for i in range(n)])) if collapse: @@ -262,9 +259,9 @@ def words(self): """ self.tokenize() return self.tokens - + def counts(self, whichType): - + count = dict() for gram in getattr(self,whichType)(): try: @@ -293,13 +290,13 @@ def __init__(self, csv_string, level): self.output = dict(zip(f.word, f.counts)) else: self.output = dict(zip([tuple(w.split(" ")) for w in f.word], f.counts)) - + def counts(self,level): if level != self.level: raise return self.output - + def getAlreadySeenList(folder): #Load in a list of what's already been translated for that level. #Returns a set. @@ -319,9 +316,8 @@ def encode_text_stream(): line = line.rstrip("\n") if filename not in seen: tokenBatch.encodeRow(line) - + # And printout again at the end if __name__=="__main__": encode_text_stream() - From 1bc13471a8b2c0fb9381e376bfd256881f2a7673 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 14 Oct 2020 14:46:39 -0400 Subject: [PATCH 05/41] handle remote hosts in API better --- bookwormDB/CreateDatabase.py | 5 +- bookwormDB/SQLAPI.py | 78 ++++++------- bookwormDB/general_API.py | 108 +++++++++--------- bookwormDB/mariaDB.py | 208 ++++++++++++++++------------------- bookwormDB/tokenizer.py | 2 +- setup.py | 2 +- 6 files changed, 188 insertions(+), 215 deletions(-) diff --git a/bookwormDB/CreateDatabase.py b/bookwormDB/CreateDatabase.py index 0f8b305..794b9bc 100755 --- a/bookwormDB/CreateDatabase.py +++ b/bookwormDB/CreateDatabase.py @@ -40,7 +40,7 @@ def connect(self, setengine=True): try: host = conf.config.get("mysqld", "host") except NoOptionError: - host = "localhost" + host = conf.config.get("client", "host") connect_args = { "user": conf.config.get("client", "user"), "passwd": conf.config.get("client", "password"), @@ -164,7 +164,8 @@ def grantPrivileges(self): username=globalfile.config.get("client","user") password=globalfile.config.get("client","password") clienthostname=globalfile.config.get("client","clienthostname") - + if clienthostname == '': + clienthostname = "%" try: self.db.query("GRANT SELECT ON %s.* TO '%s'@'%s' IDENTIFIED BY '%s'" % (self.dbname,username,clienthostname, password)) except MySQLdb._exceptions.OperationalError: diff --git a/bookwormDB/SQLAPI.py b/bookwormDB/SQLAPI.py index 33bdf70..15dd84b 100644 --- a/bookwormDB/SQLAPI.py +++ b/bookwormDB/SQLAPI.py @@ -9,27 +9,27 @@ import hashlib import logging from .bwExceptions import BookwormException +import bookwormDB.configuration # If you have bookworms stored on a different host, you can create more lines # like this. # A different host and read_default_file will let you import things onto a # different server. general_prefs = dict() -general_prefs["default"] = {"fastcat": "fastcat", - "fastword": "wordsheap", - "fullcat": "catalog", - "fullword": "words", - "read_default_file": "/etc/mysql/my.cnf" +general_prefs["default"] = { + "fastcat": "fastcat", + "fastword": "wordsheap", + "fullcat": "catalog", + "fullword": "words", + "read_default_file": "/etc/mysql/my.cnf" } class DbConnect(object): # This is a read-only account - def __init__(self, prefs=general_prefs['default'], database=None, - host=None): - + def __init__(self, prefs=general_prefs['default'], database=None): + self.dbname = database - - import bookwormDB.configuration + conf = bookwormDB.configuration.Configfile("read_only").config if database is None: @@ -40,28 +40,12 @@ def __init__(self, prefs=general_prefs['default'], database=None, "use_unicode": 'True', "charset": 'utf8', "user": conf.get("client", "user"), - "password": conf.get("client", "password") + "password": conf.get("client", "password"), + "host": conf.get("client", "host") } - if host: - connargs['host'] = host - # For back-compatibility: - elif "HOST" in prefs: - connargs['host'] = prefs['HOST'] - else: - host = "localhost" - - try: - self.db = MySQLdb.connect(**connargs) - except: - try: - # Sometimes mysql wants to connect over this rather than a socket: - # falling back to it for backward-compatibility. - connargs["host"] = "127.0.0.1" - self.db = MySQLdb.connect(**connargs) - except: - raise - + logging.info("Preparing to connect with args", connargs) + self.db = MySQLdb.connect(**connargs) self.cursor = self.db.cursor() def fail_if_nonword_characters_in_columns(input): @@ -114,7 +98,7 @@ def __init__(self, outside_dictionary = {}, db = None, databaseScheme = None): self.prefs = general_prefs['default'] self.prefs['database'] = outside_dictionary['database'] self.outside_dictionary = outside_dictionary - # self.prefs = general_prefs[outside_dictionary.setdefault('database', 'presidio')] + self.db = db if db is None: self.db = DbConnect(self.prefs) @@ -124,7 +108,7 @@ def __init__(self, outside_dictionary = {}, db = None, databaseScheme = None): self.cursor = self.db.cursor self.wordsheap = self.fallback_table(self.prefs['fastword']) - + self.words = self.prefs['fullword'] """ I'm now allowing 'search_limits' to either be a dictionary or an array of dictionaries: @@ -356,10 +340,10 @@ def pull_keys(entry): else: return [] return [re.sub(" .*","",key) for key in val] - + return pull_keys(self.limits) + [re.sub(" .*","",g) for g in self.groups] - - + + def create_catalog_table(self): self.catalog = self.prefs['fastcat'] # 'catalog' # Can be replaced with a more complicated query in the event of longer joins. @@ -403,7 +387,7 @@ def create_catalog_table(self): self.catalog = self.fallback_table("fastcat") if self.catalog == "fastcat_": self.prefs['fastcat'] = "fastcat_" - + for table in self.relevantTables: if table!="fastcat" and table!="words" and table!="wordsheap" and table!="master_bookcounts" and table!="master_bigrams" and table != "fastcat_" and table != "wordsheap_": self.catalog = self.catalog + """ NATURAL JOIN """ + table + " " @@ -422,29 +406,29 @@ def fallback_table(self,tabname): if not hasattr(self,"fallbacks_cache"): self.fallbacks_cache = {} - + if tabname in self.fallbacks_cache: return self.fallbacks_cache[tabname] - + q = "SELECT COUNT(*) FROM {}".format(tab) try: self.db.cursor.execute(q) length = self.db.cursor.fetchall()[0][0] if length==0: - tab += "_" + tab += "_" except MySQLdb.ProgrammingError: tab += "_" - + self.fallbacks_cache[tabname] = tab - + return tab - + def make_catwhere(self): # Where terms that don't include the words table join. Kept separate so that we can have subqueries only working on one half of the stack. catlimits = dict() for key in list(self.limits.keys()): # !!Warning--none of these phrases can be used in a bookworm as a custom table names. - + if key not in ('word', 'word1', 'word2', 'hasword') and not re.search("words\d", key): catlimits[key] = self.limits[key] if len(list(catlimits.keys())) > 0: @@ -887,7 +871,7 @@ def search_results(self): # This is an alias that is handled slightly differently in # APIimplementation (no "RESULTS" bit in front). Once # that legacy code is cleared out, they can be one and the same. - + return json.loads(self.return_books()) def getActualSearchedWords(self): @@ -1119,7 +1103,7 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ whereterm.append(" ( " + " OR ".join(local_set) + " )") elif key == '$and' or key == "$AND": for comparison in values: - whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) + whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) elif isinstance(values, dict): if joiner is None: joiner = " AND " @@ -1128,7 +1112,7 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ operations = {"$gt":">", "$ne":"!=", "$lt":"<", "$grep":" REGEXP ", "$gte":">=", "$lte":"<=", "$eq":"="} - + for operation in list(values.keys()): if operation == "$ne": # If you pass a lot of ne values, they must *all* be false. @@ -1156,7 +1140,7 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ quotesep = "" def escape(value): - # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. + # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8') else: def escape(value): diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 5ce5675..2b4031b 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -7,7 +7,7 @@ from pandas import set_option from copy import deepcopy from collections import defaultdict -from .SQLAPI import DbConnect +from .mariaDB import DbConnect from .SQLAPI import userquery from .mariaDB import Query from .bwExceptions import BookwormException @@ -23,7 +23,7 @@ The general API is some functions for working with pandas to calculate bag-of-words summary statistics according to the API description. -It is not bound to any particular backend: instead, a subset of +It is not bound to any particular backend: instead, a subset of methods in the API must be supported by subclassing APICall(). The only existing example of this is "SQLAPICall." @@ -99,49 +99,49 @@ class Aggregator(object): but there are a multitude of things you can do with those: basic things like frequency, all the way up to TF-IDF. - """ + """ def __init__(self, df, groups = None): self.df = df self.groups = groups def _aggregate(self, parameters): "Run the aggregation. Prefixed with an underscore so it doesn't show up in the dict." - + parameters = set(map(str, parameters)) for parameter in parameters: getattr(self, parameter)() return self.df - + def WordCount(self): self.df["WordCount"] = self.df["WordCount_x"] - + def TextCount(self): self.df["TextCount"] = self.df["TextCount_x"] - + def WordsPerMillion(self): self.df["WordsPerMillion"] = (self.df["WordCount_x"].multiply(1000000)/ self.df["WordCount_y"]) def TotalWords(self): self.df["TotalWords"] = self.df["WordCount_y"] - + def SumWords(self): self.df["SumWords"] = self.df["WordCount_y"] + self.df["WordCount_x"] - + def WordsRatio(self): self.df["WordsRatio"] = self.df["WordCount_x"]/self.df["WordCount_y"] - + def TextPercent(self): self.df["TextPercent"] = 100*self.df["TextCount_x"].divide(self.df["TextCount_y"]) - + def TextRatio(self): - self.df["TextRatio"] = self.df["TextCount_x"]/self.df["TextCount_y"] + self.df["TextRatio"] = self.df["TextCount_x"]/self.df["TextCount_y"] def TotalTexts(self): self.df["TotalTexts"] = self.df["TextCount_y"] - + def SumTexts(self): self.df["SumTexts"] = self.df["TextCount_y"] + self.df["TextCount_x"] - + def HitsPerText(self): self.df["HitsPerText"] = self.df["WordCount_x"]/self.df["TextCount_x"] @@ -152,13 +152,13 @@ def PMI_words(self): self.df["PMI_words"] = PMI(self.df, "WordCount_x", self.groups) def PMI_texts(self): - self.df["PMI_texts"] = PMI(self.df, "TextCount_x", self.groups) - + self.df["PMI_texts"] = PMI(self.df, "TextCount_x", self.groups) + def TFIDF(self): from numpy import log as log self.df["TF"] = self.df["WordCount_x"]/self.df["WordCount_y"] self.df["TFIDF"] = self.df["TF"] * np.log(self.df["TextCount_y"]/self.df['TextCount_x']) - + def Dunning(self): self.df["Dunning"] = DunningLog(self.df, "WordCount_x", "WordCount_y") @@ -167,7 +167,7 @@ def DunningTexts(self): self.df["DunningTexts"] = DunningLog(self.df, "TextCount_x", "TextCount_y") def rename(df, newkey): - + # Add "x" and "y" suffixed to the dataframes even when not explicitly needed. renamer = {} @@ -209,7 +209,7 @@ def base_count_types(list_of_final_count_types): subq = set() superq = set() - + for count_name in list_of_final_count_types: if count_name in ["WordCount", "WordsPerMillion", "WordsRatio", "TotalWords", "SumWords", "Dunning", "PMI_words", "TextLength", "HitsPerMatch", "TFIDF"]: @@ -247,6 +247,7 @@ def __init__(self, APIcall): self.idiot_proof_arrays() self.set_defaults() + def set_defaults(self): query = self.query if "search_limits" not in query: @@ -310,13 +311,13 @@ def data(self): def validate_query(self): self.ensure_query_has_required_fields() - + def ensure_query_has_required_fields(self): required_fields = ['counttype', 'groups', 'database'] if self.query['method'] in ['schema', 'search']: required_fields = ['database'] - + for field in required_fields: if field not in self.query: logging.error("Missing field: %s" % field) @@ -326,17 +327,17 @@ def ensure_query_has_required_fields(self): def prepare_search_and_compare_queries(self): - + call1 = deepcopy(self.query) call2 = deepcopy(call1) call2['search_limits'] = self.get_compare_limits() - + # The individual calls need only the base counts: not "Percentage of # Words," but just "WordCount" twice, and so forth call1['counttype'], call2['counttype'] = base_count_types(self.query['counttype']) - + # Drop out asterisks for that syntactic sugar. for limit in list(call1['search_limits'].keys()): if re.search(r'^\*', limit): @@ -363,14 +364,14 @@ def get_data_from_source(self): instance or something else, just by changing the bits in the middle where it handles storage_format. """ - + self.validate_query() if self.query['method'] in ['schema', 'search']: return self.generate_pandas_frame() - + self.prepare_search_and_compare_queries() - + """ This could use any method other than pandas_SQL: You'd just need to redefine "generate_pandas_frame" @@ -387,7 +388,7 @@ def get_data_from_source(self): logging.debug(self.call2) df2 = self.generate_pandas_frame(self.call2) rename(df2, "y") - + except Exception as error: logging.exception("Database error") # One common error is putting in an inappropriate column @@ -401,14 +402,14 @@ def get_data_from_source(self): except: return Series({"status": "error", "message": "Unknown error. ", - "code":str(error)}) - + "code":str(error)}) + intersections = intersectingNames(df1, df2) """ Would this merge be faster with indexes? """ - + if len(intersections) > 0: merged = merge(df1, df2, on=intersections, how='outer') else: @@ -420,7 +421,7 @@ def get_data_from_source(self): gator = Aggregator(merged, self.query['groups']) calcced = gator._aggregate(calculations) # calcced = calculateAggregates(merged, calculations, self.query['groups']) - + calcced = calcced.fillna(int(0)) final_DataFrame = (calcced[self.query['groups'] + @@ -451,9 +452,9 @@ def execute(self): else: # Only return first search limit if not return in json self.query['search_limits'] = self.query['search_limits'][0] - + form = method[7:] if method[:6] == 'return' else method - + logging.warning("method == \"%s\" is deprecated. Use method=\"data\" " "with format=\"%s\" instead." % (method, form)) @@ -464,14 +465,14 @@ def execute(self): elif method == "return_csv" or method == "csv": self.query['method'] = 'data' - self.query['format'] = 'json' + self.query['format'] = 'json' frame = self.data() return frame.to_csv(path = None, sep="\t", encoding="utf8", index=False, quoting=csv.QUOTE_NONE, escapechar="\\") elif version >= 2: try: # What to do with multiple search_limits - + if isinstance(self.query['search_limits'], list): if fmt == "json" or version >= 3: frame = self.multi_execute(version = version) @@ -480,13 +481,13 @@ def execute(self): self.query['search_limits'] = self.query['search_limits'][0] else: frame = self.data() - + if fmt == "json": return self.return_json(version=2) - + if fmt == "csv": return frame.to_csv(encoding="utf8", index=False) - + if fmt == "tsv": return frame.to_csv(sep="\t", encoding="utf8", index=False) @@ -505,7 +506,7 @@ def execute(self): if fmt == 'html': return self.html(frame) - + else: err = dict(status="error", code=200, message="Only formats in ['csv', 'tsv', 'json', 'feather']" @@ -528,6 +529,8 @@ def execute(self): if method in ["returnPossibleFields", "search_results", "return_books", "schema"]: try: + logging.warn("Using deprecated API call.") + query = userquery(self.query) if method == "return_books": return query.execute() @@ -539,12 +542,12 @@ def execute(self): return "General error" def multi_execute(self, version=1): - + """ Queries may define several search limits in an array if they use the return_json method. """ - + if version <= 2: returnable = [] for limits in self.query['search_limits']: @@ -554,7 +557,7 @@ def multi_execute(self, version=1): version=version) returnable.append(q) return self._prepare_response(returnable, version) - + if version == 3: for i, limits in enumerate(self.query['search_limits']): child = deepcopy(self.query) @@ -567,7 +570,7 @@ def multi_execute(self, version=1): frame = frame.append(f, ignore_index = True) return frame - + def html(self, data): """ Return data in column-oriented format with run-length encoding @@ -577,7 +580,7 @@ def html(self, data): if isinstance(data, Series) and 'status' in data: # If data has a status, Bookworm is trying to send us an error return data.to_json() - + set_option('display.max_colwidth', -1) return data.to_html(escape = False, index = False) @@ -587,20 +590,20 @@ def return_rle_json(self, data): Return data in column-oriented format with run-length encoding on duplicate values. """ - + if isinstance(data, Series) and 'status' in data: # If data has a status, Bookworm is trying to send us an error return data.to_json() - + output = {'status':'success', 'data':{}} - + for k in data: series = data[k] output['data'][k] = rle(data[k].tolist()) - + return json.dumps(output) - - + + def return_json(self, raw_python_object=False, version=1): ''' Get JSON data for a single search_limit. @@ -734,8 +737,7 @@ def generate_pandas_frame(self, call = None): call = self.query con = DbConnect(prefs, self.query['database']) q = Query(call).query() - logging.debug("Preparing to execute {}".format(q)) + logging.debug("Preparing to execute {}".format(q)) df = read_sql(q, con.db) logging.debug("Query retrieved") return df - diff --git a/bookwormDB/mariaDB.py b/bookwormDB/mariaDB.py index c9ebc9c..9742ca9 100644 --- a/bookwormDB/mariaDB.py +++ b/bookwormDB/mariaDB.py @@ -22,9 +22,9 @@ class DbConnect(object): # This is a read-only account def __init__(self, database=None, host=None): - + self.dbname = database - + import bookwormDB.configuration conf = bookwormDB.configuration.Configfile("read_only").config @@ -36,26 +36,12 @@ def __init__(self, database=None, "use_unicode": 'True', "charset": 'utf8', "user": conf.get("client", "user"), - "password": conf.get("client", "password") + "password": conf.get("client", "password"), + "host": conf.get("client", "host") } - if host: - connargs['host'] = host - # For back-compatibility: - else: - connargs['host'] = "localhost" - - try: - self.db = MySQLdb.connect(**connargs) - except: - try: - # Sometimes mysql wants to connect over this rather than a socket: - # falling back to it for backward-compatibility. - connargs["host"] = "127.0.0.1" - self.db = MySQLdb.connect(**connargs) - except: - raise - + logging.info("Preparing to connect with args", connargs) + self.db = MySQLdb.connect(**connargs) self.cursor = self.db.cursor() def fail_if_nonword_characters_in_columns(input): @@ -96,7 +82,7 @@ def all_keys(input): def check_query(query): - + fail_if_nonword_characters_in_columns(query) for key in ['database']: @@ -107,12 +93,12 @@ def check_query(query): if query['method'] in ["schema", "search"]: # Queries below this only apply to "data" return - + for v in query['counttype']: if not v in ['WordCount', 'TextCount']: raise BookwormException({"code": 400, "message": 'Only "WordCount" and "TextCount"' ' counts are supported by the SQL api, but passed {}'.format(v)}) - + class Query(object): """ @@ -122,15 +108,15 @@ def __init__(self, query_object = {}, db = None, databaseScheme = None): # Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it. check_query(query_object) - + self.prefs = {'database': query_object['database']} - + self.query_object = query_object - + self.db = db if db is None: self.db = DbConnect(query_object['database']) - + self.databaseScheme = databaseScheme if databaseScheme is None: self.databaseScheme = databaseSchema(self.db) @@ -138,14 +124,14 @@ def __init__(self, query_object = {}, db = None, databaseScheme = None): self.cursor = self.db.cursor # Some tablenames. - + self.wordsheap = self.databaseScheme.fallback_table('wordsheap') self.fastcat = self.databaseScheme.fallback_table("fastcat") logging.info("Catalog set to {}".format(self.fastcat)) self.words = "words" self.defaults(query_object) # Take some defaults - + self.derive_variables() # Derive some useful variables that the query will use. def defaults(self, query_object): @@ -157,8 +143,8 @@ def defaults(self, query_object): self.wordsTables = None - - + + # Set up a dictionary for the denominator of any fraction if it doesn't already exist: self.search_limits = query_object.setdefault('search_limits', [{"word":["polka dot"]}]) self.words_collation = query_object.setdefault('words_collation', "Case_Insensitive") @@ -177,7 +163,7 @@ def defaults(self, query_object): groups = query_object['groups'] except: groups = None - + if groups == [] or groups == ["unigram"]: # Set an arbitrary column name that will always be true if nothing else is set. pass @@ -187,7 +173,7 @@ def defaults(self, query_object): # A user query can't demand ungrouped results, # but internally it's represented as None. groups = [] - + for group in groups: # There's a special set of rules for how to handle unigram and bigrams @@ -287,7 +273,7 @@ def determineOutsideDictionary(self): def derive_variables(self): # These are locally useful, and depend on the search limits put in. self.limits = self.search_limits - + # Treat empty constraints as nothing at all, not as restricting to the set of nothing. for key in list(self.limits.keys()): if self.limits[key] == []: @@ -297,13 +283,13 @@ def derive_variables(self): self.word_limits = True else: self.word_limits = False - + self.set_operations() - + self.create_catalog_table() - + self.make_catwhere() - + self.make_wordwheres() def tablesNeededForQuery(self, fieldNames=[]): @@ -314,7 +300,7 @@ def tablesNeededForQuery(self, fieldNames=[]): neededTables = set() tablenames = dict() tableDepends = dict() - + q = "SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);" logging.debug(q) db.cursor.execute(q) @@ -350,7 +336,7 @@ def needed_columns(self): Needs a recursive function so it will find keys deeply nested inside "$or" searches. """ cols = [] - + def pull_keys(entry): val = [] if isinstance(entry,list) and not isinstance(entry,(str, bytes)): @@ -364,9 +350,9 @@ def pull_keys(entry): val += pull_keys(v) else: return [] - + return [re.sub(" .*","",key) for key in val] - + return pull_keys(self.limits) def wordid_query(self): @@ -374,11 +360,11 @@ def wordid_query(self): if self.wordswhere != " TRUE ": f = "SELECT wordid FROM {words} as words1 WHERE {wordswhere}".format(**self.__dict__) - logging.debug("`" + self.wordswhere + "`") + logging.debug("`" + self.wordswhere + "`") return " wordid IN ({})".format(f) else: return " TRUE " - + def make_group_query(self): aliases = [self.databaseScheme.aliases[g] for g in self.query_object["groups"]] if len(aliases) > 0: @@ -392,13 +378,13 @@ def main_table(self): return 'master_bookcounts as main' if self.gram_size() == 2: return 'master_bigrams as main' - + def full_query_tables(self): # Joins are needed to provide groups, but *not* to provide # provide evidence for wheres. # But if there's a group, there may also need to be an associated where. - + if self.word_limits == False: tables = [self.fastcat] else: @@ -413,7 +399,7 @@ def full_query_tables(self): tables.append(t) return tables - + def make_join_query(self): tables = self.full_query_tables() return " NATURAL JOIN ".join(tables) @@ -424,34 +410,34 @@ def base_query(self): dicto['finalGroups'] = ', '.join(self.query_object['groups']) if dicto['finalGroups'] != '': dicto['finalGroups'] = ", " + dicto['finalGroups'] - + dicto['group_query'] = self.make_group_query() dicto['op'] = ', '.join(self.set_operations()) dicto['bookid_where'] = self.bookid_query() dicto['wordid_where'] = self.wordid_query() dicto['tables'] = self.make_join_query() logging.info("'{}'".format(dicto['tables'])) - + dicto['catwhere'] = self.make_catwhere("main") - + basic_query = """ SELECT {op} {finalGroups} FROM {tables} WHERE {bookid_where} - AND + AND {wordid_where} - AND {catwhere} + AND {catwhere} {group_query} """.format(**dicto) - + return basic_query - + def create_catalog_table(self): # self.catalog = self.prefs['fastcat'] # 'catalog' # Can be replaced with a more complicated query in the event of longer joins. """ - + This should check query constraints against a list of tables, and join to them. So if you query with a limit on LCSH, and LCSH is listed as being in a separate table, it joins the table @@ -464,15 +450,15 @@ def create_catalog_table(self): self.relevantTables = set() databaseScheme = self.databaseScheme - + cols = self.needed_columns() cols = [c for c in cols if not c in ["word", "word1", "word2"]] - + self.relevantTables = self.databaseScheme.tables_for_variables(cols) - + # moreTables = self.tablesNeededForQuery(columns) - + self.catalog = " NATURAL JOIN ".join(self.relevantTables) return self.catalog # for table in self.relevantTables: @@ -481,26 +467,26 @@ def create_catalog_table(self): # # return self.catalog - + def make_catwhere(self, query = "sub"): # Where terms that don't include the words table join. Kept separate so that we can have subqueries only working on one half of the stack. catlimits = dict() - + for key in list(self.limits.keys()): # !!Warning--none of these phrases can be used in a bookworm as a custom table names. - + if key not in ('word', 'word1', 'word2', 'hasword') and not re.search("words\d", key): catlimits[key] = self.limits[key] if query == "main": - ts = set(self.full_query_tables()) + ts = set(self.full_query_tables()) for key in list(catlimits.keys()): logging.debug(key) logging.debug(ts) if not (key in ts or key + "__id" in ts): logging.info("removing {}".format(key)) del catlimits[key] - + if len(list(catlimits.keys())) > 0: catwhere = where_from_hash(catlimits) else: @@ -508,7 +494,7 @@ def make_catwhere(self, query = "sub"): if query == "sub": self.catwhere = catwhere return catwhere - + def gram_size(self): try: ls = [phrase.split() for phrase in self.limits['word']] @@ -519,14 +505,14 @@ def gram_size(self): raise BookwormException('400', 'Must pass all unigrams or all bigrams') else: return lengths[0] - - - + + + def make_wordwheres(self): self.wordswhere = " TRUE " - + limits = [] - + if self.word_limits: """ @@ -540,7 +526,7 @@ def make_wordwheres(self): """ - + for phrase in self.limits['word']: locallimits = dict() array = phrase.split() @@ -554,7 +540,7 @@ def make_wordwheres(self): # That's a little joke. Get it? searchingFor = searchingFor.lower() - + selectString = "SELECT wordid FROM %s WHERE %s = %%s" % (self.wordsheap, self.word_field) logging.debug(selectString) cursor = self.db.cursor @@ -565,7 +551,7 @@ def make_wordwheres(self): if self.gram_size() > 1: # 1-indexed entries in the bigram tables. search_key = "word{}".format(n + 1) - + for row in cursor.fetchall(): wordid = row[0] try: @@ -575,7 +561,7 @@ def make_wordwheres(self): if len(locallimits) > 0: limits.append(where_from_hash(locallimits, comp = " = ", escapeStrings=False)) - + self.wordswhere = "(" + ' OR '.join(limits) + ")" if limits == []: @@ -607,9 +593,9 @@ def build_wordstables(self): if self.wordsTables is not None: return - + needsBigrams = (self.max_word_length == 2 or re.search("words2", self.selections)) - + needsUnigrams = self.max_word_length == 1; if self.max_word_length > 2: @@ -663,17 +649,17 @@ def build_wordstables(self): def set_operations(self): with_words = self.word_limits - + output = [] # experimental if self.query_object['counttype'] == 'bookid': return ['bookid'] - + if self.query_object['counttype'] == 'wordid': - return ['wordid'] + return ['wordid'] + - if with_words: if "TextCount" in self.query_object['counttype']: output.append("count(DISTINCT main.bookid) as TextCount") @@ -688,28 +674,28 @@ def set_operations(self): return output def bookid_query(self): - + q = "SELECT bookid FROM {catalog} WHERE {catwhere}""".format(**self.__dict__) logging.debug("'{}'".format(self.catwhere)) - + if self.catwhere == "TRUE": self.bookid_where = " TRUE " - + else: self.bookid_where = " bookid IN ({}) ".format(q) - + return self.bookid_where - + def query(self): - + """ Return the SQL query that fills the API request. There must be a search method filled out. """ - + if (self.query_object['method'] == 'schema'): return "SELECT name,type,description,tablename,dbname,anchor FROM masterVariableTable WHERE status='public'" elif (self.query_object['method'] == 'search'): @@ -756,10 +742,10 @@ def bibliography_query(self, limit = "100"): 'catwhere': self.make_catwhere("main"), 'limit': limit } - + dicto['bookid_where'] = self.bookid_query() dicto['wordid_where'] = self.wordid_query() - + bibQuery = """ SELECT searchstring FROM catalog RIGHT JOIN ( @@ -777,11 +763,11 @@ def search_results(self): # This is an alias that is handled slightly differently in # APIimplementation (no "RESULTS" bit in front). Once # that legacy code is cleared out, they can be one and the same. - + return json.loads(self.return_books()) def getActualSearchedWords(self): - # + # if len(self.wordswhere) > 7: words = self.query_object['search_limits']['word'] # Break bigrams into single words. @@ -828,7 +814,7 @@ def custom_SearchString_additions(self, returnarray): else: newarray = returnarray return newarray - + def execute(self): # This performs the query using the method specified in the passed parameters. if self.method == "Nothing": @@ -841,14 +827,14 @@ class databaseSchema(object): """ This class stores information about the database setup that is used to optimize query creation query and so that queries know what tables to include. - It's broken off like this because it might be usefully wrapped around some of + It's broken off like this because it might be usefully wrapped around some of the backend features, because it shouldn't be run multiple times in a single query (that spawns two instances of itself), as was happening before. It's closely related to some of the classes around variables and variableSets in the Bookworm Creation scripts, - but is kept separate for now: that allows a bit more flexibility, + but is kept separate for now: that allows a bit more flexibility, but is probaby a Bad Thing in the long run. """ @@ -857,7 +843,7 @@ def __init__(self, db): self.cursor=db.cursor # has of what table each variable is in self.tableToLookIn = {} - + # hash of what the root variable for each search term is (eg, # 'author_birth' might be crosswalked to 'authorid' in the # main catalog.) @@ -878,33 +864,33 @@ def __init__(self, db): def newStyle(self, db): - + self.tableToLookIn['bookid'] = self.fallback_table('fastcat') self.tableToLookIn['filename'] = self.fallback_table('fastcat') ff = self.fallback_table('fastcat') self.anchorFields[ff] = ff - + self.tableToLookIn['wordid'] = self.fallback_table('wordsheap') self.tableToLookIn['word'] = self.fallback_table('wordsheap') ww = self.fallback_table('wordsheap') self.anchorFields[ww] = ww - + tablenames = dict() tableDepends = dict() q = "SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);" logging.debug(q) db.cursor.execute(q) - + for row in db.cursor.fetchall(): (dbname, alias, tablename, dependsOn) = row tablename = self.fallback_table(tablename) dependsOn = self.fallback_table(dependsOn) - + self.tableToLookIn[dbname] = tablename self.anchorFields[tablename] = dependsOn - + self.aliases[dbname] = alias def fallback_table(self,tabname): @@ -921,22 +907,22 @@ def fallback_table(self,tabname): if not hasattr(self,"fallbacks_cache"): self.fallbacks_cache = {} - + if tabname in self.fallbacks_cache: return self.fallbacks_cache[tabname] - + q = "SELECT COUNT(*) FROM {}".format(tab) logging.debug(q) try: self.db.cursor.execute(q) length = self.db.cursor.fetchall()[0][0] if length==0: - tab += "_" + tab += "_" except MySQLdb.ProgrammingError: tab += "_" - + self.fallbacks_cache[tabname] = tab - + return tab def tables_for_variables(self, variables, tables = []): @@ -954,10 +940,10 @@ def tables_for_variables(self, variables, tables = []): else: tables.append(anchor) lookup_table = anchor - + return tables - + def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_joiner = " OR "): whereterm = [] @@ -980,7 +966,7 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ whereterm.append(" ( " + " OR ".join(local_set) + " )") elif key == '$and' or key == "$AND": for comparison in values: - whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) + whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) elif isinstance(values, dict): if joiner is None: joiner = " AND " @@ -989,7 +975,7 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ operations = {"$gt":">", "$ne":"!=", "$lt":"<", "$grep":" REGEXP ", "$gte":">=", "$lte":"<=", "$eq":"="} - + for operation in list(values.keys()): if operation == "$ne": # If you pass a lot of ne values, they must *all* be false. @@ -1017,7 +1003,7 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ quotesep = "" def escape(value): - # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. + # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8') else: def escape(value): diff --git a/bookwormDB/tokenizer.py b/bookwormDB/tokenizer.py index dd338d9..fba0bfc 100644 --- a/bookwormDB/tokenizer.py +++ b/bookwormDB/tokenizer.py @@ -18,7 +18,7 @@ # import regex as re --now done only when the function is actually called. # Set at a global to avoid multiple imports. -import re +import regex as re # Likewise, store a thread-wise count on whether we've thrown a unicode encoding error. haveWarnedUnicode = False diff --git a/setup.py b/setup.py index 1aa0aa5..31468fa 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,6 @@ ], install_requires=["numpy","pandas","mysqlclient", "python-dateutil", "psutil", "bounter", - "gunicorn" + "gunicorn", "regex" ] ) From 2769c2c9555e5e6f63adde63c41d66b880e9a933 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 14 Oct 2020 15:14:13 -0400 Subject: [PATCH 06/41] handling of remove mysql servers --- bookwormDB/SQLAPI.py | 4 +++- bookwormDB/general_API.py | 2 +- bookwormDB/mariaDB.py | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bookwormDB/SQLAPI.py b/bookwormDB/SQLAPI.py index 15dd84b..f856759 100644 --- a/bookwormDB/SQLAPI.py +++ b/bookwormDB/SQLAPI.py @@ -44,7 +44,9 @@ def __init__(self, prefs=general_prefs['default'], database=None): "host": conf.get("client", "host") } - logging.info("Preparing to connect with args", connargs) + logging.info("Preparing to connect with args") + logging.info(connargs) + self.db = MySQLdb.connect(**connargs) self.cursor = self.db.cursor() diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 2b4031b..26e018b 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -735,7 +735,7 @@ def generate_pandas_frame(self, call = None): if call is None: call = self.query - con = DbConnect(prefs, self.query['database']) + con = DbConnect(self.query['database']) q = Query(call).query() logging.debug("Preparing to execute {}".format(q)) df = read_sql(q, con.db) diff --git a/bookwormDB/mariaDB.py b/bookwormDB/mariaDB.py index 9742ca9..635af99 100644 --- a/bookwormDB/mariaDB.py +++ b/bookwormDB/mariaDB.py @@ -20,8 +20,7 @@ class DbConnect(object): # This is a read-only account - def __init__(self, database=None, - host=None): + def __init__(self, database=None, host=None): self.dbname = database @@ -40,7 +39,8 @@ def __init__(self, database=None, "host": conf.get("client", "host") } - logging.info("Preparing to connect with args", connargs) + logging.warning("Preparing to connect with args") + logging.warning(connargs) self.db = MySQLdb.connect(**connargs) self.cursor = self.db.cursor() From 6165481f9b0f28b5c7211992926a4e713e1cb23b Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Mon, 19 Oct 2020 13:27:31 -0400 Subject: [PATCH 07/41] Allow password files --- bookwormDB/configuration.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/bookwormDB/configuration.py b/bookwormDB/configuration.py index e32cc27..aafe61a 100644 --- a/bookwormDB/configuration.py +++ b/bookwormDB/configuration.py @@ -136,20 +136,25 @@ def __init__(self, usertype, possible_locations=None, default=None, ask_about_de def read_config_files(self, used_files): - - try: - self.config.read(used_files) - except configparser.MissingSectionHeaderError: - """ - Some files throw this error if you have an empty - my.cnf. This throws those out of the list, and tries again. - """ - for file in used_files: + for file in used_files: + try: + self.config.read(file) try: - self.config.read(file) - except configparser.MissingSectionHeaderError: - used_files.remove(file) - successes = self.config.read(used_files) + password_file = self.config.get("client", "password_file") + except configparser.NoOptionError: + password_file = None + if password_file: + try: + with open(password_file) as fin: + password = fin.read().rstrip("\n").rstrip("\r") + self.config.set("client", "password", password) + except: + logging.error(f"Error reading passworm from {password_file}") + raise + self.config.remove_option("client", "password_file") + except configparser.MissingSectionHeaderError: + # Not every file needs every section. + pass def default_locations_from_type(self,usertype): From 89ab79a27908ee8628aa72970167584fddc9b4c0 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Tue, 2 Feb 2021 21:58:53 -0500 Subject: [PATCH 08/41] Better field name validation --- bookwormDB/MetaParser.py | 94 +++++++++++++++++++++------- bookwormDB/variableSet.py | 128 ++++++++++++-------------------------- 2 files changed, 113 insertions(+), 109 deletions(-) diff --git a/bookwormDB/MetaParser.py b/bookwormDB/MetaParser.py index d3d20dc..464e8ff 100644 --- a/bookwormDB/MetaParser.py +++ b/bookwormDB/MetaParser.py @@ -18,22 +18,73 @@ def DaysSinceZero(dateobj): #Zero isn't a date, which python knows but MySQL and javascript don't. return (dateobj - date(1,1,1)).days + 366 + +mySQLreservedWords = set(["ACCESSIBLE", "ADD", +"ALL", "ALTER", "ANALYZE", "AND", "AS", "ASC", "ASENSITIVE", "BEFORE", +"BETWEEN", "BIGINT", "BINARY", "BLOB", "BOTH", "BY", "CALL", +"CASCADE", "CASE", "CHANGE", "CHAR", "CHARACTER", "CHECK", "COLLATE", +"COLUMN", "CONDITION", "CONSTRAINT", "CONTINUE", "CONVERT", "CREATE", +"CROSS", "CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", +"CURRENT_USER", "CURSOR", "DATABASE", "DATABASES", "DAY_HOUR", +"DAY_MICROSECOND", "DAY_MINUTE", "DAY_SECOND", "DEC", "DECIMAL", +"DECLARE", "DEFAULT", "DELAYED", "DELETE", "DESC", "DESCRIBE", +"DETERMINISTIC", "DISTINCT", "DISTINCTROW", "DIV", "DOUBLE", "DROP", +"DUAL", "EACH", "ELSE", "ELSEIF", "ENCLOSED", "ESCAPED", "EXISTS", +"EXIT", "EXPLAIN", "FALSE", "FETCH", "FLOAT", "FLOAT4", "FLOAT8", +"FOR", "FORCE", "FOREIGN", "FROM", "FULLTEXT", "GENERAL", "GRANT", +"GROUP", "HAVING", "HIGH_PRIORITY", "HOUR_MICROSECOND", "HOUR_MINUTE", +"HOUR_SECOND", "IF", "IGNORE", "IGNORE_SERVER_IDS", "IN", "INDEX", +"INFILE", "INNER", "INOUT", "INSENSITIVE", "INSERT", "INT", "INT1", +"INT2", "INT3", "INT4", "INT8", "INTEGER", "INTERVAL", "INTO", "IS", +"ITERATE", "JOIN", "KEY", "KEYS", "KILL", "LEADING", "LEAVE", "LEFT", +"LIKE", "LIMIT", "LINEAR", "LINES", "LOAD", "LOCALTIME", +"LOCALTIMESTAMP", "LOCK", "LONG", "LONGBLOB", "LONGTEXT", "LOOP", +"LOW_PRIORITY", "MASTER_HEARTBEAT_PERIOD[c]", +"MASTER_SSL_VERIFY_SERVER_CERT", "MATCH", "MAXVALUE", "MEDIUMBLOB", +"MEDIUMINT", "MEDIUMTEXT", "MIDDLEINT", "MINUTE_MICROSECOND", +"MINUTE_SECOND", "MOD", "MODIFIES", "NATURAL", "NOT", +"NO_WRITE_TO_BINLOG", "NULL", "NUMERIC", "ON", "OPTIMIZE", "OPTION", +"OPTIONALLY", "OR", "ORDER", "OUT", "OUTER", "OUTFILE", "PRECISION", +"PRIMARY", "PROCEDURE", "PURGE", "RANGE", "READ", "READS", +"READ_WRITE", "REAL", "REFERENCES", "REGEXP", "RELEASE", "RENAME", +"REPEAT", "REPLACE", "REQUIRE", "RESIGNAL", "RESTRICT", "RETURN", +"REVOKE", "RIGHT", "RLIKE", "SCHEMA", "SCHEMAS", "SECOND_MICROSECOND", +"SELECT", "SENSITIVE", "SEPARATOR", "SET", "SHOW", "SIGNAL", +"SLOW[d]", "SMALLINT", "SPATIAL", "SPECIFIC", "SQL", "SQLEXCEPTION", +"SQLSTATE", "SQLWARNING", "SQL_BIG_RESULT", "SQL_CALC_FOUND_ROWS", +"SQL_SMALL_RESULT", "SSL", "STARTING", "STRAIGHT_JOIN", "TABLE", +"TERMINATED", "THEN", "TINYBLOB", "TINYINT", "TINYTEXT", "TO", +"TRAILING", "TRIGGER", "TRUE", "UNDO", "UNION", "UNIQUE", "UNLOCK", +"UNSIGNED", "UPDATE", "USAGE", "USE", "USING", "UTC_DATE", "UTC_TIME", +"UTC_TIMESTAMP", "VALUES", "VARBINARY", "VARCHAR", "VARCHARACTER", +"VARYING", "WHEN", "WHERE", "WHILE", "WITH", "WRITE", "XOR", +"YEAR_MONTH", "ZEROFILL", "WORDS", "NWORDS", "WORD", "UNIGRAM"]) + + def ParseFieldDescs(write = False): f = open('field_descriptions.json', 'r') try: fields = json.loads(f.read()) except ValueError: - raise ValueError("Error parsing JSON: Check to make sure that your field_descriptions.json file is valid?") + raise ValueError("Error parsing JSON: Check to make sure that your field_descriptions.json file is valid.") if write: derivedFile = open('.bookworm/metadata/field_descriptions_derived.json', 'w') output = [] - + fields_to_derive = [] - + for field in fields: + if field["field"] in mySQLreservedWords: + raise NameError(f"{field['field']} is a reserved word but appears" + "in field_description.json. Please choose a different name for" + "the column.") + for character in [" ","-", "&","+","."]: + if character in field['field']: + raise NameError(f"{field['field']} contains a special character, please rename") + if field["datatype"] == "time": if "derived" in field: fields_to_derive.append(field) @@ -56,25 +107,26 @@ def ParseFieldDescs(write = False): if write: derivedFile.write(json.dumps(output)) derivedFile.close() - + return (fields_to_derive, fields) + def parse_json_catalog(line_queue, processes, modulo): fields_to_derive, fields = ParseFieldDescs(write = False) - + if os.path.exists("jsoncatalog.txt"): mode = "json" fin = open("jsoncatalog.txt") - + if os.path.exists("catalog.csv"): mode = "csv" import csv - fin = csv.DictReader("catalog.csv") - + fin = csv.DictReader("catalog.csv") + for i, line in enumerate(fin): if i % processes != modulo: continue - + for char in ['\t', '\n']: line = line.replace(char, '') @@ -84,7 +136,7 @@ def parse_json_catalog(line_queue, processes, modulo): except: logging.warn("Couldn't parse catalog line {}".format(line)) continue - + for field in fields: # Smash together misidentified lists try: @@ -92,19 +144,19 @@ def parse_json_catalog(line_queue, processes, modulo): line[field["field"]] = "--".join(line[field["field"]]) except KeyError: pass - + for field in fields_to_derive: - + """ - Using fields_to_derive as a shorthand for dates--this may break + Using fields_to_derive as a shorthand for dates--this may break if we get more ambitious about derived fields, but this whole metadata-parsing code needs to be refactored anyway. - Note: this code is inefficient--it parses the same date multiple times. - We should be parsing the date once and pulling + Note: this code is inefficient--it parses the same date multiple times. + We should be parsing the date once and pulling derived fields out of that one parsing. """ - + try: if line[field["field"]]=="": # Use blankness as a proxy for unknown @@ -113,7 +165,7 @@ def parse_json_catalog(line_queue, processes, modulo): time = dateutil.parser.parse(line[field["field"]],default = defaultDate) intent = [time.year,time.month,time.day] content = [str(item) for item in intent] - + pass except: """ @@ -234,7 +286,7 @@ def parse_catalog_multicore(): cpus, _ = mp_stats() encoded_queue = Queue(10000) workers = [] - + for i in range(cpus): p = Process(target = parse_json_catalog, args = (encoded_queue, cpus, i)) p.start() @@ -243,7 +295,7 @@ def parse_catalog_multicore(): bookids = KV(".bookworm/metadata/textids.sqlite") import sqlite3 - + while True: try: filename, n = encoded_queue.get_nowait() @@ -255,7 +307,7 @@ def parse_catalog_multicore(): if filename in ids: logging.warning("Duplicate key insertion {}".format(filename)) ids.add(filename) - + except Empty: if running_processes(workers): # Give it a sec to fill back up to avoid this thread taking up @@ -264,6 +316,6 @@ def parse_catalog_multicore(): else: # We're done! break - + bookids.close() output.close() diff --git a/bookwormDB/variableSet.py b/bookwormDB/variableSet.py index 024704f..b6cf0d4 100644 --- a/bookwormDB/variableSet.py +++ b/bookwormDB/variableSet.py @@ -18,7 +18,7 @@ def to_unicode(obj): return obj def splitMySQLcode(string): - + """ MySQL code can only be executed one command at a time, and fails if it has any empty slots So as a convenience wrapper, I'm just splitting it and returning an array. @@ -122,16 +122,16 @@ def __init__(self, definition, dbToPutIn, anchorType="MEDIUMINT UNSIGNED", ancho def __repr__(self): val = "Data Field '{}'".format(self.field) val += "\n\tdatatype: {}".format(self.datatype) - val += "\n\ttype: {}".format(self.type) - val += "\n\tuniqueness: {}".format(self.unique) + val += "\n\ttype: {}".format(self.type) + val += "\n\tuniqueness: {}".format(self.unique) return val - + def slowSQL(self, withIndex=False): """ This returns something like "author VARCHAR(255)", a small definition string with an index, potentially. """ - + mysqltypes = { "character": "VARCHAR(255)", "integer": "INT", @@ -257,7 +257,7 @@ def fastSQLTable(self,engine="MEMORY"): elif engine=="MEMORY": queries += "INSERT INTO tmp SELECT * FROM {}_; ".format(tname) queries += "DROP TABLE IF EXISTS {}; RENAME TABLE tmp TO {}; ".format(tname,tname) - + if self.datatype == 'categorical' and self.unique: pass @@ -338,7 +338,7 @@ def buildIdTable(self, minimum_occurrence_rate = 1/100000): a 12-byte VARCHAR field takes 5.5 seconds, but a GROUP BY with a 3-byte MEDIUMINT field corresponding exactly to that takes 2.2 seconds on the exact same data. That sort of query is included in every single bookworm - search multiple times, so it's necessary to optimize. + search multiple times, so it's necessary to optimize. Plus, it means we can save space on memory storage in important ways as well. """ @@ -352,10 +352,10 @@ def buildIdTable(self, minimum_occurrence_rate = 1/100000): # XXXX to fix # Hardcoding this for now at one per 100K in the method definition. Could be user-set. - n_documents = self.dbToPutIn.query("SELECT COUNT(*) FROM catalog").fetchall()[0][0] + n_documents = self.dbToPutIn.query("SELECT COUNT(*) FROM catalog").fetchall()[0][0] self.minimum_count = round(n_documents*minimum_occurrence_rate) - # XXXX - + # XXXX + returnt +="DELETE FROM tmp WHERE count < %(minimum_count)s;" % self.__dict__ returnt += "DROP TABLE IF EXISTS %(field)s__id;\n\n" % self.__dict__ @@ -390,7 +390,7 @@ def exists(tablename): if self.datatype=="categorical": if exists(self.field+"Lookup"): self.dbToPutIn.query("DELETE FROM " + self.field+"Lookup") - + def updateVariableDescriptionTable(self): self.memoryCode = self.fastLookupTableIfNecessary() code = """DELETE FROM masterVariableTable WHERE dbname="%(field)s"; @@ -412,7 +412,7 @@ def updateVariableDescriptionTable(self): self.dbToPutIn.query(q, (self.field + "heap", parentTab, code)) if self.datatype=="categorical": #Variable Info - + code = """ DELETE FROM masterVariableTable WHERE dbname='%(field)s__id'; INSERT IGNORE INTO masterVariableTable @@ -436,53 +436,9 @@ def updateVariableDescriptionTable(self): # self.dbToPutIn.query(q) q = "INSERT INTO masterTableTable VALUES (%s, %s, %s)" - + self.dbToPutIn.query(q, (self.field+"Lookup", self.fasttab, code)) - - -# Ugh! This could probably be solved just by putting a lot of -# backticks in the code! - -mySQLreservedWords = set(["ACCESSIBLE", "ADD", -"ALL", "ALTER", "ANALYZE", "AND", "AS", "ASC", "ASENSITIVE", "BEFORE", -"BETWEEN", "BIGINT", "BINARY", "BLOB", "BOTH", "BY", "CALL", -"CASCADE", "CASE", "CHANGE", "CHAR", "CHARACTER", "CHECK", "COLLATE", -"COLUMN", "CONDITION", "CONSTRAINT", "CONTINUE", "CONVERT", "CREATE", -"CROSS", "CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", -"CURRENT_USER", "CURSOR", "DATABASE", "DATABASES", "DAY_HOUR", -"DAY_MICROSECOND", "DAY_MINUTE", "DAY_SECOND", "DEC", "DECIMAL", -"DECLARE", "DEFAULT", "DELAYED", "DELETE", "DESC", "DESCRIBE", -"DETERMINISTIC", "DISTINCT", "DISTINCTROW", "DIV", "DOUBLE", "DROP", -"DUAL", "EACH", "ELSE", "ELSEIF", "ENCLOSED", "ESCAPED", "EXISTS", -"EXIT", "EXPLAIN", "FALSE", "FETCH", "FLOAT", "FLOAT4", "FLOAT8", -"FOR", "FORCE", "FOREIGN", "FROM", "FULLTEXT", "GENERAL", "GRANT", -"GROUP", "HAVING", "HIGH_PRIORITY", "HOUR_MICROSECOND", "HOUR_MINUTE", -"HOUR_SECOND", "IF", "IGNORE", "IGNORE_SERVER_IDS", "IN", "INDEX", -"INFILE", "INNER", "INOUT", "INSENSITIVE", "INSERT", "INT", "INT1", -"INT2", "INT3", "INT4", "INT8", "INTEGER", "INTERVAL", "INTO", "IS", -"ITERATE", "JOIN", "KEY", "KEYS", "KILL", "LEADING", "LEAVE", "LEFT", -"LIKE", "LIMIT", "LINEAR", "LINES", "LOAD", "LOCALTIME", -"LOCALTIMESTAMP", "LOCK", "LONG", "LONGBLOB", "LONGTEXT", "LOOP", -"LOW_PRIORITY", "MASTER_HEARTBEAT_PERIOD[c]", -"MASTER_SSL_VERIFY_SERVER_CERT", "MATCH", "MAXVALUE", "MEDIUMBLOB", -"MEDIUMINT", "MEDIUMTEXT", "MIDDLEINT", "MINUTE_MICROSECOND", -"MINUTE_SECOND", "MOD", "MODIFIES", "NATURAL", "NOT", -"NO_WRITE_TO_BINLOG", "NULL", "NUMERIC", "ON", "OPTIMIZE", "OPTION", -"OPTIONALLY", "OR", "ORDER", "OUT", "OUTER", "OUTFILE", "PRECISION", -"PRIMARY", "PROCEDURE", "PURGE", "RANGE", "READ", "READS", -"READ_WRITE", "REAL", "REFERENCES", "REGEXP", "RELEASE", "RENAME", -"REPEAT", "REPLACE", "REQUIRE", "RESIGNAL", "RESTRICT", "RETURN", -"REVOKE", "RIGHT", "RLIKE", "SCHEMA", "SCHEMAS", "SECOND_MICROSECOND", -"SELECT", "SENSITIVE", "SEPARATOR", "SET", "SHOW", "SIGNAL", -"SLOW[d]", "SMALLINT", "SPATIAL", "SPECIFIC", "SQL", "SQLEXCEPTION", -"SQLSTATE", "SQLWARNING", "SQL_BIG_RESULT", "SQL_CALC_FOUND_ROWS", -"SQL_SMALL_RESULT", "SSL", "STARTING", "STRAIGHT_JOIN", "TABLE", -"TERMINATED", "THEN", "TINYBLOB", "TINYINT", "TINYTEXT", "TO", -"TRAILING", "TRIGGER", "TRUE", "UNDO", "UNION", "UNIQUE", "UNLOCK", -"UNSIGNED", "UPDATE", "USAGE", "USE", "USING", "UTC_DATE", "UTC_TIME", -"UTC_TIMESTAMP", "VALUES", "VARBINARY", "VARCHAR", "VARCHARACTER", -"VARYING", "WHEN", "WHERE", "WHILE", "WITH", "WRITE", "XOR", -"YEAR_MONTH", "ZEROFILL", "WORDS", "NWORDS", "WORD", "UNIGRAM"]) + class variableSet(object): def __init__(self, @@ -495,7 +451,7 @@ def __init__(self, self.originFile=originFile self.jsonDefinition=jsonDefinition logging.debug(jsonDefinition) - + if jsonDefinition==None: logging.warning("No field_descriptions.json file provided, so guessing based " "on variable names.") @@ -512,18 +468,14 @@ def __init__(self, for item in self.jsonDefinition: #The anchor field has special methods hard coded in. - + if item['field'] == self.anchorField: continue - if item['field'].upper() in mySQLreservedWords: - logging.warning(item['field'] + """ is a reserved word in MySQL, so can't be used as a Bookworm field name: skipping it for now, but you probably want to rename it to something different""") - item['field'] = item['field'] + "___" - continue self.variables.append(dataField(item,self.db,anchor=anchorField,table=self.tableName,fasttab=self.fastName)) def __repr__(self): return "A variable set of {} objects".format(len(self.variables)) - + def setTableNames(self): """ For the base case, they're catalog and fastcat: otherwise, it's just they key @@ -532,7 +484,7 @@ def setTableNames(self): if os.path.split(self.originFile)[-1] == 'jsoncatalog_derived.txt': self.tableName = "catalog" self.fastName = "fastcat" - + else: try: self.tableName = self.jsonDefinition[0]['field'] + "_" + self.jsonDefinition[1]['field'] @@ -546,14 +498,14 @@ def setTableNames(self): def guessAtFieldDescriptions(self,stopAfter=30000): allMyKeys = dict() unique = True - + for i, line in enumerate(open(self.originFile)): try: entry = json.loads(line.rstrip("\n")) except: logging.warning("Error in line {} of {}".format(i, self.originFile)) logging.warning(line) - + for key in entry: if type(entry[key])==list: unique=False @@ -575,11 +527,11 @@ def guessAtFieldDescriptions(self,stopAfter=30000): myOutput = [] for metadata in allMyKeys: - + bestGuess = guessBasedOnNameAndContents(metadata,allMyKeys[metadata]) if unique==False: bestGuess['unique'] = False - + myOutput.append(bestGuess) myOutput = [output for output in myOutput if output["field"] != "filename"] @@ -597,7 +549,7 @@ def uniques(self,type="base"): return [variable for variable in self.variables if (variable.unique and variable.fastSQL() is not None)] if type=="categorical": return [variable for variable in self.variables if (variable.unique and variable.fastSQL() is not None and variable.datatype=="categorical")] - + def notUniques(self): return [variable for variable in self.variables if not variable.unique] @@ -605,11 +557,11 @@ def anchorLookupDictionary(self): db = self.db anchor = self.anchorField self.fastAnchor = self.anchorField - + if anchor == "bookid" and self.tableName != "catalog": self.fastAnchor="bookid" bookids = DummyDict() - + elif anchor=="filename" or anchor=="bookid": self.fastAnchor = "bookid" bookids = dict() @@ -681,13 +633,13 @@ def writeMetadata(self,limit=float("Inf")): variable.output = open(variable.outputloc, 'w') for entry in metadatafile: - + try: entry = json.loads(entry) except: logging.warning("""WARNING: json parsing failed for this JSON line: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n""" + entry) - + continue #We always lead with the bookid and the filename. @@ -708,13 +660,13 @@ def writeMetadata(self,limit=float("Inf")): #If the key isn't in the name table, we have no use for this entry. continue mainfields = [str(bookid),to_unicode(entry[self.anchorField])] - + if self.tableName != "catalog": #It can get problematic to have them both, so we're just writing over the #anchorField here. mainfields = [str(bookid)] # First, pull the unique variables and write them to the 'catalog' table - + for var in [variable for variable in variables if variable.unique]: if var.field not in [self.anchorField,self.fastAnchor]: myfield = entry.get(var.field, "") @@ -726,7 +678,7 @@ def writeMetadata(self,limit=float("Inf")): catalog.write(catalogtext) except TypeError: catalog.write(catalogtext) - + for variable in [variable for variable in variables if not variable.unique]: # Each of these has a different file it must write to... outfile = variable.output @@ -771,7 +723,7 @@ def loadMetadata(self): for variable in self.uniques(): createstring = variable.slowSQL(withIndex=True) mysqlfields.append(createstring) - + if len(mysqlfields) > 1: #This creates the main (slow) catalog table db.query("""DROP TABLE IF EXISTS %s """ % self.tableName) @@ -787,10 +739,10 @@ def loadMetadata(self): db.query("ALTER TABLE %s DISABLE KEYS" % self.tableName) logging.info("loading data into %s using LOAD DATA LOCAL INFILE..." % self.tableName) anchorFields = self.fastAnchor - + if self.tableName=="catalog": anchorFields = "bookid,filename" - + loadEntries = { "catLoc": self.catalogLocation, "tabName": self.tableName, @@ -803,7 +755,7 @@ def loadMetadata(self): loadcode = """LOAD DATA LOCAL INFILE '%(catLoc)s' INTO TABLE %(tabName)s FIELDS ESCAPED BY '' (%(loadingFields)s)""" % loadEntries - + db.query(loadcode) logging.info("enabling keys on %s" %self.tableName) db.query("ALTER TABLE %s ENABLE KEYS" % self.tableName) @@ -820,7 +772,7 @@ def loadMetadata(self): for variable in self.variables: if variable.datatype=="categorical": variable.build_ID_and_lookup_tables() - + if len(self.uniques()) > 0 and self.tableName!="catalog": #catalog has separate rules handled in CreateDatabase.py. fileCommand = self.uniqueVariableFastSetup("MYISAM") @@ -834,9 +786,9 @@ def uniqueVariableFastSetup(self,engine="MEMORY"): ) fileCommand += ",\n".join([variable.fastSQL() for variable in self.variables if (variable.unique and variable.fastSQL() is not None)]) fileCommand += ") ENGINE=%s;\n" % engine - + fast_fields = self.fastAnchor + ", " + ",".join([variable.fastField for variable in self.variables if variable.unique and variable.fastSQL() is not None]) - + fileCommand += "INSERT INTO tmp SELECT " + fast_fields fileCommand += " FROM %s " % self.tableName fileCommand += " ".join([" JOIN %(field)s__id USING (%(field)s ) " % variable.__dict__ for variable in self.variables if variable.unique and variable.fastSQL() is not None and variable.datatype=="categorical"])+ ";\n" @@ -848,7 +800,7 @@ def uniqueVariableFastSetup(self,engine="MEMORY"): fileCommand += "RENAME TABLE tmp TO %s;\n" % name return fileCommand - + def updateMasterVariableTable(self): """ All the categorical variables get a lookup table; @@ -878,7 +830,7 @@ def updateMasterVariableTable(self): raise self.db.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="%s";' %self.fastName) self.db.query("INSERT INTO masterTableTable VALUES (%s, %s, %s)", (self.fastName,parentTab,escape_string(fileCommand))) - + def createNwordsFile(self): """ A necessary supplement to the `catalog` table. @@ -900,4 +852,4 @@ class DummyDict(dict): """ # we need to have it there. def __missing__(self,key): - return key + return key From 372a39251de81217c8b291d560e5abee9b81e555 Mon Sep 17 00:00:00 2001 From: Benjamin Schmidt Date: Wed, 3 Feb 2021 11:20:53 -0500 Subject: [PATCH 09/41] Aesthetic changes, document pre-tokenized data --- README.md | 18 ++++++++++++++++++ bookwormDB/configuration.py | 2 +- bookwormDB/countManager.py | 16 ++++++++++++---- bookwormDB/general_API.py | 23 +++++++++++++++++++++++ bookwormDB/tokenizer.py | 5 +++-- tests/test_mysql.py | 2 +- 6 files changed, 58 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b573b02..4fa14b5 100755 --- a/README.md +++ b/README.md @@ -238,6 +238,24 @@ Once this works, you can use various libraries to query the endpoint, or create an HTML page that builds off the endpoint. See the (currently underdeveloped) Bookworm-Vega repository for some examples. +## Pre-tokenized data. + +If you're using data that's already been tokenized, it can be ingested +by using a different file than 'input.txt' or 'input.txt.gz'. + +``` +bookworm --feature-counts unigrams.txt --feature-counts bigrams.txt build all +``` + +The format for `unigrams.txt` is a little wonky. It should consist of one row +per document. The first element is the identifier, followed by a tab. The next element +should be a CSV file that uses the formfeed character (`\f`) instead of the newline +to separate records. + +``` +id\t{word,count csv} + +``` ## Production servers diff --git a/bookwormDB/configuration.py b/bookwormDB/configuration.py index aafe61a..bb54a36 100644 --- a/bookwormDB/configuration.py +++ b/bookwormDB/configuration.py @@ -98,7 +98,7 @@ def __init__(self, usertype, possible_locations=None, default=None, ask_about_de self.ask_about_defaults = ask_about_defaults - logging.info("Creating configuration as " + usertype) + logging.debug("Creating configuration as " + usertype) self.usertype = usertype diff --git a/bookwormDB/countManager.py b/bookwormDB/countManager.py index 0e87b4f..890fc4b 100644 --- a/bookwormDB/countManager.py +++ b/bookwormDB/countManager.py @@ -61,18 +61,18 @@ def counter(qout, i, fin, mode = "count"): datatype = "raw" count_signals = [".unigrams", ".bigrams", ".trigrams", ".quadgrams"] + logging.info(f"fin is {fin}") for signal in count_signals: if signal in fin: datatype = signal.strip(".") if mode == "encode": - encoder = tokenBatches([datatype]) - + encoder = tokenBatches([datatype]) + logging.info(f"Worker counting with type {datatype}") if (fin.endswith(".gz")): fin = gzip.open(fin, 'rt') else: fin = open(fin) - - + for ii, row in enumerate(fin): if ii % cpus != i: # Don't do anything on most lines. @@ -112,6 +112,12 @@ def counter(qout, i, fin, mode = "count"): encoder.close() def create_counts(input): + + """ + The first step of wordcounting is done on a worker--then those + counts are shipped here to a bounter object that counts approximately. + """ + qout = Queue(cpus * 2) workers = [] logging.info("Spawning {} count processes on {}".format(cpus, input)) @@ -152,6 +158,8 @@ def create_wordlist(n, input, output): counter = create_counts(input) counter = sorted(list(counter.iteritems()), key = lambda x: -1 * x[1]) output = open(output, "w") + logging.info(f"Created wordlist from {input}") + logging.info(f"top 10 words are {[c for c in counter[:10]]}") for i, (k, v) in enumerate(counter): output.write("{}\t{}\t{}\n".format(i, k, v)) if i >= n: diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 26e018b..550f252 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -707,6 +707,29 @@ def generate_pandas_frame(self, call = None): df = read_sql(q, con.db) return df +class MetaAPIcall(APIcall): + def __init__(self, endpoints): + self.endpoints = endpoints + + def connect(self, endpoint): + # return some type of a connection. + pass + + def generate_pandas_frame(self, call): + if call is None: + call = deepcopy(self.query) + call['format'] = 'feather' + for endpoint in self.endpoints: + connection = self.connect(endpoint) + d = connection.query(call) + count_fields = [] + + for field in ['WordCount', 'TextCount']: + if field in call["counttype"]: + count_fields.push(field) + together = pd.concat(d) + together[count_fields].sum() + class SQLAPIcall(APIcall): """ To make a new backend for the API, you just need to extend the base API diff --git a/bookwormDB/tokenizer.py b/bookwormDB/tokenizer.py index fba0bfc..51910fa 100644 --- a/bookwormDB/tokenizer.py +++ b/bookwormDB/tokenizer.py @@ -124,6 +124,7 @@ def encodeRow(self, IDfile = self.IDfile levels = None + """ if source=="raw_text": parts = row.split("\t", 1) @@ -143,7 +144,7 @@ def encodeRow(self, raise tokens = preTokenized(token, count, self.levels[0]) """ - + try: textid = IDfile[filename] except KeyError: @@ -291,7 +292,7 @@ def __init__(self, csv_string, level): else: self.output = dict(zip([tuple(w.split(" ")) for w in f.word], f.counts)) - def counts(self,level): + def counts(self, level): if level != self.level: raise return self.output diff --git a/tests/test_mysql.py b/tests/test_mysql.py index ffd0d62..89f49a1 100644 --- a/tests/test_mysql.py +++ b/tests/test_mysql.py @@ -36,7 +36,7 @@ def test_config_files(self): def test_config_file(conf): user = conf.config.get("client","user") pw = conf.config.get("client","password") - return (user,pw) + return (user, pw) global_configuration_file = Configfile("read_only") admin_configuration_file = Configfile("admin") From e69e9bafe18c70e5716c8fccb351d764940c72f6 Mon Sep 17 00:00:00 2001 From: Benjamin Schmidt Date: Wed, 3 Feb 2021 11:31:33 -0500 Subject: [PATCH 10/41] More informative error message --- bookwormDB/variableSet.py | 58 ++++----------------------------------- 1 file changed, 6 insertions(+), 52 deletions(-) diff --git a/bookwormDB/variableSet.py b/bookwormDB/variableSet.py index 024704f..e9191c2 100644 --- a/bookwormDB/variableSet.py +++ b/bookwormDB/variableSet.py @@ -226,7 +226,12 @@ def fastLookupTableIfNecessary(self, engine="MEMORY"): self.setIntType() self.maxlength = self.dbToPutIn.query("SELECT MAX(CHAR_LENGTH(%(field)s)) FROM %(field)s__id" % self.__dict__) self.maxlength = self.maxlength.fetchall()[0][0] - self.maxlength = max([self.maxlength,1]) + try: + self.maxlength = max([self.maxlength,1]) + except TypeError: + logging.error(f"Unable to calculate length for {field}" + "perhaps there are no entries in the catalog?") + raise code = """DROP TABLE IF EXISTS tmp; CREATE TABLE tmp (%(field)s__id %(intType)s ,PRIMARY KEY (%(field)s__id), %(field)s VARCHAR (%(maxlength)s) ) ENGINE=%(engine)s @@ -263,57 +268,6 @@ def fastSQLTable(self,engine="MEMORY"): return queries - def jsonDict(self): - """ - DEPRECATED!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - #This builds a JSON dictionary that can be loaded into outside - bookworm in the "options.json" file. - It's a bad design decision; newer version - just load this directly from the database. - """ - mydict = dict() - #It gets confusingly named: "type" is the key for real name ("time", "categorical" in the json), but also the mysql key ('character','integer') here. That would require renaming code in a couple places. - mydict['type'] = self.datatype - mydict['dbfield'] = self.field - try: - mydict['name'] = self.name - except: - mydict['name'] = self.field - if self.datatype == "etc" or self.type == "text": - return dict() #(Some things don't go into the fast settings because they'd take too long) - if self.datatype == "time": - mydict['unit'] = self.field - #default to the full min and max date ranges - #times may not be zero or negative - cursor = self.dbToPutIn.query("SELECT MIN(" + self.field + "), MAX(" + self.field + ") FROM catalog WHERE " + self.field + " > 0 ") - results = cursor.fetchall()[0] - mydict['range'] = [results[0], results[1]] - mydict['initial'] = [results[0], results[1]] - - if self.datatype == "categorical": - mydict['dbfield'] = self.field + "__id" - #Find all the variables used more than 20 times from the database, and build them into something json-usable. - cursor = self.dbToPutIn.query("SELECT %(field)s, %(field)s__id FROM %(field)s__id WHERE %(field)s__count > 20 ORDER BY %(field)s__id ASC LIMIT 500;" % self.__dict__) - sort_order = [] - descriptions = dict() - for row in cursor.fetchall(): - code = row[1] - name = row[0] - code = to_unicode(code) - sort_order.append(code) - descriptions[code] = dict() - """ - These three things all have slightly different meanings: - the english name, the database code for that name, and the short display name to show. - It would be worth allowing lookup files for these: for now, they are what they are and can be further improved by hand. - """ - descriptions[code]["dbcode"] = code - descriptions[code]["name"] = name - descriptions[code]["shortname"] = name - mydict["categorical"] = {"descriptions": descriptions, "sort_order": sort_order} - - return mydict - def setIntType(self): try: alreadyExists = self.intType From f81108ebf9dbd2f05a3d79e2bed9fd8d04829e94 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 3 Feb 2021 11:36:19 -0500 Subject: [PATCH 11/41] Restore folder ingest of multiple txt files; Housekeeping --- bookwormDB/CreateDatabase.py | 68 ++------------------- bookwormDB/MetaParser.py | 3 +- bookwormDB/countManager.py | 111 +++++++++++++++++++++++------------ bookwormDB/manager.py | 78 +++++++++--------------- bookwormDB/tokenizer.py | 7 ++- 5 files changed, 115 insertions(+), 152 deletions(-) diff --git a/bookwormDB/CreateDatabase.py b/bookwormDB/CreateDatabase.py index 794b9bc..704700c 100755 --- a/bookwormDB/CreateDatabase.py +++ b/bookwormDB/CreateDatabase.py @@ -13,10 +13,6 @@ import warnings from .sqliteKV import KV -#if logging.getLogger().isEnabledFor(logging.DEBUG): - # Catch MYSQL warnings as errors if logging is set to debug. -# warnings.filterwarnings('error', category=MySQLdb.Warning) # For testing - warnings.filterwarnings('ignore', 'Table .* already exists') warnings.filterwarnings("ignore", ".*Can't create database.*; database exists.*") warnings.filterwarnings("ignore", ".*Unknown table.*") @@ -216,6 +212,8 @@ def create_database(self): def load_word_list(self): db = self.db + if db is None: + raise AttributeError("No database connection defined--are you running Bookworm without a configuration file or naming the bookworm like `bookworm -d my_bookworm build all`?") logging.info("Making a SQL table to hold the words") db.query("""DROP TABLE IF EXISTS words""") db.query("""CREATE TABLE IF NOT EXISTS words ( @@ -487,11 +485,12 @@ def fastcat_creation_SQL(self, engine="MEMORY"): ", ".join(fastFieldsCreateList), engine) if engine == "MYISAM": - fastFields = ["bookid","nwords"] + [variable.fastField for variable in self.variableSet.uniques("fast")] + fastFields = ["bookid", "nwords"] + [variable.fastField for variable in self.variableSet.uniques("fast")] load_command = "INSERT INTO tmp SELECT " load_command += ",".join(fastFields) + " FROM catalog USE INDEX () " # LEFT JOIN fixes a bug where fields were being dropped - load_command += " ".join(["LEFT JOIN %(field)s__id USING (%(field)s ) " % variable.__dict__ for variable in self.variableSet.uniques("categorical")]) + ";" + load_command += " ".join(["LEFT JOIN %(field)s__id USING (%(field)s )" % variable.__dict__ for variable in self.variableSet.uniques("categorical")]) + load_command += " WHERE nwords IS NOT NULL;" elif engine == "MEMORY": load_command = "INSERT INTO tmp SELECT * FROM fastcat_;" @@ -539,63 +538,6 @@ def addWordsToMasterVariableTable(self, max_word_length = 30, max_words = 150000 logging.info("Creating wordsheap") self.db.query(query) - def jsonify_data(self): - variables = self.variableSet.variables - dbname = self.dbname - #This creates a JSON file compliant with the Bookworm web site. - #Deprecated. - output = dict() - output['settings'] = { - "dbname": self.dbname, - "itemName":" text", - "sourceName": self.dbname, - "sourceURL": self.dbname - } - ui_components = [ - { - "type":"text", - "dbfield":"word", - "name":"Word(s)" - } - ] - for variable in variables: - newdict = variable.jsonDict() - if newdict: #(It can be empty, in which case we don't want it for the json) - ui_components.append(newdict) - try: - mytime = [variable.field for variable in variables if variable.datatype=='time'][0] - output['default_search'] = [ - { - "search_limits": [{"word":["test"]}], - "time_measure": mytime, - "words_collation": "Case_Sensitive", - "counttype": "Occurrences_per_Million_Words", - "smoothingSpan": 0 - } - ] - except: - logging.warning("No default search created because of insufficient data.") - output['ui_components'] = ui_components - - with open('.bookworm/%s.json' % dbname, 'w') as outfile: - outfile.write(json.dumps(output)) - - def create_API_settings(self): - db = self.db - try: - db.query("DROP TABLE IF EXISTS API_settings") - db.query("CREATE TABLE API_settings (settings VARCHAR(8192));") - except: - pass - api_info = { - "HOST": "10.102.15.45", - "database": self.dbname, - "read_default_file": "/etc/mysql/my.cnf", - } - addCode = json.dumps(api_info) - logging.info(addCode) - db.query("INSERT INTO API_settings VALUES ('%s');" % addCode) - def update_Porter_stemming(self): #We use stems occasionally. """ Still not executed. diff --git a/bookwormDB/MetaParser.py b/bookwormDB/MetaParser.py index 464e8ff..0c56014 100644 --- a/bookwormDB/MetaParser.py +++ b/bookwormDB/MetaParser.py @@ -68,7 +68,6 @@ def ParseFieldDescs(write = False): except ValueError: raise ValueError("Error parsing JSON: Check to make sure that your field_descriptions.json file is valid.") - if write: derivedFile = open('.bookworm/metadata/field_descriptions_derived.json', 'w') @@ -77,7 +76,7 @@ def ParseFieldDescs(write = False): fields_to_derive = [] for field in fields: - if field["field"] in mySQLreservedWords: + if field["field"].upper() in mySQLreservedWords: raise NameError(f"{field['field']} is a reserved word but appears" "in field_description.json. Please choose a different name for" "the column.") diff --git a/bookwormDB/countManager.py b/bookwormDB/countManager.py index 0e87b4f..857739d 100644 --- a/bookwormDB/countManager.py +++ b/bookwormDB/countManager.py @@ -12,6 +12,9 @@ import fileinput import time import csv +from pathlib import Path +import gzip +import hashlib cpus, memory = mp_stats() @@ -39,7 +42,7 @@ def flush_counter(counter, qout): except KeyError: continue qout.put(counter) - + def counter(qout, i, fin, mode = "count"): """ # Counts words exactly in a separate process. @@ -50,40 +53,27 @@ def counter(qout, i, fin, mode = "count"): totals = 0 errors = 0 - + if mode == "count": counter = Counter() encoder = tokenBatches(['words']) - + if mode == "encode": encoder = tokenBatches(['unigrams', 'bigrams']) - + datatype = "raw" - + count_signals = [".unigrams", ".bigrams", ".trigrams", ".quadgrams"] for signal in count_signals: if signal in fin: datatype = signal.strip(".") if mode == "encode": - encoder = tokenBatches([datatype]) - - if (fin.endswith(".gz")): - fin = gzip.open(fin, 'rt') - else: - fin = open(fin) + encoder = tokenBatches([datatype]) + + + + for id, text in yield_texts(fin, i): - - for ii, row in enumerate(fin): - if ii % cpus != i: - # Don't do anything on most lines. - continue - totals += 1 - try: - (filename, text) = row.rstrip().split("\t",1) - except ValueError: - errors += 1 - continue - if datatype == "raw": tokenizer = Tokenizer(text) else: @@ -91,12 +81,12 @@ def counter(qout, i, fin, mode = "count"): # When encoding if mode == "encode": - encoder.encodeRow(filename, tokenizer, write_completed = True) + encoder.encodeRow(id, tokenizer, write_completed = True) continue - + # When building counts counter.update(tokenizer.counts("words")) - + # When the counter is long, post it to the master and clear it. if len(counter) > QUEUE_POST_THRESH: flush_counter(counter=counter, qout = qout) @@ -106,11 +96,58 @@ def counter(qout, i, fin, mode = "count"): if mode == "count": logging.debug("Flushing leftover counts from thread {}".format(i)) flush_counter(counter=counter, qout = qout) - if totals > 0 and errors/totals > 0.01: - logging.warning("Skipped {} rows without tabs".format(errors)) if mode == "encode": encoder.close() +def yield_texts(fname, i): + p = Path(fname) + if p.is_dir(): + for id, text in yield_texts_from_directory(p, i): + yield (id, text) + else: + for id, text in yield_lines_from_single_file(p, i): + yield (id, text) + + +def yield_texts_from_directory(dir, i): + for file in dir.glob('**/*.txt*'): + # Strips _djvu for Internet Archive. + basename = file.name.rstrip(".gz").rstrip(".txt").rstrip("_djvu") + # Use sha256 + key = int(hashlib.md5(basename.encode('utf-8')).hexdigest(), 16) + if key % cpus != i: + continue + if file.name.endswith(".txt.gz"): + fin = gzip.open(file) + elif file.name.endswith(".txt"): + fin = open(file) + else: + logging.error(f"Can't handle file {file}") + yield (basename, fin.read().replace("\t", "\f").replace("\n", "\f")) + +def yield_lines_from_single_file(fname, i, cpus): + if (fname.endswith(".gz")): + fin = gzip.open(fin, 'rt') + else: + fin = open(fin) + totals = 0 + errors = 0 + for ii, row in enumerate(fin): + if ii % cpus != i: + # Don't do anything on most lines. + continue + + totals += 1 + try: + (filename, text) = row.rstrip().split("\t",1) + except ValueError: + errors += 1 + continue + yield (filename, text) + if totals > 0 and errors/totals > 0.01: + logging.warning("Skipped {} rows without tabs".format(errors)) + + def create_counts(input): qout = Queue(cpus * 2) workers = [] @@ -121,14 +158,14 @@ def create_counts(input): workers.append(p) wordcounter = bounter.bounter(memory) - + while True: - + try: input_dict = qout.get_nowait() logging.debug("inputting queue of length {} from worker".format(len(input_dict))) wordcounter.update(input_dict) - + except queue.Empty: if running_processes(workers): time.sleep(1/100) @@ -136,19 +173,19 @@ def create_counts(input): break except ValueError: for k, v in input_dict.items(): - print("'{}'\t'{}'".format(k, v)) + print("'{}'\t'{}'".format(k, v)) wordcounter.update({k: v}) raise except TypeError: for k, v in input_dict.items(): - print("'{}'\t'{}'".format(k, v)) + print("'{}'\t'{}'".format(k, v)) wordcounter.update({k: v}) raise - + return wordcounter def create_wordlist(n, input, output): - + counter = create_counts(input) counter = sorted(list(counter.iteritems()), key = lambda x: -1 * x[1]) output = open(output, "w") @@ -156,8 +193,8 @@ def create_wordlist(n, input, output): output.write("{}\t{}\t{}\n".format(i, k, v)) if i >= n: break - -def encode_words(wordlist, input = "input.txt"): + +def encode_words(wordlist, input): qout = Queue(cpus * 2) workers = [] diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index c66ccbd..2a4d9d8 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -80,26 +80,6 @@ def config(self,args): from bookwormDB.configuration import apache apache() - def ftokenize(self, args): - - import bookwormDB.tokenizer - - """ - Handle functions related to tokenization and encoding. - - Should eventually be able to accept arguments like "token-regex" - and already-tokenized documents. - """ - - if args.process == "encode": - self.encoded(args) - - if args.process == "text_stream" or args.process == "token_stream": - raise NotImplementedError("This feature has been removed") - - if args.process == "word_db": - self.wordlist(args) - def init(self, args): """ Initialize the current directory as a bookworm directory. @@ -227,7 +207,7 @@ def wordlist(self, args): except FileExistsError: pass - input = "input.txt" + input = args.input if args.feature_counts: logging.info(args.feature_counts) input = [a for a in args.feature_counts if 'unigrams' in a][0] @@ -261,7 +241,7 @@ def encoded(self, args): for feature in args.feature_counts: encode_words(".bookworm/texts/wordlist/wordlist.txt", feature) else: - encode_words(".bookworm/texts/wordlist/wordlist.txt", "input.txt") + encode_words(".bookworm/texts/wordlist/wordlist.txt", args.input) def all(self, args): self.preDatabaseMetadata(args) @@ -272,7 +252,10 @@ def all(self, args): def preDatabaseMetadata(self, args=None, **kwargs): import os if not os.path.exists("field_descriptions.json"): - self.guessAtFieldDescriptions() + if os.path.exists("field_descriptions.csv"): + self.field_descriptions_from_csv() + else: + self.guess_field_descriptions() self.derived_catalog(args) import bookwormDB.CreateDatabase # Doesn't need a created database yet, just needs access @@ -297,7 +280,14 @@ def derived_catalog(self, args): logging.debug("Preparing to write catalog") parse_catalog_multicore() - def guessAtFieldDescriptions(self, args = None, **kwargs): + def field_descriptions_from_csv(self): + import pandas as pd + import json + jsonified = pd.read_csv("field_descriptions.csv").to_json(orient="records") + with open("field_descriptions.json", "w") as fout: + fout.write(jsonified) + + def guess_field_descriptions(self, args = None, **kwargs): """ Use a number of rules of thumb to automatically generate a field_descriptions.json file. @@ -307,20 +297,17 @@ def guessAtFieldDescriptions(self, args = None, **kwargs): import bookwormDB.CreateDatabase import json + import os + import pandas as pd Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None) Bookworm.setVariables("jsoncatalog.txt", jsonDefinition=None) - import os - if not os.path.exists("field_descriptions.json"): - output = open("field_descriptions.json","w") - guess = json.dumps(Bookworm.variableSet.guessAtFieldDescriptions(), indent = 2) - logging.warning("Creating guess for field descriptions at: {}".format(guess)) - output.write(guess) - else: - logging.error(""" - You already have a file at field_descriptions.json - Dying rather than overwrite it. - """) - sys.exit() + guess = Bookworm.variableSet.guessAtFieldDescriptions() + guess = pd.DataFrame(guess) + guess.to_csv("field_descriptions.csv", index = False) + raise FileNotFoundError("No field descriptions file found." + "Creating guess for field descriptions at: field_descriptions.csv." + "You should probably inspect and edit this file before you build." + "But if you suspect it's right, you can rebuild again immediately.") def reload_memory(self,args): import bookwormDB.CreateDatabase @@ -353,19 +340,8 @@ def database_metadata(self, args): Bookworm.loadVariableDescriptionsIntoDatabase() - Bookworm.create_fastcat_and_wordsheap_disk_tables() - # The temporary memory tables are no longer automatically created on a build. - # To create them, use `bookworm reload_memory`. - # Bookworm.reloadMemoryTables() - - #print "adding cron job to automatically reload memory tables on launch" - #print "(this assumes this machine is the MySQL server, which need not be the case)" - #call(["sh","scripts/scheduleCronJob.sh"]) - Bookworm.jsonify_data() # Create the self.dbname.json file in the root directory. - Bookworm.create_API_settings() - Bookworm.grantPrivileges() def add_metadata(self, args): @@ -469,6 +445,12 @@ def run_arguments(): parser.add_argument("--log-level","-l", help="The logging detail to use for errors. Default is 'warning', only significant problems; info gives a fuller record, and 'debug' dumps many MySQL queries, etc.",choices=["warning","info","debug"],type=str.lower,default="warning") + parser.add_argument("--input", "-i", + help = "The location of texts for an initial build." + "Either a text file ('input.txt' or 'input.txt.gz')" + "or a folder containing txt or txt.gz files, which may be nested" + "inside other directories", default = "input.txt") + parser.add_argument("--feature-counts", action='append', help="Use pre-calculated feature counts rather than tokenizing complete text on the fly. Supply any number of single files per count level like 'input.unigrams', 'input.bigrams', etc.") @@ -479,8 +461,6 @@ def run_arguments(): # Use subparsers to have an action syntax, like git. subparsers = parser.add_subparsers(title="action", help='The commands to run with Bookworm', dest="action") - - ############# build ################# build_parser = subparsers.add_parser("build",description = "Create files",help="""Build up the component parts of a Bookworm.\ diff --git a/bookwormDB/tokenizer.py b/bookwormDB/tokenizer.py index fba0bfc..ff8b126 100644 --- a/bookwormDB/tokenizer.py +++ b/bookwormDB/tokenizer.py @@ -50,7 +50,12 @@ def readDictionaryFile(prefix=""): look = dict() for line in open(prefix + ".bookworm/texts/wordlist/wordlist.txt"): line = line.rstrip("\n") - v, k, _ = line.split("\t") + try: + v, k, _ = line.split("\t") + except ValueError: + print(line) + print([look.keys()][:10]) + raise look[k] = v return look From 75d18b697ebbee60ef2c4fff359a30c331a4eec0 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 3 Feb 2021 11:54:26 -0500 Subject: [PATCH 12/41] Make remove file tree --- bookwormDB/manager.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index 2a4d9d8..914b7dd 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -215,13 +215,18 @@ def wordlist(self, args): input = input, output = ".bookworm/texts/wordlist/wordlist.txt") - def pristine(self, args): + def destroy(self, args): + self.pristine(args) + def pristine(self, args): + # Old name still works. import bookwormDB.CreateDatabase bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None) if self.dbname == "mysql": raise NameError("Don't try to delete the mysql database") bookworm.db.query("DROP DATABASE IF EXISTS {}".format(self.dbname)) + import shutil + shutil.rmtree('.bookworm') def encoded(self, args): """ From d1adf0df8dd0d3dda4893428fde0e593ebd43766 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 3 Feb 2021 11:58:06 -0500 Subject: [PATCH 13/41] Fix input.txt parsing --- bookwormDB/countManager.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bookwormDB/countManager.py b/bookwormDB/countManager.py index 770a0d9..d3e48e1 100644 --- a/bookwormDB/countManager.py +++ b/bookwormDB/countManager.py @@ -126,11 +126,11 @@ def yield_texts_from_directory(dir, i): logging.error(f"Can't handle file {file}") yield (basename, fin.read().replace("\t", "\f").replace("\n", "\f")) -def yield_lines_from_single_file(fname, i, cpus): - if (fname.endswith(".gz")): - fin = gzip.open(fin, 'rt') +def yield_lines_from_single_file(fname, i): + if (str(fname).endswith(".gz")): + fin = gzip.open(fname, 'rt') else: - fin = open(fin) + fin = open(fname) totals = 0 errors = 0 for ii, row in enumerate(fin): From e2117b274dcda6334d0264cf9f94525aef555f25 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Wed, 3 Feb 2021 12:06:18 -0500 Subject: [PATCH 14/41] Raise error on invalid catalog entries --- bookwormDB/MetaParser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bookwormDB/MetaParser.py b/bookwormDB/MetaParser.py index 0c56014..3f55ad2 100644 --- a/bookwormDB/MetaParser.py +++ b/bookwormDB/MetaParser.py @@ -133,8 +133,9 @@ def parse_json_catalog(line_queue, processes, modulo): try: line = json.loads(line) except: - logging.warn("Couldn't parse catalog line {}".format(line)) - continue + logging.error(f"Invalid json in line {i}\n:{line}" + "The input file must be in ndjson format (http://ndjson.org/)") + raise for field in fields: # Smash together misidentified lists From b00e6a9a7e72d132a9dd556b22cd57347c2b8f0f Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Fri, 5 Feb 2021 14:38:18 -0500 Subject: [PATCH 15/41] changes to tokenization pipeline internals --- bookwormDB/MetaParser.py | 1 - bookwormDB/countManager.py | 42 +++++++------ bookwormDB/manager.py | 6 +- bookwormDB/tokenizer.py | 118 +++++++++++++++---------------------- bookwormDB/variableSet.py | 4 +- 5 files changed, 81 insertions(+), 90 deletions(-) diff --git a/bookwormDB/MetaParser.py b/bookwormDB/MetaParser.py index 3f55ad2..b2074d6 100644 --- a/bookwormDB/MetaParser.py +++ b/bookwormDB/MetaParser.py @@ -18,7 +18,6 @@ def DaysSinceZero(dateobj): #Zero isn't a date, which python knows but MySQL and javascript don't. return (dateobj - date(1,1,1)).days + 366 - mySQLreservedWords = set(["ACCESSIBLE", "ADD", "ALL", "ALTER", "ANALYZE", "AND", "AS", "ASC", "ASENSITIVE", "BEFORE", "BETWEEN", "BIGINT", "BINARY", "BLOB", "BOTH", "BY", "CALL", diff --git a/bookwormDB/countManager.py b/bookwormDB/countManager.py index d3e48e1..65e0469 100644 --- a/bookwormDB/countManager.py +++ b/bookwormDB/countManager.py @@ -32,9 +32,6 @@ logging.info("Filling dicts to size {}".format(QUEUE_POST_THRESH)) -import random -import gzip - def flush_counter(counter, qout): for k in ['', '\x00']: try: @@ -65,16 +62,14 @@ def counter(qout, i, fin, mode = "count"): count_signals = [".unigrams", ".bigrams", ".trigrams", ".quadgrams"] logging.info(f"fin is {fin}") + for signal in count_signals: if signal in fin: datatype = signal.strip(".") if mode == "encode": encoder = tokenBatches([datatype]) - - - for id, text in yield_texts(fin, i): - + for id, text in yield_texts(fin, i, encoder.IDfile): if datatype == "raw": tokenizer = Tokenizer(text) else: @@ -100,35 +95,42 @@ def counter(qout, i, fin, mode = "count"): if mode == "encode": encoder.close() -def yield_texts(fname, i): +def yield_texts(fname, i, IDfile): p = Path(fname) if p.is_dir(): - for id, text in yield_texts_from_directory(p, i): + for id, text in yield_texts_from_directory(p, i, IDfile): yield (id, text) else: - for id, text in yield_lines_from_single_file(p, i): + for id, text in yield_lines_from_single_file(p, i, IDfile): yield (id, text) - -def yield_texts_from_directory(dir, i): +def yield_texts_from_directory(dir, i, IDfile): for file in dir.glob('**/*.txt*'): - # Strips _djvu for Internet Archive. - basename = file.name.rstrip(".gz").rstrip(".txt").rstrip("_djvu") + # Strips _djvu just for Internet Archive. + basename = file.name.rsplit(".txt", 1)[0] + # print(basename, file.name) + try: + id = IDfile[basename] + except KeyError: + logging.info(f"No catalog entry for {basename} at {file.name}, skipping") + continue # Use sha256 key = int(hashlib.md5(basename.encode('utf-8')).hexdigest(), 16) + logging.info(basename, key) if key % cpus != i: continue if file.name.endswith(".txt.gz"): - fin = gzip.open(file) + fin = gzip.open(file, mode="rt") elif file.name.endswith(".txt"): fin = open(file) else: logging.error(f"Can't handle file {file}") yield (basename, fin.read().replace("\t", "\f").replace("\n", "\f")) -def yield_lines_from_single_file(fname, i): +def yield_lines_from_single_file(fname, i, IDfile): + if (str(fname).endswith(".gz")): - fin = gzip.open(fname, 'rt') + fin = gzip.open(fname, mode = 'rt') else: fin = open(fname) totals = 0 @@ -144,6 +146,12 @@ def yield_lines_from_single_file(fname, i): except ValueError: errors += 1 continue + try: + id = IDfile[filename] + except KeyError: + logging.warning(f"No catalog entry for {id} though it appears in {filename}, skipping") + continue + yield (filename, text) if totals > 0 and errors/totals > 0.01: logging.warning("Skipped {} rows without tabs".format(errors)) diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index 914b7dd..fe4fb53 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -226,7 +226,11 @@ def pristine(self, args): raise NameError("Don't try to delete the mysql database") bookworm.db.query("DROP DATABASE IF EXISTS {}".format(self.dbname)) import shutil - shutil.rmtree('.bookworm') + try: + shutil.rmtree('.bookworm') + except FileNotFoundError: + pass + def encoded(self, args): """ diff --git a/bookwormDB/tokenizer.py b/bookwormDB/tokenizer.py index 4e76d3b..341c3c4 100644 --- a/bookwormDB/tokenizer.py +++ b/bookwormDB/tokenizer.py @@ -43,6 +43,7 @@ def wordRegex(): Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms """ bigregex = re.compile("|".join([decimals,possessive,numbers,abbreviation,sharps,punctuators,MasterExpression]),re.UNICODE|re.IGNORECASE) + bigregex = re.compile(u"\w+|\p{P}|\p{S}") return bigregex @@ -78,7 +79,7 @@ class tokenBatches(object): with 3-byte integer encoding for wordid and bookid. """ - def __init__(self, levels=["unigrams","bigrams"]): + def __init__(self, levels=["unigrams", "bigrams"]): """ mode: 'encode' (write files out) @@ -87,26 +88,36 @@ def __init__(self, levels=["unigrams","bigrams"]): self.levels=levels # placeholder to alert that createOutputFiles must be run. - self.completedFile = None - - def createOutputFiles(self): - self.completedFile = open(".bookworm/texts/encoded/completed/" + self.id,"w") - self.outputFiles = dict() - for level in self.levels: - self.outputFiles[level] = open(".bookworm/texts/encoded/{}/{}.txt".format(level, self.id),"w") - - def attachDictionaryAndID(self): - self.dictionary = readDictionaryFile() - self.IDfile = readIDfile() - + self._IDfile = None + self._dictionary = None + + def output_files(self, level): + if not hasattr(self, "outputFiles"): + self.outputFiles = dict() + if not level in self.outputFiles: + self.outputFiles[level] = open(".bookworm/texts/encoded/{}/{}.txt".format(level, self.id), "w") + return self.outputFiles[level] + + @property + def IDfile(self): + if self._IDfile: + return self._IDfile + self._IDfile = readIDfile() + return self._IDfile + + @property + def dictionary(self): + if self._dictionary: + return self._dictionary + self._dictionary = readDictionaryFile() + return self._dictionary def close(self): """ This test allows the creation of bookworms with fewer document than requested threads, which happens to be the case in the tests. """ - if self.completedFile is not None: - self.completedFile.close() + if hasattr(self, "outputFiles"): for v in self.outputFiles.values(): v.close() @@ -120,16 +131,13 @@ def encodeRow(self, 'tokenizer': a tokenizer object """ - if self.completedFile is None: - self.createOutputFiles() - self.attachDictionaryAndID() #The dictionary and ID lookup tables should be pre-attached. dictionary = self.dictionary IDfile = self.IDfile levels = None - + """ if source=="raw_text": parts = row.split("\t", 1) @@ -149,7 +157,7 @@ def encodeRow(self, raise tokens = preTokenized(token, count, self.levels[0]) """ - + try: textid = IDfile[filename] except KeyError: @@ -157,7 +165,7 @@ def encodeRow(self, return for level in self.levels: - outputFile = self.outputFiles[level] + outputFile = self.output_files(level) output = [] counts = tokenizer.counts(level) @@ -186,9 +194,6 @@ def encodeRow(self, except IOError as e: logging.exception(e) - if write_completed: - self.completedFile.write(filename + "\n") - class Tokenizer(object): """ A tokenizer is initialized with a single text string. @@ -208,17 +213,17 @@ def __init__(self, string, tokenization_regex=None): global haveWarnedUnicode self.string = string self.tokenization_regex = tokenization_regex - self.tokens = None + self._tokens = None + + @property + def tokens(self): + if self._tokens: + return self._tokens + self._tokens = self.tokenize() + return self._tokens + def tokenize(self): - """ - This tries to return the pre-made tokenization: - if that doesn't exist, it creates it. - """ - if self.tokens is not None: - return self.tokens - """ - For speed, don't import until here. - """ + tokenization_regex=self.tokenization_regex global re if re is None: @@ -229,20 +234,22 @@ def tokenize(self): if bigregex==None: bigregex = wordRegex() tokenization_regex = bigregex - self.tokens = re.findall(tokenization_regex, self.string) - return self.tokens + + + components = self.string.split("\f") + return [re.findall(tokenization_regex, component) for component in components] def ngrams(self, n, collapse = False): """ All the ngrams in the text can be created as a tuple by zipping an arbitrary number of copies of the text to itself. """ - - self.tokenize() - l = list(zip(*[self.tokens[i:] for i in range(n)])) + values = [] + for tokenset in self.tokens: + values.extend(zip(*[tokenset[i:] for i in range(n)])) if collapse: - l = [" ".join(tupled) for tupled in l] - return l + values = [" ".join(tupled) for tupled in values] + return values def unigrams(self): return self.ngrams(1) @@ -263,8 +270,7 @@ def words(self): """ 1-grams have tuple keys, but words have index keys. """ - self.tokenize() - return self.tokens + return [item for sublist in self.tokens for item in sublist] def counts(self, whichType): @@ -301,29 +307,3 @@ def counts(self, level): if level != self.level: raise return self.output - - -def getAlreadySeenList(folder): - #Load in a list of what's already been translated for that level. - #Returns a set. - files = os.listdir(folder) - seen = set([]) - for file in files: - for line in open(folder + "/" + file): - seen.add(line.rstrip("\n")) - return seen - -def encode_text_stream(): - seen = getAlreadySeenList(".bookworm/texts/encoded/completed") - tokenBatch = tokenBatches() - tokenBatch.attachDictionaryAndID() - for line in sys.stdin: - filename = line.split("\t",1)[0] - line = line.rstrip("\n") - if filename not in seen: - tokenBatch.encodeRow(line) - - # And printout again at the end - -if __name__=="__main__": - encode_text_stream() diff --git a/bookwormDB/variableSet.py b/bookwormDB/variableSet.py index 23bdbc5..cdc2e5a 100644 --- a/bookwormDB/variableSet.py +++ b/bookwormDB/variableSet.py @@ -229,9 +229,9 @@ def fastLookupTableIfNecessary(self, engine="MEMORY"): try: self.maxlength = max([self.maxlength,1]) except TypeError: - logging.error(f"Unable to calculate length for {field}" + logging.error(f"Unable to calculate length for {self.field}" "perhaps there are no entries in the catalog?") - raise + self.maxlength = 1; code = """DROP TABLE IF EXISTS tmp; CREATE TABLE tmp (%(field)s__id %(intType)s ,PRIMARY KEY (%(field)s__id), %(field)s VARCHAR (%(maxlength)s) ) ENGINE=%(engine)s From f0bd78ec1ac8efccda40d8075708ce1cf3a0a199 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Mon, 8 Feb 2021 09:35:25 -0500 Subject: [PATCH 16/41] Add default parsing of ISO date fields where not specified at yearly and daily resolutions --- bookwormDB/MetaParser.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bookwormDB/MetaParser.py b/bookwormDB/MetaParser.py index b2074d6..6bc9629 100644 --- a/bookwormDB/MetaParser.py +++ b/bookwormDB/MetaParser.py @@ -87,7 +87,16 @@ def ParseFieldDescs(write = False): if "derived" in field: fields_to_derive.append(field) else: - output.append(field) + if field['type'] == "character": + field['derived'] = [ + {"resolution": "year"}, + {"resolution": "day"} + ] + fields_to_derive.append(field) + elif field['type'] == "integer": + output.append(field) + else: + raise TypeError("Unable to parse temporal field " + field['field']) else: output.append(field) From c086b5ceec00e6fd80e8bfb577243eaa6d08878e Mon Sep 17 00:00:00 2001 From: Benjamin Schmidt <> Date: Sat, 6 Mar 2021 20:22:02 -0500 Subject: [PATCH 17/41] better testing --- bookwormDB/general_API.py | 126 +++++++++++++++++++++++++++++++++++--- bookwormDB/query_cache.py | 78 +++++++++++++++++++++++ bookwormDB/variableSet.py | 2 +- bookwormDB/wsgi.py | 2 + setup.py | 3 +- tests/test_formats.py | 67 ++++++++++++++++++++ 6 files changed, 266 insertions(+), 12 deletions(-) create mode 100644 bookwormDB/query_cache.py create mode 100644 tests/test_formats.py diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 550f252..9221788 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -1,16 +1,15 @@ #!/usr/bin/python -from pandas import merge -from pandas import Series +from pandas import merge, Series, set_option, DataFrame from pandas.io.sql import read_sql -from pandas import merge -from pandas import set_option +from pyarrow import feather from copy import deepcopy from collections import defaultdict from .mariaDB import DbConnect from .SQLAPI import userquery from .mariaDB import Query from .bwExceptions import BookwormException +from .query_cache import Query_Cache import re import json import logging @@ -18,6 +17,8 @@ import csv import io import numpy as np +from urllib import request +from urllib import parse """ The general API is some functions for working with pandas to calculate @@ -258,6 +259,8 @@ def set_defaults(self): query["search_limits"]["word"] = query["search_limits"]["unigram"] del query["search_limits"]["unigram"] + + def idiot_proof_arrays(self): for element in ['counttype', 'groups']: try: @@ -309,6 +312,12 @@ def data(self): self.pandas_frame = self.get_data_from_source() return self.pandas_frame + #@attr + #data_frame(self): + # if self._pandas_frame is not None: + # return self.return_pandas_frame + + def validate_query(self): self.ensure_query_has_required_fields() @@ -327,8 +336,6 @@ def ensure_query_has_required_fields(self): def prepare_search_and_compare_queries(self): - - call1 = deepcopy(self.query) call2 = deepcopy(call1) call2['search_limits'] = self.get_compare_limits() @@ -491,10 +498,13 @@ def execute(self): if fmt == "tsv": return frame.to_csv(sep="\t", encoding="utf8", index=False) - if fmt == "feather": + if fmt == "feather" or fmt == "feather_js": + compression = "zstd" + if fmt == "feather_js": + compression = "uncompressed" fout = io.BytesIO(b'') try: - frame.to_feather(fout) + feather.write_table(frame, fout, compression = compression) except: logging.warning("You need the pyarrow package installed to export as feather.") raise @@ -710,6 +720,7 @@ def generate_pandas_frame(self, call = None): class MetaAPIcall(APIcall): def __init__(self, endpoints): self.endpoints = endpoints + super().__init__(self) def connect(self, endpoint): # return some type of a connection. @@ -741,8 +752,9 @@ class SQLAPIcall(APIcall): But the point is, you need to define a function "generate_pandas_frame" that accepts an API call and returns a pandas frame. - But that API call is more limited than the general API; you only need to + But that API call is more limited than the general API; it need only support "WordCount" and "TextCount" methods. + """ def generate_pandas_frame(self, call = None): @@ -764,3 +776,99 @@ def generate_pandas_frame(self, call = None): df = read_sql(q, con.db) logging.debug("Query retrieved") return df + + + +def my_sort(something): + if type(something) == list: + return sorted(something) + if type(something) == dict: + ks = sorted([*dict.keys()]) + output = {} + for k in ks: + output[k] = something[k] + return output + return something + +def standardized_query(query: dict) -> dict: + trimmed_call = {} + needed_keys = [ + 'search_limits', + 'compare_limits', + 'words_collation', + 'database', + 'method', + 'groups', + 'counttypes', + ] + needed_keys.sort() + for k in needed_keys: + try: + trimmed_call[k] = my_sort(query[k]) + except KeyError: + continue + return trimmed_call + + + +class ProxyAPI(APIcall): + + """ + Forward a request to a remote url. + + Can be useful if you want a proxy server with caching on one server which + reaches out to a different server for uncached requests, or perhaps + if you want a single gateway for multiple different bookworms. + + """ + + def __init__(self, endpoint): + """ + Endpoint: A URL, like `http://localhost:10013`. + """ + self.endpoint = endpoint + super().__init__(self) + + def generate_pandas_frame(self, call = None) -> DataFrame: + """ + Note--requires that the endpoint expose the new feather method. + """ + + if call is None: + call = self.query + call = deepcopy(call) + call['format'] = 'feather' + qstring = parse.urlencode(json.dumps(query)) + connection = request.urlopen(f"{self.endpoint}/?qstring") + return feather.read_table(connection) + +class Caching_API(APIcall): + def __init__(self, query: dict, cache: Query_Cache, fallback_api: APIcall, **kwargs): + """ + cache: an existing Query_Cache method. These are expensive to create, + so you don't get one generated by default. + + fallback_api: Must be initialized with a parent API class that also + inherits from APICall. + + kwargs: are passed to the fallback API. + """ + self.cache = cache + self.Fallback = fallback_api + self.kwargs = kwargs + super().__init__(query) + + def generate_pandas_frame(self, call = None) -> DataFrame: + if call is None: + call = self.query + + trimmed_call = standardized_query(call) + try: + return self.cache[trimmed_call] + except FileNotFoundError: + resolution = Fallback(query, **self.kwargs).generate_pandas_frame() + self.cache[trimmed_call] = resolution + if random.random() < .1: + # Don't bother doing this every time. + self.cache.trim_cache() + return resolution diff --git a/bookwormDB/query_cache.py b/bookwormDB/query_cache.py new file mode 100644 index 0000000..3608183 --- /dev/null +++ b/bookwormDB/query_cache.py @@ -0,0 +1,78 @@ +import pyarrow as pa +from pyarrow import feather +import pandas as pd +from pathlib import Path + +import logging +import json +import hashlib +import random + +def hashcode(query: dict) -> str: + return hashlib.sha1(json.dumps(query).encode("utf-8")).hexdigest() + +class Query_Cache: + + def __init__(self, location = "/tmp", + max_entries = 256, + max_length = 2**8, + cold_storage = None): + """ + location: where to keep some cached queries as parquet. + max_entries: the max size of the cache. + max_length: row length above which a query is never cached. + cold_storage: Optional location of a second, read-only cache. + Feather files in this can be nested at any depth. + """ + self.location = location + self.max_entries = max_entries + self.max_length = max_length + self.precache = {} + + if not Path(location).exists(): + Path(location).mkdir(parents = True) + assert Path(location).is_dir() + for path in Path(cold_storage).glob("**/*.feather"): + code = str(path.with_suffix("").name) + self.precache[code] = path + + def filepath(self, query: dict) -> Path: + code = hashcode(query) + if code in precache: + return precache[code] + return (Path(self.location) / code).with_suffix("feather") + + def __getitem__(self, query: dict) -> pd.DataFrame: + if hashcode(query) in self.precache: + # First check any manual queries. + return feather.read_feather[self.precache[hashcode(query)]] + + p = self.filepath(query) + table = feather.read_feather(p) + p.touch() # Note access for LRU cache flushing. + return table + + def __setitem__(self, query: dict, table: pd.DataFrame): + if table.shape[0] > self.max_length: + return + if not self.max_length: + # 0 or None are both reasonable here. + return + path = self.filepath(query).open(mode="w") + feather.write_feather(table, path, compression = "zstd") + + def trim_cache(self): + """ + Remove all cached feather files except the first + few (defined by the max_entries parameter of the class.) + """ + files = Path(self.location).glob("*.feather") + all_of_em = [] + for file in files: + all_of_em = [-1 * file.stat().st_mtime, file] + all_of_em.sort() + for extra in all_of_em[self.max_entries:]: + try: + extra.unlink() + except: + logging.error(f"Unable to unlink file {extra}; assuming another thread got it first, although that's pretty unlikely!") diff --git a/bookwormDB/variableSet.py b/bookwormDB/variableSet.py index cdc2e5a..7a36692 100644 --- a/bookwormDB/variableSet.py +++ b/bookwormDB/variableSet.py @@ -216,10 +216,10 @@ def build_ID_and_lookup_tables(self): self.dbToPutIn.query(query) def fastLookupTableIfNecessary(self, engine="MEMORY"): - """ This uses the already-created ID table to create a memory lookup. """ + self.engine = engine if self.datatype == 'categorical': logging.debug("Creating a memory lookup table for " + self.field) diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index fdf5de8..2f835b7 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -4,6 +4,8 @@ import logging import multiprocessing import gunicorn.app.base +from bookwormDB.query_cache import cache, cacheAPI + from datetime import datetime def content_type(query): diff --git a/setup.py b/setup.py index 31468fa..8374e0d 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,6 @@ author_email="bmschmidt@gmail.com", license="MIT", classifiers=[ - 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', 'Intended Audience :: Education', "Natural Language :: English", @@ -36,6 +35,6 @@ ], install_requires=["numpy","pandas","mysqlclient", "python-dateutil", "psutil", "bounter", - "gunicorn", "regex" + "gunicorn", "regex", "pyarrow" ] ) diff --git a/tests/test_formats.py b/tests/test_formats.py new file mode 100644 index 0000000..fe928cc --- /dev/null +++ b/tests/test_formats.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +from builtins import range +from builtins import object +import unittest +import bookwormDB +import bookwormDB.CreateDatabase +from bookwormDB.general_API import SQLAPIcall as SQLAPIcall +import logging +import os +from subprocess import call as call +import sys +import json +from setup import setup_bookworm, setup_bookworm_unicode +from pyarrow import feather + +class Bookworm_Return_Formats(unittest.TestCase): + + def test_feather(self): + from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + query = { + "database":"federalist_bookworm", + "search_limits":{}, + "counttype":"TextPercent", + "groups":["author"], + "method":"data", + "format":"feather" + } + + feather_file = SQLAPIcall(query).execute() + f = io.BytesIO(feather_file) + f.seek(0) + m = feather.read_feather(f) + self.assertEqual(m.shape[0],5) + self.assertEqual(m.shape[1],2) + + + def test_proxy_API(self): + from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + + import json + + query = { + "database":"federalist_bookworm", + "search_limits":{}, + "counttype":"TextPercent", + "groups":["author"], + "method":"data", + "format":"json" + } + + m = json.loads(SQLAPIcall(query).execute())['data'] + self.assertEqual(len(m),5) + +if __name__=="__main__": + # The setup is done without verbose logging; any failure + # causes it to try again. + logging.basicConfig(level=40) + try: + setup_bookworm() + setup_bookworm_unicode() + except: + logging.basicConfig(level=10) + setup_bookworm() + setup_bookworm_unicode() + logging.basicConfig(level=10) + unittest.main() From 0256a204e95c99e7ae045ea6438876662cf06193 Mon Sep 17 00:00:00 2001 From: Benjamin Schmidt <> Date: Sun, 7 Mar 2021 12:33:04 -0500 Subject: [PATCH 18/41] Add caching and API extensions to support it. --- bookwormDB/general_API.py | 35 ++++++++++++++++++++++++----------- bookwormDB/manager.py | 17 ++++++++++++----- bookwormDB/query_cache.py | 23 ++++++++++++++--------- bookwormDB/store.py | 9 +++++++++ bookwormDB/wsgi.py | 38 ++++++++++++++++++++++++++++++++++---- tests/test_formats.py | 1 + 6 files changed, 94 insertions(+), 29 deletions(-) create mode 100644 bookwormDB/store.py diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 9221788..1f6012f 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -19,6 +19,7 @@ import numpy as np from urllib import request from urllib import parse +import random """ The general API is some functions for working with pandas to calculate @@ -504,7 +505,7 @@ def execute(self): compression = "uncompressed" fout = io.BytesIO(b'') try: - feather.write_table(frame, fout, compression = compression) + feather.write_feather(frame, fout, compression = compression) except: logging.warning("You need the pyarrow package installed to export as feather.") raise @@ -783,9 +784,10 @@ def my_sort(something): if type(something) == list: return sorted(something) if type(something) == dict: - ks = sorted([*dict.keys()]) + keys = list(something.keys()) + keys.sort() output = {} - for k in ks: + for k in keys: output[k] = something[k] return output return something @@ -822,12 +824,12 @@ class ProxyAPI(APIcall): """ - def __init__(self, endpoint): + def __init__(self, query, endpoint): """ - Endpoint: A URL, like `http://localhost:10013`. + Endpoint: A URL, like `http://localhost:10012`. """ self.endpoint = endpoint - super().__init__(self) + super().__init__(query) def generate_pandas_frame(self, call = None) -> DataFrame: """ @@ -838,10 +840,21 @@ def generate_pandas_frame(self, call = None) -> DataFrame: call = self.query call = deepcopy(call) call['format'] = 'feather' - qstring = parse.urlencode(json.dumps(query)) - connection = request.urlopen(f"{self.endpoint}/?qstring") - return feather.read_table(connection) - + print(call) + query_string = json.dumps(call) + print(query_string) + qstring = parse.quote(query_string) + print(qstring) + remote_url = f"{self.endpoint}/?{qstring}" + buffer = io.BytesIO() + connection = request.urlopen(remote_url) + buffer.write(connection.read()) + try: + return feather.read_feather(buffer) + except: + # TODO: re-throw bookworm errors with additional context. + + raise class Caching_API(APIcall): def __init__(self, query: dict, cache: Query_Cache, fallback_api: APIcall, **kwargs): """ @@ -866,7 +879,7 @@ def generate_pandas_frame(self, call = None) -> DataFrame: try: return self.cache[trimmed_call] except FileNotFoundError: - resolution = Fallback(query, **self.kwargs).generate_pandas_frame() + resolution = self.Fallback(call, **self.kwargs).generate_pandas_frame() self.cache[trimmed_call] = resolution if random.random() < .1: # Don't bother doing this every time. diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index fe4fb53..f68a530 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -7,6 +7,7 @@ import os import bookwormDB import argparse +from .store import store """ This is the code that actually gets run from the command-line executable. @@ -567,15 +568,19 @@ def run_arguments(): "the gunicorn endpoint behind a more powerful webserver like apache or nginx.") serve_parser.add_argument("--full-site", action = "store_true", help="Serve a webpage as well as a query endpoint? Not active.") - serve_parser.add_argument("--port", "-p", default="10012", help="The port over which to serve the bookworm", type=int) - serve_parser.add_argument("--bind", "-b", default="127.0.0.1", help="The IP address to bind the server to.", type=str) - serve_parser.add_argument("--workers", "-w", default="0", help="How many gunicorn worker threads to launch for the API. Reduce if you're seeing memory issues.",type=int) - serve_parser.add_argument("--dir","-d",default="http_server",help="A filepath for a directory to serve from. Will be created if it does not exist.") - +# serve_parser.add_argument("--API", "-a", default="MySQL", +# help="The type of API endpoint to run. 'MySQL' will" +# "will run MySQL") + serve_parser.add_argument("--cache", default = "none", + help="cache locations?") + serve_parser.add_argument("--cold-storage", default = "none", + help="A folder with cached query results. Allows long-term cold-storage.") + serve_parser.add_argument("--remote-host", default = None, + help="Hosts to pass queries through to. If enabled.") # Configure the global server. @@ -586,6 +591,8 @@ def run_arguments(): # Call the function args = parser.parse_args() + # stash those away. + store()['args'] = args # Set the logging level based on the input. numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): diff --git a/bookwormDB/query_cache.py b/bookwormDB/query_cache.py index 3608183..ce1d00f 100644 --- a/bookwormDB/query_cache.py +++ b/bookwormDB/query_cache.py @@ -12,8 +12,11 @@ def hashcode(query: dict) -> str: return hashlib.sha1(json.dumps(query).encode("utf-8")).hexdigest() class Query_Cache: + # By default, use locally stored feather files. If that's bad, it Would + # be pretty easy to split the class out into anything using an API + # that maps from cache[query_dictionary] -> pandas_frame. - def __init__(self, location = "/tmp", + def __init__(self, location, max_entries = 256, max_length = 2**8, cold_storage = None): @@ -32,20 +35,22 @@ def __init__(self, location = "/tmp", if not Path(location).exists(): Path(location).mkdir(parents = True) assert Path(location).is_dir() - for path in Path(cold_storage).glob("**/*.feather"): - code = str(path.with_suffix("").name) - self.precache[code] = path + if cold_storage is not None: + for path in Path(cold_storage).glob("**/*.feather"): + code = str(path.with_suffix("").name) + self.precache[code] = path def filepath(self, query: dict) -> Path: code = hashcode(query) - if code in precache: - return precache[code] - return (Path(self.location) / code).with_suffix("feather") + if code in self.precache: + return self.precache[code] + return (Path(self.location) / code).with_suffix(".feather") def __getitem__(self, query: dict) -> pd.DataFrame: if hashcode(query) in self.precache: # First check any manual queries. - return feather.read_feather[self.precache[hashcode(query)]] +# print(self.precache[hashcode(query)]) + return feather.read_feather(self.precache[hashcode(query)]) p = self.filepath(query) table = feather.read_feather(p) @@ -58,7 +63,7 @@ def __setitem__(self, query: dict, table: pd.DataFrame): if not self.max_length: # 0 or None are both reasonable here. return - path = self.filepath(query).open(mode="w") + path = self.filepath(query).open(mode="wb") feather.write_feather(table, path, compression = "zstd") def trim_cache(self): diff --git a/bookwormDB/store.py b/bookwormDB/store.py new file mode 100644 index 0000000..897f359 --- /dev/null +++ b/bookwormDB/store.py @@ -0,0 +1,9 @@ +# Just a place to store configurations rather than pass through a +# nest of functions. Bad idea? + + +store_dict = {} + +def store(): + global store_dict + return store_dict diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index 2f835b7..e986a6e 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -1,10 +1,12 @@ -from bookwormDB.general_API import SQLAPIcall as SQLAPIcall +from bookwormDB.general_API import SQLAPIcall as SQLAPIcall, Caching_API, ProxyAPI import json from urllib.parse import unquote import logging import multiprocessing import gunicorn.app.base -from bookwormDB.query_cache import cache, cacheAPI +from bookwormDB.store import store +from .store import store +from .query_cache import Query_Cache from datetime import datetime @@ -25,6 +27,27 @@ def content_type(query): return 'text/plain' + +args = store()['args'] +API_kwargs = {} +if args.cache != "none": + query_cache = Query_Cache( + args.cache, + max_entries = 256, + max_length = 2**8, + cold_storage = args.cold_storage) + + +if args.remote_host is None: + logging.info("Using SQL API") + API = SQLAPIcall +else: + logging.info("Using proxy API") + API = ProxyAPI + API_kwargs = { + "endpoint": args.remote_host + } + def application(environ, start_response, logfile = "bookworm_queries.log"): # Starting with code from http://wsgi.tutorial.codepoint.net/parsing-the-request-post try: @@ -73,7 +96,12 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): start_response(status, list(headers.items())) return [b'{"status":"error", "message": "You have passed invalid JSON to the Bookworm API"}'] - process = SQLAPIcall(query) + args = store()['args'] + if args.cache == "none": + process = API(query, **API_kwargs) + else: + process = Caching_API(query, query_cache, API, **API_kwargs) + response_body = process.execute() # It might be binary already. @@ -124,7 +152,9 @@ def load(self): def run(port = 10012, bind="0.0.0.0", workers = number_of_workers()): """ port: the service port - bind: the host to bind to. + bind: the host to bind to. Requests that don't match this address + will be ignored. The default accepts all connections: 127.0.0.1 listens + only to localhost. """ if workers==0: workers = number_of_workers() diff --git a/tests/test_formats.py b/tests/test_formats.py index fe928cc..8f8e5a0 100644 --- a/tests/test_formats.py +++ b/tests/test_formats.py @@ -13,6 +13,7 @@ import json from setup import setup_bookworm, setup_bookworm_unicode from pyarrow import feather +import io class Bookworm_Return_Formats(unittest.TestCase): From f9d6b1719b868df919d2b5841edc396979010a66 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 7 Mar 2021 15:09:45 -0500 Subject: [PATCH 19/41] register json type for json_c --- bookwormDB/wsgi.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index fdf5de8..14cd578 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -15,6 +15,9 @@ def content_type(query): if format == "json": return "application/json" + if format == "json_c": + return "application/json" + if format == "feather": return "application/octet-stream" From 088eeaee161bcf50e8c474e1e242d07b8eb7df3b Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 7 Mar 2021 15:10:26 -0500 Subject: [PATCH 20/41] More error handling on read stage --- bookwormDB/countManager.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/bookwormDB/countManager.py b/bookwormDB/countManager.py index 65e0469..770b874 100644 --- a/bookwormDB/countManager.py +++ b/bookwormDB/countManager.py @@ -120,13 +120,26 @@ def yield_texts_from_directory(dir, i, IDfile): if key % cpus != i: continue if file.name.endswith(".txt.gz"): - fin = gzip.open(file, mode="rt") + try: + fin = gzip.open(file, mode="rt") + except UnicodeDecodeError: + logging.error(f"Unable to read {file}: unicode error") + continue + except gzip.BadGzipFile: + logging.error(f"Unable to read {file}: Bad gzip file") + continue elif file.name.endswith(".txt"): fin = open(file) else: logging.error(f"Can't handle file {file}") - yield (basename, fin.read().replace("\t", "\f").replace("\n", "\f")) - + try: + yield (basename, fin.read().replace("\t", "\f").replace("\n", "\f")) + except UnicodeDecodeError: + logging.error(f"Unable to read {file}") + except gzip.BadGzipFile: + logging.error(f"Unable to read {file}: Bad gzip file") + continue + def yield_lines_from_single_file(fname, i, IDfile): if (str(fname).endswith(".gz")): From 1d52bfc0851a0fe6d141d9554ae94fae398ed34e Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 7 Mar 2021 15:11:49 -0500 Subject: [PATCH 21/41] More explicit in proofing user queries. --- bookwormDB/general_API.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 550f252..76d1a82 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -261,7 +261,7 @@ def set_defaults(self): def idiot_proof_arrays(self): for element in ['counttype', 'groups']: try: - if not isinstance(self.query[element], list): + if isinstance(self.query[element], str): self.query[element] = [self.query[element]] except KeyError: # It's OK if it's not there. @@ -628,6 +628,7 @@ def fixNumpyType(input): # Define a recursive structure to hold the stuff. def tree(): return defaultdict(tree) + returnt = tree() for row in data.itertuples(index=False): @@ -710,11 +711,11 @@ def generate_pandas_frame(self, call = None): class MetaAPIcall(APIcall): def __init__(self, endpoints): self.endpoints = endpoints - + def connect(self, endpoint): # return some type of a connection. pass - + def generate_pandas_frame(self, call): if call is None: call = deepcopy(self.query) @@ -723,10 +724,10 @@ def generate_pandas_frame(self, call): connection = self.connect(endpoint) d = connection.query(call) count_fields = [] - + for field in ['WordCount', 'TextCount']: if field in call["counttype"]: - count_fields.push(field) + count_fields.push(field) together = pd.concat(d) together[count_fields].sum() From 79282dc73a161062988f11ac0a4107630a0e248e Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 7 Mar 2021 15:12:24 -0500 Subject: [PATCH 22/41] throw more errors --- bookwormDB/mariaDB.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bookwormDB/mariaDB.py b/bookwormDB/mariaDB.py index 635af99..7c10492 100644 --- a/bookwormDB/mariaDB.py +++ b/bookwormDB/mariaDB.py @@ -778,7 +778,6 @@ def getActualSearchedWords(self): self.actualWords = [item[0] for item in self.cursor.fetchall()] else: raise TypeError("Suspiciously low word count") - self.actualWords = ["tasty", "mistake", "happened", "here"] def custom_SearchString_additions(self, returnarray): """ From 4924755d01539745a249e1875ba3024b49124c6f Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 7 Mar 2021 16:00:18 -0500 Subject: [PATCH 23/41] Fix cache trimming --- bookwormDB/query_cache.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/bookwormDB/query_cache.py b/bookwormDB/query_cache.py index ce1d00f..2005608 100644 --- a/bookwormDB/query_cache.py +++ b/bookwormDB/query_cache.py @@ -6,16 +6,16 @@ import logging import json import hashlib -import random +import random def hashcode(query: dict) -> str: return hashlib.sha1(json.dumps(query).encode("utf-8")).hexdigest() class Query_Cache: # By default, use locally stored feather files. If that's bad, it Would - # be pretty easy to split the class out into anything using an API + # be pretty easy to split the class out into anything using an API # that maps from cache[query_dictionary] -> pandas_frame. - + def __init__(self, location, max_entries = 256, max_length = 2**8, @@ -31,7 +31,7 @@ def __init__(self, location, self.max_entries = max_entries self.max_length = max_length self.precache = {} - + if not Path(location).exists(): Path(location).mkdir(parents = True) assert Path(location).is_dir() @@ -39,33 +39,33 @@ def __init__(self, location, for path in Path(cold_storage).glob("**/*.feather"): code = str(path.with_suffix("").name) self.precache[code] = path - - def filepath(self, query: dict) -> Path: + + def filepath(self, query: dict) -> Path: code = hashcode(query) if code in self.precache: return self.precache[code] return (Path(self.location) / code).with_suffix(".feather") - + def __getitem__(self, query: dict) -> pd.DataFrame: if hashcode(query) in self.precache: # First check any manual queries. # print(self.precache[hashcode(query)]) return feather.read_feather(self.precache[hashcode(query)]) - + p = self.filepath(query) table = feather.read_feather(p) p.touch() # Note access for LRU cache flushing. return table - + def __setitem__(self, query: dict, table: pd.DataFrame): if table.shape[0] > self.max_length: return if not self.max_length: # 0 or None are both reasonable here. - return + return path = self.filepath(query).open(mode="wb") feather.write_feather(table, path, compression = "zstd") - + def trim_cache(self): """ Remove all cached feather files except the first @@ -74,10 +74,10 @@ def trim_cache(self): files = Path(self.location).glob("*.feather") all_of_em = [] for file in files: - all_of_em = [-1 * file.stat().st_mtime, file] + all_of_em.push((-1 * file.stat().st_mtime, file)) all_of_em.sort() for extra in all_of_em[self.max_entries:]: try: - extra.unlink() + extra[1].unlink() except: logging.error(f"Unable to unlink file {extra}; assuming another thread got it first, although that's pretty unlikely!") From 0ea587128eae178112e0e8effca2bc7795b61c60 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 7 Mar 2021 16:09:33 -0500 Subject: [PATCH 24/41] 'push' is js, not python. Ugh. --- bookwormDB/query_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bookwormDB/query_cache.py b/bookwormDB/query_cache.py index 2005608..1f78347 100644 --- a/bookwormDB/query_cache.py +++ b/bookwormDB/query_cache.py @@ -74,7 +74,7 @@ def trim_cache(self): files = Path(self.location).glob("*.feather") all_of_em = [] for file in files: - all_of_em.push((-1 * file.stat().st_mtime, file)) + all_of_em.append((-1 * file.stat().st_mtime, file)) all_of_em.sort() for extra in all_of_em[self.max_entries:]: try: From dc1ec673fb69aebf55fc8f93c9c702890801725f Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 25 Apr 2021 17:32:47 -0400 Subject: [PATCH 25/41] explain wsgi changes --- bookwormDB/wsgi.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index aa1920d..8758adc 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -6,10 +6,12 @@ import gunicorn.app.base from bookwormDB.store import store from .store import store -from .query_cache import Query_Cache +from .query_cache import Query_Cache from datetime import datetime + + def content_type(query): try: format = query['format'] @@ -40,7 +42,7 @@ def content_type(query): max_length = 2**8, cold_storage = args.cold_storage) - + if args.remote_host is None: logging.info("Using SQL API") API = SQLAPIcall @@ -48,9 +50,9 @@ def content_type(query): logging.info("Using proxy API") API = ProxyAPI API_kwargs = { - "endpoint": args.remote_host + "endpoint": args.remote_host } - + def application(environ, start_response, logfile = "bookworm_queries.log"): # Starting with code from http://wsgi.tutorial.codepoint.net/parsing-the-request-post try: @@ -70,6 +72,12 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): ip = environ.get('REMOTE_ADDR') if ip is None: ip = environ.get('REMOTE_ADDR') + + # Caching IPs directly is probably in violation of GPDR. + # It's nice to have session browsing data, so we'll grab just the + # last byte which should be enough to get something out of. + ip = ip.split(".")[-1] + query = unquote(q) headers = { @@ -104,7 +112,7 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): process = API(query, **API_kwargs) else: process = Caching_API(query, query_cache, API, **API_kwargs) - + response_body = process.execute() # It might be binary already. From 29e060f143a7a84bcecba187119afff1f5e06da5 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 29 Apr 2021 16:57:22 -0400 Subject: [PATCH 26/41] Preliminarily working duckdb fetches. No build yet. --- bookwormDB/SQLAPI.py | 1159 -------------------------- bookwormDB/configuration.py | 217 ----- bookwormDB/{mariaDB.py => duckdb.py} | 145 ++-- bookwormDB/general_API.py | 45 +- bookwormDB/store.py | 23 +- bookwormDB/wsgi.py | 28 +- 6 files changed, 112 insertions(+), 1505 deletions(-) delete mode 100644 bookwormDB/SQLAPI.py rename bookwormDB/{mariaDB.py => duckdb.py} (90%) diff --git a/bookwormDB/SQLAPI.py b/bookwormDB/SQLAPI.py deleted file mode 100644 index f856759..0000000 --- a/bookwormDB/SQLAPI.py +++ /dev/null @@ -1,1159 +0,0 @@ -#!/usr/local/bin/python - - -from .variableSet import to_unicode -import json -import re -import copy -import MySQLdb -import hashlib -import logging -from .bwExceptions import BookwormException -import bookwormDB.configuration - -# If you have bookworms stored on a different host, you can create more lines -# like this. -# A different host and read_default_file will let you import things onto a -# different server. -general_prefs = dict() -general_prefs["default"] = { - "fastcat": "fastcat", - "fastword": "wordsheap", - "fullcat": "catalog", - "fullword": "words", - "read_default_file": "/etc/mysql/my.cnf" -} - -class DbConnect(object): - # This is a read-only account - def __init__(self, prefs=general_prefs['default'], database=None): - - self.dbname = database - - conf = bookwormDB.configuration.Configfile("read_only").config - - if database is None: - database = prefs['database'] - - connargs = { - "db": database, - "use_unicode": 'True', - "charset": 'utf8', - "user": conf.get("client", "user"), - "password": conf.get("client", "password"), - "host": conf.get("client", "host") - } - - logging.info("Preparing to connect with args") - logging.info(connargs) - - self.db = MySQLdb.connect(**connargs) - self.cursor = self.db.cursor() - -def fail_if_nonword_characters_in_columns(input): - keys = all_keys(input) - for key in keys: - if re.search(r"[^A-Za-z_$*0-9]", key): - logging.error("{} has nonword character".format(key)) - raise - - -def all_keys(input): - """ - Recursive function. Get every keyname in every descendant of a dictionary. - Iterates down on list and dict structures to search for more dicts with - keys. - """ - values = [] - if isinstance(input, dict): - values = list(input.keys()) - for key in list(input.keys()): - values = values + all_keys(input[key]) - if isinstance(input, list): - for value in input: - valleys = all_keys(value) - for val in valleys: - values.append(val) - return values - -# The basic object here is a 'userquery:' it takes dictionary as input, -# as defined in the API, and returns a value -# via the 'execute' function whose behavior -# depends on the mode that is passed to it. -# Given the dictionary, it can return a number of objects. -# The "Search_limits" array in the passed dictionary determines how many -# elements it returns; this lets multiple queries be bundled together. -# Most functions describe a subquery that might be combined into one big query -# in various ways. - -class userquery(object): - """ - The base class for a bookworm search. - """ - def __init__(self, outside_dictionary = {}, db = None, databaseScheme = None): - # Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it. - fail_if_nonword_characters_in_columns(outside_dictionary) - try: - self.prefs = general_prefs[outside_dictionary['database']] - except KeyError: - # If it's not in the option, use some default preferences and search on localhost. This will work in most cases here on out. - self.prefs = general_prefs['default'] - self.prefs['database'] = outside_dictionary['database'] - self.outside_dictionary = outside_dictionary - - self.db = db - if db is None: - self.db = DbConnect(self.prefs) - self.databaseScheme = databaseScheme - if databaseScheme is None: - self.databaseScheme = databaseSchema(self.db) - - self.cursor = self.db.cursor - self.wordsheap = self.fallback_table(self.prefs['fastword']) - - self.words = self.prefs['fullword'] - """ - I'm now allowing 'search_limits' to either be a dictionary or an array of dictionaries: - this makes the syntax cleaner on most queries, - while still allowing some long ones from the Bookworm website. - """ - try: - if isinstance(outside_dictionary['search_limits'], list): - outside_dictionary['search_limits'] = outside_dictionary['search_limits'][0] - except: - outside_dictionary['search_limits'] = dict() - # outside_dictionary = self.limitCategoricalQueries(outside_dictionary) - self.defaults(outside_dictionary) # Take some defaults - self.derive_variables() # Derive some useful variables that the query will use. - - def defaults(self, outside_dictionary): - # these are default values;these are the only values that can be set in the query - # search_limits is an array of dictionaries; - # each one contains a set of limits that are mutually independent - # The other limitations are universal for all the search limits being set. - - # Set up a dictionary for the denominator of any fraction if it doesn't already exist: - self.search_limits = outside_dictionary.setdefault('search_limits', [{"word":["polka dot"]}]) - self.words_collation = outside_dictionary.setdefault('words_collation', "Case_Insensitive") - - lookups = {"Case_Insensitive":'word', 'lowercase':'lowercase', 'casesens':'casesens', "case_insensitive":"word", "Case_Sensitive":"casesens", "All_Words_with_Same_Stem":"stem", 'stem':'stem'} - self.word_field = lookups[self.words_collation] - - self.time_limits = outside_dictionary.setdefault('time_limits', [0, 10000000]) - self.time_measure = outside_dictionary.setdefault('time_measure', 'year') - - self.groups = set() - self.outerGroups = [] # [] # Only used on the final join; directionality matters, unlike for the other ones. - self.finalMergeTables=set() - try: - groups = outside_dictionary['groups'] - except: - groups = [outside_dictionary['time_measure']] - - if groups == [] or groups == ["unigram"]: - # Set an arbitrary column name that will always be true if nothing else is set. - groups.insert(0, "1 as In_Library") - - if (len(groups) > 1): - pass - # self.groups = credentialCheckandClean(self.groups) - # Define some sort of limitations here, if not done in dbbindings.py - - for group in groups: - - # There's a special set of rules for how to handle unigram and bigrams - multigramSearch = re.match("(unigram|bigram|trigram)(\d)?", group) - - if multigramSearch: - if group == "unigram": - gramPos = "1" - gramType = "unigram" - - else: - gramType = multigramSearch.groups()[0] - try: - gramPos = multigramSearch.groups()[1] - except: - print("currently you must specify which bigram element you want (eg, 'bigram1')") - raise - - lookupTableName = "%sLookup%s" %(gramType, gramPos) - self.outerGroups.append("%s.%s as %s" %(lookupTableName, self.word_field, group)) - self.finalMergeTables.add(" JOIN %s as %s ON %s.wordid=w%s" %(self.wordsheap, lookupTableName, lookupTableName, gramPos)) - self.groups.add("words%s.wordid as w%s" %(gramPos, gramPos)) - - else: - self.outerGroups.append(group) - try: - if self.databaseScheme.aliases[group] != group: - # Search on the ID field, not the basic field. - # debug(self.databaseScheme.aliases.keys()) - self.groups.add(self.databaseScheme.aliases[group]) - table = self.databaseScheme.tableToLookIn[group] - - joinfield = self.databaseScheme.aliases[group] - self.finalMergeTables.add(" JOIN " + table + " USING (" + joinfield + ") ") - else: - self.groups.add(group) - except KeyError: - self.groups.add(group) - - """ - There are the selections which can include table refs, and the groupings, which may not: - and the final suffix to enable fast lookup - """ - - self.selections = ",".join(self.groups) - self.groupings = ",".join([re.sub(".* as", "", group) for group in self.groups]) - - self.joinSuffix = "" + " ".join(self.finalMergeTables) - - """ - Define the comparison set if a comparison is being done. - """ - # Deprecated--tagged for deletion - # self.determineOutsideDictionary() - - # This is a little tricky behavior here--hopefully it works in all cases. It drops out word groupings. - - self.counttype = outside_dictionary.setdefault('counttype', ["WordCount"]) - - if isinstance(self.counttype, (str, bytes)): - self.counttype = [self.counttype] - - # index is deprecated, but the old version uses it. - self.index = outside_dictionary.setdefault('index', 0) - """ - # Ordinarily, the input should be an an array of groups that will both select and group by. - # The joins may be screwed up by certain names that exist in multiple tables, so there's an option to do something like - # SELECT catalog.bookid as myid, because WHERE clauses on myid will work but GROUP BY clauses on catalog.bookid may not - # after a sufficiently large number of subqueries. - # This smoothing code really ought to go somewhere else, since it doesn't quite fit into the whole API mentality and is - # more about the webpage. It is only included here as a stopgap: NO FURTHER APPLICATIONS USING IT SHOULD BE BUILT. - """ - - self.smoothingType = outside_dictionary.setdefault('smoothingType', "triangle") - self.smoothingSpan = outside_dictionary.setdefault('smoothingSpan', 3) - self.method = outside_dictionary.setdefault('method', "Nothing") - - def determineOutsideDictionary(self): - """ - deprecated--tagged for deletion. - """ - self.compare_dictionary = copy.deepcopy(self.outside_dictionary) - if 'compare_limits' in list(self.outside_dictionary.keys()): - self.compare_dictionary['search_limits'] = self.outside_dictionary['compare_limits'] - del self.outside_dictionary['compare_limits'] - elif sum([bool(re.search(r'\*', string)) for string in list(self.outside_dictionary['search_limits'].keys())]) > 0: - # If any keys have stars at the end, drop them from the compare set - # This is often a _very_ helpful definition for succinct comparison queries of many types. - # The cost is that an asterisk doesn't allow you - - for key in list(self.outside_dictionary['search_limits'].keys()): - if re.search(r'\*', key): - # rename the main one to not have a star - self.outside_dictionary['search_limits'][re.sub(r'\*', '', key)] = self.outside_dictionary['search_limits'][key] - # drop it from the compare_limits and delete the version in the search_limits with a star - del self.outside_dictionary['search_limits'][key] - del self.compare_dictionary['search_limits'][key] - else: # if nothing specified, we compare the word to the corpus. - deleted = False - for key in list(self.outside_dictionary['search_limits'].keys()): - if re.search('words?\d', key) or re.search('gram$', key) or re.match(r'word', key): - del self.compare_dictionary['search_limits'][key] - deleted = True - if not deleted: - # If there are no words keys, just delete the first key of any type. - # Sort order can't be assumed, but this is a useful failure mechanism of last resort. Maybe. - try: - del self.compare_dictionary['search_limits'][list(self.outside_dictionary['search_limits'].keys())[0]] - except: - pass - """ - The grouping behavior here is not desirable, but I'm not quite sure how yet. - Aha--one way is that it accidentally drops out a bunch of options. I'm just disabling it: let's see what goes wrong now. - """ - try: - pass# self.compare_dictionary['groups'] = [group for group in self.compare_dictionary['groups'] if not re.match('word', group) and not re.match("[u]?[bn]igram", group)]# topicfix? and not re.match("topic", group)] - except: - self.compare_dictionary['groups'] = [self.compare_dictionary['time_measure']] - - def derive_variables(self): - # These are locally useful, and depend on the search limits put in. - self.limits = self.search_limits - # Treat empty constraints as nothing at all, not as full restrictions. - for key in list(self.limits.keys()): - if self.limits[key] == []: - del self.limits[key] - self.set_operations() - self.create_catalog_table() - self.make_catwhere() - self.make_wordwheres() - - def tablesNeededForQuery(self, fieldNames=[]): - db = self.db - neededTables = set() - tablenames = dict() - tableDepends = dict() - db.cursor.execute("SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);") - for row in db.cursor.fetchall(): - tablenames[row[0]] = row[2] - tableDepends[row[2]] = row[3] - - for fieldname in fieldNames: - parent = "" - try: - current = tablenames[fieldname] - neededTables.add(current) - n = 1 - while parent not in ['fastcat', 'wordsheap']: - parent = tableDepends[current] - neededTables.add(parent) - current = parent - n+=1 - if n > 100: - raise TypeError("Unable to handle this; seems like a recursion loop in the table definitions.") - # This will add 'fastcat' or 'wordsheap' exactly once per entry - except KeyError: - pass - - return neededTables - - def needed_columns(self): - """ - Given a query, what are the columns that the compiled search will need materialized? - - Important for joining appropriate tables to the search. - - Needs a recursive function so it will find keys deeply nested inside "$or" searches. - """ - cols = [] - def pull_keys(entry): - val = [] - if isinstance(entry,list) and not isinstance(entry,(str, bytes)): - for element in entry: - val += pull_keys(element) - elif isinstance(entry,dict): - for k,v in entry.items(): - if k[0] != "$": - val.append(k) - else: - val += pull_keys(v) - else: - return [] - return [re.sub(" .*","",key) for key in val] - - return pull_keys(self.limits) + [re.sub(" .*","",g) for g in self.groups] - - - def create_catalog_table(self): - self.catalog = self.prefs['fastcat'] # 'catalog' # Can be replaced with a more complicated query in the event of longer joins. - - """ - This should check query constraints against a list of tables, and join to them. - So if you query with a limit on LCSH, and LCSH is listed as being in a separate table, - it joins the table "LCSH" to catalog; and then that table has one column, ALSO - called "LCSH", which is matched against. This allows a bookid to be a member of multiple catalogs. - """ - - self.relevantTables = set() - - databaseScheme = self.databaseScheme - columns = [] - for columnInQuery in self.needed_columns(): - columns.append(columnInQuery) - try: - self.relevantTables.add(databaseScheme.tableToLookIn[columnInQuery]) - try: - self.relevantTables.add(databaseScheme.tableToLookIn[databaseScheme.anchorFields[columnInQuery]]) - try: - self.relevantTables.add(databaseScheme.tableToLookIn[databaseScheme.anchorFields[databaseScheme.anchorFields[columnInQuery]]]) - except KeyError: - pass - except KeyError: - pass - except KeyError: - pass - # Could raise as well--shouldn't be errors--but this helps back-compatability. - - try: - moreTables = self.tablesNeededForQuery(columns) - except MySQLdb.ProgrammingError: - # What happens on old-style Bookworm constructions. - moreTables = set() - self.relevantTables = list(self.relevantTables.union(moreTables)) - - - self.relevantTables = [self.fallback_table(t) for t in self.relevantTables] - - self.catalog = self.fallback_table("fastcat") - if self.catalog == "fastcat_": - self.prefs['fastcat'] = "fastcat_" - - for table in self.relevantTables: - if table!="fastcat" and table!="words" and table!="wordsheap" and table!="master_bookcounts" and table!="master_bigrams" and table != "fastcat_" and table != "wordsheap_": - self.catalog = self.catalog + """ NATURAL JOIN """ + table + " " - - def fallback_table(self,tabname): - """ - Fall back to the saved versions if the memory tables are unpopulated. - - Use a cache first to avoid unnecessary queries, though the overhead shouldn't be much. - """ - tab = tabname - if tab.endswith("_"): - return tab - if tab in ["words","master_bookcounts","master_bigrams","catalog"]: - return tab - - if not hasattr(self,"fallbacks_cache"): - self.fallbacks_cache = {} - - if tabname in self.fallbacks_cache: - return self.fallbacks_cache[tabname] - - q = "SELECT COUNT(*) FROM {}".format(tab) - try: - self.db.cursor.execute(q) - length = self.db.cursor.fetchall()[0][0] - if length==0: - tab += "_" - except MySQLdb.ProgrammingError: - tab += "_" - - self.fallbacks_cache[tabname] = tab - - return tab - - def make_catwhere(self): - # Where terms that don't include the words table join. Kept separate so that we can have subqueries only working on one half of the stack. - catlimits = dict() - for key in list(self.limits.keys()): - # !!Warning--none of these phrases can be used in a bookworm as a custom table names. - - if key not in ('word', 'word1', 'word2', 'hasword') and not re.search("words\d", key): - catlimits[key] = self.limits[key] - if len(list(catlimits.keys())) > 0: - self.catwhere = where_from_hash(catlimits) - else: - self.catwhere = "TRUE" - if 'hasword' in list(self.limits.keys()): - """ - Because derived tables don't carry indexes, we're just making the new tables - with indexes on the fly to be stored in a temporary database, "bookworm_scratch" - Each time a hasword query is performed, the results of that query are permanently cached; - they're stored as a table that can be used in the future. - - This will create problems if database contents are changed; there needs to be some mechanism for - clearing out the cache periodically. - """ - - if self.limits['hasword'] == []: - del self.limits['hasword'] - return - - # deepcopy lets us get a real copy of the dictionary - # that can be changed without affecting the old one. - mydict = copy.deepcopy(self.outside_dictionary) - # This may make it take longer than it should; we might want the list to - # just be every bookid with the given word rather than - # filtering by the limits as well. - # It's not obvious to me which will be faster. - mydict['search_limits'] = copy.deepcopy(self.limits) - if isinstance(mydict['search_limits']['hasword'], (str, bytes)): - # Make sure it's an array - mydict['search_limits']['hasword'] = [mydict['search_limits']['hasword']] - """ - # Ideally, this would shuffle into an order ensuring that the - rarest words were nested deepest. - # That would speed up query execution by ensuring there - wasn't some massive search for 'the' being - # done at the end. - - Instead, it just pops off the last element and sets up a - recursive nested join. for every element in the - array. - """ - mydict['search_limits']['word'] = [mydict['search_limits']['hasword'].pop()] - if len(mydict['search_limits']['hasword']) == 0: - del mydict['search_limits']['hasword'] - tempquery = userquery(mydict, databaseScheme=self.databaseScheme) - listofBookids = tempquery.bookid_query() - - # Unique identifier for the query that persists across the - # various subqueries. - queryID = hashlib.sha1(listofBookids).hexdigest()[:20] - - tmpcatalog = "bookworm_scratch.tmp" + re.sub("-", "", queryID) - - try: - self.cursor.execute("CREATE TABLE %s (bookid MEDIUMINT, PRIMARY KEY (bookid)) ENGINE=MYISAM;" %tmpcatalog) - self.cursor.execute("INSERT IGNORE INTO %s %s;" %(tmpcatalog, listofBookids)) - - except MySQLdb.OperationalError as e: - # Usually the error will be 1050, which is a good thing: it means we don't need to - # create the table. - # If it's not, something bad is happening. - if not re.search("1050.*already exists", str(e)): - raise - self.catalog += " NATURAL JOIN %s "%(tmpcatalog) - - def make_wordwheres(self): - self.wordswhere = " TRUE " - self.max_word_length = 0 - limits = [] - """ - "unigram" or "bigram" can be used as an alias for "word" in the search_limits field. - """ - - for gramterm in ['unigram', 'bigram']: - if gramterm in list(self.limits.keys()) and "word" not in list(self.limits.keys()): - self.limits['word'] = self.limits[gramterm] - del self.limits[gramterm] - - if 'word' in list(self.limits.keys()): - """ - This doesn't currently allow mixing of one and two word searches together in a logical way. - It might be possible to just join on both the tables in MySQL--I'm not completely sure what would happen. - But the philosophy has been to keep users from doing those searches as far as possible in any case. - """ - for phrase in self.limits['word']: - locallimits = dict() - array = phrase.split() - n = 0 - for word in array: - n += 1 - searchingFor = word - if self.word_field == "stem": - from nltk import PorterStemmer - searchingFor = PorterStemmer().stem_word(searchingFor) - if self.word_field == "case_insensitive" or self.word_field == "Case_Insensitive": - # That's a little joke. Get it? - searchingFor = searchingFor.lower() - selectString = "SELECT wordid FROM %s WHERE %s = %%s" % (self.wordsheap, self.word_field) - - logging.debug(selectString) - cursor = self.db.cursor - cursor.execute(selectString,(searchingFor,)) - for row in cursor.fetchall(): - wordid = row[0] - try: - locallimits['words'+str(n) + ".wordid"] += [wordid] - except KeyError: - locallimits['words'+str(n) + ".wordid"] = [wordid] - self.max_word_length = max(self.max_word_length, n) - - # Strings have already been escaped, so don't need to be escaped again. - if len(list(locallimits.keys())) > 0: - limits.append(where_from_hash(locallimits, comp = " = ", escapeStrings=False)) - # XXX for backward compatability - self.words_searched = phrase - # XXX end deprecated block - self.wordswhere = "(" + ' OR '.join(limits) + ")" - if limits == []: - # In the case that nothing has been found, tell it explicitly to search for - # a condition when nothing will be found. - self.wordswhere = "words1.wordid=-1" - - wordlimits = dict() - - limitlist = copy.deepcopy(list(self.limits.keys())) - - for key in limitlist: - if re.search("words\d", key): - wordlimits[key] = self.limits[key] - self.max_word_length = max(self.max_word_length, 2) - del self.limits[key] - - if len(list(wordlimits.keys())) > 0: - self.wordswhere = where_from_hash(wordlimits) - - return self.wordswhere - - def build_wordstables(self): - # Deduce the words tables we're joining against. The iterating on this can be made more general to get 3 or four grams in pretty easily. - # This relies on a determination already having been made about whether this is a unigram or bigram search; that's reflected in the self.selections - # variable. - - """ - We also now check for whether it needs the topic assignments: this could be generalized, with difficulty, for any other kind of plugin. - """ - - needsBigrams = (self.max_word_length == 2 or re.search("words2", self.selections)) - needsUnigrams = self.max_word_length == 1 or re.search("[^h][^a][^s]word", self.selections) - - if self.max_word_length > 2: - err = dict(code=400, message="Phrase is longer than what Bookworm supports") - raise BookwormException(err) - - needsTopics = bool(re.search("topic", self.selections)) or ("topic" in list(self.limits.keys())) - - if needsBigrams: - - self.maintable = 'master_bigrams' - - self.main = ''' - JOIN - master_bigrams as main - ON ('''+ self.prefs['fastcat'] +'''.bookid=main.bookid) - ''' - - self.wordstables = """ - JOIN %(wordsheap)s as words1 ON (main.word1 = words1.wordid) - JOIN %(wordsheap)s as words2 ON (main.word2 = words2.wordid) """ % self.__dict__ - - # I use a regex here to do a blanket search for any sort of word limitations. That has some messy sideffects (make sure the 'hasword' - # key has already been eliminated, for example!) but generally works. - - elif needsTopics and needsUnigrams: - self.maintable = 'master_topicWords' - self.main = ''' - NATURAL JOIN - master_topicWords as main - ''' - self.wordstables = """ - JOIN ( %(wordsheap)s as words1) ON (main.wordid = words1.wordid) - """ % self.__dict__ - - elif needsUnigrams: - self.maintable = 'master_bookcounts' - self.main = ''' - NATURAL JOIN - master_bookcounts as main - ''' - - self.wordstables = """ - JOIN ( %(wordsheap)s as words1) ON (main.wordid = words1.wordid) - """ % self.__dict__ - - elif needsTopics: - self.maintable = 'master_topicCounts' - self.main = ''' - NATURAL JOIN - master_topicCounts as main ''' - self.wordstables = " " - self.wordswhere = " TRUE " - - else: - """ - Have _no_ words table if no words searched for or grouped by; - instead just use nwords. This - means that we can use the same basic functions both to build the - counts for word searches and - for metadata searches, which is valuable because there is a - metadata-only search built in to every single ratio - query. (To get the denominator values). - - Call this OLAP, if you like. - """ - self.main = " " - self.operation = ','.join(self.catoperations) - """ - This, above is super important: the operation used is relative to the counttype, and changes to use 'catoperation' instead of 'bookoperation' - That's the place that the denominator queries avoid having to do a table scan on full bookcounts that would take hours, and instead takes - milliseconds. - """ - self.wordstables = " " - self.wordswhere = " TRUE " - # Just a dummy thing to make the SQL writing easier. Shouldn't take any time. Will usually be extended with actual conditions. - - def set_operations(self): - """ - This is the code that allows multiple values to be selected. - - All can be removed when we kill back compatibility ! It's all handled now by the general_API, not the SQL_API. - """ - - backCompatability = {"Occurrences_per_Million_Words":"WordsPerMillion", "Raw_Counts":"WordCount", "Percentage_of_Books":"TextPercent", "Number_of_Books":"TextCount"} - - for oldKey in list(backCompatability.keys()): - self.counttype = [re.sub(oldKey, backCompatability[oldKey], entry) for entry in self.counttype] - - self.bookoperation = {} - self.catoperation = {} - self.finaloperation = {} - - # Text statistics - self.bookoperation['TextPercent'] = "count(DISTINCT " + self.prefs['fastcat'] + ".bookid) as TextCount" - self.bookoperation['TextRatio'] = "count(DISTINCT " + self.prefs['fastcat'] + ".bookid) as TextCount" - self.bookoperation['TextCount'] = "count(DISTINCT " + self.prefs['fastcat'] + ".bookid) as TextCount" - - # Word Statistics - self.bookoperation['WordCount'] = "sum(main.count) as WordCount" - self.bookoperation['WordsPerMillion'] = "sum(main.count) as WordCount" - self.bookoperation['WordsRatio'] = "sum(main.count) as WordCount" - - """ - +Total Numbers for comparisons/significance assessments - This is a little tricky. The total words is EITHER the denominator (as in a query against words per Million) or the numerator+denominator (if you're comparing - Pittsburg and Pittsburgh, say, and want to know the total number of uses of the lemma. For now, "TotalWords" means the former and "SumWords" the latter, - On the theory that 'TotalWords' is more intuitive and only I (Ben) will be using SumWords all that much. - """ - self.bookoperation['TotalWords'] = self.bookoperation['WordsPerMillion'] - self.bookoperation['SumWords'] = self.bookoperation['WordsPerMillion'] - self.bookoperation['TotalTexts'] = self.bookoperation['TextCount'] - self.bookoperation['SumTexts'] = self.bookoperation['TextCount'] - - for stattype in list(self.bookoperation.keys()): - if re.search("Word", stattype): - self.catoperation[stattype] = "sum(nwords) as WordCount" - if re.search("Text", stattype): - self.catoperation[stattype] = "count(nwords) as TextCount" - - self.finaloperation['TextPercent'] = "IFNULL(numerator.TextCount,0)/IFNULL(denominator.TextCount,0)*100 as TextPercent" - self.finaloperation['TextRatio'] = "IFNULL(numerator.TextCount,0)/IFNULL(denominator.TextCount,0) as TextRatio" - self.finaloperation['TextCount'] = "IFNULL(numerator.TextCount,0) as TextCount" - - self.finaloperation['WordsPerMillion'] = "IFNULL(numerator.WordCount,0)*100000000/IFNULL(denominator.WordCount,0)/100 as WordsPerMillion" - self.finaloperation['WordsRatio'] = "IFNULL(numerator.WordCount,0)/IFNULL(denominator.WordCount,0) as WordsRatio" - self.finaloperation['WordCount'] = "IFNULL(numerator.WordCount,0) as WordCount" - - self.finaloperation['TotalWords'] = "IFNULL(denominator.WordCount,0) as TotalWords" - self.finaloperation['SumWords'] = "IFNULL(denominator.WordCount,0) + IFNULL(numerator.WordCount,0) as SumWords" - self.finaloperation['TotalTexts'] = "IFNULL(denominator.TextCount,0) as TotalTexts" - self.finaloperation['SumTexts'] = "IFNULL(denominator.TextCount,0) + IFNULL(numerator.TextCount,0) as SumTexts" - - """ - The values here will be chosen in build_wordstables; that's what decides if it uses the 'bookoperation' or 'catoperation' dictionary to build out. - """ - - self.finaloperations = list() - self.bookoperations = set() - self.catoperations = set() - - for summaryStat in self.counttype: - self.catoperations.add(self.catoperation[summaryStat]) - self.bookoperations.add(self.bookoperation[summaryStat]) - self.finaloperations.append(self.finaloperation[summaryStat]) - - def counts_query(self): - - self.operation = ','.join(self.bookoperations) - self.build_wordstables() - - countsQuery = """ - SELECT - %(selections)s, - %(operation)s - FROM - %(catalog)s - %(main)s - %(wordstables)s - WHERE - %(catwhere)s AND %(wordswhere)s - GROUP BY - %(groupings)s - """ % self.__dict__ - return countsQuery - - def bookid_query(self): - # A temporary method to setup the hasword query. - self.operation = ','.join(self.bookoperations) - self.build_wordstables() - - countsQuery = """ - SELECT - main.bookid as bookid - FROM - %(catalog)s - %(main)s - %(wordstables)s - WHERE - %(catwhere)s AND %(wordswhere)s - """ % self.__dict__ - return countsQuery - - def debug_query(self): - query = self.ratio_query(materialize = False) - return json.dumps(self.denominator.groupings.split(",")) + query - - def query(self, materialize=False): - """ - We launch a whole new userquery instance here to build the denominator, based on the 'compare_dictionary' option (which in most - cases is the search_limits without the keys, see above; it can also be specially defined using asterisks as a shorthand to identify other fields to drop. - We then get the counts_query results out of that result. - """ - - """ - self.denominator = userquery(outside_dictionary = self.compare_dictionary,db=self.db,databaseScheme=self.databaseScheme) - self.supersetquery = self.denominator.counts_query() - supersetIndices = self.denominator.groupings.split(",") - if materialize: - self.supersetquery = derived_table(self.supersetquery,self.db,indices=supersetIndices).materialize() - """ - self.mainquery = self.counts_query() - self.countcommand = ','.join(self.finaloperations) - self.totalselections = ",".join([group for group in self.outerGroups if group!="1 as In_Library" and group != ""]) - if self.totalselections != "": - self.totalselections += ", " - - query = """ - SELECT - %(totalselections)s - %(countcommand)s - FROM - (%(mainquery)s) as numerator - %(joinSuffix)s - GROUP BY %(groupings)s;""" % self.__dict__ - - logging.debug("Query: %s" % query) - return query - - def returnPossibleFields(self): - try: - self.cursor.execute("SELECT name,type,description,tablename,dbname,anchor FROM masterVariableTable WHERE status='public'") - colnames = [line[0] for line in self.cursor.description] - returnset = [] - for line in self.cursor.fetchall(): - thisEntry = {} - for i in range(len(line)): - thisEntry[colnames[i]] = line[i] - returnset.append(thisEntry) - except: - returnset=[] - return returnset - - def bibliography_query(self, limit = "100"): - # I'd like to redo this at some point so it could work as an API call more naturally. - self.limit = limit - self.ordertype = "sum(main.count*10000/nwords)" - try: - if self.outside_dictionary['ordertype'] == "random": - if self.counttype == ["Raw_Counts"] or self.counttype == ["Number_of_Books"] or self.counttype == ['WordCount'] or self.counttype == ['BookCount'] or self.counttype == ['TextCount']: - self.ordertype = "RAND()" - else: - # This is a based on an attempt to match various different distributions I found on the web somewhere to give - # weighted results based on the counts. It's not perfect, but might be good enough. Actually doing a weighted random search is not easy without - # massive memory usage inside sql. - self.ordertype = "LOG(1-RAND())/sum(main.count)" - except KeyError: - pass - - # If IDF searching is enabled, we could add a term like '*IDF' here to overweight better selecting words - # in the event of a multiple search. - self.idfterm = "" - prep = self.counts_query() - - if self.main == " ": - self.ordertype="RAND()" - - bibQuery = """ - SELECT searchstring - FROM """ % self.__dict__ + self.prefs['fullcat'] + """ RIGHT JOIN ( - SELECT - """+ self.prefs['fastcat'] + """.bookid, %(ordertype)s as ordering - FROM - %(catalog)s - %(main)s - %(wordstables)s - WHERE - %(catwhere)s AND %(wordswhere)s - GROUP BY bookid ORDER BY %(ordertype)s DESC LIMIT %(limit)s - ) as tmp USING(bookid) ORDER BY ordering DESC; - """ % self.__dict__ - return bibQuery - - def disk_query(self, limit="100"): - pass - - def return_books(self): - # This preps up the display elements for a search: it returns an array with a single string for each book, sorted in the best possible way - silent = self.cursor.execute(self.bibliography_query()) - returnarray = [] - for line in self.cursor.fetchall(): - returnarray.append(line[0]) - if not returnarray: - # why would someone request a search with no locations? - # Turns out (usually) because the smoothing tricked them. - returnarray.append("") - newerarray = self.custom_SearchString_additions(returnarray) - return json.dumps(newerarray) - - def search_results(self): - # This is an alias that is handled slightly differently in - # APIimplementation (no "RESULTS" bit in front). Once - # that legacy code is cleared out, they can be one and the same. - - return json.loads(self.return_books()) - - def getActualSearchedWords(self): - if len(self.wordswhere) > 7: - words = self.outside_dictionary['search_limits']['word'] - # Break bigrams into single words. - words = ' '.join(words).split(' ') - self.cursor.execute("SELECT word FROM {} WHERE {}".format(self.wordsheap, where_from_hash({self.word_field:words}))) - self.actualWords = [item[0] for item in self.cursor.fetchall()] - else: - raise TypeError("Suspiciously low word count") - self.actualWords = ["tasty", "mistake", "happened", "here"] - - def custom_SearchString_additions(self, returnarray): - """ - It's nice to highlight the words searched for. This will be on partner web sites, so requires custom code for different databases - """ - db = self.outside_dictionary['database'] - if db in ('jstor', 'presidio', 'ChronAm', 'LOC', 'OL'): - self.getActualSearchedWords() - if db == 'jstor': - joiner = "&searchText=" - preface = "?Search=yes&searchText=" - urlRegEx = "http://www.jstor.org/stable/\d+" - if db == 'presidio' or db == 'OL': - joiner = "+" - preface = "# page/1/mode/2up/search/" - urlRegEx = 'http://archive.org/stream/[^"# ><]*' - if db in ('ChronAm', 'LOC'): - preface = "/;words=" - joiner = "+" - urlRegEx = 'http://chroniclingamerica.loc.gov[^\"><]*/seq-\d+' - newarray = [] - for string in returnarray: - try: - base = re.findall(urlRegEx, string)[0] - newcore = ' search inside ' - string = re.sub("^", "", string) - string = re.sub("$", "", string) - string = string+newcore - except IndexError: - pass - newarray.append(string) - # Arxiv is messier, requiring a whole different URL interface: http://search.arxiv.org:8081/paper.jsp?r=1204.3352&qs=netwokr - else: - newarray = returnarray - return newarray - - def return_tsv(self, query = "ratio_query"): - if self.outside_dictionary['counttype'] == "Raw_Counts" or self.outside_dictionary['counttype'] == ["Raw_Counts"]: - query="counts_query" - # This allows much speedier access to counts data if you're - # willing not to know about all the zeroes. - # Will not work as well once the id_fields are in use. - querytext = getattr(self, query)() - silent = self.cursor.execute(querytext) - results = ["\t".join([to_unicode(item[0]) for item in self.cursor.description])] - lines = self.cursor.fetchall() - for line in lines: - items = [] - for item in line: - item = to_unicode(item) - item = re.sub("\t", "", item) - items.append(item) - results.append("\t".join(items)) - return "\n".join(results) - - def execute(self): - # This performs the query using the method specified in the passed parameters. - if self.method == "Nothing": - pass - else: - value = getattr(self, self.method)() - return value - -class derived_table(object): - """ - MySQL/MariaDB doesn't have good subquery materialization, - so I'm implementing it by hand. - """ - def __init__(self, SQLstring, db, indices = [], dbToPutIn = "bookworm_scratch"): - """ - initialize with the code to create the table; the database it will be in - (to prevent conflicts with other identical queries in other dbs); - and the list of all tables to be indexed - (optional, but which can really speed up joins) - """ - self.query = SQLstring - self.db = db - # Each query is identified by a unique key hashed - # from the query and the dbname. - self.queryID = dbToPutIn + "." + "derived" + hashlib.sha1(self.query + db.dbname).hexdigest() - self.indices = "(" + ",".join(["INDEX(%s)" % index for index in indices]) + ")" if indices != [] else "" - - def setStorageEngines(self, temp): - """ - Chooses where and how to store tables. - """ - self.tempString = "TEMPORARY" if temp else "" - self.engine = "MEMORY" if temp else "MYISAM" - - def checkCache(self): - """ - Checks what's already been calculated. - """ - try: - (self.count, self.created, self.modified, self.createCode, self.data) = self.db.cursor.execute("SELECT count,created,modified,createCode,data FROM bookworm_scratch.cache WHERE fieldname='%s'" %self.queryID)[0] - return True - except: - (self.count, self.created, self.modified, self.createCode, self.data) = [None]*5 - return False - - def fillTableWithData(self, data): - dataCode = "INSERT INTO %s values ("%self.queryID + ", ".join(["%s"]*len(data[0])) + ")" - self.db.cursor.executemany(dataCode, data) - self.db.db.commit() - - -class databaseSchema(object): - """ - This class stores information about the database setup that is used to optimize query creation query - and so that queries know what tables to include. - It's broken off like this because it might be usefully wrapped around some of the backend features, - because it shouldn't be run multiple times in a single query (that spawns two instances of itself), as was happening before. - - It's closely related to some of the classes around variables and variableSets in the Bookworm Creation scripts, - but is kept separate for now: that allows a bit more flexibility, but is probaby a Bad Thing in the long run. - """ - - def __init__(self, db): - self.db = db - self.cursor=db.cursor - # has of what table each variable is in - self.tableToLookIn = {} - # hash of what the root variable for each search term is (eg, 'author_birth' might be crosswalked to 'authorid' in the main catalog.) - self.anchorFields = {} - # aliases: a hash showing internal identifications codes that dramatically speed up query time, but which shouldn't be exposed. - # So you can run a search for "state," say, and the database will group on a 50-element integer code instead of a VARCHAR that - # has to be long enough to support "Massachusetts" and "North Carolina." - # A couple are hard-coded in, but most are derived by looking for fields that end in the suffix "__id" later. - - if self.db.dbname == "presidio": - self.aliases = {"classification":"lc1", "lat":"pointid", "lng":"pointid"} - else: - self.aliases = dict() - - try: - # First build using the new streamlined tables; if that fails, - # build using the old version that hits the INFORMATION_SCHEMA, - # which is bad practice. - self.newStyle(db) - except: - # The new style will fail on old bookworms: a failure is an easy way to test - # for oldness, though of course something else might be causing the failure. - self.oldStyle(db) - - def newStyle(self, db): - self.tableToLookIn['bookid'] = self.fallback_table('fastcat') - self.anchorFields['bookid'] = self.fallback_table('fastcat') - self.anchorFields['wordid'] = 'wordid' - self.tableToLookIn['wordid'] = self.wordsheap - - tablenames = dict() - tableDepends = dict() - db.cursor.execute("SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);") - for row in db.cursor.fetchall(): - (dbname, alias, tablename, dependsOn) = row - self.tableToLookIn[dbname] = tablename - self.anchorFields[tablename] = dependsOn - self.aliases[dbname] = alias - - def oldStyle(self, db): - - # This is sorted by engine DESC so that memory table locations will overwrite disk table in the hash. - - self.cursor.execute("SELECT ENGINE,TABLE_NAME,COLUMN_NAME,COLUMN_KEY,TABLE_NAME='fastcat' OR TABLE_NAME='wordsheap' AS privileged FROM information_schema.COLUMNS JOIN INFORMATION_SCHEMA.TABLES USING (TABLE_NAME,TABLE_SCHEMA) WHERE TABLE_SCHEMA='%(dbname)s' ORDER BY privileged,ENGINE DESC,TABLE_NAME,COLUMN_KEY DESC;" % self.db.__dict__) - columnNames = self.cursor.fetchall() - - parent = 'bookid' - previous = None - for databaseColumn in columnNames: - if previous != databaseColumn[1]: - if databaseColumn[3] == 'PRI' or databaseColumn[3] == 'MUL': - parent = databaseColumn[2] - previous = databaseColumn[1] - else: - parent = 'bookid' - else: - self.anchorFields[databaseColumn[2]] = parent - if databaseColumn[3]!='PRI' and databaseColumn[3]!="MUL": # if it's a primary key, this isn't the right place to find it. - self.tableToLookIn[databaseColumn[2]] = databaseColumn[1] - if re.search('__id\*?$', databaseColumn[2]): - self.aliases[re.sub('__id', '', databaseColumn[2])]=databaseColumn[2] - - try: - cursor = self.cursor.execute("SELECT dbname,tablename,anchor,alias FROM masterVariableTables") - for row in cursor.fetchall(): - if row[0] != row[3]: - self.aliases[row[0]] = row[3] - if row[0] != row[2]: - self.anchorFields[row[0]] = row[2] - # Should be uncommented, but some temporary issues with the building script - # self.tableToLookIn[row[0]] = row[1] - except: - pass - self.tableToLookIn['bookid'] = 'fastcat' - self.anchorFields['bookid'] = 'fastcat' - self.anchorFields['wordid'] = 'wordid' - self.tableToLookIn['wordid'] = 'wordsheap' - -def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_joiner = " OR "): - whereterm = [] - # The general idea here is that we try to break everything in search_limits down to a list, and then create a whereterm on that joined by whatever the 'joiner' is ("AND" or "OR"), with the comparison as whatever comp is ("=",">=",etc.). - # For more complicated bits, it gets all recursive until the bits are all in terms of list. - if joiner is None: - joiner = " AND " - for key in list(myhash.keys()): - values = myhash[key] - if isinstance(values, (str, bytes)) or isinstance(values, int) or isinstance(values, float): - # This is just human-being handling. You can pass a single value instead of a list if you like, and it will just convert it - # to a list for you. - values = [values] - # Or queries are special, since the default is "AND". This toggles that around for a subportion. - - if key == "$or" or key == "$OR": - local_set = [] - for comparison in values: - local_set.append(where_from_hash(comparison, comp=comp)) - whereterm.append(" ( " + " OR ".join(local_set) + " )") - elif key == '$and' or key == "$AND": - for comparison in values: - whereterm.append(where_from_hash(comparison, joiner=" AND ", comp=comp)) - elif isinstance(values, dict): - if joiner is None: - joiner = " AND " - # Certain function operators can use MySQL terms. - # These are the only cases that a dict can be passed as a limitations - operations = {"$gt":">", "$ne":"!=", "$lt":"<", - "$grep":" REGEXP ", "$gte":">=", - "$lte":"<=", "$eq":"="} - - for operation in list(values.keys()): - if operation == "$ne": - # If you pass a lot of ne values, they must *all* be false. - subjoiner = " AND " - else: - subjoiner = " OR " - whereterm.append(where_from_hash({key:values[operation]}, comp=operations[operation], list_joiner=subjoiner)) - elif isinstance(values, list): - # and this is where the magic actually happens: - # the cases where the key is a string, and the target is a list. - if isinstance(values[0], dict): - # If it's a list of dicts, then there's one thing that happens. - # Currently all types are assumed to be the same: - # you couldn't pass in, say {"year":[{"$gte":1900}, 1898]} to - # catch post-1898 years except for 1899. Not that you - # should need to. - for entry in values: - whereterm.append(where_from_hash(entry)) - else: - # Note that about a third of the code is spent on escaping strings. - if escapeStrings: - if isinstance(values[0], (str, bytes)): - quotesep = "'" - else: - quotesep = "" - - def escape(value): - # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. - return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8') - else: - def escape(value): - return to_unicode(value) - quotesep = "" - - joined = list_joiner.join([" ({}{}{}{}{}) ".format(key, comp, quotesep, escape(value), quotesep) for value in values]) - whereterm.append(" ( {} ) ".format(joined)) - - if len(whereterm) > 1: - return "(" + joiner.join(whereterm) + ")" - else: - return whereterm[0] - # This works pretty well, except that it requires very specific sorts of terms going in, I think. diff --git a/bookwormDB/configuration.py b/bookwormDB/configuration.py index bb54a36..146400c 100644 --- a/bookwormDB/configuration.py +++ b/bookwormDB/configuration.py @@ -1,221 +1,4 @@ -#!/usr/bin/python -from __future__ import print_function -import configparser -import os -import sys -import re -import MySQLdb -import argparse -import getpass -import subprocess -import logging -import uuid - -def update(): - ## Assemble list of all bookworms on the system. - - bookworms = [] ### ... - - ## Create on-disk versions of memory tables if 'fastcat_' does not exists. - - pass - - ## Allow "'bookworm'@'localhost' IDENTIFIED BY ''" to have select access on each bookworm. - - pass - - ## Print a message about enabling access. - - pass - - -def create(ask_about_defaults=True, database=None): - """ - Through interactive prompts at the command line, builds up a file at - bookworm.cnf that can be used to set preferences for the installation. - """ - - if ask_about_defaults: - print(""" - Welcome to Bookworm. - ~~~~~~~~~~~~~~~~~~~~ - First off, let's build a configuration file. This will live - at bookworm.cnf in the current directory: if you mistype anything, - or want to change settings, edit it directly in that location. - - For each of the following entries, type the value you want, or hit - enter to accept the default: - - """) - else: - logging.info("Auto-generating config file.") - - """ - First, we go to great efforts to find some sensible defaults - Usually the user can just hit enter. - """ - - systemConfigFile = configparser.SafeConfigParser(allow_no_value=True) - - defaults = dict() - # The default bookwormname is just the current location - - if database is None: - defaults['database'] = os.path.relpath(".", "..") - else: - defaults['database'] = database - - defaults["user"] = "bookworm" - defaults["password"] = "" - - config = configparser.ConfigParser() - - for section in ["client"]: - config.add_section(section) - - if ask_about_defaults: - database = input("What is the name of the bookworm [" + defaults['database'] + "]: ") - else: - database = defaults['database'] - - config.set("client", "database", re.sub(" ","_",database)) - config.write(open("bookworm.cnf", "w")) - -class Configfile(object): - def __init__(self, usertype, possible_locations=None, default=None, ask_about_defaults=True): - """ - Initialize with the type of the user. The last encountered file on - the list is the one that will be used. - If default is set, a file will be created at that location if none - of the files in possible_locations exist. - - If ask_about_defaults is false, it will do a force installation. - """ - - if not usertype in ['read_only', 'admin']: - raise NotImplementedError("Only read_only and admin supported") - - self.ask_about_defaults = ask_about_defaults - - logging.debug("Creating configuration as " + usertype) - - self.usertype = usertype - - if possible_locations is None: - possible_locations = self.default_locations_from_type(usertype) - - self.location = None - - self.config = configparser.ConfigParser(allow_no_value=True) - - if usertype=="admin": - - self.ensure_section("client") - self.ensure_section("mysqld") - - self.config.set("client", "host", "localhost") - self.config.set("client", "user", "root") - self.config.set("client", "password", "") - self.config.set("client", "clienthostname", "localhost") - - else: - self.ensure_section("client") - self.config.set("client", "host", "localhost") - self.config.set("client", "user", "bookworm") - self.config.set("client", "password", "") - # A different section here can change the name of the host - # allowed to log in for select queries. - self.config.set("client", "clienthostname", "localhost") - - self.read_config_files(possible_locations) - - for string in possible_locations: - if os.path.exists(string): - self.location = string - - - def read_config_files(self, used_files): - for file in used_files: - try: - self.config.read(file) - try: - password_file = self.config.get("client", "password_file") - except configparser.NoOptionError: - password_file = None - if password_file: - try: - with open(password_file) as fin: - password = fin.read().rstrip("\n").rstrip("\r") - self.config.set("client", "password", password) - except: - logging.error(f"Error reading passworm from {password_file}") - raise - self.config.remove_option("client", "password_file") - except configparser.MissingSectionHeaderError: - # Not every file needs every section. - pass - - - def default_locations_from_type(self,usertype): - """ - The default locations for each usertype. - Note that these are in ascending order of importance: - so the preferred location for admin and read_only configuration - is in /etc/bookworm/admin.cnf - and /etc/bookworm/client.cnf - """ - - if usertype=="admin": - return [os.path.abspath(os.path.expanduser("~/.my.cnf")), - os.path.abspath(os.path.expanduser("~/my.cnf")), - "/etc/bookworm/admin.cnf"] - if usertype == "read_only": - return ["~/.bookworm-sql.cnf", "/etc/bookworm/client.cnf"] - else: - return [] - - def ensure_section(self,section): - if not self.config.has_section(section): - self.config.add_section(section) - - def set_bookworm_options(self): - """ - A number of specific MySQL changes to ensure fast queries on Bookworm. - """ - self.ensure_section("mysqld") - - mysqldoptions = {"### = =": "THIS FILE SHOULD GENERALLY BE PLACED AT /etc/mysql/my.cnf = = = ###", "max_allowed_packet":"512M","sort_buffer_size":"8M","read_buffer_size":"8M","read_rnd_buffer_size":"8M","bulk_insert_buffer_size":"512M","myisam_sort_buffer_size":"5512M","myisam_max_sort_file_size":"5500G","key_buffer_size":"2500M","query_cache_size":"32M","tmp_table_size":"1024M","max_heap_table_size":"2048M","character_set_server":"utf8","query_cache_type":"1","query_cache_limit":"8M"} - - for option in list(mysqldoptions.keys()): - if not self.config.has_option("mysqld",option): - self.config.set("mysqld", option, mysqldoptions[option]) - else: - if mysqldoptions[option] != self.config.get("mysqld",option): - choice = input("Do you want to change the value for " + option + " from " + self.config.get("mysqld",option) + " to the bookworm-recommended " + mysqldoptions[option] + "? (y/N): ") - if choice=="y": - self.config.set("mysqld",option,mysqldoptions[option]) - - self.write_out() - - def write_out(self): - """ - Write out a new version of the configfile to stdout. - The user is responsible for putting this somewhere it will - affect the MySQL preferences - """ - self.config.write(sys.stdout) - -def recommend_my_cnf(known_loc = None): - if known_loc is None: - for loc in ["/usr/etc/my.cnf","/etc/mysql/my.cnf","/etc/my.cnf"]: - if os.path.exists(loc): - known_loc = loc - if known_loc is None: - raise FileNotFoundError("Could not find MySQL folder: pass one.") - cnf = Configfile(usertype = 'admin', possible_locations = [known_loc]) - cnf.set_bookworm_options() - cnf.write_out() diff --git a/bookwormDB/mariaDB.py b/bookwormDB/duckdb.py similarity index 90% rename from bookwormDB/mariaDB.py rename to bookwormDB/duckdb.py index 7c10492..8462eee 100644 --- a/bookwormDB/mariaDB.py +++ b/bookwormDB/duckdb.py @@ -7,42 +7,10 @@ import json import re import copy -import MySQLdb import hashlib import logging - - - -# If you have bookworms stored on a different host, you can create more lines -# like this. -# A different host and read_default_file will let you import things onto a -# different server. - -class DbConnect(object): - # This is a read-only account - def __init__(self, database=None, host=None): - - self.dbname = database - - import bookwormDB.configuration - conf = bookwormDB.configuration.Configfile("read_only").config - - if database is None: - raise BookwormException("You must specify a database") - - connargs = { - "db": database, - "use_unicode": 'True', - "charset": 'utf8', - "user": conf.get("client", "user"), - "password": conf.get("client", "password"), - "host": conf.get("client", "host") - } - - logging.warning("Preparing to connect with args") - logging.warning(connargs) - self.db = MySQLdb.connect(**connargs) - self.cursor = self.db.cursor() +from MySQLdb import escape_string +#import duckdb def fail_if_nonword_characters_in_columns(input): keys = all_keys(input) @@ -51,7 +19,6 @@ def fail_if_nonword_characters_in_columns(input): logging.error("{} has nonword character".format(key)) raise - def all_keys(input): """ Recursive function. Get every keyname in every descendant of a dictionary. @@ -100,7 +67,7 @@ def check_query(query): ' counts are supported by the SQL api, but passed {}'.format(v)}) -class Query(object): +class DuckQuery(object): """ The base class for a bookworm search. """ @@ -115,14 +82,12 @@ def __init__(self, query_object = {}, db = None, databaseScheme = None): self.db = db if db is None: - self.db = DbConnect(query_object['database']) + raise TypeError("Must supply database.") self.databaseScheme = databaseScheme if databaseScheme is None: self.databaseScheme = databaseSchema(self.db) - self.cursor = self.db.cursor - # Some tablenames. self.wordsheap = self.databaseScheme.fallback_table('wordsheap') @@ -149,7 +114,12 @@ def defaults(self, query_object): self.search_limits = query_object.setdefault('search_limits', [{"word":["polka dot"]}]) self.words_collation = query_object.setdefault('words_collation', "Case_Insensitive") - lookups = {"Case_Insensitive":'word', 'lowercase':'lowercase', 'casesens':'casesens', "case_insensitive":"word", "Case_Sensitive":"casesens", "All_Words_with_Same_Stem":"stem", 'stem':'stem'} + lookups = { + "Case_Insensitive":'word', + 'lowercase':'lowercase', + 'casesens':'casesens', "case_insensitive":"word", + "Case_Sensitive":"casesens", "All_Words_with_Same_Stem":"stem", + 'stem':'stem'} self.word_field = lookups[self.words_collation] self.time_limits = query_object.setdefault('time_limits', [0, 10000000]) @@ -193,7 +163,7 @@ def defaults(self, query_object): raise lookupTableName = "%sLookup%s" %(gramType, gramPos) - self.outerGroups.append("%s.%s as %s" %(lookupTableName, self.word_field, group)) + self.outerGroups.append(f"`{lookupTableName}`.`{self.word_field}` as {group}") self.finalMergeTables.add(" JOIN %s as %s ON %s.wordid=w%s" %(self.wordsheap, lookupTableName, lookupTableName, gramPos)) self.groups.add("words%s.wordid as w%s" %(gramPos, gramPos)) @@ -265,10 +235,7 @@ def determineOutsideDictionary(self): del self.compare_dictionary['search_limits'][list(self.query_object['search_limits'].keys())[0]] except: pass - """ - The grouping behavior here is not desirable, but I'm not quite sure how yet. - Aha--one way is that it accidentally drops out a bunch of options. I'm just disabling it: let's see what goes wrong now. - """ + def derive_variables(self): # These are locally useful, and depend on the search limits put in. @@ -303,8 +270,8 @@ def tablesNeededForQuery(self, fieldNames=[]): q = "SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);" logging.debug(q) - db.cursor.execute(q) - for row in db.cursor.fetchall(): + db.execute(q) + for row in db.fetchall(): tablenames[row[0]] = row[2] tableDepends[row[2]] = row[3] @@ -367,17 +334,17 @@ def wordid_query(self): def make_group_query(self): aliases = [self.databaseScheme.aliases[g] for g in self.query_object["groups"]] - if len(aliases) > 0: - return "GROUP BY {}".format(", ".join(aliases)) + if len(self.query_object["groups"]) > 0: + return "GROUP BY {}".format(", ".join(self.query_object["groups"])) else: return " " def main_table(self): if self.gram_size() == 1: - return 'master_bookcounts as main' + return 'unigrams_wordid as main' if self.gram_size() == 2: - return 'master_bigrams as main' + return 'bigrams_word1_word2 as main' def full_query_tables(self): # Joins are needed to provide groups, but *not* to provide @@ -452,21 +419,15 @@ def create_catalog_table(self): databaseScheme = self.databaseScheme cols = self.needed_columns() - cols = [c for c in cols if not c in ["word", "word1", "word2"]] + + cols = [c for c in cols if not c in {"word", "word1", "word2", "word3", "word4"}] self.relevantTables = self.databaseScheme.tables_for_variables(cols) # moreTables = self.tablesNeededForQuery(columns) - self.catalog = " NATURAL JOIN ".join(self.relevantTables) return self.catalog -# for table in self.relevantTables: -# if table!="fastcat" and table!="words" and table!="wordsheap" and table!="master_bookcounts" and table!="master_bigrams" and table != "fastcat_" and table != "wordsheap_": -# self.catalog = self.catalog + """ NATURAL JOIN """ + table + " "# -# -# return self.catalog - def make_catwhere(self, query = "sub"): # Where terms that don't include the words table join. Kept separate so that we can have subqueries only working on one half of the stack. @@ -541,18 +502,17 @@ def make_wordwheres(self): searchingFor = searchingFor.lower() - selectString = "SELECT wordid FROM %s WHERE %s = %%s" % (self.wordsheap, self.word_field) - logging.debug(selectString) - cursor = self.db.cursor - cursor.execute(selectString,(searchingFor,)) + selectString = f"SELECT wordid FROM wordsheap_ WHERE word = '{searchingFor}'" + logging.warning(selectString) + self.db.execute(selectString) # Set the search key being used. search_key = "wordid" if self.gram_size() > 1: # 1-indexed entries in the bigram tables. - search_key = "word{}".format(n + 1) + search_key = f"word{n + 1}" - for row in cursor.fetchall(): + for row in self.db.fetchall(): wordid = row[0] try: locallimits[search_key] += [wordid] @@ -604,7 +564,7 @@ def build_wordstables(self): if needsBigrams: self.main = ''' - master_bigrams as main + bigrams_word1_word2 as main ''' self.wordstables = """ @@ -616,7 +576,7 @@ def build_wordstables(self): elif needsUnigrams: self.main = ''' - master_bookcounts as main + unigrams_wordid as main ''' self.wordstables = """ @@ -662,20 +622,20 @@ def set_operations(self): if with_words: if "TextCount" in self.query_object['counttype']: - output.append("count(DISTINCT main.bookid) as TextCount") + output.append("count(DISTINCT main.bookid) as 'TextCount'") if "WordCount" in self.query_object['counttype']: - output.append("sum(main.count) as WordCount") + output.append("sum(main.count) as 'WordCount'") else: if "WordCount" in self.query_object['counttype']: - output.append("sum(nwords) as WordCount") + output.append("sum(nwords) as 'WordCount'") if "TextCount" in self.query_object['counttype']: - output.append("count(nwords) as TextCount") + output.append("count(nwords) as 'TextCount'") return output def bookid_query(self): - q = "SELECT bookid FROM {catalog} WHERE {catwhere}""".format(**self.__dict__) + q = f""" {self.catwhere} """ logging.debug("'{}'".format(self.catwhere)) @@ -683,7 +643,7 @@ def bookid_query(self): self.bookid_where = " TRUE " else: - self.bookid_where = " bookid IN ({}) ".format(q) + self.bookid_where = q return self.bookid_where @@ -774,8 +734,8 @@ def getActualSearchedWords(self): words = ' '.join(words).split(' ') q = "SELECT word FROM {} WHERE {}".format(self.wordsheap, where_from_hash({self.word_field:words})) logging.debug(q) - self.cursor.execute(q) - self.actualWords = [item[0] for item in self.cursor.fetchall()] + self.db.execute(q) + self.actualWords = [item[0] for item in self.db.fetchall()] else: raise TypeError("Suspiciously low word count") @@ -838,8 +798,8 @@ class databaseSchema(object): """ def __init__(self, db): + # XXXX self.db = db - self.cursor=db.cursor # has of what table each variable is in self.tableToLookIn = {} @@ -880,9 +840,9 @@ def newStyle(self, db): tableDepends = dict() q = "SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);" logging.debug(q) - db.cursor.execute(q) + db.execute(q) - for row in db.cursor.fetchall(): + for row in db.fetchall(): (dbname, alias, tablename, dependsOn) = row tablename = self.fallback_table(tablename) dependsOn = self.fallback_table(dependsOn) @@ -901,7 +861,7 @@ def fallback_table(self,tabname): tab = tabname if tab.endswith("_"): return tab - if tab in ["words","master_bookcounts","master_bigrams","catalog"]: + if tab in ["words","unigrams_wordid","bigrams_word1_word2","catalog"]: return tab if not hasattr(self,"fallbacks_cache"): @@ -909,15 +869,14 @@ def fallback_table(self,tabname): if tabname in self.fallbacks_cache: return self.fallbacks_cache[tabname] - q = "SELECT COUNT(*) FROM {}".format(tab) logging.debug(q) try: - self.db.cursor.execute(q) - length = self.db.cursor.fetchall()[0][0] + self.db.execute(q) + length = self.db.fetchall()[0][0] if length==0: tab += "_" - except MySQLdb.ProgrammingError: + except RuntimeError: tab += "_" self.fallbacks_cache[tabname] = tab @@ -925,22 +884,26 @@ def fallback_table(self,tabname): return tab def tables_for_variables(self, variables, tables = []): - tables = [] - + lookups = [] for variable in variables: + stack_here = [] lookup_table = self.tableToLookIn[variable] if lookup_table in tables: continue - tables.append(lookup_table) + stack_here.append(lookup_table) while True: anchor = self.fallback_table(self.anchorFields[lookup_table]) - if anchor in tables: + if anchor in stack_here or anchor in lookups: break else: - tables.append(anchor) + # Must go first in duck. + stack_here.append(anchor) lookup_table = anchor - - return tables + stack_here.reverse() + for variable in stack_here: + if not variable in lookups: + lookups.append(variable) + return list(lookups) @@ -1003,7 +966,7 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ def escape(value): # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. - return str(MySQLdb.escape_string(to_unicode(value)), 'utf-8') + return str(escape_string(to_unicode(value)), 'utf-8') else: def escape(value): return to_unicode(value) diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 801f4c1..1f7bc45 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -5,9 +5,7 @@ from pyarrow import feather from copy import deepcopy from collections import defaultdict -from .mariaDB import DbConnect -from .SQLAPI import userquery -from .mariaDB import Query +from .duckdb import DuckQuery from .bwExceptions import BookwormException from .query_cache import Query_Cache import re @@ -239,13 +237,13 @@ class APIcall(object): Without a "return_pandas_frame" method, it won't run. """ - def __init__(self, APIcall): + def __init__(self, query): """ Initialized with a dictionary unJSONed from the API defintion. """ - self.query = APIcall + self.query = query self.idiot_proof_arrays() self.set_defaults() @@ -743,21 +741,17 @@ def generate_pandas_frame(self, call): together = pd.concat(d) together[count_fields].sum() -class SQLAPIcall(APIcall): +class DuckDBCall(APIcall): + """ + Fetches from DuckDB. Must create a connection before passing, + to discourage on-the-fly creation which is slow. """ - To make a new backend for the API, you just need to extend the base API - call class like this. - - This one is comically short because all the real work is done in the - userquery object. - But the point is, you need to define a function "generate_pandas_frame" - that accepts an API call and returns a pandas frame. + def __init__(self, db, **kwargs): - But that API call is more limited than the general API; it need only - support "WordCount" and "TextCount" methods. - - """ + self.db = db + + super().__init__(**kwargs) def generate_pandas_frame(self, call = None): """ @@ -769,18 +763,16 @@ def generate_pandas_frame(self, call = None): more legacy code. """ - if call is None: call = self.query - con = DbConnect(self.query['database']) - q = Query(call).query() - logging.debug("Preparing to execute {}".format(q)) - df = read_sql(q, con.db) + + q = DuckQuery(call, db = self.db).query() + logging.warning("Preparing to execute {}".format(q)) + df = self.db.execute(q).df() logging.debug("Query retrieved") return df - def my_sort(something): if type(something) == list: return sorted(something) @@ -813,7 +805,6 @@ def standardized_query(query: dict) -> dict: return trimmed_call - class ProxyAPI(APIcall): """ @@ -841,11 +832,8 @@ def generate_pandas_frame(self, call = None) -> DataFrame: call = self.query call = deepcopy(call) call['format'] = 'feather' - print(call) query_string = json.dumps(call) - print(query_string) qstring = parse.quote(query_string) - print(qstring) remote_url = f"{self.endpoint}/?{qstring}" buffer = io.BytesIO() connection = request.urlopen(remote_url) @@ -854,8 +842,8 @@ def generate_pandas_frame(self, call = None) -> DataFrame: return feather.read_feather(buffer) except: # TODO: re-throw bookworm errors with additional context. - raise + class Caching_API(APIcall): def __init__(self, query: dict, cache: Query_Cache, fallback_api: APIcall, **kwargs): """ @@ -886,3 +874,4 @@ def generate_pandas_frame(self, call = None) -> DataFrame: # Don't bother doing this every time. self.cache.trim_cache() return resolution + diff --git a/bookwormDB/store.py b/bookwormDB/store.py index 897f359..7aadd8a 100644 --- a/bookwormDB/store.py +++ b/bookwormDB/store.py @@ -1,9 +1,26 @@ -# Just a place to store configurations rather than pass through a -# nest of functions. Bad idea? +# Just a place to store per-process configurations rather than pass through a +# nest of functions. Bad idea? Probably--I got it from too much Javascript. +# Only one location should ever have write +# access, certainly. But this should be easier to disentangle than endless passed 'args.whatever' +import yaml +from pathlib import Path -store_dict = {} +store_dict = { + 'duckdb_directory': Path(".") +} + +directories = [Path("."), Path("/var/lib/bookworm/"), *Path(".").parents, Path("~").expanduser()] +directories.reverse() # because we want the immediate parent first. + +for dir in directories: + for file in [".bookworm.yaml", ".bookworm.yml", "bookworm.yaml"]: + p = dir / file + if p.exists(): + print("Loading", dir) + store_dict = yaml.safe_load(p.open()) def store(): global store_dict return store_dict + diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index 8758adc..ee9d9de 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -1,4 +1,4 @@ -from bookwormDB.general_API import SQLAPIcall as SQLAPIcall, Caching_API, ProxyAPI +from bookwormDB.general_API import DuckDBCall, Caching_API, ProxyAPI import json from urllib.parse import unquote import logging @@ -7,11 +7,12 @@ from bookwormDB.store import store from .store import store from .query_cache import Query_Cache +from pathlib import Path +import duckdb from datetime import datetime - def content_type(query): try: format = query['format'] @@ -34,7 +35,6 @@ def content_type(query): args = store()['args'] -API_kwargs = {} if args.cache != "none": query_cache = Query_Cache( args.cache, @@ -43,9 +43,22 @@ def content_type(query): cold_storage = args.cold_storage) +class DuckPool(dict): + def __missing__(self, key): + # Mother duck said 'quack quack quack quack' + # and all of her five little duckies came back. + duck_dir = store()['duckdb_directory'] + self[key] = duckdb.connect(str(Path(duck_dir) / key), read_only = True) + return self[key] + +duck_connections = DuckPool() + if args.remote_host is None: logging.info("Using SQL API") - API = SQLAPIcall + API = DuckDBCall + API_kwargs = { + } + else: logging.info("Using proxy API") API = ProxyAPI @@ -53,6 +66,8 @@ def content_type(query): "endpoint": args.remote_host } + + def application(environ, start_response, logfile = "bookworm_queries.log"): # Starting with code from http://wsgi.tutorial.codepoint.net/parsing-the-request-post try: @@ -67,7 +82,6 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): q = environ.get('QUERY_STRING') try: ip = environ.get('HTTP_X_FORWARDED_FOR') - # logging.debug("Request from {}".format(ip)) except: ip = environ.get('REMOTE_ADDR') if ip is None: @@ -89,7 +103,6 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): } - logging.debug("Received query {}".format(query)) start = datetime.now() @@ -108,8 +121,9 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): return [b'{"status":"error", "message": "You have passed invalid JSON to the Bookworm API"}'] args = store()['args'] + if args.cache == "none": - process = API(query, **API_kwargs) + process = API(query=query, db=duck_connections[query['database']], **API_kwargs) else: process = Caching_API(query, query_cache, API, **API_kwargs) From fc041002be1a275dcc3421dd0f939559ebde0814 Mon Sep 17 00:00:00 2001 From: Benjamin Schmidt <> Date: Fri, 14 May 2021 16:06:41 -0400 Subject: [PATCH 27/41] Closer to DuckDB, interim patch --- LICENSE.md | 2 + bookwormDB/CreateDatabase.py | 558 ---------------------------- bookwormDB/convertTSVtoJSONarray.py | 28 -- bookwormDB/duckdb.py | 26 +- bookwormDB/wsgi.py | 3 +- setup.py | 3 +- 6 files changed, 7 insertions(+), 613 deletions(-) delete mode 100755 bookwormDB/CreateDatabase.py delete mode 100644 bookwormDB/convertTSVtoJSONarray.py diff --git a/LICENSE.md b/LICENSE.md index cd4ca3a..9b5660b 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -18,3 +18,5 @@ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +Further revisions to the Python3 2020 Benjamin Schmidt diff --git a/bookwormDB/CreateDatabase.py b/bookwormDB/CreateDatabase.py deleted file mode 100755 index 704700c..0000000 --- a/bookwormDB/CreateDatabase.py +++ /dev/null @@ -1,558 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import MySQLdb -import re -import json -import os -from .variableSet import variableSet -from .variableSet import splitMySQLcode -from bookwormDB.configuration import Configfile -from configparser import NoOptionError -import logging -import warnings -from .sqliteKV import KV - -warnings.filterwarnings('ignore', 'Table .* already exists') -warnings.filterwarnings("ignore", ".*Can't create database.*; database exists.*") -warnings.filterwarnings("ignore", ".*Unknown table.*") -warnings.filterwarnings("ignore", "Table 'mysql.table_stats' doesn't exist") -warnings.filterwarnings("ignore", "Data truncated for column .*") -warnings.filterwarnings("ignore", "Incorrect integer value.*") - -class DB(object): - def __init__(self, dbname = None): - if dbname == None: - self.dbname = config.get("client","database") - else: - self.dbname = dbname - if not re.match("^[A-Za-z0-9_]+$", self.dbname): - raise NameError("Database names must not include any spaces or special characters") - self.conn = None - - def connect(self, setengine=True): - #These scripts run as the Bookworm _Administrator_ on this machine; defined by the location of this my.cnf file. - conf = Configfile("admin") - try: - host = conf.config.get("mysqld", "host") - except NoOptionError: - host = conf.config.get("client", "host") - connect_args = { - "user": conf.config.get("client", "user"), - "passwd": conf.config.get("client", "password"), - "host": host, - "use_unicode": 'True', - "charset": 'utf8', - "db": '', - "local_infile": 1} - try: - logging.info(connect_args) - self.conn = MySQLdb.connect(**connect_args) - except MySQLdb.OperationalError: - # Sometimes mysql wants to connect over this rather than a socket: - # falling back to it for backward-compatibility. - logging.debug("Connection failed: attempting fallback over a different port") - if connect_args["host"] == "localhost": - connect_args["host"] = "127.0.0.1" - self.conn = MySQLdb.connect(**connect_args) - else: - raise - - cursor = self.conn.cursor() - cursor.execute("CREATE DATABASE IF NOT EXISTS %s default character set utf8" % self.dbname) - # Don't use native query attribute here to avoid infinite loops - cursor.execute("SET NAMES 'utf8'") - cursor.execute("SET CHARACTER SET 'utf8'") - if setengine: - try: - cursor.execute("SET default_storage_engine=MYISAM") - except: - logging.error("Forcing default engine failed. On some versions of Mysql,\ - you may need to add \"default-storage-engine=MYISAM\" manually\ - to the [mysqld] user in /etc/my.cnf. Trying again to connect...") - self.connect(setengine=False) - logging.debug("Connecting to %s" % self.dbname) - cursor.execute("USE %s" % self.dbname) - - def query(self, sql, params = None, many_params=None): - """ - If a connection times out, reboot - the connection and starts up nicely again. - - many_params: If included, assume that executemany() is expected, with the sequence of parameter - provided. - """ - logging.debug(" -- Preparing to execute SQL code -- " + sql) - logging.debug(" -- with params {}".format(params)) - - try: - cursor = self.conn.cursor() - if many_params is not None: - cursor.executemany(sql, many_params) - else: - - cursor.execute(sql) - except: - try: - self.connect() - cursor = self.conn.cursor() - if many_params is not None: - cursor.executemany(sql, many_params) - else: - if params is None: - cursor.execute(sql) - else: - cursor.execute(sql, params) - except: - logging.error("Query failed: \n" + sql + "\n") - raise - - return cursor - -class BookwormSQLDatabase(object): - - """ - This class gives interactions methods to a MySQL database storing Bookworm - data. Although the primary methods are about loading data already created - into the SQL database, it has a few other operations - that write out text files needed by the API and the web front end: - I take it as logical to do those here, since that how - it fits chronologically in the bookworm-creation sequence. - """ - - def __init__(self, dbname=None, - variableFile=".bookworm/metadata/jsoncatalog_derived.txt"): - """ - You can initialize it with a database name; - otherwise it defaults to finding a - Bookworm configuration file. - """ - self.config_manager = Configfile("admin") - config = self.config_manager.config - - self.dbname = dbname - - self.conn = None - - if self.dbname is not None: - # Sometimes this may be called just to access the - # variables elements. - self.db = DB(dbname=self.dbname) - else: - self.db = None - - if variableFile is not None: - try: - self.setVariables(originFile=variableFile) - except FileNotFoundError: - pass - def grantPrivileges(self): - """ - Grants select-only privileges to a non-admin mysql user for the API to - query with without risking exposing write access to the Internet. - - The username for these privileges is usually just 'bookworm' without a password, - but if you place a file at '/etc/bookworm.cnf', it will be read from there. - """ - - globalfile = Configfile("read_only") - - username=globalfile.config.get("client","user") - password=globalfile.config.get("client","password") - clienthostname=globalfile.config.get("client","clienthostname") - if clienthostname == '': - clienthostname = "%" - try: - self.db.query("GRANT SELECT ON %s.* TO '%s'@'%s' IDENTIFIED BY '%s'" % (self.dbname,username,clienthostname, password)) - except MySQLdb._exceptions.OperationalError: - self.db.query("CREATE USER '%s'@'%s' IDENTIFIED BY '%s'" % (username,clienthostname,password)) - self.db.query("GRANT SELECT ON %s.* TO '%s'@'%s' IDENTIFIED BY '%s'" % (self.dbname,username,clienthostname,password)) - - def setVariables(self, originFile, anchorField="bookid", - jsonDefinition=".bookworm/metadata/field_descriptions_derived.json"): - self.variableSet = variableSet(originFile=originFile, anchorField=anchorField, jsonDefinition=jsonDefinition,db=self.db) - - def importNewFile(self,originFile,anchorField,jsonDefinition): - """ - Add additional metadata from a source collection of json-formatted rows. - originFile is the filename of the new metadata, in the same input format - as the original jsoncatalog.txt - anchorField is the field in the existing dataset it should be anchored onto; - jsonDefinition is a filename pointing to a file - of the format of field_descriptions.json describing the new data to ingest. - If it is of type None, then one will be guessed at. - """ - self.setVariables(originFile,anchorField=anchorField,jsonDefinition=jsonDefinition) - self.variableSet.writeMetadata() - self.variableSet.loadMetadata() - self.variableSet.updateMasterVariableTable() - for variable in self.variableSet.variables: - variable.clear_associated_memory_tables() - #self.reloadMemoryTables() - - def create_database(self): - dbname = self.dbname - dbuser = self.dbuser - dbpassword = self.dbpassword - - db = self.db - - #This must be run as a MySQL user with create_table privileges - try: - db.query("CREATE DATABASE " + dbname) - except: - logging.info("Database %s already exists: that might be intentional, so not dying" % dbname) - - "Setting up permissions for web user..." - db.query("GRANT SELECT ON " + dbname + ".*" + " TO '" + dbuser + "'@'localhost' IDENTIFIED BY '" + dbpassword + "'") - db.query("GRANT SELECT ON {}.* TO 'bookworm'@'localhost'".format(dbname)) - db.query("FLUSH PRIVILEGES") - #a field to store stuff we might need later. - db.query("CREATE TABLE IF NOT EXISTS bookworm_information (entry VARCHAR(255), PRIMARY KEY (entry), value VARCHAR(50000))") - - def load_word_list(self): - db = self.db - if db is None: - raise AttributeError("No database connection defined--are you running Bookworm without a configuration file or naming the bookworm like `bookworm -d my_bookworm build all`?") - logging.info("Making a SQL table to hold the words") - db.query("""DROP TABLE IF EXISTS words""") - db.query("""CREATE TABLE IF NOT EXISTS words ( - wordid MEDIUMINT UNSIGNED NOT NULL, - word VARCHAR(255), INDEX (word), - count BIGINT UNSIGNED, - casesens VARBINARY(255), - stem VARCHAR(255) - );""") - - db.query("ALTER TABLE words DISABLE KEYS") - logging.info("loading data using LOAD DATA LOCAL INFILE") - db.query("""LOAD DATA LOCAL INFILE '.bookworm/texts/wordlist/wordlist.txt' - INTO TABLE words - CHARACTER SET binary - (wordid,word,count) """) - logging.info("creating indexes on words table") - db.query("ALTER TABLE words ENABLE KEYS") - db.query("UPDATE words SET casesens=word") - - def load_book_list(self): - """ - Slated for deletion. - - Loads in the tables that have already been created by a previous - call to `Bookworm.variableSet.writeMetadata()` - """ - self.variableSet.loadMetadata() - - def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, reverse_index=True, table_count=1): - import time - t0 = time.time() - - db = self.db - ngramname = "unigrams" - tablenameroot = "master_bookcounts" - # If you are splitting the input into multiple tables - # to be joined as a merge table, come up with multiple - # table names and we'll cycle through. - if table_count == 1: - tablenames = [tablenameroot] - elif table_count > 1: - tablenames = ["%s_p%d" % (tablenameroot, i) for i in range(1, table_count+1)] - else: - logging.error("You need a positive integer for table_count") - raise - - grampath = ".bookworm/texts/encoded/%s" % ngramname - tmpdir = "%s/tmp" % grampath - - if (len(grampath) == 0) or (grampath == "/"): - logging.error("Woah! Don't set the ngram path to your system root!") - raise - - if newtable: - if os.path.exists(tmpdir): - import shutil - shutil.rmtree(tmpdir) - - logging.info("Dropping older %s table, if it exists" % ngramname) - for tablename in tablenames: - db.query("DROP TABLE IF EXISTS " + tablename) - - logging.info("Making a SQL table to hold the %s" % ngramname) - reverse_index_sql = "INDEX(bookid,wordid,count), " if reverse_index else "" - for tablename in tablenames: - db.query("CREATE TABLE IF NOT EXISTS " + tablename + " (" - "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + - "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " - "count MEDIUMINT UNSIGNED NOT NULL);") - - if ingest: - for tablename in tablenames: - db.query("ALTER TABLE " + tablename + " DISABLE KEYS") - db.query("set NAMES utf8;") - db.query("set CHARACTER SET utf8;") - logging.info("loading data using LOAD DATA LOCAL INFILE") - - files = os.listdir(grampath) - for i, filename in enumerate(files): - if filename.endswith('.txt'): - # With each input file, cycle through each table in tablenames - tablename = tablenames[i % len(tablenames)] - logging.debug("Importing txt file, %s (%d/%d)" % (filename, i, len(files))) - try: - db.query("LOAD DATA LOCAL INFILE '" + grampath + "/" + filename + "' INTO TABLE " + tablename +" CHARACTER SET utf8 (bookid,wordid,count);") - except KeyboardInterrupt: - raise - except: - logging.debug("Falling back on insert without LOCAL DATA INFILE. Slower.") - try: - import pandas as pd - df = pd.read_csv(grampath + "/" + filename, sep='\t', header=None) - to_insert = df.apply(tuple, axis=1).tolist() - db.query( - "INSERT INTO " + tablename + " (bookid,wordid,count) " - "VALUES (%s, %s, %s);""", - many_params=to_insert - ) - except KeyboardInterrupt: - raise - except: - logging.exception("Error inserting %s from %s" % (ngramname, filename)) - continue - - elif filename.endswith('.h5'): - logging.info("Importing h5 file, %s (%d/%d)" % (filename, i, len(files))) - try: - # When encountering an .h5 file, this looks for ngram information - # in a /#{ngramnames} table (e.g. /unigrams) and writes it out to - # temporary TSV files. - # Dask is used here simply because it's a dead simple way to multithread - # the TSV writing and lower the overhead versus having a TSV already staged. - import csv - import pandas as pd - try: - import dask.dataframe as dd - except: - logging.exception("Ingesting h5 files requires dask") - try: - os.makedirs(tmpdir) - except OSError: - if not os.path.isdir(tmpdir): - raise - # Dask will use #{n_cores-1} threads when saving CSVs. - # Ingest and key reload times are identical to txt import, so the only - # additional overhead is reading the file (small effect) and writing the csv. - ddf = dd.read_hdf(grampath + "/" + filename, - ngramname, mode='r', chunksize=2000000) - ddf.reset_index().to_csv(tmpdir + '/tmp.*.tsv', - index=False, sep='\t', header=False, - quoting=csv.QUOTE_NONNUMERIC) - logging.info("CSV written from H5. Time passed: %.2f s" % (time.time() - t0)) - for j, tmpfile in enumerate(os.listdir(tmpdir)): - # With each input file, cycle through each table in tablenames - tablename = tablenames[j % len(tablenames)] - path = "%s/%s" % (tmpdir, tmpfile) - db.query("LOAD DATA LOCAL INFILE '" + path + "' " - "INTO TABLE " + tablename + " " - "CHARACTER SET utf8 (bookid,wordid,count);") - try: - os.remove(path) - except: - pass - logging.info("CSVs input. Time passed: %.2f s" % (time.time() - t0)) - except KeyboardInterrupt: - raise - except: - logging.exception("Error inserting %s from %s" % (ngramname, filename)) - continue - else: - continue - if index: - logging.info("Creating Unigram Indexes. Time passed: %.2f s" % (time.time() - t0)) - for tablename in tablenames: - db.query("ALTER TABLE " + tablename + " ENABLE KEYS") - - if table_count > 1: - logging.info("Creating a merge table for " + ",".join(tablenames)) - db.query("CREATE TABLE IF NOT EXISTS " + tablenameroot + " (" - "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + - "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " - "count MEDIUMINT UNSIGNED NOT NULL) " - "ENGINE=MERGE UNION=(" + ",".join(tablenames) + ") INSERT_METHOD=LAST;") - - logging.info("Unigram index created in: %.2f s" % ((time.time() - t0))) - - def create_bigram_book_counts(self): - db = self.db - logging.info("Making a SQL table to hold the bigram counts") - db.query("""DROP TABLE IF EXISTS master_bigrams""") - db.query("""CREATE TABLE master_bigrams ( - bookid MEDIUMINT UNSIGNED NOT NULL, - word1 MEDIUMINT UNSIGNED NOT NULL, INDEX (word1,word2,bookid,count), - word2 MEDIUMINT UNSIGNED NOT NULL, - count MEDIUMINT UNSIGNED NOT NULL);""") - db.query("ALTER TABLE master_bigrams DISABLE KEYS") - logging.info("loading data using LOAD DATA LOCAL INFILE") - for filename in os.listdir(".bookworm/texts/encoded/bigrams"): - db.query("LOAD DATA LOCAL INFILE '.bookworm/texts/encoded/bigrams/"+filename+"' INTO TABLE master_bigrams CHARACTER SET utf8 (bookid,word1,word2,count);") - - logging.info("Creating bigram indexes") - db.query("ALTER TABLE master_bigrams ENABLE KEYS") - - def loadVariableDescriptionsIntoDatabase(self): - """ - This adds a description of files to the master variable table: - also, crucially, it puts code specifying their fast creation there, - where it will be executed on startup for all eternity. - """ - logging.debug("Building masterVariableTable") - db = self.db - db.query("DROP TABLE IF EXISTS masterVariableTable") - m = db.query(""" - CREATE TABLE IF NOT EXISTS masterVariableTable - (dbname VARCHAR(255), PRIMARY KEY (dbname), - name VARCHAR(255), - type VARCHAR(255), - tablename VARCHAR(255), - anchor VARCHAR(255), - alias VARCHAR(255), - status VARCHAR(255), - description VARCHAR(5000) - ) ENGINE=MYISAM; - """) - tableTable = db.query(""" - CREATE TABLE IF NOT EXISTS masterTableTable - (tablename VARCHAR(255), PRIMARY KEY (tablename), - dependsOn VARCHAR(255), - memoryCode VARCHAR(20000)) ENGINE=MYISAM; - """) - self.addFilesToMasterVariableTable() - self.addWordsToMasterVariableTable() - self.variableSet.updateMasterVariableTable() - - def reloadMemoryTables(self, force=False, names = None): - - """ - Checks to see if memory tables need to be repopulated (by seeing if they are empty) - and then does so if necessary. - - If an array is passed to 'names', only the specified tables will be - loaded into memory; otherwise, all will. - """ - - q = "SELECT tablename,memoryCode FROM masterTableTable" - existingCreateCodes = self.db.query(q).fetchall() - - if names is not None: - existingCreateCodes = [e for e in existingCreateCodes if e[0] in names] - - for row in existingCreateCodes: - """ - For each table, it checks to see if the table is currently populated; if not, - it runs the stored code to repopulate the table. (It checks length because - memory tables are emptied on a restart). - """ - tablename = row[0] - try: - cursor = self.db.query("SELECT count(*) FROM %s" %(tablename)) - currentLength = cursor.fetchall()[0][0] - logging.debug("Current Length is %d" %currentLength) - except: - currentLength = 0 - if currentLength==0 or force: - for query in splitMySQLcode(row[1]): - self.db.query("SET optimizer_search_depth=0") - self.db.query(query) - - - def fastcat_creation_SQL(self, engine="MEMORY"): - """ - Generate SQL to create the fastcat (memory) and fastcat_ (on-disk) tables. - """ - - tbname = "fastcat" - if engine=="MYISAM": - tbname = "fastcat_" - - fastFieldsCreateList = [ - "bookid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (bookid)", - "nwords MEDIUMINT UNSIGNED NOT NULL" - ] - - fastFieldsCreateList += [variable.fastSQL() for variable in self.variableSet.uniques("fast")] - - create_command = """DROP TABLE IF EXISTS tmp;""" - create_command += "CREATE TABLE tmp ({}) ENGINE={};""".format( - ", ".join(fastFieldsCreateList), engine) - - if engine == "MYISAM": - fastFields = ["bookid", "nwords"] + [variable.fastField for variable in self.variableSet.uniques("fast")] - load_command = "INSERT INTO tmp SELECT " - load_command += ",".join(fastFields) + " FROM catalog USE INDEX () " - # LEFT JOIN fixes a bug where fields were being dropped - load_command += " ".join(["LEFT JOIN %(field)s__id USING (%(field)s )" % variable.__dict__ for variable in self.variableSet.uniques("categorical")]) - load_command += " WHERE nwords IS NOT NULL;" - elif engine == "MEMORY": - load_command = "INSERT INTO tmp SELECT * FROM fastcat_;" - - cleanup_command = "DROP TABLE IF EXISTS {};".format(tbname) - cleanup_command += "RENAME TABLE tmp TO {};".format(tbname) - return create_command + load_command + cleanup_command; - - def create_fastcat_and_wordsheap_disk_tables(self): - for q in self.fastcat_creation_SQL("MYISAM").split(";"): - if q != "": - self.db.query(q) - for q in self.wordsheap_creation_SQL("MYISAM").split(";"): - if q != "": - self.db.query(q) - - def addFilesToMasterVariableTable(self): - #Also update the wordcounts for each text. - code = self.fastcat_creation_SQL("MEMORY") - self.db.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="fastcat";') - self.db.query("""INSERT IGNORE INTO masterTableTable VALUES - ('fastcat','fastcat','{}')""".format(code)) - - - def wordsheap_creation_SQL(self,engine="MEMORY",max_word_length=30,max_words = 1500000): - tbname = "wordsheap" - if engine=="MYISAM": - tbname = "wordsheap_" - wordCommand = "DROP TABLE IF EXISTS tmp;" - wordCommand += "CREATE TABLE tmp (wordid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (wordid), word VARCHAR(30), INDEX (word), casesens VARBINARY(30),UNIQUE INDEX(casesens), lowercase CHAR(30), INDEX (lowercase) ) ENGINE={};".format(engine) - if engine=="MYISAM": - wordCommand += "INSERT IGNORE INTO tmp SELECT wordid as wordid,word,casesens,LOWER(word) FROM words WHERE CHAR_LENGTH(word) <= {} AND wordid <= {} ORDER BY wordid;".format(max_word_length,max_words) - else: - wordCommand += "INSERT IGNORE INTO tmp SELECT * FROM wordsheap_;" - wordCommand += "DROP TABLE IF EXISTS {};".format(tbname) - wordCommand += "RENAME TABLE tmp TO {};".format(tbname) - return wordCommand - - def addWordsToMasterVariableTable(self, max_word_length = 30, max_words = 1500000): - """ - - """ - wordCommand = self.wordsheap_creation_SQL("MEMORY",max_word_length,max_words) - query = "INSERT IGNORE INTO masterTableTable " - query += "VALUES ('wordsheap','wordsheap','{}'); ".format(wordCommand) - logging.info("Creating wordsheap") - self.db.query(query) - - def update_Porter_stemming(self): #We use stems occasionally. - """ - Still not executed. - """ - logging.info("Updating stems from Porter algorithm...") - from nltk import PorterStemmer - db = self.db - - stemmer = PorterStemmer() - cursor = db.query("""SELECT word FROM words""") - words = cursor.fetchall() - for local in words: - word = ''.join(local) # Could probably take the first element of the tuple as well? - # Apostrophes have the save stem as the word, if they're included - word = word.replace("'s","") - if re.match("^[A-Za-z]+$",word): - query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" - z = cursor.execute(query) diff --git a/bookwormDB/convertTSVtoJSONarray.py b/bookwormDB/convertTSVtoJSONarray.py deleted file mode 100644 index 3573f44..0000000 --- a/bookwormDB/convertTSVtoJSONarray.py +++ /dev/null @@ -1,28 +0,0 @@ -import json - -def convertToJSON(filename, location): - """ - given a filename of a tsv, converts that into an ndjson - file for Bookworm. - """ - input = open(filename) - output = open(location, "w") - headers = input.readline() - headers = headers.rstrip("\n") - headers = headers.rstrip("\r") - headers = headers.rstrip("\n") - headers = headers.rstrip("\r") - headers = headers.split("\t") - for line in input: - line = line.rstrip("\n") - line = line.rstrip("\r") - line = line.rstrip("\n") - line = line.rstrip("\r") - values = line.split("\t") - myobject = dict(list(zip(headers,values))) - output.write(json.dumps(myobject) + "\n") - output.close() - - - - diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index 8462eee..d97e423 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -48,25 +48,18 @@ def all_keys(input): # in various ways. def check_query(query): - - fail_if_nonword_characters_in_columns(query) - for key in ['database']: if not key in query: raise BookwormException({"code": 400, "message": "You must specify a value for {}".format(key)}) - - if query['method'] in ["schema", "search"]: # Queries below this only apply to "data" return - for v in query['counttype']: if not v in ['WordCount', 'TextCount']: raise BookwormException({"code": 400, "message": 'Only "WordCount" and "TextCount"' ' counts are supported by the SQL api, but passed {}'.format(v)}) - class DuckQuery(object): """ The base class for a bookworm search. @@ -75,9 +68,7 @@ def __init__(self, query_object = {}, db = None, databaseScheme = None): # Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it. check_query(query_object) - self.prefs = {'database': query_object['database']} - self.query_object = query_object self.db = db @@ -104,12 +95,7 @@ def defaults(self, query_object): # search_limits is an array of dictionaries; # each one contains a set of limits that are mutually independent # The other limitations are universal for all the search limits being set. - - - self.wordsTables = None - - # Set up a dictionary for the denominator of any fraction if it doesn't already exist: self.search_limits = query_object.setdefault('search_limits', [{"word":["polka dot"]}]) self.words_collation = query_object.setdefault('words_collation', "Case_Insensitive") @@ -122,9 +108,6 @@ def defaults(self, query_object): 'stem':'stem'} self.word_field = lookups[self.words_collation] - self.time_limits = query_object.setdefault('time_limits', [0, 10000000]) - self.time_measure = query_object.setdefault('time_measure', 'year') - self.groups = set() self.outerGroups = [] self.finalMergeTables = set() @@ -326,7 +309,7 @@ def wordid_query(self): return self.wordswhere if self.wordswhere != " TRUE ": - f = "SELECT wordid FROM {words} as words1 WHERE {wordswhere}".format(**self.__dict__) + f = "SELECT wordid FROM { words } as words1 WHERE { wordswhere }".format(**self.__dict__) logging.debug("`" + self.wordswhere + "`") return " wordid IN ({})".format(f) else: @@ -636,16 +619,11 @@ def set_operations(self): def bookid_query(self): q = f""" {self.catwhere} """ - logging.debug("'{}'".format(self.catwhere)) - if self.catwhere == "TRUE": self.bookid_where = " TRUE " - else: self.bookid_where = q - - return self.bookid_where def query(self): @@ -935,7 +913,7 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ # Certain function operators can use MySQL terms. # These are the only cases that a dict can be passed as a limitations operations = {"$gt":">", "$ne":"!=", "$lt":"<", - "$grep":" REGEXP ", "$gte":">=", + "$grep":" SIMILAR TO ", "$gte":">=", "$lte":"<=", "$eq":"="} for operation in list(values.keys()): diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index ee9d9de..eba557f 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -56,8 +56,7 @@ def __missing__(self, key): if args.remote_host is None: logging.info("Using SQL API") API = DuckDBCall - API_kwargs = { - } + API_kwargs = {} else: logging.info("Using proxy API") diff --git a/setup.py b/setup.py index 8374e0d..588f06d 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,8 @@ "Topic :: Text Processing :: Indexing", "Topic :: Text Processing :: Linguistic" ], - install_requires=["numpy","pandas","mysqlclient", + install_requires=["pandas","mysqlclient", + "duckdb", "python-dateutil", "psutil", "bounter", "gunicorn", "regex", "pyarrow" ] From 52b8c6608e17c8b2919d496539556510a3a1cb33 Mon Sep 17 00:00:00 2001 From: Benjamin Schmidt <> Date: Mon, 17 May 2021 15:59:53 -0400 Subject: [PATCH 28/41] Integrate nonconsumptive, ~50% passage of the old query API tests. --- bookwormDB/DuckSchema.py | 90 +++++ bookwormDB/builder.py | 91 +++++ bookwormDB/duckdb.py | 302 ++------------ bookwormDB/general_API.py | 139 +++---- bookwormDB/manager.py | 5 +- bookwormDB/sqliteKV.py | 65 --- bookwormDB/tokenizer.py | 309 --------------- bookwormDB/variableSet.py | 809 -------------------------------------- tests/test_API.py | 200 ++++------ tests/test_config.py | 18 - tests/test_creation.py | 20 + tests/test_mysql.py | 63 --- 12 files changed, 377 insertions(+), 1734 deletions(-) create mode 100644 bookwormDB/DuckSchema.py create mode 100644 bookwormDB/builder.py delete mode 100644 bookwormDB/sqliteKV.py delete mode 100644 bookwormDB/tokenizer.py delete mode 100644 bookwormDB/variableSet.py delete mode 100644 tests/test_config.py create mode 100644 tests/test_creation.py delete mode 100644 tests/test_mysql.py diff --git a/bookwormDB/DuckSchema.py b/bookwormDB/DuckSchema.py new file mode 100644 index 0000000..71a38ee --- /dev/null +++ b/bookwormDB/DuckSchema.py @@ -0,0 +1,90 @@ +import pyarrow as pa +from base64 import b64decode +class DuckSchema(object): + """ + This class stores information about the database setup that is used to + optimize query creation query + and so that queries know what tables to include. + It's broken off like this because it might be usefully wrapped around some of + the backend features, + because it shouldn't be run multiple times in a single query + (that spawns two instances of itself), + as was happening before. + """ + + def __init__(self, db): + # XXXX + self.db = db + + # hash of what table each variable is in + self.tableToLookIn = { + 'bookid': 'fastcat', + 'filename': "catalog", + 'wordid': "wordsheap"} + + # hash of what the root variable for each search term is (eg, + # 'author_birth' might be crosswalked to 'authorid' in the + # main catalog.) + self.anchorFields = { + 'bookid': 'bookid', + 'filename': "bookid", + 'wordid': "wordid", + 'word': "wordid" + } + + # aliases: a hash showing internal identifications codes that + # dramatically speed up query time, but which shouldn't be + # exposed. So you can run a search for "state," say, and the + # database will group on a 50-element integer code instead of + # a VARCHAR that has to be long enough to support + # "Massachusetts" and "North Carolina." A couple are + # hard-coded in, but most are derived by looking for fields + # that end in the suffix "__id" later. + + # The aliases starts with a dummy alias for fully grouped queries. + self.aliases = {} + + tables = db.execute("SELECT name, schema FROM arrow_schemas WHERE type='table'").fetchall() + schema = dict(tables) + + current_anchor = None + for tablename, tab in schema.items(): + sch = pa.ipc.read_schema(pa.py_buffer(b64decode(tab))) + if tablename in ["catalog"]: + continue + for i, field in enumerate(sch): + if i == 0: + current_anchor = field.name + else: + self.tableToLookIn[field.name] = tablename + self.anchorFields[field.name] = current_anchor + if current_anchor.endswith("__id"): + self.aliases[field.name] = current_anchor + + def tables_for_variables(self, variables, tables = []): + lookups = [] + for variable in variables: + stack_here = [] + lookup_table = self.tableToLookIn[variable] + if lookup_table in tables: + continue + stack_here.append(lookup_table) + while True: + anchor = self.anchorFields[variable] + parent_tab = self.tableToLookIn[anchor] + if anchor in stack_here or anchor in lookups: + break + else: + # Must go first in duck or other postgres parsers. + stack_here.append(parent_tab) + lookup_table = anchor + # Look for the parent of the parent. + if variable == 'bookid' or anchor == 'wordid': + break + variable = anchor + + stack_here.reverse() + for variable in stack_here: + if not variable in lookups: + lookups.append(variable) + return list(lookups) diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py new file mode 100644 index 0000000..9fff243 --- /dev/null +++ b/bookwormDB/builder.py @@ -0,0 +1,91 @@ +from ducksauce import quacksort +import duckdb +import numpy as np +from base64 import b64encode, b64decode +import pyarrow as pa +from nonconsumptive import Corpus +from nonconsumptive.metadata import Catalog +from pathlib import Path +class BookwormCorpus(Corpus): + """ + Create a Bookworm corpus. Uses write db locations, so should + not be used to managed existing ones or in a multi-threaded context. + """ + + def __init__(self, db_location, *args, **kwargs): + self.db_location = Path(db_location) + self._connection = None + super().__init__(*args, **kwargs) + + def encoded_batches(self): + for batch in self.encoded_wordcounts: + yield batch + + def bookworm_name(self): + return self.db_location.with_suffix("").name + + def prepare_parquet_ingest_file(self): + quacksort(self.encoded_batches(), ['wordid', 'bookid'], self.root / 'unigram_bookid.parquet', block_size = 1_000_000_000) + + def prepare_metadata(self): + Catalog(self.metadata.tb).to_flat_catalog(self.metadata) + + def flat_tabs(self): + """ + Level-3 normalized database tables with integer keys for faster grouping and selection. + """ + return (self.root / "metadata" / "flat_catalog").glob("*.parquet") + + @property + def con(self): + if self._connection is not None: + return self._connection + self._connection = duckdb.connect(str(self.db_location)) + return self._connection + + def ingest_unigrams(self): + con = self.con + wordids = self.root / 'unigram_bookid.parquet' + con.execute(f"CREATE TABLE IF NOT EXISTS unigram_bookid AS SELECT * FROM parquet_scan('{wordids}')") + con.execute(f"CREATE TABLE words AS SELECT * FROM parquet_scan('{self.root / 'wordids.parquet'}')") + con.execute(f"CREATE TABLE wordsheap AS SELECT wordid, token as word, lower(token) as lowercase FROM words") + + def ingest_metadata(self): + for tabpath in self.flat_tabs(): + name = tabpath.with_suffix("").name + self.con.execute(f"CREATE TABLE {name} AS SELECT * FROM parquet_scan('{tabpath}')") + + def create_table_schemas(self): + con = self.con + con.execute('DROP TABLE IF EXISTS arrow_schemas') + con.execute('CREATE TABLE arrow_schemas (name VARCHAR, schema VARCHAR, type VARCHAR)') + insertion = 'INSERT INTO arrow_schemas VALUES (?, ?, ?)' + rich = Catalog(self.metadata.tb).feather_ld() + con.execute(insertion, ("catalog_ld", b64encode(rich.schema.serialize().to_pybytes()), "resource")) + ## Insert schemas into the database for later retrieval to understand the db structure + # Stash as base64. + + # DuckDB can't yet handle blob inserts from python. + # https://github.com/duckdb/duckdb/issues/1703 + for tab in [*self.flat_tabs()] + [self.root / Path("unigram_bookid.parquet"), self.root / 'wordids.parquet']: + tabname = tab.with_suffix("").name + if tabname in ["sorted", "wordids"]: + continue + con.execute(insertion, + (tabname, b64encode(pa.parquet.ParquetFile(tab).schema_arrow.serialize().to_pybytes()), + "table")) + + def update_wordcounts(self): + bookid_wordcounts = self.bookid_wordcounts + rel = self.con.register_arrow("my_nwords", bookid_wordcounts) + self.con.execute("ALTER TABLE fastcat ADD nwords INT32") + rel.execute("UPDATE fastcat SET nwords = s.nwords FROM my_nwords as s WHERE s.bookid = fastcat.bookid ") + rel.unregister_arrow("my_nwords") + + def build(self): + self.prepare_parquet_ingest_file() + self.prepare_metadata() + self.ingest_unigrams() + self.ingest_metadata() + self.create_table_schemas() + self.update_wordcounts() diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index d97e423..777dc11 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -1,9 +1,8 @@ #!/usr/local/bin/python -from .variableSet import to_unicode from .search_limits import Search_limits from .bwExceptions import BookwormException - +from .DuckSchema import DuckSchema import json import re import copy @@ -48,10 +47,6 @@ def all_keys(input): # in various ways. def check_query(query): - fail_if_nonword_characters_in_columns(query) - for key in ['database']: - if not key in query: - raise BookwormException({"code": 400, "message": "You must specify a value for {}".format(key)}) if query['method'] in ["schema", "search"]: # Queries below this only apply to "data" return @@ -68,7 +63,7 @@ def __init__(self, query_object = {}, db = None, databaseScheme = None): # Certain constructions require a DB connection already available, so we just start it here, or use the one passed to it. check_query(query_object) - self.prefs = {'database': query_object['database']} + self.prefs = {} self.query_object = query_object self.db = db @@ -77,17 +72,10 @@ def __init__(self, query_object = {}, db = None, databaseScheme = None): self.databaseScheme = databaseScheme if databaseScheme is None: - self.databaseScheme = databaseSchema(self.db) - - # Some tablenames. + self.databaseScheme = DuckSchema(self.db) - self.wordsheap = self.databaseScheme.fallback_table('wordsheap') - self.fastcat = self.databaseScheme.fallback_table("fastcat") - logging.info("Catalog set to {}".format(self.fastcat)) self.words = "words" - self.defaults(query_object) # Take some defaults - self.derive_variables() # Derive some useful variables that the query will use. def defaults(self, query_object): @@ -101,11 +89,13 @@ def defaults(self, query_object): self.words_collation = query_object.setdefault('words_collation', "Case_Insensitive") lookups = { - "Case_Insensitive":'word', + "Case_Insensitive":'lowercase', 'lowercase':'lowercase', - 'casesens':'casesens', "case_insensitive":"word", - "Case_Sensitive":"casesens", "All_Words_with_Same_Stem":"stem", + 'casesens':'word', + "case_insensitive":"lowercase", + "Case_Sensitive":"word", 'stem':'stem'} + self.word_field = lookups[self.words_collation] self.groups = set() @@ -117,11 +107,6 @@ def defaults(self, query_object): except: groups = None - if groups == [] or groups == ["unigram"]: - # Set an arbitrary column name that will always be true if nothing else is set. - pass - # groups.insert(0, "1 as In_Library") - if groups is None: # A user query can't demand ungrouped results, # but internally it's represented as None. @@ -160,7 +145,7 @@ def defaults(self, query_object): table = self.databaseScheme.tableToLookIn[group] joinfield = self.databaseScheme.aliases[group] - self.finalMergeTables.add(" JOIN " + table + " USING (" + joinfield + ") ") + self.finalMergeTables.add(f' JOIN "{table}" USING ("{joinfield}")') else: self.groups.add(group) except KeyError: @@ -185,41 +170,6 @@ def defaults(self, query_object): if isinstance(self.counttype, (str, bytes)): self.counttype = [self.counttype] - def determineOutsideDictionary(self): - """ - deprecated--tagged for deletion. - """ - self.compare_dictionary = copy.deepcopy(self.query_object) - if 'compare_limits' in list(self.query_object.keys()): - self.compare_dictionary['search_limits'] = self.query_object['compare_limits'] - del self.query_object['compare_limits'] - elif sum([bool(re.search(r'\*', string)) for string in list(self.query_object['search_limits'].keys())]) > 0: - # If any keys have stars at the end, drop them from the compare set - # This is often a _very_ helpful definition for succinct comparison queries of many types. - # The cost is that an asterisk doesn't allow you - - for key in list(self.query_object['search_limits'].keys()): - if re.search(r'\*', key): - # rename the main one to not have a star - self.query_object['search_limits'][re.sub(r'\*', '', key)] = self.query_object['search_limits'][key] - # drop it from the compare_limits and delete the version in the search_limits with a star - del self.query_object['search_limits'][key] - del self.compare_dictionary['search_limits'][key] - else: # if nothing specified, we compare the word to the corpus. - deleted = False - for key in list(self.query_object['search_limits'].keys()): - if re.search('words?\d', key) or re.search('gram$', key) or re.match(r'word', key): - del self.compare_dictionary['search_limits'][key] - deleted = True - if not deleted: - # If there are no words keys, just delete the first key of any type. - # Sort order can't be assumed, but this is a useful failure mechanism of last resort. Maybe. - try: - del self.compare_dictionary['search_limits'][list(self.query_object['search_limits'].keys())[0]] - except: - pass - - def derive_variables(self): # These are locally useful, and depend on the search limits put in. self.limits = self.search_limits @@ -235,48 +185,10 @@ def derive_variables(self): self.word_limits = False self.set_operations() - self.create_catalog_table() - self.make_catwhere() - self.make_wordwheres() - def tablesNeededForQuery(self, fieldNames=[]): - """ - Deprecated. - """ - db = self.db - neededTables = set() - tablenames = dict() - tableDepends = dict() - - q = "SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);" - logging.debug(q) - db.execute(q) - for row in db.fetchall(): - tablenames[row[0]] = row[2] - tableDepends[row[2]] = row[3] - - for fieldname in fieldNames: - parent = "" - try: - current = tablenames[fieldname] - neededTables.add(current) - n = 1 - while parent not in ['fastcat', 'wordsheap']: - parent = tableDepends[current] - neededTables.add(parent) - current = parent - n+=1 - if n > 100: - raise TypeError("Unable to handle this; seems like a recursion loop in the table definitions.") - # This will add 'fastcat' or 'wordsheap' exactly once per entry - except KeyError: - pass - - return neededTables - def needed_columns(self): """ Given a query, what are the columns that the compiled search will need materialized? @@ -289,7 +201,7 @@ def needed_columns(self): def pull_keys(entry): val = [] - if isinstance(entry,list) and not isinstance(entry,(str, bytes)): + if isinstance(entry, list) and not isinstance(entry, (str, bytes)): for element in entry: val += pull_keys(element) elif isinstance(entry,dict): @@ -316,7 +228,13 @@ def wordid_query(self): return " TRUE " def make_group_query(self): - aliases = [self.databaseScheme.aliases[g] for g in self.query_object["groups"]] + aliases = [] + for g in self.query_object["groups"]: + try: + aliases.append(self.databaseScheme.aliases[g]) + except KeyError: + aliases.append(g) + if len(self.query_object["groups"]) > 0: return "GROUP BY {}".format(", ".join(self.query_object["groups"])) else: @@ -325,9 +243,9 @@ def make_group_query(self): def main_table(self): if self.gram_size() == 1: - return 'unigrams_wordid as main' + return 'unigram_bookid as main' if self.gram_size() == 2: - return 'bigrams_word1_word2 as main' + return 'word1_word2_bookid as main' def full_query_tables(self): # Joins are needed to provide groups, but *not* to provide @@ -336,7 +254,7 @@ def full_query_tables(self): # But if there's a group, there may also need to be an associated where. if self.word_limits == False: - tables = [self.fastcat] + tables = ["fastcat"] else: tables = [self.main_table()] @@ -356,32 +274,22 @@ def make_join_query(self): def base_query(self): - dicto = {} - dicto['finalGroups'] = ', '.join(self.query_object['groups']) - if dicto['finalGroups'] != '': - dicto['finalGroups'] = ", " + dicto['finalGroups'] - - dicto['group_query'] = self.make_group_query() - dicto['op'] = ', '.join(self.set_operations()) - dicto['bookid_where'] = self.bookid_query() - dicto['wordid_where'] = self.wordid_query() - dicto['tables'] = self.make_join_query() - logging.info("'{}'".format(dicto['tables'])) - - dicto['catwhere'] = self.make_catwhere("main") - - basic_query = """ - SELECT {op} {finalGroups} - FROM {tables} + try: + finalGroups = self.query_object['groups'] + except: + print(self.query_object) + raise + return f""" + SELECT {', '.join(self.set_operations() + finalGroups)} + FROM {self.make_join_query()} WHERE - {bookid_where} + {self.bookid_query()} AND - {wordid_where} - AND {catwhere} - {group_query} - """.format(**dicto) - - return basic_query + {self.wordid_query()} + AND + {self.make_catwhere("main")} + {self.make_group_query()} + """ def create_catalog_table(self): # self.catalog = self.prefs['fastcat'] # 'catalog' # Can be replaced with a more complicated query in the event of longer joins. @@ -407,21 +315,17 @@ def create_catalog_table(self): self.relevantTables = self.databaseScheme.tables_for_variables(cols) - # moreTables = self.tablesNeededForQuery(columns) - self.catalog = " NATURAL JOIN ".join(self.relevantTables) return self.catalog def make_catwhere(self, query = "sub"): - # Where terms that don't include the words table join. Kept separate so that we can have subqueries only working on one half of the stack. + # Where terms that don't include the words table join. catlimits = dict() for key in list(self.limits.keys()): - # !!Warning--none of these phrases can be used in a bookworm as a custom table names. - if key not in ('word', 'word1', 'word2', 'hasword') and not re.search("words\d", key): catlimits[key] = self.limits[key] - + if query == "main": ts = set(self.full_query_tables()) for key in list(catlimits.keys()): @@ -485,7 +389,7 @@ def make_wordwheres(self): searchingFor = searchingFor.lower() - selectString = f"SELECT wordid FROM wordsheap_ WHERE word = '{searchingFor}'" + selectString = f"SELECT wordid FROM wordsheap WHERE word = '{searchingFor}'" logging.warning(selectString) self.db.execute(selectString) @@ -559,7 +463,7 @@ def build_wordstables(self): elif needsUnigrams: self.main = ''' - unigrams_wordid as main + unigram_bookid as main ''' self.wordstables = """ @@ -674,7 +578,6 @@ def bibliography_query(self, limit = "100"): # self.ordertype = "RAND()" dicto = { - 'fastcat': self.fastcat, 'tables': self.make_join_query(), 'ordertype': self.ordertype, 'catwhere': self.make_catwhere("main"), @@ -760,131 +663,6 @@ def execute(self): value = getattr(self, self.method)() return value -class databaseSchema(object): - """ - This class stores information about the database setup that is used to optimize query creation query - and so that queries know what tables to include. - It's broken off like this because it might be usefully wrapped around some of - the backend features, - because it shouldn't be run multiple times in a single query (that spawns two instances of itself), - as was happening before. - - It's closely related to some of the classes around variables and - variableSets in the Bookworm Creation scripts, - but is kept separate for now: that allows a bit more flexibility, - but is probaby a Bad Thing in the long run. - """ - - def __init__(self, db): - # XXXX - self.db = db - # has of what table each variable is in - self.tableToLookIn = {} - - # hash of what the root variable for each search term is (eg, - # 'author_birth' might be crosswalked to 'authorid' in the - # main catalog.) - self.anchorFields = {} - - # aliases: a hash showing internal identifications codes that - # dramatically speed up query time, but which shouldn't be - # exposed. So you can run a search for "state," say, and the - # database will group on a 50-element integer code instead of - # a VARCHAR that has to be long enough to support - # "Massachusetts" and "North Carolina." A couple are - # hard-coded in, but most are derived by looking for fields - # that end in the suffix "__id" later. - - # The aliases starts with a dummy alias for fully grouped queries. - self.aliases = {} - self.newStyle(db) - - - def newStyle(self, db): - - self.tableToLookIn['bookid'] = self.fallback_table('fastcat') - self.tableToLookIn['filename'] = self.fallback_table('fastcat') - ff = self.fallback_table('fastcat') - self.anchorFields[ff] = ff - - self.tableToLookIn['wordid'] = self.fallback_table('wordsheap') - self.tableToLookIn['word'] = self.fallback_table('wordsheap') - - ww = self.fallback_table('wordsheap') - self.anchorFields[ww] = ww - - - tablenames = dict() - tableDepends = dict() - q = "SELECT dbname,alias,tablename,dependsOn FROM masterVariableTable JOIN masterTableTable USING (tablename);" - logging.debug(q) - db.execute(q) - - for row in db.fetchall(): - (dbname, alias, tablename, dependsOn) = row - tablename = self.fallback_table(tablename) - dependsOn = self.fallback_table(dependsOn) - - self.tableToLookIn[dbname] = tablename - self.anchorFields[tablename] = dependsOn - - self.aliases[dbname] = alias - - def fallback_table(self,tabname): - """ - Fall back to the saved versions if the memory tables are unpopulated. - - Use a cache first to avoid unnecessary queries, though the overhead shouldn't be much. - """ - tab = tabname - if tab.endswith("_"): - return tab - if tab in ["words","unigrams_wordid","bigrams_word1_word2","catalog"]: - return tab - - if not hasattr(self,"fallbacks_cache"): - self.fallbacks_cache = {} - - if tabname in self.fallbacks_cache: - return self.fallbacks_cache[tabname] - q = "SELECT COUNT(*) FROM {}".format(tab) - logging.debug(q) - try: - self.db.execute(q) - length = self.db.fetchall()[0][0] - if length==0: - tab += "_" - except RuntimeError: - tab += "_" - - self.fallbacks_cache[tabname] = tab - - return tab - - def tables_for_variables(self, variables, tables = []): - lookups = [] - for variable in variables: - stack_here = [] - lookup_table = self.tableToLookIn[variable] - if lookup_table in tables: - continue - stack_here.append(lookup_table) - while True: - anchor = self.fallback_table(self.anchorFields[lookup_table]) - if anchor in stack_here or anchor in lookups: - break - else: - # Must go first in duck. - stack_here.append(anchor) - lookup_table = anchor - stack_here.reverse() - for variable in stack_here: - if not variable in lookups: - lookups.append(variable) - return list(lookups) - - - def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_joiner = " OR "): whereterm = [] # The general idea here is that we try to break everything in search_limits down to a list, and then create a whereterm on that joined by whatever the 'joiner' is ("AND" or "OR"), with the comparison as whatever comp is ("=",">=",etc.). @@ -944,10 +722,10 @@ def where_from_hash(myhash, joiner=None, comp = " = ", escapeStrings=True, list_ def escape(value): # NOTE: stringifying the escape from MySQL; hopefully doesn't break too much. - return str(escape_string(to_unicode(value)), 'utf-8') + return str(escape_string(value), 'utf-8') else: def escape(value): - return to_unicode(value) + return value quotesep = "" joined = list_joiner.join([" ({}{}{}{}{}) ".format(key, comp, quotesep, escape(value), quotesep) for value in values]) diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 1f7bc45..482db41 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -440,116 +440,71 @@ def execute(self): method = self.query['method'] logging.debug("Preparing to execute with method '{}'".format(method)) fmt = self.query['format'] if 'format' in self.query else False - - if method == 'data' or method == 'schema' or method == 'search': - version = 2 - if fmt in ['json_c', 'search', 'html', 'csv', 'tsv']: - version = 3 - else: - version = 1 - - if version == 1: + version = 3 + try: # What to do with multiple search_limits + if isinstance(self.query['search_limits'], list): - if method in ["json", "return_json"]: - self.query['method'] = 'data' - self.query['format'] = 'json' - return self.multi_execute(version=version) + if fmt == "json" or version >= 3: + frame = self.multi_execute(version = version) else: # Only return first search limit if not return in json self.query['search_limits'] = self.query['search_limits'][0] - - form = method[7:] if method[:6] == 'return' else method - - logging.warning("method == \"%s\" is deprecated. Use method=\"data\" " - "with format=\"%s\" instead." % (method, form)) - - if method == "return_json" or method == "json": - self.query['method'] = 'data' - self.query['format'] = 'json' - return self.return_json(version=1) - - elif method == "return_csv" or method == "csv": - self.query['method'] = 'data' - self.query['format'] = 'json' + else: frame = self.data() - return frame.to_csv(path = None, sep="\t", encoding="utf8", index=False, - quoting=csv.QUOTE_NONE, escapechar="\\") - elif version >= 2: - try: - # What to do with multiple search_limits - - if isinstance(self.query['search_limits'], list): - if fmt == "json" or version >= 3: - frame = self.multi_execute(version = version) - else: - # Only return first search limit if not return in json - self.query['search_limits'] = self.query['search_limits'][0] - else: - frame = self.data() - if fmt == "json": - return self.return_json(version=2) + if fmt == "json": + return self.return_json(version=2) - if fmt == "csv": - return frame.to_csv(encoding="utf8", index=False) + if fmt == "csv": + return frame.to_csv(encoding="utf8", index=False) - if fmt == "tsv": - return frame.to_csv(sep="\t", encoding="utf8", index=False) + if fmt == "tsv": + return frame.to_csv(sep="\t", encoding="utf8", index=False) - if fmt == "feather" or fmt == "feather_js": - compression = "zstd" - if fmt == "feather_js": - compression = "uncompressed" - fout = io.BytesIO(b'') - try: - feather.write_feather(frame, fout, compression = compression) - except: - logging.warning("You need the pyarrow package installed to export as feather.") - raise - fout.seek(0) - return fout.read() + if fmt == "feather" or fmt == "feather_js": + compression = "zstd" + if fmt == "feather_js": + compression = "uncompressed" + fout = io.BytesIO(b'') + try: + feather.write_feather(frame, fout, compression = compression) + except: + logging.warning("You need the pyarrow package installed to export as feather.") + raise + fout.seek(0) + return fout.read() - if fmt == 'json_c': - return self.return_rle_json(frame) + if fmt == 'json_c': + return self.return_rle_json(frame) - if fmt == 'html': - return self.html(frame) + if fmt == 'html': + return self.html(frame) - else: - err = dict(status="error", code=200, - message="Only formats in ['csv', 'tsv', 'json', 'feather']" - " currently supported") - return json.dumps(err) - except BookwormException as e: - # Error status codes are HTTP codes - # http://www.restapitutorial.com/httpstatuscodes.html - err = e.args[0] - err['status'] = "error" + else: + err = dict(status="error", code=200, + message="Only formats in ['csv', 'tsv', 'json', 'feather']" + " currently supported") return json.dumps(err) - except Exception as ex: - # General Uncaught error. - logging.exception("{}".format(ex)) - logging.exception("Database error") - return json.dumps({"status": "error", "message": "Database error. " - "Try checking field names."}) + except BookwormException as e: + # Error status codes are HTTP codes + # http://www.restapitutorial.com/httpstatuscodes.html + err = e.args[0] + err['status'] = "error" + return json.dumps(err) + except Exception as ex: + # General Uncaught error. + logging.exception("{}".format(ex)) + logging.exception("Database error") + return json.dumps({"status": "error", "message": "Database error. " + "Try checking field names."}) # Temporary catch-all pushes to the old methods: if method in ["returnPossibleFields", "search_results", "return_books", "schema"]: - try: - logging.warn("Using deprecated API call.") - - query = userquery(self.query) - if method == "return_books": - return query.execute() - return json.dumps(query.execute()) - except Exception as e: - if len(str(e)) > 1 and e[1].startswith("Unknown database"): - return "No such bookworm {}".format(e[1].replace("Unknown database","")) - except: - return "General error" - + return json.dumps({"status": "error", "message": "Database error. " + "Try checking field names."}) + raise "No return requested" def multi_execute(self, version=1): """ diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index f68a530..155fccf 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -1,12 +1,11 @@ from __future__ import print_function import re -from subprocess import call -from subprocess import Popen import logging import sys import os import bookwormDB import argparse +import nonconsumptive as nc from .store import store """ @@ -114,7 +113,7 @@ def query(self, args): Run a query against the API from the command line. """ - from bookwormDB.general_API import SQLAPIcall + from bookwormDB.general_API import DuckDBCall import json query = json.loads(args.APIcall) diff --git a/bookwormDB/sqliteKV.py b/bookwormDB/sqliteKV.py deleted file mode 100644 index c3dcb6d..0000000 --- a/bookwormDB/sqliteKV.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright © 2018 Sylvain PULICANI -# Super heavily changed by Ben Schmidt; the old version was a true -# kv store, this one just autoincrements a lookup table. - -# This should generally be thread safe for reads, but not for writes. -# If multip - -# This work is free. You can redistribute it and/or modify it under the -# terms of the Do What The Fuck You Want To Public License, Version 2, -# as published by Sam Hocevar. See the COPYING file for more details. - -# sqlite_kv.py -# -# Python implementation of the SQLiteKV store. - -import sqlite3 - - -class KV: - """ - Python implementation of the SQLiteKV store, with additionnal methods - to make it more pythonic. - ..Warning:: - * The `close` method has to be called after use. - * The `delete` method is not yet implemented. - """ - def __init__(self, dbfile): - """ - Open a connection to the SQLite file. If it doesn't exists, create it - and add the needed tables. - """ - self.conn = None - self.conn = sqlite3.connect(dbfile, detect_types=sqlite3.PARSE_DECLTYPES) - self.conn.row_factory = sqlite3.Row - - tables = [dict(r)['name'] for r in self.conn.execute( - "SELECT name FROM sqlite_master WHERE type='table'")] - - if 'keys' not in tables: - self.conn.execute("""CREATE TABLE keys( - ID INTEGER PRIMARY KEY ASC, - key TEXT UNIQUE NOT NULL)""") - - self.conn.execute("CREATE UNIQUE INDEX idx_keys ON keys(key)") - - - def close(self): - """ - Properly close the database. - """ - self.conn.commit() - self.conn.close() - - def __getitem__(self, key): - rows = self.conn.execute("""SELECT ID FROM keys - WHERE keys.key=(?)""", (key, )) - row = rows.fetchone() - if row is None: - raise KeyError(key) - return row['ID'] - - def register(self, key): - self.conn.execute("INSERT INTO keys(key) VALUES (?)", - (key, )) - diff --git a/bookwormDB/tokenizer.py b/bookwormDB/tokenizer.py deleted file mode 100644 index 341c3c4..0000000 --- a/bookwormDB/tokenizer.py +++ /dev/null @@ -1,309 +0,0 @@ -#! /usr/bin/python - -from __future__ import print_function -import random -import sys -import os -from .sqliteKV import KV -import time -import logging -import numpy as np -from pandas import read_csv -from io import StringIO - -""" -This section does a lot of work on tokenizing and aggregating wordcounts. -""" - -# import regex as re --now done only when the function is actually called. -# Set at a global to avoid multiple imports. - -import regex as re - -# Likewise, store a thread-wise count on whether we've thrown a unicode encoding error. -haveWarnedUnicode = False -# And the default regex is generated by a function on demand. -bigregex = None - - -def wordRegex(): - """ - #I'm including the code to create the regex, which makes it more readable. - Note that this uses *unicode*: among other things, that means that it needs to be passed - a unicode-decoded string: and that we have to use the "regex" module instead of the "re" module. Python3 will make this, perhaps, easier. - """ - MasterExpression = r"\w+" - possessive = MasterExpression + r"'s" - numbers = r"(?:[\$])?\d+" - decimals = numbers + r"\.\d+" - abbreviation = r"(?:mr|ms|mrs|dr|prof|rev|rep|sen|st|sr|jr|ft|gen|adm|lt|col|etc)\." - sharps = r"[a-gjxA-GJX]#" - punctuators = r"[^\w\p{Z}]" - """ - Note: this compiles looking for the most complicated words first, and as it goes on finds simpler and simpler forms - """ - bigregex = re.compile("|".join([decimals,possessive,numbers,abbreviation,sharps,punctuators,MasterExpression]),re.UNICODE|re.IGNORECASE) - bigregex = re.compile(u"\w+|\p{P}|\p{S}") - return bigregex - - -def readDictionaryFile(prefix=""): - look = dict() - for line in open(prefix + ".bookworm/texts/wordlist/wordlist.txt"): - line = line.rstrip("\n") - try: - v, k, _ = line.split("\t") - except ValueError: - print(line) - print([look.keys()][:10]) - raise - look[k] = v - return look - -def readIDfile(prefix=""): - if not os.path.exists(".bookworm/metadata/textids.sqlite"): - raise FileNotFoundError("No textids DB: run `bookworm build textids`") - return KV(prefix + ".bookworm/metadata/textids.sqlite") - -class tokenBatches(object): - """ - A tokenBatches is a manager for tokenizers. Each one corresponds to - a reasonable number of texts to read in to memory on a single processor: - during the initial loads, there will probably be one per core. - It doesn't store the original text, just the unigram and bigram tokenizations in its attached self.counts arrays. - - It writes out its dat to a single file: - in this way, a batch of up to several hundred thousand individual files is grouped into a single file. - - It also has a method that encodes and writes its wordcounts into a tsv file appropriate for reading with mysql, - with 3-byte integer encoding for wordid and bookid. - """ - - def __init__(self, levels=["unigrams", "bigrams"]): - """ - - mode: 'encode' (write files out) - """ - self.id = '%030x' % random.randrange(16**30) - self.levels=levels - - # placeholder to alert that createOutputFiles must be run. - self._IDfile = None - self._dictionary = None - - def output_files(self, level): - if not hasattr(self, "outputFiles"): - self.outputFiles = dict() - if not level in self.outputFiles: - self.outputFiles[level] = open(".bookworm/texts/encoded/{}/{}.txt".format(level, self.id), "w") - return self.outputFiles[level] - - @property - def IDfile(self): - if self._IDfile: - return self._IDfile - self._IDfile = readIDfile() - return self._IDfile - - @property - def dictionary(self): - if self._dictionary: - return self._dictionary - self._dictionary = readDictionaryFile() - return self._dictionary - - def close(self): - """ - This test allows the creation of bookworms with fewer document than requested - threads, which happens to be the case in the tests. - """ - if hasattr(self, "outputFiles"): - for v in self.outputFiles.values(): - v.close() - - def encodeRow(self, - filename, - tokenizer, - write_completed=True - ): - """ - 'id': the filename - 'tokenizer': a tokenizer object - - """ - - #The dictionary and ID lookup tables should be pre-attached. - dictionary = self.dictionary - IDfile = self.IDfile - - levels = None - - """ - if source=="raw_text": - parts = row.split("\t", 1) - filename = parts[0] - try: - tokens = tokenizer(parts[1]) - except IndexError: - logging.warn("\nFound no tab in the input for '" + filename + "'...skipping row\n") - levels = self.levels - - if source == "countfile": - try: - (filename, token, count) = row.split("\t") - except: - logging.error("Can't find tab\n***************") - logging.error(row) - raise - tokens = preTokenized(token, count, self.levels[0]) - """ - - try: - textid = IDfile[filename] - except KeyError: - logging.warn("Warning: file " + filename + " not found in jsoncatalog.txt, not encoding") - return - - for level in self.levels: - outputFile = self.output_files(level) - output = [] - - counts = tokenizer.counts(level) - - for wordset, count in counts.items(): - skip = False - wordList = [] - for word in wordset: - try: - wordList.append(dictionary[word]) - except KeyError: - """ - if any of the words to be included is not in the dictionary, - we don't include the whole n-gram in the counts. - """ - skip = True - if not skip: - wordids = "\t".join(wordList) - output.append("{}\t{}\t{}".format(int(textid), wordids, count)) - - try: - if len(output) > 0: - # The test is necessary because otherwise this prints a blank line. - outputFile.write("\n".join(output) + "\n") - - except IOError as e: - logging.exception(e) - -class Tokenizer(object): - """ - A tokenizer is initialized with a single text string. - - It assumes that you have in namespace an object called "bigregex" which - identifies words. - - (I'd define it here, but it's a performance optimization to avoid compiling the large regex millions of times.) - - the general way to call it is to initialize, and then for each desired set of counts call "tokenizer.counts("bigrams")" (or whatever). - - That returns a dictionary, whose keys are tuples of length 1 for unigrams, 2 for bigrams, etc., and whose values are counts for that ngram. The tuple form should allow faster parsing down the road. - - """ - - def __init__(self, string, tokenization_regex=None): - global haveWarnedUnicode - self.string = string - self.tokenization_regex = tokenization_regex - self._tokens = None - - @property - def tokens(self): - if self._tokens: - return self._tokens - self._tokens = self.tokenize() - return self._tokens - - def tokenize(self): - - tokenization_regex=self.tokenization_regex - global re - if re is None: - import regex as re - if tokenization_regex is None: - # by default, use the big regex. - global bigregex - if bigregex==None: - bigregex = wordRegex() - tokenization_regex = bigregex - - - components = self.string.split("\f") - return [re.findall(tokenization_regex, component) for component in components] - - def ngrams(self, n, collapse = False): - """ - All the ngrams in the text can be created as a tuple by zipping an arbitrary number of - copies of the text to itself. - """ - values = [] - for tokenset in self.tokens: - values.extend(zip(*[tokenset[i:] for i in range(n)])) - if collapse: - values = [" ".join(tupled) for tupled in values] - return values - - def unigrams(self): - return self.ngrams(1) - - def bigrams(self): - return self.ngrams(2) - - def trigrams(self): - return self.ngrams(3) - - def allgrams(self, max = 6): - output = [] - for i in range(1, max + 1): - output.extend(self.ngrams(i, collapse = True)) - return output - - def words(self): - """ - 1-grams have tuple keys, but words have index keys. - """ - return [item for sublist in self.tokens for item in sublist] - - def counts(self, whichType): - - count = dict() - for gram in getattr(self,whichType)(): - try: - count[gram] += 1 - except KeyError: - count[gram] = 1 - return count - - -class PreTokenized(object): - """ - This class is a little goofy: it mimics the behavior of a tokenizer - one data that's already been tokenized by something like - Google Ngrams or JStor Data for Research. - """ - - def __init__(self, csv_string, level): - f = read_csv(StringIO(csv_string), - lineterminator = "\f", - # Ugh--want 'NA' to be a word. - dtype = {'word': str, 'counts': np.int}, - keep_default_na=False, - names = ["word", "counts"]) - self.level = level - if level == 'words': - self.output = dict(zip(f.word, f.counts)) - else: - self.output = dict(zip([tuple(w.split(" ")) for w in f.word], f.counts)) - - def counts(self, level): - if level != self.level: - raise - return self.output diff --git a/bookwormDB/variableSet.py b/bookwormDB/variableSet.py deleted file mode 100644 index 7a36692..0000000 --- a/bookwormDB/variableSet.py +++ /dev/null @@ -1,809 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import json -import os -import decimal -import re -from MySQLdb import escape_string -import logging -import subprocess -from .sqliteKV import KV - -def to_unicode(obj): - if isinstance(obj, bytes): - obj = str(obj) - if isinstance(obj, int) or isinstance(obj, float) or isinstance(obj, decimal.Decimal): - obj = str(obj) - return obj - -def splitMySQLcode(string): - - """ - MySQL code can only be executed one command at a time, and fails if it has any empty slots - So as a convenience wrapper, I'm just splitting it and returning an array. - """ - logging.debug(f"Splitting: '{string}'") - try: - output = ['%s;\n' % query for query in string.split(';') if re.search(r"\w", query)] - output = [o.strip("\\n") for o in output if o != "\\n"] - except AttributeError: - # Occurs when the field is completely empty - output = [] - return output - - -def guessBasedOnNameAndContents(metadataname,dictionary): - """ - This makes a guess based on the data field's name and type. - CUrrently it assumes everything is categorical; that can really chunk out on some text fields, but works much better for importing csvs. Probably we want to take some other things into account as well. - """ - description = {"field":metadataname,"datatype":"categorical","type":"character","unique":True} - - example = list(dictionary.keys())[0] - - if type(example) == int: - description["type"] = "integer" - - if type(example) == list: - description["unique"] = False - - if metadataname == "searchstring": - return {"datatype": "searchstring", "field": "searchstring", "unique": True, "type": "text"} - - if re.search("date",metadataname) or re.search("time",metadataname): - description["datatype"] = "time" - - values = [dictionary[key] for key in dictionary] - averageNumberOfEntries = sum(values)/ len(values) - - if averageNumberOfEntries > 2: - description["datatype"] = "categorical" - - return description - - -class dataField(object): - """ - This define a class that supports a data field from a json definition. - We'll use this to spit out appropriate sql code and JSON where needed. - The 'definition' here means the user-generated array (submitted in json but - parsed out before this) described in the Bookworm interface. - This knows whether it's unique, whether it should treat itself as a date, etc. - - The complicated bits are about allowing fast lookups for arbitrary-length - character lookups: for a variable like "country," it will also create - the new field "country__id" and the table "countryLookup" to allow - faster joins on the main database - """ - - def __init__(self, definition, dbToPutIn, anchorType="MEDIUMINT UNSIGNED", anchor="bookid",table="catalog",fasttab="fastcat"): - #anchorType should be derived from somewhere. - self.anchorType = anchorType - self.anchor = anchor - - for key in definition.keys(): - vars(self)[key] = definition[key] - self.dbToPutIn = dbToPutIn - - #ordinarily, a column has no alias other than itself. - self.alias = self.field - self.status = "hidden" - - #The table it's stored in will be either 'catalog', or a new - #table named after the variable. For now, at least. (later the anchor should get used). - - self.fastField = self.field - self.finalTable = fasttab - if self.datatype == "categorical": - self.type = "character" - #This will catch a common sort of mistake (calling it text), - #but also coerce any categorical data to have fewer than 255 characters. - #This is worth it b/c a more than 255-character field will take *forever* to build. - self.fastField = "%s__id" % self.field - self.alias = self.fastField - #If it's a categorical variable, it will be found in a lookup table. - self.finalTable = self.field + "Lookup" - self.status = "public" - - if self.datatype == "time": - self.status = "public" - - if self.unique: - self.table = table - self.fasttab = fasttab - - else: - self.table = self.field + "Disk" - self.fasttab = self.field + "heap" - self.outputloc = ".bookworm/metadata/%s.txt" % self.field - - - def __repr__(self): - val = "Data Field '{}'".format(self.field) - val += "\n\tdatatype: {}".format(self.datatype) - val += "\n\ttype: {}".format(self.type) - val += "\n\tuniqueness: {}".format(self.unique) - return val - - def slowSQL(self, withIndex=False): - """ - This returns something like "author VARCHAR(255)", - a small definition string with an index, potentially. - """ - - mysqltypes = { - "character": "VARCHAR(255)", - "integer": "INT", - "text": "VARCHAR(5000)", - "decimal": "DECIMAL (9,4)", - "float": "FLOAT" - } - - # Indexing both the field and against the anchor for fast memory table creation. - indexstring = ", INDEX (%(field)s), INDEX (%(anchor)s, %(field)s " % self.__dict__ - #need to specify fixed prefix length on text strings: (http://dev.mysql.com/doc/refman/5.0/en/create-index.html) - # If it's a text field, we need to curtail the index at 255 characters - # or else indexes start timing out or eating up all the memory. - indextypes = { - "character": "%s)" % indexstring, - "integer": "%s)" % indexstring, - "text": "%s (255) )" % indexstring, - "decimal": "%s)" % indexstring - } - createstring = " %s %s" % (self.field, mysqltypes[self.type]) - - if withIndex and self.type != 'text' and self.type != "float": - return '%s%s' % (createstring, indextypes[self.type]) - - return createstring - - def fastSQL(self): - """ - This creates code to go in a memory table: it assumes that the disk - tables are already there, and that a connection cursor is active. - Memory tables in MySQL don't suppor the VARCHAR (they just take up all - 255 characters or whatever); thus, it has to be stored this other way. - """ - if self.datatype != 'etc': - if self.type == "character": - self.setIntType() - return " %(field)s__id %(intType)s" % self.__dict__ - if self.type == "integer": - return " %s INT" % self.field - if self.type == "decimal": - return " %s DECIMAL (9,4) " % self.field - if self.type == "float": - return " %s FLOAT " % self.field - else: - return None - else: - return None - - def buildDiskTable(self,fileLocation="default"): - """ - Builds a disk table for a nonunique variable. - """ - db = self.dbToPutIn - dfield = self - - if fileLocation == "default": - fileLocation = ".bookworm/metadata/" + dfield.field + ".txt" - - logging.info("Making a SQL table to hold the data for " + dfield.field) - - q1 = """DROP TABLE IF EXISTS """ + dfield.field + "Disk" - db.query(q1) - db.query("""CREATE TABLE IF NOT EXISTS """ + dfield.field + """Disk ( - """ + self.anchor + " " + self.anchorType + """, - """ + dfield.slowSQL(withIndex=True) + """ - );""") - db.query("ALTER TABLE " + dfield.field + "Disk DISABLE KEYS;") - loadcode = """LOAD DATA LOCAL INFILE '""" + fileLocation + """' - INTO TABLE """ + dfield.field + """Disk - FIELDS ESCAPED BY '';""" - db.query(loadcode) - # cursor = db.query("""SELECT count(*) FROM """ + dfield.field + """Disk""") - db.query("ALTER TABLE " + dfield.field + "Disk ENABLE KEYS") - - def build_ID_and_lookup_tables(self): - IDcode = self.buildIdTable() - for query in splitMySQLcode(IDcode): - self.dbToPutIn.query(query) - for query in splitMySQLcode(self.fastLookupTableIfNecessary("MYISAM")): - self.dbToPutIn.query(query) - for query in splitMySQLcode(self.fastSQLTable("MYISAM")): - self.dbToPutIn.query(query) - - def fastLookupTableIfNecessary(self, engine="MEMORY"): - """ - This uses the already-created ID table to create a memory lookup. - """ - - self.engine = engine - if self.datatype == 'categorical': - logging.debug("Creating a memory lookup table for " + self.field) - self.setIntType() - self.maxlength = self.dbToPutIn.query("SELECT MAX(CHAR_LENGTH(%(field)s)) FROM %(field)s__id" % self.__dict__) - self.maxlength = self.maxlength.fetchall()[0][0] - try: - self.maxlength = max([self.maxlength,1]) - except TypeError: - logging.error(f"Unable to calculate length for {self.field}" - "perhaps there are no entries in the catalog?") - self.maxlength = 1; - code = """DROP TABLE IF EXISTS tmp; - CREATE TABLE tmp (%(field)s__id %(intType)s ,PRIMARY KEY (%(field)s__id), - %(field)s VARCHAR (%(maxlength)s) ) ENGINE=%(engine)s - SELECT %(field)s__id,%(field)s FROM %(field)s__id;""" % self.__dict__ - tname = self.field+"Lookup" - if engine=="MYISAM": - tname += "_" - - code += "DROP TABLE IF EXISTS {}; RENAME TABLE tmp to {}".format(tname,tname) - return code - return "" - - def fastSQLTable(self,engine="MEMORY"): - #setting engine to another value will create these tables on disk. - queries = "" - self.engine = engine - tname = self.field + "heap" - if engine=="MYISAM": - tname += "_" - if self.unique and self.anchor=="bookid": - pass #when it has to be part of a larger set - if not self.unique and self.datatype == 'categorical': - self.setIntType() - queries += """DROP TABLE IF EXISTS tmp;""" - queries += """CREATE TABLE tmp (%(anchor)s %(anchorType)s , INDEX (%(anchor)s),%(field)s__id %(intType)s ) ENGINE=%(engine)s; """ % self.__dict__ - if engine=="MYISAM": - queries += "INSERT INTO tmp SELECT %(anchor)s ,%(field)s__id FROM %(field)s__id JOIN %(field)sDisk USING (%(field)s); " % self.__dict__ - elif engine=="MEMORY": - queries += "INSERT INTO tmp SELECT * FROM {}_; ".format(tname) - queries += "DROP TABLE IF EXISTS {}; RENAME TABLE tmp TO {}; ".format(tname,tname) - - if self.datatype == 'categorical' and self.unique: - pass - - return queries - - def setIntType(self): - try: - alreadyExists = self.intType - except AttributeError: - cursor = self.dbToPutIn.query("SELECT count(DISTINCT "+ self.field + ") FROM " + self.table) - self.nCategories = cursor.fetchall()[0][0] - self.intType = "INT UNSIGNED" - if self.nCategories <= 16777215: - self.intType = "MEDIUMINT UNSIGNED" - if self.nCategories <= 65535: - self.intType = "SMALLINT UNSIGNED" - if self.nCategories <= 255: - self.intType = "TINYINT UNSIGNED" - - def buildIdTable(self, minimum_occurrence_rate = 1/100000): - - """ - This builds an integer crosswalk ID table with a field that stores categorical - information in the fewest number of bytes. This is important because it can take - significant amounts of time to group across categories if they are large: - for example, with 4 million newspaper articles, on one server a GROUP BY with - a 12-byte VARCHAR field takes 5.5 seconds, but a GROUP BY with a 3-byte MEDIUMINT - field corresponding exactly to that takes 2.2 seconds on the exact same data. - That sort of query is included in every single bookworm - search multiple times, so it's necessary to optimize. - Plus, it means we can save space on memory storage - in important ways as well. - """ - #First, figure out how long the ID table has to be and make that into a datatype. - #Joins and groups are slower the larger the field grouping on, so this is worth optimizing. - self.setIntType() - - returnt = "DROP TABLE IF EXISTS tmp;\n\n" - - returnt += "CREATE TABLE tmp ENGINE=MYISAM SELECT %(field)s,count(*) as count FROM %(table)s GROUP BY %(field)s;\n\n" % self.__dict__ - - # XXXX to fix - # Hardcoding this for now at one per 100K in the method definition. Could be user-set. - n_documents = self.dbToPutIn.query("SELECT COUNT(*) FROM catalog").fetchall()[0][0] - self.minimum_count = round(n_documents*minimum_occurrence_rate) - # XXXX - - returnt +="DELETE FROM tmp WHERE count < %(minimum_count)s;" % self.__dict__ - - returnt += "DROP TABLE IF EXISTS %(field)s__id;\n\n" % self.__dict__ - - returnt += """CREATE TABLE IF NOT EXISTS %(field)s__id ( - %(field)s__id %(intType)s PRIMARY KEY AUTO_INCREMENT, - %(field)s VARCHAR (255), INDEX (%(field)s, %(field)s__id), %(field)s__count MEDIUMINT UNSIGNED);\n\n""" % self.__dict__ - - returnt += """INSERT INTO %(field)s__id (%(field)s,%(field)s__count) - SELECT %(field)s,count FROM tmp LEFT JOIN %(field)s__id USING (%(field)s) WHERE %(field)s__id.%(field)s__id IS NULL - ORDER BY count DESC;\n\n""" % self.__dict__ - - returnt += """DROP TABLE tmp;\n\n""" - - self.idCode = "%s__id" % self.field - return returnt - - def clear_associated_memory_tables(self): - """ - Remove all data from memory tables associated with this variable. - Useful when refreshing the database. - """ - db = self.dbToPutIn - def exists(tablename): - return len(db.query("SHOW TABLES LIKE '" + tablename + "'").fetchall())>0 - if exists(self.fasttab): - logging.debug("DELETING FROM " + self.fasttab) - self.dbToPutIn.query("DELETE FROM " + self.fasttab) - if not self.unique: - if exists(self.field+"heap"): - self.dbToPutIn.query("DELETE FROM " + self.field + "heap") - if self.datatype=="categorical": - if exists(self.field+"Lookup"): - self.dbToPutIn.query("DELETE FROM " + self.field+"Lookup") - - def updateVariableDescriptionTable(self): - self.memoryCode = self.fastLookupTableIfNecessary() - code = """DELETE FROM masterVariableTable WHERE dbname="%(field)s"; - INSERT INTO masterVariableTable - (dbname, name, type, tablename, anchor, alias, status,description) - VALUES - ('%(field)s','%(field)s','%(type)s','%(finalTable)s','%(anchor)s','%(alias)s','%(status)s','') """ % self.__dict__ - self.dbToPutIn.query(code) - if not self.unique: - code = self.fastSQLTable() - try: - parentTab = self.dbToPutIn.query(""" - SELECT tablename FROM masterVariableTable - WHERE dbname='%s'""" % self.fastAnchor).fetchall()[0][0] - except: - parentTab="fastcat" - self.dbToPutIn.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="%s";' % (self.field + "heap")) - q = "INSERT INTO masterTableTable VALUES (%s,%s,%s)" - self.dbToPutIn.query(q, (self.field + "heap", parentTab, code)) - if self.datatype=="categorical": - #Variable Info - - code = """ - DELETE FROM masterVariableTable WHERE dbname='%(field)s__id'; - INSERT IGNORE INTO masterVariableTable - (dbname, name, type, tablename, - anchor, alias, status,description) - VALUES - ('%(field)s__id','%(field)s','lookup','%(fasttab)s', - '%(anchor)s','%(alias)s','hidden','') """ % self.__dict__ - self.dbToPutIn.query(code) - #Separate Table Info - code = self.fastLookupTableIfNecessary() - self.dbToPutIn.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="%s";' %(self.field + "Lookup")) - -# code = escape_string(code) -# if isinstance(code, bytes): -# code = str(code, 'utf-8') -# if (code.startswith(b'b')): -# print("\n\n") -# print(code) - -# self.dbToPutIn.query(q) - - q = "INSERT INTO masterTableTable VALUES (%s, %s, %s)" - - self.dbToPutIn.query(q, (self.field+"Lookup", self.fasttab, code)) - - -class variableSet(object): - def __init__(self, - originFile=".bookworm/metadata/jsoncatalog_derived.txt", - anchorField="bookid", - jsonDefinition=None, - db=None): - self.db = db - self.anchorField = anchorField - self.originFile=originFile - self.jsonDefinition=jsonDefinition - logging.debug(jsonDefinition) - - if jsonDefinition==None: - logging.warning("No field_descriptions.json file provided, so guessing based " - "on variable names.") - self.jsonDefinition=self.guessAtFieldDescriptions() - else: - with open(jsonDefinition,"r") as fin: - self.jsonDefinition = json.loads(fin.read()) - - self.setTableNames() - self.catalogLocation = ".bookworm/metadata/" + self.tableName + ".txt" - - - self.variables = [] - - for item in self.jsonDefinition: - #The anchor field has special methods hard coded in. - - if item['field'] == self.anchorField: - continue - self.variables.append(dataField(item,self.db,anchor=anchorField,table=self.tableName,fasttab=self.fastName)) - - def __repr__(self): - return "A variable set of {} objects".format(len(self.variables)) - - def setTableNames(self): - """ - For the base case, they're catalog and fastcat: otherwise, it's just they key - and the first variable associated with it. - """ - if os.path.split(self.originFile)[-1] == 'jsoncatalog_derived.txt': - self.tableName = "catalog" - self.fastName = "fastcat" - - else: - try: - self.tableName = self.jsonDefinition[0]['field'] + "_" + self.jsonDefinition[1]['field'] - except IndexError: - #if it's only one element long, just name it after the variable itself. - #Plus the string 'unique', to prevent problems of dual-named tables; - self.tableName = "unick_" + self.jsonDefinition[0]['field'] - - self.fastName = self.tableName + "heap" - - def guessAtFieldDescriptions(self,stopAfter=30000): - allMyKeys = dict() - unique = True - - for i, line in enumerate(open(self.originFile)): - try: - entry = json.loads(line.rstrip("\n")) - except: - logging.warning("Error in line {} of {}".format(i, self.originFile)) - logging.warning(line) - - for key in entry: - if type(entry[key])==list: - unique=False - else: - #Treat it for counting sake as a single element list. - entry[key] = [entry[key]] - for value in entry[key]: - try: - allMyKeys[key][value] += 1 - except KeyError: - try: - allMyKeys[key][value] = 1 - except KeyError: - allMyKeys[key] = dict() - allMyKeys[key][value] = 1 - if i > stopAfter: - break - - myOutput = [] - - for metadata in allMyKeys: - - bestGuess = guessBasedOnNameAndContents(metadata,allMyKeys[metadata]) - if unique==False: - bestGuess['unique'] = False - - myOutput.append(bestGuess) - - myOutput = [output for output in myOutput if output["field"] != "filename"] - - return myOutput - - def uniques(self,type="base"): - """ - Some frequent patterns that tend to need to be iterated through. - """ - - if type=="base": - return [variable for variable in self.variables if variable.unique] - if type=="fast": - return [variable for variable in self.variables if (variable.unique and variable.fastSQL() is not None)] - if type=="categorical": - return [variable for variable in self.variables if (variable.unique and variable.fastSQL() is not None and variable.datatype=="categorical")] - - def notUniques(self): - return [variable for variable in self.variables if not variable.unique] - - def anchorLookupDictionary(self): - db = self.db - anchor = self.anchorField - self.fastAnchor = self.anchorField - - if anchor == "bookid" and self.tableName != "catalog": - self.fastAnchor="bookid" - bookids = DummyDict() - - elif anchor=="filename" or anchor=="bookid": - self.fastAnchor = "bookid" - bookids = dict() - try: - """ - It is faster, better, and (on the first run only) sometimes necessary - to pull the textids from the original files, not the database. - """ - bookids = KV(".bookworm/metadata/textids.sqlite") - for variable in self.variables: - variable.anchor=self.fastAnchor - except IOError: - logging.info("Pulling bookids from catalog...") - results = db.query("SELECT bookid,filename FROM catalog;") - logging.info("... bookids have been retrieved.") - for row in results.fetchall(): - bookids[row[1]] = row[0] - logging.info("... and are loaded into a dictionary.") - for variable in self.variables: - variable.anchor=self.fastAnchor - else: - query = """SELECT alias FROM masterVariableTable WHERE dbname='%s'""" % (anchor) - bookids = dict() - cursor = db.query("SELECT alias FROM masterVariableTable WHERE dbname = '%s'" % anchor) - try: - fastAnchor = cursor.fetchall()[0][0] - except: - if anchor in ["bookid","filename"]: - fastAnchor="bookid" - logging.warning("Unable find an alias in the DB for anchor" + anchor + "\n\n") - self.fastAnchor=fastAnchor - if fastAnchor != anchor: - results = db.query("SELECT * FROM %sLookup_;" % (anchor)) - for row in results.fetchall(): - bookids[row[1]] = row[0] - self.anchor=fastAnchor - for variable in self.variables: - variable.anchor = fastAnchor - else: - #construct a phony dictionary that just returns what you gave - bookids = DummyDict() - - return bookids - - def writeMetadata(self,limit=float("Inf")): - #Write out all the metadata into files that MySQL is able to read in. - """ - This is a general purpose, with a few special cases for the primary use case that this is the - "catalog" table that hold the primary lookup information. - """ - linenum = 1 - variables = self.variables - bookids = self.anchorLookupDictionary() - - metadatafile = open(self.originFile) - - - #Open files for writing to - path = os.path.dirname(self.catalogLocation) - try: - os.makedirs(path) - except OSError: - if not os.path.isdir(path): - raise - - catalog = open(self.catalogLocation, 'w') - - for variable in [variable for variable in variables if not variable.unique]: - variable.output = open(variable.outputloc, 'w') - - for entry in metadatafile: - - try: - entry = json.loads(entry) - except: - logging.warning("""WARNING: json parsing failed for this JSON line: - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n""" + entry) - - continue - - #We always lead with the bookid and the filename. - #Unicode characters in filenames may cause problems? - if self.anchorField=="bookid" and self.tableName=="catalog": - self.anchorField="filename" - - filename = to_unicode(entry[self.anchorField]) - - try: - bookid = bookids[entry[self.anchorField]] - except KeyError: - if self.tableName=="catalog": - logging.warning("No entry for {}".format(entry[self.anchorField])) - continue - # bookid = bookids.bump(entry[self.anchorField]) - else: - #If the key isn't in the name table, we have no use for this entry. - continue - mainfields = [str(bookid),to_unicode(entry[self.anchorField])] - - if self.tableName != "catalog": - #It can get problematic to have them both, so we're just writing over the - #anchorField here. - mainfields = [str(bookid)] - # First, pull the unique variables and write them to the 'catalog' table - - for var in [variable for variable in variables if variable.unique]: - if var.field not in [self.anchorField,self.fastAnchor]: - myfield = entry.get(var.field, "") - if myfield is None: - myfield = '' - mainfields.append(to_unicode(myfield)) - catalogtext = '%s\n' % '\t'.join(mainfields) - try: - catalog.write(catalogtext) - except TypeError: - catalog.write(catalogtext) - - for variable in [variable for variable in variables if not variable.unique]: - # Each of these has a different file it must write to... - outfile = variable.output - lines = entry.get(variable.field, []) - if isinstance(lines, (str, bytes, int)): - """ - Allow a single element to be represented as a string - """ - lines = [lines] - if lines==None: - lines = [] - for line in lines: - try: - writing = '%s\t%s\n' % (str(bookid), to_unicode(line)) - outfile.write(writing) - except: - logging.warning("some sort of error with bookid no. " +str(bookid) + ": " + json.dumps(lines)) - pass - if linenum > limit: - break - linenum=linenum+1 - for variable in [variable for variable in variables if not variable.unique]: - variable.output.close() - catalog.close() - metadatafile.close() - - def loadMetadata(self): - """ - Load in the metadata files which have already been created elsewhere. - """ - - #This function is called for the sideffect of assigning a `fastAnchor` field - bookwormcodes = self.anchorLookupDictionary() - db = self.db - logging.info("Making a SQL table to hold the catalog data") - - if self.tableName=="catalog": - """A few necessary basic fields""" - mysqlfields = ["bookid MEDIUMINT UNSIGNED, PRIMARY KEY(bookid)", "filename VARCHAR(255)", "nwords INT"] - else: - mysqlfields = ["%s MEDIUMINT UNSIGNED, PRIMARY KEY (%s)" % (self.fastAnchor,self.fastAnchor)] - for variable in self.uniques(): - createstring = variable.slowSQL(withIndex=True) - mysqlfields.append(createstring) - - if len(mysqlfields) > 1: - #This creates the main (slow) catalog table - db.query("""DROP TABLE IF EXISTS %s """ % self.tableName) - createcode = """CREATE TABLE IF NOT EXISTS %s ( - """ % self.tableName + ",\n".join(mysqlfields) + ") ENGINE=MYISAM;" - try: - db.query(createcode) - except: - logging.error("Unable to create table for metadata: SQL Code follows") - logging.error(createcode) - raise - #Never have keys before a LOAD DATA INFILE - db.query("ALTER TABLE %s DISABLE KEYS" % self.tableName) - logging.info("loading data into %s using LOAD DATA LOCAL INFILE..." % self.tableName) - anchorFields = self.fastAnchor - - if self.tableName=="catalog": - anchorFields = "bookid,filename" - - loadEntries = { - "catLoc": self.catalogLocation, - "tabName": self.tableName, - "anchorFields": anchorFields, - "loadingFields": anchorFields + "," + ','.join([field.field for field in self.variables if field.unique]) - } - - loadEntries['loadingFields'] = loadEntries['loadingFields'].rstrip(',') - logging.debug("loading in data from " + self.catalogLocation) - loadcode = """LOAD DATA LOCAL INFILE '%(catLoc)s' - INTO TABLE %(tabName)s FIELDS ESCAPED BY '' - (%(loadingFields)s)""" % loadEntries - - db.query(loadcode) - logging.info("enabling keys on %s" %self.tableName) - db.query("ALTER TABLE %s ENABLE KEYS" % self.tableName) - - #If there isn't a 'searchstring' field, it may need to be coerced in somewhere hereabouts - - #This here stores the number of words in between catalog updates, so that the full word counts only have to be done once since they're time consuming. - if self.tableName=="catalog": - self.createNwordsFile() - - for variable in self.notUniques(): - variable.buildDiskTable() - - for variable in self.variables: - if variable.datatype=="categorical": - variable.build_ID_and_lookup_tables() - - if len(self.uniques()) > 0 and self.tableName!="catalog": - #catalog has separate rules handled in CreateDatabase.py. - fileCommand = self.uniqueVariableFastSetup("MYISAM") - for query in splitMySQLcode(fileCommand): - db.query(query) - - def uniqueVariableFastSetup(self,engine="MEMORY"): - fileCommand = "DROP TABLE IF EXISTS tmp;" - fileCommand += "CREATE TABLE tmp ({} MEDIUMINT UNSIGNED, PRIMARY KEY ({}), ".format( - self.fastAnchor,self.fastAnchor - ) - fileCommand += ",\n".join([variable.fastSQL() for variable in self.variables if (variable.unique and variable.fastSQL() is not None)]) - fileCommand += ") ENGINE=%s;\n" % engine - - fast_fields = self.fastAnchor + ", " + ",".join([variable.fastField for variable in self.variables if variable.unique and variable.fastSQL() is not None]) - - fileCommand += "INSERT INTO tmp SELECT " + fast_fields - fileCommand += " FROM %s " % self.tableName - fileCommand += " ".join([" JOIN %(field)s__id USING (%(field)s ) " % variable.__dict__ for variable in self.variables if variable.unique and variable.fastSQL() is not None and variable.datatype=="categorical"])+ ";\n" - - name = self.fastName - if engine=="MYISAM": - name += "_" - fileCommand += "DROP TABLE IF EXISTS %s;\n" % name - fileCommand += "RENAME TABLE tmp TO %s;\n" % name - - return fileCommand - - def updateMasterVariableTable(self): - """ - All the categorical variables get a lookup table; - we store the create code in the databse; - """ - for variable in self.variables: - # Make sure the variables know who their parent is - variable.fastAnchor = self.fastAnchor - # Update the referents for everything - variable.updateVariableDescriptionTable() - - inCatalog = self.uniques() - if len(inCatalog) > 0 and self.tableName!="catalog": - #catalog has separate rules handled in CreateDatabase.py; so this builds - #the big rectangular table otherwise. - #It will fail if masterTableTable doesn't exister. - fileCommand = self.uniqueVariableFastSetup() - try: - parentTab = self.db.query(""" - SELECT tablename FROM masterVariableTable - WHERE dbname='%s'""" % self.fastAnchor).fetchall()[0][0] - except: - if self.fastAnchor=="bookid": - parentTab="fastcat" - else: - logging.error("Unable to find a table to join the anchor (%s) against" % self.fastAnchor) - raise - self.db.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="%s";' %self.fastName) - self.db.query("INSERT INTO masterTableTable VALUES (%s, %s, %s)", (self.fastName,parentTab,escape_string(fileCommand))) - - def createNwordsFile(self): - """ - A necessary supplement to the `catalog` table. - """ - db = self.db - - db.query("CREATE TABLE IF NOT EXISTS nwords (bookid MEDIUMINT UNSIGNED, PRIMARY KEY (bookid), nwords INT);") - db.query("UPDATE catalog JOIN nwords USING (bookid) SET catalog.nwords = nwords.nwords") - db.query("INSERT INTO nwords (bookid,nwords) SELECT catalog.bookid,sum(count) FROM catalog LEFT JOIN nwords USING (bookid) JOIN master_bookcounts USING (bookid) WHERE nwords.bookid IS NULL GROUP BY catalog.bookid") - db.query("UPDATE catalog JOIN nwords USING (bookid) SET catalog.nwords = nwords.nwords") - - - -class DummyDict(dict): - """ - Stupid little hack. - Looks like a dictionary, but just returns itself. - Used in cases where we don't actually need the dictionary. - """ - # we need to have it there. - def __missing__(self,key): - return key diff --git a/tests/test_API.py b/tests/test_API.py index 4fab13a..eb6bcf3 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -1,34 +1,60 @@ # -*- coding: utf-8 -*- -from builtins import range -from builtins import object -import unittest +import pytest import bookwormDB -import bookwormDB.CreateDatabase -from bookwormDB.general_API import SQLAPIcall as SQLAPIcall +from bookwormDB.general_API import DuckDBCall as DuckDBCall +from bookwormDB.builder import BookwormCorpus +from pathlib import Path import logging import os +import duckdb from subprocess import call as call import sys import json -from setup import setup_bookworm, setup_bookworm_unicode -class Bookworm_SQL_Creation(unittest.TestCase): - - def test_bookworm_files_exist(self): - bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase("federalist_bookworm") - db = bookworm.db - db.query("USE federalist_bookworm") - wordCount = db.query("SELECT SUM(nwords) FROM fastcat_").fetchall()[0][0] +import pytest + + +@pytest.fixture(scope="session") +def federalist_bookworm(tmpdir_factory): + path = tmpdir_factory.mktemp("ascii").join("federalist.duckdb") + tmpdir = tmpdir_factory.mktemp("tmpdir") + corp = BookwormCorpus( + f"{path}", + texts = Path('tests/test_bookworm_files/input.txt'), + metadata = "tests/test_bookworm_files/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + con = duckdb.connect(str(path), read_only = True) + return con + + + +@pytest.fixture(scope="session") +def unicode_bookworm(tmpdir_factory): + path = tmpdir_factory.mktemp("unicode").join("unicode.duckdb") + tmpdir = tmpdir_factory.mktemp("tmpdir") + + corp = BookwormCorpus( + f"{path}", + texts = Path('tests/test_bookworm_files_unicode/input.txt'), + metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + con = duckdb.connect(str(path), read_only = True) + return con + +class Test_Bookworm_SQL_Creation(): + def test_bookworm_files_exist(self, federalist_bookworm): + wordCount = federalist_bookworm.query("SELECT SUM(nwords) FROM fastcat").fetchall()[0][0] # This should be 212,081, but I don't want the tests to start failing when # we change the tokenization rules or miscellaneous things about encoding. - self.assertTrue(wordCount>100000) + assert wordCount > 200000 """ Then we test whether the API can make queries on that bookworm. """ - def test_API(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_groups(self, federalist_bookworm): import json query = { @@ -39,12 +65,11 @@ def test_API(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertEqual(len(m),5) + m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert len(m) == 5 - def test_multiword_search(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_multiword_search(self, federalist_bookworm): import json query = { @@ -55,11 +80,10 @@ def test_multiword_search(self): "groups": [] } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(m[0] > 33) + m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert m[0] > 33 - def test_ne_with_one_entry(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_ne_with_one_entry(self, federalist_bookworm): import json query = { @@ -72,11 +96,10 @@ def test_ne_with_one_entry(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==4) + m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert len(m)==4 - def test_ne_with_two_entries(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_ne_with_two_entries(self, federalist_bookworm): import json query = { @@ -89,12 +112,11 @@ def test_ne_with_two_entries(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==3) + m = json.loads(DuckDBCall(federalist_bookworm, query= query).execute())['data'] + assert len(m)==3 - def test_ne_with_two_entries(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_ne_with_two_entries(self, federalist_bookworm): import json query = { @@ -107,12 +129,11 @@ def test_ne_with_two_entries(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==3) + m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert len(m)==3 - def test_or_with_two_entries(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_or_with_two_entries(self, federalist_bookworm): import json query = { @@ -128,11 +149,10 @@ def test_or_with_two_entries(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertEqual(len(m),2) + m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert len(m) == 2 - def test_lte_and_gte(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_lte_and_gte(self, federalist_bookworm): import json query = { @@ -145,11 +165,10 @@ def test_lte_and_gte(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==6) + m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert len(m)==6 - def test_and_with_two_entries(self): - from bookwormDB.general_API import SQLAPIcall as SQLAPIcall + def test_and_with_two_entries(self, federalist_bookworm): import json query = { @@ -165,10 +184,10 @@ def test_and_with_two_entries(self): "method":"data", "format":"json" } - m = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(len(m)==0) + m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert len(m)==0 - def test_adding_metadata_to_bookworm(self): + def ftest_adding_metadata_to_bookworm(self): """ Build out some dummy metadata: label the difference between even and odd paragrahs. @@ -217,8 +236,7 @@ class Dummy(object): "method":"data", "format":"json" } - SQLAPIcall(query) - m = json.loads(SQLAPIcall(query).execute())['data'] +# m = json.loads(SQLAPIcall(query).execute())['data'] # Even or odd is one of two things. self.assertTrue(len(m)==2) @@ -227,7 +245,7 @@ class Dummy(object): self.assertTrue(m['odd'][0]>=m['even'][0]) - def test_case_sensitivity(self): + def test_case_sensitivity(self, federalist_bookworm): query = { "database":"federalist_bookworm", "search_limits":{"word":["the"]}, @@ -237,19 +255,17 @@ def test_case_sensitivity(self): "method":"data", "format":"json" } - SQLAPIcall(query) - val1 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] > 0) + val1 = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert(val1[0] > 0) query["words_collation"] = "Case_Insensitive" - SQLAPIcall(query) - val2 = json.loads(SQLAPIcall(query).execute())['data'] + val2= json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] # The words ('The','the') appear more often than ('the') alone. - self.assertTrue(val2[0] > val1[0]) + assert (val2[0] > val1[0]) - def test_case_insensitivity_works_without_search_term(self): + def test_case_insensitivity_works_without_search_term(self, federalist_bookworm): query = { "database":"federalist_bookworm", "search_limits":{"word":["hOwEvEr"]}, @@ -258,24 +274,22 @@ def test_case_insensitivity_works_without_search_term(self): "words_collation":"Case_Insensitive", "method":"data", "format":"json" } - SQLAPIcall(query) - val1 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] > 0) + val = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert (val[0] > 0) - def test_unicode_search_term(self): + def test_unicode_search_term(self, unicode_bookworm): query = { "database":"unicode_test_bookworm", - "search_limits":{"word":[u"ᎾᏍᎩ"]}, + "search_limits":{"word":["ᎾᏍᎩ"]}, "counttype":"WordCount", "groups":[], "words_collation":"Case_Insensitive", "method":"data", "format":"json" } - SQLAPIcall(query) - val1 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] > 0) + val = json.loads(DuckDBCall(unicode_bookworm, query = query).execute())['data'] + assert (val[0] > 0) - def test_various_unicode_cases(self): + def test_various_unicode_cases(self, federalist_bookworm): # There's a 'description_' for each individual item. catalog_location = sys.path[0] + "/test_bookworm_files_unicode/jsoncatalog.txt" cases = [json.loads(line)["description_"] for line in open(catalog_location)] @@ -288,11 +302,10 @@ def test_various_unicode_cases(self): "words_collation":"Case_Insensitive", "method":"data", "format":"json" } - SQLAPIcall(query) - val1 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] > 0) + val = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert(val[0] > 0) - def test_asterisks_in_search_limits(self): + def test_asterisks_in_search_limits(self, federalist_bookworm): """ The following two queries should, by definition, produce the same result. """ @@ -304,7 +317,7 @@ def test_asterisks_in_search_limits(self): "groups":[], "method":"data", "format":"json" } - val1 = json.loads(SQLAPIcall(query).execute())['data'] + val1 = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] query = { "database":"federalist_bookworm", @@ -313,46 +326,7 @@ def test_asterisks_in_search_limits(self): "groups":[], "method":"data", "format":"json" } - val2 = json.loads(SQLAPIcall(query).execute())['data'] - self.assertTrue(val1[0] == val2[0]) - - -""" -class SQLConnections(unittest.TestCase): - - - - def test_dunning(self): - query = { - "database":"federalist", - "search_limits":{"author":"Hamilton"}, - "compare_limits":{"author":"Madison"}, - "counttype":"Dunning", - "groups":["unigram"], - "method":"data", "format":"json" - } - + val2 = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert(val1[0] == val2[0]) - try: - #dbbindings.main(query) - worked = True - except: - worked = False - - self.assertTrue(worked) -""" - - -if __name__=="__main__": - # The setup is done without verbose logging; any failure - # causes it to try again. - logging.basicConfig(level=40) - try: - setup_bookworm() - setup_bookworm_unicode() - except: - logging.basicConfig(level=10) - setup_bookworm() - setup_bookworm_unicode() - logging.basicConfig(level=10) - unittest.main() + \ No newline at end of file diff --git a/tests/test_config.py b/tests/test_config.py deleted file mode 100644 index 69d853d..0000000 --- a/tests/test_config.py +++ /dev/null @@ -1,18 +0,0 @@ -# -*- coding: utf-8 -*- - -from bookwormDB.manager import BookwormManager -import unittest -import logging -import os -import sys - -class Bookworm_Configuration(unittest.TestCase): - - def test_config(self): - bookworm = BookwormManager(None, "federalist_bookworm") - - -if __name__=="__main__": - # The setup is done without verbose logging; any failure - # causes it to try again. - unittest.main() diff --git a/tests/test_creation.py b/tests/test_creation.py new file mode 100644 index 0000000..1e332b0 --- /dev/null +++ b/tests/test_creation.py @@ -0,0 +1,20 @@ +import pytest +from pathlib import Path +from bookwormDB.builder import BookwormCorpus + +class TestCreation(): + def test_ascii_creation(self, tmpdir): + corp = BookwormCorpus( + f"{tmpdir}/federalist.duckdb", + texts = Path('tests/test_bookworm_files/input.txt'), + metadata = "tests/test_bookworm_files/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + + def test_unicode_creation(self, tmpdir): + corp = BookwormCorpus( + f"{tmpdir}/unicode.duckdb", + texts = Path('tests/test_bookworm_files_unicode/input.txt'), + metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() \ No newline at end of file diff --git a/tests/test_mysql.py b/tests/test_mysql.py deleted file mode 100644 index 89f49a1..0000000 --- a/tests/test_mysql.py +++ /dev/null @@ -1,63 +0,0 @@ -from builtins import hex -import unittest -import bookwormDB -from bookwormDB.configuration import Configfile -import bookwormDB.CreateDatabase -import logging -import MySQLdb -import random - -logging.basicConfig(level=10) - - -""" -Tests of the MySQL configuration. -""" - -class Bookworm_MySQL_Configuration(unittest.TestCase): - def test_server_connection(self): - logging.info("\n\nTESTING SERVER CONNECTION\n\n") - """ - Connect to MySQL and run a simple query. - """ - import bookwormDB.CreateDatabase - db = bookwormDB.CreateDatabase.DB(dbname="mysql") - sampleQuery=db.query("SELECT 1+1").fetchall() - self.assertTrue(sampleQuery[0][0]==2) - - """ - To properly test things, we actually build some bookworms. - This assumes that the directory '/tmp' is writeable, - which isn't strictly necessary for a bookworm to be built. - """ - - def test_config_files(self): - logging.info("\n\nTESTING CONFIG FILE ACCESS\n\n") - def test_config_file(conf): - user = conf.config.get("client","user") - pw = conf.config.get("client","password") - return (user, pw) - - global_configuration_file = Configfile("read_only") - admin_configuration_file = Configfile("admin") - - (admin_user,admin_pw) = test_config_file(global_configuration_file) - (client_user,client_pw) = test_config_file(admin_configuration_file) - logging.info("admin user is {} and password is {}".format(admin_user,admin_pw)) - logging.info("client user is {} and password is {}".format(client_user,client_pw)) - logging.info("Checking that admin and client users are distinct") - self.assertTrue(admin_user != client_user) - - def test_createDB_permission(self): - logging.info("\nTESTING ABILITY TO CREATE DATABASES\n\n") - import bookwormDB.configuration - dbname = "A" + hex(random.getrandbits(128))[2:-1] - import bookwormDB.CreateDatabase - db = bookwormDB.CreateDatabase.DB(dbname="mysql") - cursor = db.query("CREATE DATABASE {}".format(dbname)) - cursor.execute("DROP DATABASE {}".format(dbname)) - cursor.close() - - -if __name__=="__main__": - unittest.main() From a11204af5cb708e620b2446713d56ce3f80eea21 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 23 May 2021 22:26:54 -0400 Subject: [PATCH 29/41] Refactorings --- bookwormDB/duckdb.py | 3 +- bookwormDB/general_API.py | 150 +++++++++++++------------------------- bookwormDB/manager.py | 8 +- 3 files changed, 58 insertions(+), 103 deletions(-) diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index 8462eee..748bb17 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -798,8 +798,6 @@ class databaseSchema(object): """ def __init__(self, db): - # XXXX - self.db = db # has of what table each variable is in self.tableToLookIn = {} @@ -819,6 +817,7 @@ def __init__(self, db): # The aliases starts with a dummy alias for fully grouped queries. self.aliases = {} + self.db = db self.newStyle(db) diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 1f7bc45..828be47 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -323,7 +323,7 @@ def validate_query(self): def ensure_query_has_required_fields(self): required_fields = ['counttype', 'groups', 'database'] - if self.query['method'] in ['schema', 'search']: + if self.query['method'] in ['schema', 'search', 'returnPossibleFields']: required_fields = ['database'] for field in required_fields: @@ -438,117 +438,72 @@ def get_data_from_source(self): def execute(self): method = self.query['method'] - logging.debug("Preparing to execute with method '{}'".format(method)) fmt = self.query['format'] if 'format' in self.query else False + if not 'method' in self.query: + return "You must pass a method to the query." + if method=="returnPossibleFields": + method = "json_c" + self.query['method'] = "schema" + method = "schema" - if method == 'data' or method == 'schema' or method == 'search': - version = 2 - if fmt in ['json_c', 'search', 'html', 'csv', 'tsv']: - version = 3 - else: - version = 1 - - if version == 1: + try: # What to do with multiple search_limits + if isinstance(self.query['search_limits'], list): - if method in ["json", "return_json"]: - self.query['method'] = 'data' - self.query['format'] = 'json' - return self.multi_execute(version=version) + if fmt == "json" or version >= 3: + frame = self.multi_execute(version = version) else: # Only return first search limit if not return in json self.query['search_limits'] = self.query['search_limits'][0] - - form = method[7:] if method[:6] == 'return' else method - - logging.warning("method == \"%s\" is deprecated. Use method=\"data\" " - "with format=\"%s\" instead." % (method, form)) - - if method == "return_json" or method == "json": - self.query['method'] = 'data' - self.query['format'] = 'json' - return self.return_json(version=1) - - elif method == "return_csv" or method == "csv": - self.query['method'] = 'data' - self.query['format'] = 'json' + else: frame = self.data() - return frame.to_csv(path = None, sep="\t", encoding="utf8", index=False, - quoting=csv.QUOTE_NONE, escapechar="\\") - elif version >= 2: - try: - # What to do with multiple search_limits - - if isinstance(self.query['search_limits'], list): - if fmt == "json" or version >= 3: - frame = self.multi_execute(version = version) - else: - # Only return first search limit if not return in json - self.query['search_limits'] = self.query['search_limits'][0] - else: - frame = self.data() - if fmt == "json": - return self.return_json(version=2) + if fmt == "json": + return self.return_json(version=2) - if fmt == "csv": - return frame.to_csv(encoding="utf8", index=False) + if fmt == "csv": + return frame.to_csv(encoding="utf8", index=False) - if fmt == "tsv": - return frame.to_csv(sep="\t", encoding="utf8", index=False) + if fmt == "tsv": + return frame.to_csv(sep="\t", encoding="utf8", index=False) - if fmt == "feather" or fmt == "feather_js": - compression = "zstd" - if fmt == "feather_js": - compression = "uncompressed" - fout = io.BytesIO(b'') - try: - feather.write_feather(frame, fout, compression = compression) - except: - logging.warning("You need the pyarrow package installed to export as feather.") - raise - fout.seek(0) - return fout.read() + if fmt == "feather" or fmt == "feather_js": + compression = "zstd" + if fmt == "feather_js": + compression = "uncompressed" + fout = io.BytesIO(b'') + try: + feather.write_feather(frame, fout, compression = compression) + except: + logging.warning("You need the pyarrow package installed to export as feather.") + raise + fout.seek(0) + return fout.read() - if fmt == 'json_c': - return self.return_rle_json(frame) + if fmt == 'json_c': + return self.return_rle_json(frame) - if fmt == 'html': - return self.html(frame) + if fmt == 'html': + return self.html(frame) - else: - err = dict(status="error", code=200, - message="Only formats in ['csv', 'tsv', 'json', 'feather']" - " currently supported") - return json.dumps(err) - except BookwormException as e: - # Error status codes are HTTP codes - # http://www.restapitutorial.com/httpstatuscodes.html - err = e.args[0] - err['status'] = "error" + else: + err = dict(status="error", code=200, + message="Only formats in ['csv', 'tsv', 'json', 'feather']" + " currently supported") return json.dumps(err) - except Exception as ex: - # General Uncaught error. - logging.exception("{}".format(ex)) - logging.exception("Database error") - return json.dumps({"status": "error", "message": "Database error. " - "Try checking field names."}) - - # Temporary catch-all pushes to the old methods: - if method in ["returnPossibleFields", "search_results", - "return_books", "schema"]: - try: - logging.warn("Using deprecated API call.") - - query = userquery(self.query) - if method == "return_books": - return query.execute() - return json.dumps(query.execute()) - except Exception as e: - if len(str(e)) > 1 and e[1].startswith("Unknown database"): - return "No such bookworm {}".format(e[1].replace("Unknown database","")) - except: - return "General error" + except BookwormException as e: + # Error status codes are HTTP codes + # http://www.restapitutorial.com/httpstatuscodes.html + err = e.args[0] + err['status'] = "error" + return json.dumps(err) + except Exception as ex: + # General Uncaught error. + logging.exception("{}".format(ex)) + logging.exception("Database error") + return json.dumps({"status": "error", "message": "Database error. " + "Try checking field names."}) + def multi_execute(self, version=1): @@ -873,5 +828,4 @@ def generate_pandas_frame(self, call = None) -> DataFrame: if random.random() < .1: # Don't bother doing this every time. self.cache.trim_cache() - return resolution - + return resolution \ No newline at end of file diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index f68a530..42d4d1c 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -114,11 +114,13 @@ def query(self, args): Run a query against the API from the command line. """ - from bookwormDB.general_API import SQLAPIcall + from bookwormDB.general_API import DuckDBCall import json - + import duckdb query = json.loads(args.APIcall) - caller = SQLAPIcall(query) + print(query) + con = duckdb.connect("/drobo/bookworm_dbs/" + query['database'], read_only = True) + caller = DuckDBCall(con, query = query) print(caller.execute()) def serve(self, args): From a5522292cfe48a9946cdcdc2e4a9a1bf50f3e0d8 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Fri, 28 May 2021 23:05:54 -0400 Subject: [PATCH 30/41] Logging refactor, shift nc input expectations --- bookwormDB/DuckSchema.py | 3 + bookwormDB/MetaParser.py | 329 ------------------------------ bookwormDB/builder.py | 45 +++- bookwormDB/countManager.py | 36 ++-- bookwormDB/duckdb.py | 11 +- bookwormDB/general_API.py | 14 +- bookwormDB/manager.py | 44 ++-- bookwormDB/multiprocessingHelp.py | 3 +- bookwormDB/query_cache.py | 4 +- bookwormDB/wsgi.py | 10 +- tests/setup.py | 9 +- 11 files changed, 110 insertions(+), 398 deletions(-) delete mode 100644 bookwormDB/MetaParser.py diff --git a/bookwormDB/DuckSchema.py b/bookwormDB/DuckSchema.py index a9e64a7..9124203 100644 --- a/bookwormDB/DuckSchema.py +++ b/bookwormDB/DuckSchema.py @@ -1,5 +1,8 @@ import pyarrow as pa from base64 import b64decode +import logging +logger = logging.getLogger("bookworm") + class DuckSchema(object): """ This class stores information about the database setup that is used to diff --git a/bookwormDB/MetaParser.py b/bookwormDB/MetaParser.py deleted file mode 100644 index 6bc9629..0000000 --- a/bookwormDB/MetaParser.py +++ /dev/null @@ -1,329 +0,0 @@ -from __future__ import division -from datetime import date -import datetime -import dateutil.parser -import json -import sys -import os -import logging -from multiprocessing import Queue, Process -from queue import Empty -from .multiprocessingHelp import mp_stats, running_processes -import time - - -defaultDate = datetime.datetime(datetime.MINYEAR, 1, 1) - -def DaysSinceZero(dateobj): - #Zero isn't a date, which python knows but MySQL and javascript don't. - return (dateobj - date(1,1,1)).days + 366 - -mySQLreservedWords = set(["ACCESSIBLE", "ADD", -"ALL", "ALTER", "ANALYZE", "AND", "AS", "ASC", "ASENSITIVE", "BEFORE", -"BETWEEN", "BIGINT", "BINARY", "BLOB", "BOTH", "BY", "CALL", -"CASCADE", "CASE", "CHANGE", "CHAR", "CHARACTER", "CHECK", "COLLATE", -"COLUMN", "CONDITION", "CONSTRAINT", "CONTINUE", "CONVERT", "CREATE", -"CROSS", "CURRENT_DATE", "CURRENT_TIME", "CURRENT_TIMESTAMP", -"CURRENT_USER", "CURSOR", "DATABASE", "DATABASES", "DAY_HOUR", -"DAY_MICROSECOND", "DAY_MINUTE", "DAY_SECOND", "DEC", "DECIMAL", -"DECLARE", "DEFAULT", "DELAYED", "DELETE", "DESC", "DESCRIBE", -"DETERMINISTIC", "DISTINCT", "DISTINCTROW", "DIV", "DOUBLE", "DROP", -"DUAL", "EACH", "ELSE", "ELSEIF", "ENCLOSED", "ESCAPED", "EXISTS", -"EXIT", "EXPLAIN", "FALSE", "FETCH", "FLOAT", "FLOAT4", "FLOAT8", -"FOR", "FORCE", "FOREIGN", "FROM", "FULLTEXT", "GENERAL", "GRANT", -"GROUP", "HAVING", "HIGH_PRIORITY", "HOUR_MICROSECOND", "HOUR_MINUTE", -"HOUR_SECOND", "IF", "IGNORE", "IGNORE_SERVER_IDS", "IN", "INDEX", -"INFILE", "INNER", "INOUT", "INSENSITIVE", "INSERT", "INT", "INT1", -"INT2", "INT3", "INT4", "INT8", "INTEGER", "INTERVAL", "INTO", "IS", -"ITERATE", "JOIN", "KEY", "KEYS", "KILL", "LEADING", "LEAVE", "LEFT", -"LIKE", "LIMIT", "LINEAR", "LINES", "LOAD", "LOCALTIME", -"LOCALTIMESTAMP", "LOCK", "LONG", "LONGBLOB", "LONGTEXT", "LOOP", -"LOW_PRIORITY", "MASTER_HEARTBEAT_PERIOD[c]", -"MASTER_SSL_VERIFY_SERVER_CERT", "MATCH", "MAXVALUE", "MEDIUMBLOB", -"MEDIUMINT", "MEDIUMTEXT", "MIDDLEINT", "MINUTE_MICROSECOND", -"MINUTE_SECOND", "MOD", "MODIFIES", "NATURAL", "NOT", -"NO_WRITE_TO_BINLOG", "NULL", "NUMERIC", "ON", "OPTIMIZE", "OPTION", -"OPTIONALLY", "OR", "ORDER", "OUT", "OUTER", "OUTFILE", "PRECISION", -"PRIMARY", "PROCEDURE", "PURGE", "RANGE", "READ", "READS", -"READ_WRITE", "REAL", "REFERENCES", "REGEXP", "RELEASE", "RENAME", -"REPEAT", "REPLACE", "REQUIRE", "RESIGNAL", "RESTRICT", "RETURN", -"REVOKE", "RIGHT", "RLIKE", "SCHEMA", "SCHEMAS", "SECOND_MICROSECOND", -"SELECT", "SENSITIVE", "SEPARATOR", "SET", "SHOW", "SIGNAL", -"SLOW[d]", "SMALLINT", "SPATIAL", "SPECIFIC", "SQL", "SQLEXCEPTION", -"SQLSTATE", "SQLWARNING", "SQL_BIG_RESULT", "SQL_CALC_FOUND_ROWS", -"SQL_SMALL_RESULT", "SSL", "STARTING", "STRAIGHT_JOIN", "TABLE", -"TERMINATED", "THEN", "TINYBLOB", "TINYINT", "TINYTEXT", "TO", -"TRAILING", "TRIGGER", "TRUE", "UNDO", "UNION", "UNIQUE", "UNLOCK", -"UNSIGNED", "UPDATE", "USAGE", "USE", "USING", "UTC_DATE", "UTC_TIME", -"UTC_TIMESTAMP", "VALUES", "VARBINARY", "VARCHAR", "VARCHARACTER", -"VARYING", "WHEN", "WHERE", "WHILE", "WITH", "WRITE", "XOR", -"YEAR_MONTH", "ZEROFILL", "WORDS", "NWORDS", "WORD", "UNIGRAM"]) - - -def ParseFieldDescs(write = False): - f = open('field_descriptions.json', 'r') - try: - fields = json.loads(f.read()) - except ValueError: - raise ValueError("Error parsing JSON: Check to make sure that your field_descriptions.json file is valid.") - - if write: - derivedFile = open('.bookworm/metadata/field_descriptions_derived.json', 'w') - - output = [] - - fields_to_derive = [] - - for field in fields: - if field["field"].upper() in mySQLreservedWords: - raise NameError(f"{field['field']} is a reserved word but appears" - "in field_description.json. Please choose a different name for" - "the column.") - for character in [" ","-", "&","+","."]: - if character in field['field']: - raise NameError(f"{field['field']} contains a special character, please rename") - - if field["datatype"] == "time": - if "derived" in field: - fields_to_derive.append(field) - else: - if field['type'] == "character": - field['derived'] = [ - {"resolution": "year"}, - {"resolution": "day"} - ] - fields_to_derive.append(field) - elif field['type'] == "integer": - output.append(field) - else: - raise TypeError("Unable to parse temporal field " + field['field']) - else: - output.append(field) - - for field in fields_to_derive: - for derive in field["derived"]: - if "aggregate" in derive: - tmp = dict(datatype="time", type="integer", unique=True) - tmp["field"] = '_'.join([field["field"], derive["resolution"], - derive["aggregate"]]) - output.append(tmp) - else: - tmp = dict(datatype="time", type="integer", unique=True) - tmp["field"] = '_'.join([field["field"], derive["resolution"]]) - output.append(tmp) - if write: - derivedFile.write(json.dumps(output)) - derivedFile.close() - - return (fields_to_derive, fields) - - -def parse_json_catalog(line_queue, processes, modulo): - fields_to_derive, fields = ParseFieldDescs(write = False) - - if os.path.exists("jsoncatalog.txt"): - mode = "json" - fin = open("jsoncatalog.txt") - - if os.path.exists("catalog.csv"): - mode = "csv" - import csv - fin = csv.DictReader("catalog.csv") - - for i, line in enumerate(fin): - if i % processes != modulo: - continue - - for char in ['\t', '\n']: - line = line.replace(char, '') - - if mode == "json": - try: - line = json.loads(line) - except: - logging.error(f"Invalid json in line {i}\n:{line}" - "The input file must be in ndjson format (http://ndjson.org/)") - raise - - for field in fields: - # Smash together misidentified lists - try: - if field['unique'] and isinstance(line[field["field"]],list): - line[field["field"]] = "--".join(line[field["field"]]) - except KeyError: - pass - - for field in fields_to_derive: - - """ - Using fields_to_derive as a shorthand for dates--this may break - if we get more ambitious about derived fields, - but this whole metadata-parsing code needs to be refactored anyway. - - Note: this code is inefficient--it parses the same date multiple times. - We should be parsing the date once and pulling - derived fields out of that one parsing. - """ - - try: - if line[field["field"]]=="": - # Use blankness as a proxy for unknown - continue - - time = dateutil.parser.parse(line[field["field"]],default = defaultDate) - intent = [time.year,time.month,time.day] - content = [str(item) for item in intent] - - pass - except: - """ - Fall back to parsing as strings - """ - try: - datem = line[field["field"]].split("T")[0] - content = datem.split('-') - intent = [int(item) for item in content] - except KeyError: - #It's OK not to have an entry for a time field - continue - except ValueError: - # Thrown if fields are empty on taking the int value: treat as junk - continue - except AttributeError: - """ - Happens if it's an integer, which is a forgiveable way - to enter a year: - """ - content = [str(line[field['field']])] - intent = [line[field['field']]] - else: - for derive in field["derived"]: - try: - if "aggregate" in derive: - if derive["resolution"] == 'day' and \ - derive["aggregate"] == "year": - k = "%s_day_year" % field["field"] - dt = date(intent[0], intent[1], intent[2]) - line[k] = dt.timetuple().tm_yday - elif derive["resolution"] == 'day' and \ - derive["aggregate"] == "month": - k = "%s_day_month" % field["field"] - line[k] = intent[2] - elif derive["resolution"] == 'day' and \ - derive["aggregate"] == "week": - k = "%s_day_month" % field["field"] - dt = date(intent[0], intent[1], intent[2]) - # Python and javascript handle weekdays differently: - # Like JS, we want to begin on Sunday with zero - line[k] = dt.weekday() + 1 - if (line[k]) == 7: - line[k] = 0 - elif derive["resolution"] == 'month' and \ - derive["aggregate"] == "year": - k = "%s_month_year" % field["field"] - dt = date(1,intent[1],1) - line[k] = dt.timetuple().tm_yday - elif derive["resolution"] == 'week' and \ - derive["aggregate"] == "year": - dt = date(intent[0], intent[1], intent[2]) - k = "%s_week_year" % field["field"] - line[k] = int(dt.timetuple().tm_yday/7)*7 - elif derive["resolution"] == 'hour' and \ - derive["aggregate"] == "day": - k = "%s_hour_day" % field["field"] - line[k] = time.hour - elif derive["resolution"] == 'minute' and \ - derive["aggregate"] == "day": - k = "%s_hour_day" % field["field"] - line[k] = time.hour*60 + time.minute - else: - logging.warning('Problem with aggregate resolution.') - continue - else: - if derive["resolution"] == 'year': - line["%s_year" % field["field"]] = intent[0] - elif derive["resolution"] == 'month': - try: - k = "%s_month" % field["field"] - dt = date(intent[0], intent[1], 1) - line[k] = DaysSinceZero(dt) - except: - logging.warning("Problem with date fields\n") - pass - elif derive['resolution'] == 'week': - k = "%s_week" % field['field'] - dt = date(intent[0], intent[1], intent[2]) - inttime = DaysSinceZero(dt) - time = int(inttime/7)*7 - #Not starting on Sunday or anything funky like that. Actually, I don't know what we're starting on. Adding an integer here would fix that. - line[k] = time - elif derive['resolution'] == 'day': - k = "%s_day" % field['field'] - dt = date(intent[0], intent[1], intent[2]) - inttime = DaysSinceZero(dt) - line[k] = inttime - else: - logging.warning('Resolution %s currently not supported.' % (derive['resolution'])) - continue - except ValueError: - # One of out a million Times articles threw this with - # a year of like 111,203. It's not clear how best to - # handle this. - logging.warning("ERROR: %s " % line[field["field"]] + - "did not convert to proper date. Moving on...") - # raise - pass - except Exception as e: - logging.warning('*'*50) - logging.warning('ERROR: %s\nINFO: %s\n' % (str(e), e.__doc__)) - logging.warning('*'*50) - line.pop(field["field"]) - try: - el = json.dumps(line) - line_queue.put((line["filename"], el)) - except KeyError: - logging.warning("No filename key in {}".format(line)) - except: - logging.warning("Error on {}".format(line)) - raise - logging.debug("Metadata thread done after {} lines".format(i)) - - -def parse_catalog_multicore(): - from .sqliteKV import KV - cpus, _ = mp_stats() - encoded_queue = Queue(10000) - workers = [] - - for i in range(cpus): - p = Process(target = parse_json_catalog, args = (encoded_queue, cpus, i)) - p.start() - workers.append(p) - output = open(".bookworm/metadata/jsoncatalog_derived.txt", "w") - - bookids = KV(".bookworm/metadata/textids.sqlite") - import sqlite3 - - while True: - try: - filename, n = encoded_queue.get_nowait() - output.write(n + "\n") - ids = set() - try: - bookids.register(filename) - except sqlite3.IntegrityError: - if filename in ids: - logging.warning("Duplicate key insertion {}".format(filename)) - ids.add(filename) - - except Empty: - if running_processes(workers): - # Give it a sec to fill back up to avoid this thread taking up - # a full processor. - time.sleep(0.01) - else: - # We're done! - break - - bookids.close() - output.close() diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 94eecad..1b206ae 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -6,6 +6,9 @@ from nonconsumptive import Corpus from nonconsumptive.metadata import Catalog from pathlib import Path +import logging +from pyarrow import feather +logger = logging.getLogger("bookworm") class BookwormCorpus(Corpus): """ @@ -25,8 +28,12 @@ def encoded_batches(self): def bookworm_name(self): return self.db_location.with_suffix("").name - def prepare_parquet_ingest_file(self): - quacksort(self.encoded_batches(), ['wordid', 'bookid'], self.root / 'unigram_bookid.parquet', block_size = 1_000_000_000) + def sort_parquet_unigrams(self): + dest = self.root / 'unigram_bookid.parquet' + if dest.exists(): + logger.warning(f"Using existed sorted unigrams at {dest} without checking if they're out of date.") + return + quacksort(self.encoded_batches(), ['wordid', 'bookid'], self.root / 'unigram_bookid.parquet', block_size = 5_000_000_000) def prepare_metadata(self): self.metadata.to_flat_catalog() @@ -78,12 +85,6 @@ def create_table_schemas(self): (tabname, b64encode(pa.parquet.ParquetFile(tab).schema_arrow.serialize().to_pybytes()), "table")) - def update_wordcounts(self): - rel = self.con.register_arrow("my_nwords", self.document_wordcounts(key='bookid')) - self.con.execute("ALTER TABLE fastcat ADD nwords INT32") - rel.execute("UPDATE fastcat SET nwords = s.nwords FROM my_nwords as s WHERE s.bookid = fastcat.bookid") - rel.unregister_arrow("my_nwords") - def create_slow_catalog(self): con = self.con catcols = set(con.execute("DESCRIBE TABLE catalog").df()['Field']) @@ -95,13 +96,37 @@ def create_slow_catalog(self): unique.append(f'"{col}"') con.execute(f"CREATE VIEW slowcat AS SELECT {','.join(unique)} FROM catalog") + def ingest_wordcounts(self): + self.con.execute('CREATE TABLE nwords ("@id" VARCHAR, "nwords" INTEGER)') + + for p in (self.root / "document_lengths").glob("*.feather"): + tb = feather.read_table(p) + rel = self.con.register_arrow("t", tb) + self.con.execute("INSERT INTO nwords SELECT * FROM t") + self.con.unregister("t") + + self.con.execute("ALTER TABLE catalog ADD nwords INTEGER") + self.con.execute('UPDATE catalog SET nwords = nwords.nwords FROM nwords WHERE "catalog"."@id" = "nwords"."@id"') + self.con.execute("ALTER TABLE fastcat ADD nwords INTEGER") + self.con.execute('UPDATE fastcat SET nwords = catalog.nwords FROM catalog WHERE catalog.bookid = fastcat.bookid') + def build(self): - self.prepare_parquet_ingest_file() + logger.info("Preparing metadata") self.prepare_metadata() + logger.info("Sorting unigrams for duck ingest") + self.sort_parquet_unigrams() + logger.info("Ingesting unigrams") self.ingest_unigrams() +# logger.warning("Ingesting bigrams") + logger.info("Ingesting metadata") + self.ingest_metadata() + logger.info("Creating schemas for load") + + self.ingest_wordcounts() self.create_table_schemas() - self.update_wordcounts() + + logger.info("Building slow catalog view") self.create_slow_catalog() diff --git a/bookwormDB/countManager.py b/bookwormDB/countManager.py index 770b874..1012659 100644 --- a/bookwormDB/countManager.py +++ b/bookwormDB/countManager.py @@ -9,6 +9,8 @@ import psutil import queue import logging +logger = logging.getLogger("bookworm") + import fileinput import time import csv @@ -27,10 +29,10 @@ # Assume 200 bytes per entry in python dict. QUEUE_POST_THRESH = int(memory / 3 * 1024 * 1024 / 200 / cpus) -logging.debug("Ideal queue size is {}".format(QUEUE_POST_THRESH)) +logger.debug("Ideal queue size is {}".format(QUEUE_POST_THRESH)) QUEUE_POST_THRESH = max([100000, QUEUE_POST_THRESH]) -logging.info("Filling dicts to size {}".format(QUEUE_POST_THRESH)) +logger.info("Filling dicts to size {}".format(QUEUE_POST_THRESH)) def flush_counter(counter, qout): for k in ['', '\x00']: @@ -61,7 +63,7 @@ def counter(qout, i, fin, mode = "count"): datatype = "raw" count_signals = [".unigrams", ".bigrams", ".trigrams", ".quadgrams"] - logging.info(f"fin is {fin}") + logger.info(f"fin is {fin}") for signal in count_signals: if signal in fin: @@ -90,7 +92,7 @@ def counter(qout, i, fin, mode = "count"): # Cleanup. if mode == "count": - logging.debug("Flushing leftover counts from thread {}".format(i)) + logger.debug("Flushing leftover counts from thread {}".format(i)) flush_counter(counter=counter, qout = qout) if mode == "encode": encoder.close() @@ -112,32 +114,32 @@ def yield_texts_from_directory(dir, i, IDfile): try: id = IDfile[basename] except KeyError: - logging.info(f"No catalog entry for {basename} at {file.name}, skipping") + logger.info(f"No catalog entry for {basename} at {file.name}, skipping") continue # Use sha256 key = int(hashlib.md5(basename.encode('utf-8')).hexdigest(), 16) - logging.info(basename, key) + logger.info(basename, key) if key % cpus != i: continue if file.name.endswith(".txt.gz"): try: fin = gzip.open(file, mode="rt") except UnicodeDecodeError: - logging.error(f"Unable to read {file}: unicode error") + logger.error(f"Unable to read {file}: unicode error") continue except gzip.BadGzipFile: - logging.error(f"Unable to read {file}: Bad gzip file") + logger.error(f"Unable to read {file}: Bad gzip file") continue elif file.name.endswith(".txt"): fin = open(file) else: - logging.error(f"Can't handle file {file}") + logger.error(f"Can't handle file {file}") try: yield (basename, fin.read().replace("\t", "\f").replace("\n", "\f")) except UnicodeDecodeError: - logging.error(f"Unable to read {file}") + logger.error(f"Unable to read {file}") except gzip.BadGzipFile: - logging.error(f"Unable to read {file}: Bad gzip file") + logger.error(f"Unable to read {file}: Bad gzip file") continue def yield_lines_from_single_file(fname, i, IDfile): @@ -162,12 +164,12 @@ def yield_lines_from_single_file(fname, i, IDfile): try: id = IDfile[filename] except KeyError: - logging.warning(f"No catalog entry for {id} though it appears in {filename}, skipping") + logger.warn(f"No catalog entry for {id} though it appears in {filename}, skipping") continue yield (filename, text) if totals > 0 and errors/totals > 0.01: - logging.warning("Skipped {} rows without tabs".format(errors)) + logger.warn("Skipped {} rows without tabs".format(errors)) def create_counts(input): @@ -179,7 +181,7 @@ def create_counts(input): qout = Queue(cpus * 2) workers = [] - logging.info("Spawning {} count processes on {}".format(cpus, input)) + logger.info("Spawning {} count processes on {}".format(cpus, input)) for i in range(cpus): p = Process(target = counter, args = (qout, i, input, "count")) p.start() @@ -191,7 +193,7 @@ def create_counts(input): try: input_dict = qout.get_nowait() - logging.debug("inputting queue of length {} from worker".format(len(input_dict))) + logger.debug("inputting queue of length {} from worker".format(len(input_dict))) wordcounter.update(input_dict) except queue.Empty: @@ -217,8 +219,8 @@ def create_wordlist(n, input, output): counter = create_counts(input) counter = sorted(list(counter.iteritems()), key = lambda x: -1 * x[1]) output = open(output, "w") - logging.info(f"Created wordlist from {input}") - logging.info(f"top 10 words are {[c for c in counter[:10]]}") + logger.info(f"Created wordlist from {input}") + logger.info(f"top 10 words are {[c for c in counter[:10]]}") for i, (k, v) in enumerate(counter): output.write("{}\t{}\t{}\n".format(i, k, v)) if i >= n: diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index 1c934e0..0d7dc1f 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -8,12 +8,13 @@ import copy import hashlib import logging +logger = logging.getLogger("bookworm") def fail_if_nonword_characters_in_columns(input): keys = all_keys(input) for key in keys: if re.search(r"[^A-Za-z_$*0-9]", key): - logging.error("{} has nonword character".format(key)) + logger.error("{} has nonword character".format(key)) raise def all_keys(input): @@ -200,7 +201,7 @@ def wordid_query(self): if self.wordswhere != " TRUE ": f = "SELECT wordid FROM { words } as words1 WHERE { wordswhere }".format(**self.__dict__) - logging.debug("`" + self.wordswhere + "`") + logger.debug("`" + self.wordswhere + "`") return " wordid IN ({})".format(f) else: return " TRUE " @@ -355,7 +356,7 @@ def wordswhere(self): selectString = f"SELECT wordid FROM wordsheap WHERE \"{word_field}\" = '{searchingFor}'" - logging.warning(selectString) + logger.warning(selectString) self.db.execute(selectString) # Set the search key being used. @@ -480,7 +481,7 @@ def set_operations(self): def bookid_query(self): q = f""" {self.catwhere} """ - logging.debug("'{}'".format(self.catwhere)) + logger.debug("'{}'".format(self.catwhere)) if self.catwhere == "TRUE": self.bookid_where = " TRUE " else: @@ -568,7 +569,7 @@ def getActualSearchedWords(self): # Break bigrams into single words. words = ' '.join(words).split(' ') q = "SELECT word FROM {} WHERE {}".format(self.wordsheap, where_from_hash({self.word_field:words})) - logging.debug(q) + logger.debug(q) self.db.execute(q) self.actualWords = [item[0] for item in self.db.fetchall()] else: diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 2c23257..b3f568c 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -11,6 +11,8 @@ import re import json import logging +logger = logging.getLogger("bookworm") + import numpy as np import csv import io @@ -328,7 +330,7 @@ def ensure_query_has_required_fields(self): for field in required_fields: if field not in self.query: - logging.error("Missing field: %s" % field) + logger.error("Missing field: %s" % field) err = dict(message="Bad query. Missing \"%s\" field" % field, code=400) raise BookwormException(err) @@ -391,7 +393,7 @@ def get_data_from_source(self): try: df1 = self.generate_pandas_frame(self.call1) rename(df1, "x") - logging.debug(self.call2) + logger.debug(self.call2) df2 = self.generate_pandas_frame(self.call2) rename(df2, "y") @@ -476,7 +478,7 @@ def execute(self): try: feather.write_feather(frame, fout, compression = compression) except: - logging.warning("You need the pyarrow package installed to export as feather.") + logger.warning("You need the pyarrow package installed to export as feather.") raise fout.seek(0) return fout.read() @@ -611,7 +613,7 @@ def tree(): for r in row ] except: - logging.warning(row) + logger.warning(row) pass destination[key] = row break @@ -721,9 +723,9 @@ def generate_pandas_frame(self, call = None): call = self.query q = DuckQuery(call, db = self.db).query() - logging.warning("Preparing to execute {}".format(q)) + logger.warning("Preparing to execute {}".format(q)) df = self.db.execute(q).df() - logging.debug("Query retrieved") + logger.debug("Query retrieved") return df diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index 703e4d0..18f1176 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -1,12 +1,14 @@ from __future__ import print_function import re -import logging + import sys import os import bookwormDB import argparse import nonconsumptive as nc from .store import store +import logging +logger = logging.getLogger("bookworm") """ This is the code that actually gets run from the command-line executable. @@ -45,7 +47,7 @@ def __init__(self, cnf_file=None, database=None): self.basedir = basedir break if self.basedir==None: - logging.debug("No bookworm directory found; hopefully this isn't a build call.") + logger.debug("No bookworm directory found; hopefully this isn't a build call.") if cnf_file is not None: config = configparser.ConfigParser(allow_no_value=True) @@ -87,7 +89,7 @@ def init(self, args): # Create a configuration file if not args.force: if os.path.exists(".bookworm"): - logging.error(""" + logger.error(""" You already have a folder named '.bookworm'. Probably you've already initialized a Bookworm here. """) @@ -117,10 +119,10 @@ def query(self, args): import json import duckdb query = json.loads(args.APIcall) - logging.info(query) + logger.info(query) con = duckdb.connect("/drobo/bookworm_dbs/" + query['database'], read_only = True) caller = DuckDBCall(con, query = query) - logging.info(caller.execute()) + logger.info(caller.execute()) def serve(self, args): @@ -193,7 +195,7 @@ def prep(self, args): That's a little groaty, I know. """ - logging.debug(args) + logger.debug(args) getattr(self, args.goal)(args) @@ -211,7 +213,7 @@ def wordlist(self, args): input = args.input if args.feature_counts: - logging.info(args.feature_counts) + logger.info(args.feature_counts) input = [a for a in args.feature_counts if 'unigrams' in a][0] create_wordlist(n = 1.5e06, input = input, @@ -272,7 +274,7 @@ def preDatabaseMetadata(self, args=None, **kwargs): # Doesn't need a created database yet, just needs access # to some pieces. Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase() - logging.info("Writing metadata to new catalog file...") + logger.info("Writing metadata to new catalog file...") Bookworm.variableSet.writeMetadata() # This creates helper files in the /metadata/ folder. @@ -286,9 +288,9 @@ def derived_catalog(self, args): from bookwormDB.MetaParser import parse_catalog_multicore, ParseFieldDescs - logging.debug("Preparing to write field descriptions") + logger.debug("Preparing to write field descriptions") ParseFieldDescs(write = True) - logging.debug("Preparing to write catalog") + logger.debug("Preparing to write catalog") parse_catalog_multicore() def field_descriptions_from_csv(self): @@ -329,22 +331,22 @@ def reload_memory(self,args): cursor = datahandler.db.query("SELECT TABLE_SCHEMA FROM information_schema.tables WHERE TABLE_NAME='masterTableTable'") for row in cursor.fetchall(): dbnames.append(row[0]) - logging.info("The following databases are bookworms to be reloaded:") + logger.info("The following databases are bookworms to be reloaded:") for name in dbnames: - logging.info("\t" + name) + logger.info("\t" + name) for database in dbnames: - logging.info("Reloading memory tables for %s" %database) + logger.info("Reloading memory tables for %s" %database) Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(database,variableFile=None) Bookworm.reloadMemoryTables(force=args.force) def database_metadata(self, args): import bookwormDB.CreateDatabase - logging.debug("creating metadata db") + logger.debug("creating metadata db") Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname) Bookworm.variableSet.loadMetadata() - logging.debug("creating metadata variable tables") + logger.debug("creating metadata variable tables") # This creates a table in the database that makes the results of # field_descriptions accessible through the API, and updates the @@ -418,15 +420,15 @@ def __init__(self,args,basedir="./"): def clone_or_pull(self): if not os.path.exists(self.dir): - logging.info("cloning git repo from " + self.args.url) + logger.info("cloning git repo from " + self.args.url) call(["git","clone",self.args.url,self.dir]) else: - logging.info("updating pre-existing git repo at " + self.dir) + logger.info("updating pre-existing git repo at " + self.dir) Popen(["git","pull"],cwd=self.dir) def make(self): - logging.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~") - logging.debug("Running make in " + self.dir) + logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~") + logger.debug("Running make in " + self.dir) Popen(["make"], cwd=self.dir) # Initiate MySQL connection. @@ -601,8 +603,8 @@ def run_arguments(): # While we're at it, log with line numbers FORMAT = "[%(filename)s:%(lineno)s-%(funcName)s() %(asctime)s.%(msecs)03d] %(message)s" logging.basicConfig(format=FORMAT, level=numeric_level, datefmt="%I:%M:%S") - logging.info("Info logging enabled.") - logging.info("Debug logging enabled.") + logger.info("Info logging enabled.") + logger.info("Debug logging enabled.") # Create the bookworm my_bookworm = BookwormManager(args.configuration, args.database) diff --git a/bookwormDB/multiprocessingHelp.py b/bookwormDB/multiprocessingHelp.py index f1b02bc..c0d0cc7 100644 --- a/bookwormDB/multiprocessingHelp.py +++ b/bookwormDB/multiprocessingHelp.py @@ -1,6 +1,7 @@ import os import psutil import logging +logger = logging.getLogger("bookworm") def mp_stats(): try: @@ -13,7 +14,7 @@ def mp_stats(): memory = int(psutil.virtual_memory()[4]) if memory < 1024: - logging.warning("Not much memory to work with--vocab may be inexact") + logger.warning("Not much memory to work with--vocab may be inexact") return (cpus, memory) diff --git a/bookwormDB/query_cache.py b/bookwormDB/query_cache.py index 1f78347..cc8367e 100644 --- a/bookwormDB/query_cache.py +++ b/bookwormDB/query_cache.py @@ -4,6 +4,8 @@ from pathlib import Path import logging +logger = logging.getLogger("bookworm") + import json import hashlib import random @@ -80,4 +82,4 @@ def trim_cache(self): try: extra[1].unlink() except: - logging.error(f"Unable to unlink file {extra}; assuming another thread got it first, although that's pretty unlikely!") + logger.error(f"Unable to unlink file {extra}; assuming another thread got it first, although that's pretty unlikely!") diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index eba557f..a4ff588 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -2,6 +2,8 @@ import json from urllib.parse import unquote import logging +logger = logging.getLogger("bookworm") + import multiprocessing import gunicorn.app.base from bookwormDB.store import store @@ -54,12 +56,12 @@ def __missing__(self, key): duck_connections = DuckPool() if args.remote_host is None: - logging.info("Using SQL API") + logger.info("Using SQL API") API = DuckDBCall API_kwargs = {} else: - logging.info("Using proxy API") + logger.info("Using proxy API") API = ProxyAPI API_kwargs = { "endpoint": args.remote_host @@ -102,7 +104,7 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): } - logging.debug("Received query {}".format(query)) + logger.debug("Received query {}".format(query)) start = datetime.now() # Backward-compatability: we used to force query to be @@ -144,7 +146,7 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): with open(logfile, 'a') as fout: json.dump(query, fout) fout.write("\n") - logging.debug("Writing to log: \n{}\n".format(json.dumps(query))) + logger.debug("Writing to log: \n{}\n".format(json.dumps(query))) return [response_body] # Copied from the gunicorn docs. diff --git a/tests/setup.py b/tests/setup.py index 03f9fd7..9d022e0 100644 --- a/tests/setup.py +++ b/tests/setup.py @@ -3,6 +3,7 @@ import bookwormDB.CreateDatabase from bookwormDB.general_API import SQLAPIcall as SQLAPIcall import logging +logger = logging.getLogger("bookworm") import os from subprocess import call as call import sys @@ -13,7 +14,7 @@ def setup_bookworm(): """ Creates a test bookworm. Removes any existing databases called "federalist_bookworm" """ - logging.info("\n\nTESTING BOOKWORM CREATION\n\n") + logger.info("\n\nTESTING BOOKWORM CREATION\n\n") import MySQLdb from warnings import filterwarnings filterwarnings('ignore', category = MySQLdb.Warning) @@ -42,7 +43,7 @@ def setup_bookworm(): pass else: print(e) - logging.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") + logger.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") call(["bookworm --log-level warning build all"],shell=True,cwd=sys.path[0] + "/test_bookworm_files") @@ -51,7 +52,7 @@ def setup_bookworm_unicode(): """ Creates a test bookworm. Removes any existing databases called "unicode_test_bookworm" """ - logging.info("\n\nTESTING BOOKWORM CREATION\n\n") + logger.info("\n\nTESTING BOOKWORM CREATION\n\n") import MySQLdb from warnings import filterwarnings filterwarnings('ignore', category = MySQLdb.Warning) @@ -79,7 +80,7 @@ def setup_bookworm_unicode(): if e[0]=="Cannot load from mysql.proc. The table is probably corrupted": pass else: - logging.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") + logger.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") call(["bookworm --log-level warning build all"], shell=True, From 81fc5d9af321aa5789a9b7039e11d77fe5c03f1a Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 10 Jun 2021 12:33:09 -0400 Subject: [PATCH 31/41] Passes all existing tests! --- bookwormDB/DuckSchema.py | 12 +-- bookwormDB/builder.py | 28 +++--- bookwormDB/duckdb.py | 72 ++++---------- bookwormDB/manager.py | 2 +- tests/__init__.py | 0 tests/setup.py | 93 ------------------- tests/test_API.py | 33 ++++--- tests/test_creation.py | 9 +- tests/test_sql_construction.py | 62 +++++++++++++ ...ormats.py => unimplemented_est_formats.py} | 23 +---- 10 files changed, 136 insertions(+), 198 deletions(-) create mode 100644 tests/__init__.py delete mode 100644 tests/setup.py create mode 100644 tests/test_sql_construction.py rename tests/{test_formats.py => unimplemented_est_formats.py} (71%) diff --git a/bookwormDB/DuckSchema.py b/bookwormDB/DuckSchema.py index 9124203..e816a67 100644 --- a/bookwormDB/DuckSchema.py +++ b/bookwormDB/DuckSchema.py @@ -21,7 +21,7 @@ def __init__(self, db): # hash of what table each variable is in self.tableToLookIn = { - 'bookid': 'fastcat', + '_ncid': 'fastcat', '@id': "slowcat", 'wordid': "wordsheap", 'nwords': 'fastcat'} @@ -30,11 +30,11 @@ def __init__(self, db): # 'author_birth' might be crosswalked to 'authorid' in the # main catalog.) self.anchorFields = { - 'bookid': 'bookid', + '_ncid': '_ncid', '@id': "slowcat", 'wordid': "wordid", 'word': "wordid", - 'nwords': 'bookid' + 'nwords': '_ncid' } # aliases: a hash showing internal identifications codes that @@ -68,17 +68,17 @@ def __init__(self, db): tables = db.execute("SELECT name, schema FROM arrow_schemas WHERE type='table'").fetchall() # A few columns are kept in the 'slowcat' view for historical reasons. slowcols = set(db.execute("DESCRIBE TABLE slowcat").df()['Field']) - current_anchor = "bookid" + current_anchor = "_ncid" for i, field in enumerate(slowcols): if i > 0: self.tableToLookIn[field] = "slowcat" - self.anchorFields[field] = "bookid" + self.anchorFields[field] = "_ncid" def tables_for_variable(self, variable, depth = 0): """ Returns the tables needed to look up a variable, back up to 'fastcat' or 'wordsheap' """ - if variable == 'bookid' or variable == 'wordid': + if variable == '_ncid' or variable == 'wordid': return [] vals = [] try: diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 1b206ae..6b0e6fb 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -29,11 +29,11 @@ def bookworm_name(self): return self.db_location.with_suffix("").name def sort_parquet_unigrams(self): - dest = self.root / 'unigram_bookid.parquet' + dest = self.root / 'unigram__ncid.parquet' if dest.exists(): logger.warning(f"Using existed sorted unigrams at {dest} without checking if they're out of date.") return - quacksort(self.encoded_batches(), ['wordid', 'bookid'], self.root / 'unigram_bookid.parquet', block_size = 5_000_000_000) + quacksort(self.encoded_batches(), ['wordid', '_ncid'], self.root / 'unigram__ncid.parquet', block_size = 5_000_000_000) def prepare_metadata(self): self.metadata.to_flat_catalog() @@ -53,8 +53,8 @@ def con(self): def ingest_unigrams(self): con = self.con - wordids = self.root / 'unigram_bookid.parquet' - con.execute(f"CREATE TABLE IF NOT EXISTS unigram_bookid AS SELECT * FROM parquet_scan('{wordids}')") + wordids = self.root / 'unigram__ncid.parquet' + con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") con.execute(f"CREATE TABLE words AS SELECT * FROM parquet_scan('{self.root / 'wordids.parquet'}')") con.execute(f"CREATE TABLE wordsheap AS SELECT wordid, token as word, lower(token) as lowercase FROM words") @@ -77,7 +77,7 @@ def create_table_schemas(self): # DuckDB can't yet handle blob inserts from python. # https://github.com/duckdb/duckdb/issues/1703 - for tab in [*self.flat_tabs()] + [self.root / Path("unigram_bookid.parquet"), self.root / 'wordids.parquet']: + for tab in [*self.flat_tabs()] + [self.root / Path("unigram__ncid.parquet"), self.root / 'wordids.parquet']: tabname = tab.with_suffix("").name if tabname in ["sorted", "wordids"]: continue @@ -89,7 +89,7 @@ def create_slow_catalog(self): con = self.con catcols = set(con.execute("DESCRIBE TABLE catalog").df()['Field']) fastcols = set(con.execute("DESCRIBE TABLE fastcat").df()['Field']) - unique = ["bookid"] + unique = ["_ncid"] for col in catcols: if col in fastcols or f"{col}__id" in fastcols: continue @@ -97,18 +97,20 @@ def create_slow_catalog(self): con.execute(f"CREATE VIEW slowcat AS SELECT {','.join(unique)} FROM catalog") def ingest_wordcounts(self): - self.con.execute('CREATE TABLE nwords ("@id" VARCHAR, "nwords" INTEGER)') + self.con.execute('CREATE TABLE nwords ("_ncid" INTEGER, "nwords" INTEGER)') for p in (self.root / "document_lengths").glob("*.feather"): tb = feather.read_table(p) - rel = self.con.register_arrow("t", tb) - self.con.execute("INSERT INTO nwords SELECT * FROM t") + indices = feather.read_table(self.root / "build/batch_indices" / p.name, columns = ["_ncid"]) + zipped = pa.table([indices['_ncid'], tb['count']], ["_ncid", "nwords"]) + self.con.register_arrow("t", zipped) + self.con.execute("INSERT INTO nwords (_ncid, nwords) SELECT * FROM t") self.con.unregister("t") self.con.execute("ALTER TABLE catalog ADD nwords INTEGER") - self.con.execute('UPDATE catalog SET nwords = nwords.nwords FROM nwords WHERE "catalog"."@id" = "nwords"."@id"') + self.con.execute('UPDATE catalog SET nwords = nwords.nwords FROM nwords WHERE "catalog"."_ncid" = "nwords"."_ncid"') self.con.execute("ALTER TABLE fastcat ADD nwords INTEGER") - self.con.execute('UPDATE fastcat SET nwords = catalog.nwords FROM catalog WHERE catalog.bookid = fastcat.bookid') + self.con.execute('UPDATE fastcat SET nwords = nwords.nwords FROM nwords WHERE fastcat._ncid = nwords._ncid') def build(self): logger.info("Preparing metadata") @@ -124,10 +126,12 @@ def build(self): logger.info("Creating schemas for load") self.ingest_wordcounts() + self.create_table_schemas() logger.info("Building slow catalog view") self.create_slow_catalog() + self.con.close() -RESERVED_NAMES = ["slowcat", "fastcat", "catalog", "my_nwords", "unigram_bookid"] \ No newline at end of file +RESERVED_NAMES = ["slowcat", "fastcat", "catalog", "my_nwords", "unigram__ncid"] \ No newline at end of file diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index 0d7dc1f..0085b35 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -221,9 +221,9 @@ def make_group_query(self): def main_table(self): if self.gram_size() == 1: - return '"unigram_bookid" as main' + return '"unigram__ncid" as main' if self.gram_size() == 2: - return '"word1_word2_bookid" as main' + return '"word1_word2__ncid" as main' def full_query_tables(self): # Joins are needed to provide groups, but *not* to provide @@ -257,7 +257,7 @@ def base_query(self): SELECT {', '.join(self.set_operations() + self.query_object['groups'])} FROM {self.query_tables} WHERE - {self.bookid_query()} + {self._ncid_query()} AND {self.wordid_query} AND @@ -276,7 +276,7 @@ def catalog_table(self): join to them. So if you query with a limit on LCSH, and LCSH is listed as being in a separate table, it joins the table "LCSH" to catalog; and then that table has one column, ALSO - called "LCSH", which is matched against. This allows a bookid + called "LCSH", which is matched against. This allows a _ncid to be a member of multiple catalogs. """ @@ -380,7 +380,7 @@ def wordswhere(self): if limits == []: # In the case that nothing has been found, tell it explicitly to search for # a condition when nothing will be found. - self._wordswhere = "bookid = -1" + self._wordswhere = "_ncid = -1" wordlimits = dict() @@ -429,7 +429,7 @@ def build_wordstables(self): elif needsUnigrams: self.main = ''' - unigram_bookid as main + unigram__ncid as main ''' self.wordstables = """ @@ -466,7 +466,7 @@ def set_operations(self): self.using_nwords = False if with_words: if "TextCount" in self.query_object['counttype']: - output.append("count(DISTINCT main.bookid) as 'TextCount'") + output.append("count(DISTINCT main._ncid) as 'TextCount'") if "WordCount" in self.query_object['counttype']: output.append("sum(main.count) as 'WordCount'") else: @@ -474,19 +474,19 @@ def set_operations(self): if "WordCount" in self.query_object['counttype']: output.append("sum(nwords) as 'WordCount'") if "TextCount" in self.query_object['counttype']: - output.append("count(DISTINCT bookid) as 'TextCount'") + output.append("count(DISTINCT _ncid) as 'TextCount'") return output - def bookid_query(self): + def _ncid_query(self): q = f""" {self.catwhere} """ logger.debug("'{}'".format(self.catwhere)) if self.catwhere == "TRUE": - self.bookid_where = " TRUE " + self._ncid_where = " TRUE " else: - self.bookid_where = q - return self.bookid_where + self._ncid_where = q + return self._ncid_where def query(self): @@ -539,19 +539,19 @@ def bibliography_query(self, limit = "100"): 'limit': limit } - dicto['bookid_where'] = self.bookid_query() + dicto['_ncid_where'] = self._ncid_query() dicto['wordid_where'] = self.wordid_query bibQuery = """ SELECT searchstring FROM catalog RIGHT JOIN ( - SELECT {fastcat}.bookid, {ordertype} as ordering + SELECT {fastcat}._ncid, {ordertype} as ordering FROM {tables} WHERE - {bookid_where} AND {wordid_where} and {catwhere} - GROUP BY bookid ORDER BY {ordertype} DESC LIMIT {limit} - ) as tmp USING (bookid) ORDER BY ordering DESC; + {_ncid_where} AND {wordid_where} and {catwhere} + GROUP BY _ncid ORDER BY {ordertype} DESC LIMIT {limit} + ) as tmp USING (_ncid) ORDER BY ordering DESC; """.format(**dicto) return bibQuery @@ -574,44 +574,6 @@ def getActualSearchedWords(self): self.actualWords = [item[0] for item in self.db.fetchall()] else: raise TypeError("Suspiciously low word count") - """ - def custom_SearchString_additions(self, returnarray): - "" - It's nice to highlight the words searched for. This will be on partner web sites, so requires custom code for different databases - "" - db = self.query_object['database'] - if db in ('jstor', 'presidio', 'ChronAm', 'LOC', 'OL'): - self.getActualSearchedWords() - if db == 'jstor': - joiner = "&searchText=" - preface = "?Search=yes&searchText=" - urlRegEx = "http://www.jstor.org/stable/\d+" - if db == 'presidio' or db == 'OL': - joiner = "+" - preface = "# page/1/mode/2up/search/" - urlRegEx = 'http://archive.org/stream/[^"# ><]*' - if db in ('ChronAm', 'LOC'): - preface = "/;words=" - joiner = "+" - urlRegEx = 'http://chroniclingamerica.loc.gov[^\"><]*/seq-\d+' - newarray = [] - for string in returnarray: - try: - base = re.findall(urlRegEx, string)[0] - newcore = ' search inside ' - string = re.sub("^", "", string) - string = re.sub("$", "", string) - string = string+newcore - except IndexError: - pass - newarray.append(string) - # Arxiv is messier, requiring a whole different URL interface: http://search.arxiv.org:8081/paper.jsp?r=1204.3352&qs=netwokr - else: - newarray = returnarray - return newarray - """ - - def pull_keys(entry): val = [] diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index 18f1176..edc6420 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -544,7 +544,7 @@ def run_arguments(): help=getattr(BookwormManager, "database_wordcounts").__doc__) word_ingest_parser.add_argument("--no-delete", action="store_true", help="Do not delete and rebuild the token tables. Useful for a partially finished ingest.") - word_ingest_parser.add_argument("--no-reverse-index", action="store_true", help="When creating the table, choose not to index bookid/wordid/counts. This is useful for really large builds. Because this is specified at table creation time, it does nothing with --no-delete or --index-only.") + word_ingest_parser.add_argument("--no-reverse-index", action="store_true", help="When creating the table, choose not to index _ncid/wordid/counts. This is useful for really large builds. Because this is specified at table creation time, it does nothing with --no-delete or --index-only.") word_ingest_parser.add_argument("--no-index", action="store_true", help="Do not re-enable keys after ingesting tokens. Only do this if you intent to manually enable keys or will run this command again.") diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/setup.py b/tests/setup.py deleted file mode 100644 index 9d022e0..0000000 --- a/tests/setup.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import print_function -import bookwormDB -import bookwormDB.CreateDatabase -from bookwormDB.general_API import SQLAPIcall as SQLAPIcall -import logging -logger = logging.getLogger("bookworm") -import os -from subprocess import call as call -import sys -import json -from shutil import rmtree - -def setup_bookworm(): - """ - Creates a test bookworm. Removes any existing databases called "federalist_bookworm" - """ - logger.info("\n\nTESTING BOOKWORM CREATION\n\n") - import MySQLdb - from warnings import filterwarnings - filterwarnings('ignore', category = MySQLdb.Warning) - - import bookwormDB.configuration - os.chdir(sys.path[0] + "/test_bookworm_files") - rmtree(".bookworm", ignore_errors = True) - - bookwormDB.configuration.create(ask_about_defaults=False, database="federalist_bookworm") - - db = bookwormDB.CreateDatabase.DB(dbname="mysql") - - try: - db.query("DROP DATABASE IF EXISTS federalist_bookworm") - except MySQLdb.OperationalError as e: - if e[0]==1008: - pass - else: - print(e) - raise - except Exception as e: - """ - This is some weird MariaDB exception. It sucks that I'm compensating for it here. - """ - if e[0]=="Cannot load from mysql.proc. The table is probably corrupted": - pass - else: - print(e) - logger.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") - - call(["bookworm --log-level warning build all"],shell=True,cwd=sys.path[0] + "/test_bookworm_files") - - -def setup_bookworm_unicode(): - """ - Creates a test bookworm. Removes any existing databases called "unicode_test_bookworm" - """ - logger.info("\n\nTESTING BOOKWORM CREATION\n\n") - import MySQLdb - from warnings import filterwarnings - filterwarnings('ignore', category = MySQLdb.Warning) - - import bookwormDB.configuration - os.chdir(sys.path[0] + "/test_bookworm_files_unicode") - rmtree(".bookworm", ignore_errors = True) - - bookwormDB.configuration.create(ask_about_defaults=False,database="unicode_test_bookworm") - - db = bookwormDB.CreateDatabase.DB(dbname="mysql") - - try: - db.query("DROP DATABASE IF EXISTS unicode_test_bookworm") - except MySQLdb.OperationalError as e: - if e[0]==1008: - pass - else: - print(e) - raise - except Exception as e: - """ - This is some weird MariaDB exception. It sucks that I'm compensating for it here. - """ - if e[0]=="Cannot load from mysql.proc. The table is probably corrupted": - pass - else: - logger.warning("Some mysterious error in attempting to drop previous iterations: just try running it again?") - - call(["bookworm --log-level warning build all"], - shell=True, - cwd=sys.path[0] + "/test_bookworm_files_unicode") - - -if __name__=="__main__": - setup_bookworm() - setup_bookworm_unicode() - diff --git a/tests/test_API.py b/tests/test_API.py index ad7c901..3dd2da5 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -40,7 +40,7 @@ def unicode_bookworm(tmpdir_factory): return con class Test_Bookworm_SQL_Creation(): - def test_bookworm_files_exist(self, federalist_bookworm): + def test_nwords_populated(self, federalist_bookworm): wordCount = federalist_bookworm.query('SELECT SUM(nwords) FROM fastcat').fetchall()[0][0] # This should be 212,081, but I don't want the tests to start failing when # we change the tokenization rules or miscellaneous things about encoding. @@ -49,6 +49,15 @@ def test_bookworm_files_exist(self, federalist_bookworm): Then we test whether the API can make queries on that bookworm. """ + def test_fastcat_populated(self, federalist_bookworm): + textCount = federalist_bookworm.query('SELECT COUNT(*) FROM fastcat').fetchall()[0][0] + # This should be 212,081, but I don't want the tests to start failing when + # we change the tokenization rules or miscellaneous things about encoding. + assert textCount == 1333 + """ + Then we test whether the API can make queries on that bookworm. + """ + def test_groups(self, federalist_bookworm): query = { @@ -259,7 +268,7 @@ def test_case_sensitivity(self, federalist_bookworm): assert (val2[0] > val1[0]) - def test_case_insensitivity_works_without_search_term(self, federalist_bookworm): + def test_case_insensitivity_works_without_search_term_existing(self, federalist_bookworm): query = { "database":"federalist_bookworm", "search_limits":{"word":["hOwEvEr"]}, @@ -283,20 +292,22 @@ def test_unicode_search_term(self, unicode_bookworm): val = json.loads(DuckDBCall(unicode_bookworm, query = query).execute())['data'] assert (val[0] > 0) - - def test_various_unicode_cases(self, unicode_bookworm): # There's a 'description_' for each individual item. - catalog_location = sys.path[0] + "/test_bookworm_files_unicode/jsoncatalog.txt" - cases = [json.loads(line)["description_"] for line in open(catalog_location)] + catalog_location = "tests/test_bookworm_files_unicode/jsoncatalog.txt" + cases = [json.loads(line)["description_"] for line in open(catalog_location)] + wordcounts = unicode_bookworm.query("SELECT * FROM nwords").df()['nwords'] + fastcounts = unicode_bookworm.query("SELECT * FROM fastcat").df()['nwords'] + assert (wordcounts > 0).all() + assert (fastcounts > 0).all() for case in cases: query = { "database":"unicode_test_bookworm", - "search_limits":{"description_": case}, - "counttype":"WordCount", - "groups":[], - "words_collation":"Case_Insensitive", - "method":"data", "format":"json" + "search_limits": {"description_": case}, + "counttype": "WordCount", + "groups": [], + "words_collation": "Case_Insensitive", + "method": "data", "format": "json" } val = json.loads(DuckDBCall(unicode_bookworm, query = query).execute())['data'] assert(val[0] > 0) diff --git a/tests/test_creation.py b/tests/test_creation.py index 93d1b5d..df4a836 100644 --- a/tests/test_creation.py +++ b/tests/test_creation.py @@ -2,18 +2,23 @@ from pathlib import Path from bookwormDB.builder import BookwormCorpus import duckdb + class TestCreation(): def test_ascii_creation(self, tmpdir): + path = Path(f"{tmpdir}/federalist.duckdb") + corp = BookwormCorpus( - f"{tmpdir}/federalist.duckdb", + path, texts = Path('tests/test_bookworm_files/input.txt'), metadata = "tests/test_bookworm_files/jsoncatalog.txt", dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) corp.build() + con = duckdb.connect(str(path)) + ts = con.execute("""SELECT sum(nwords) as 'WordCount' FROM "fastcat" """).fetchall()[0][0] + assert ts > 20 def test_unicode_creation(self, tmpdir): path = Path(f"{tmpdir}/unicode.duckdb") - path = Path("/tmp/unicode.duckdb") if path.exists(): path.unlink() corp = BookwormCorpus( path, diff --git a/tests/test_sql_construction.py b/tests/test_sql_construction.py new file mode 100644 index 0000000..631e620 --- /dev/null +++ b/tests/test_sql_construction.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +import pytest +import bookwormDB +from bookwormDB.general_API import DuckDBCall as DuckDBCall +from bookwormDB.builder import BookwormCorpus +from pathlib import Path +import logging +logger = logging.getLogger("bookworm") + +import os +import duckdb +from subprocess import call as call +import sys +import json +import pytest + +@pytest.fixture(scope="session") +def federalist_bookworm(tmpdir_factory): + path = tmpdir_factory.mktemp("ascii").join("federalist.duckdb") + tmpdir = tmpdir_factory.mktemp("tmpdir") + corp = BookwormCorpus( + f"{path}", + texts = Path('tests/test_bookworm_files/input.txt'), + metadata = "tests/test_bookworm_files/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + corp.build() + con = duckdb.connect(str(path), read_only = True) + return con + +@pytest.fixture(scope="session") +def unicode_bookworm(tmpdir_factory): + path = tmpdir_factory.mktemp("unicode").join("unicode.duckdb") + tmpdir = tmpdir_factory.mktemp("tmpdir") + + corp = BookwormCorpus( + f"{path}", + texts = Path('tests/test_bookworm_files_unicode/input.txt'), + metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + + corp.build() + con = duckdb.connect(str(path), read_only = True) + return con + +class Test_Bookworm_SQL_Creation(): + + def test_ne_with_one_entry(self, federalist_bookworm): + import json + + query = { + "database":"federalist_bookworm", + "search_limits":{ + "author": {"$ne": ["HAMILTON"]} + }, + "counttype":"TextPercent", + "groups":["author"], + "method":"data", "format":"json" + } + + m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + assert len(m)==4 \ No newline at end of file diff --git a/tests/test_formats.py b/tests/unimplemented_est_formats.py similarity index 71% rename from tests/test_formats.py rename to tests/unimplemented_est_formats.py index 8f8e5a0..ccab3a0 100644 --- a/tests/test_formats.py +++ b/tests/unimplemented_est_formats.py @@ -2,11 +2,11 @@ from builtins import range from builtins import object -import unittest +import pytest import bookwormDB -import bookwormDB.CreateDatabase -from bookwormDB.general_API import SQLAPIcall as SQLAPIcall import logging +logger = logging.getLogger("bookworm") + import os from subprocess import call as call import sys @@ -15,7 +15,7 @@ from pyarrow import feather import io -class Bookworm_Return_Formats(unittest.TestCase): +class TestFormats: def test_feather(self): from bookwormDB.general_API import SQLAPIcall as SQLAPIcall @@ -52,17 +52,4 @@ def test_proxy_API(self): m = json.loads(SQLAPIcall(query).execute())['data'] self.assertEqual(len(m),5) - -if __name__=="__main__": - # The setup is done without verbose logging; any failure - # causes it to try again. - logging.basicConfig(level=40) - try: - setup_bookworm() - setup_bookworm_unicode() - except: - logging.basicConfig(level=10) - setup_bookworm() - setup_bookworm_unicode() - logging.basicConfig(level=10) - unittest.main() + \ No newline at end of file From e10b68ee7d76b160389d1666bccb8dec05345754 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Thu, 17 Jun 2021 22:44:21 -0400 Subject: [PATCH 32/41] Align to nc --- bookwormDB/builder.py | 35 ++++++++++++++++++++------------- bookwormDB/schema_primitives.py | 2 +- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 6b0e6fb..558ced4 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -22,7 +22,7 @@ def __init__(self, db_location, *args, **kwargs): super().__init__(*args, **kwargs) def encoded_batches(self): - for batch in self.encoded_wordcounts: + for batch in self.encoded_wordcounts(): yield batch def bookworm_name(self): @@ -51,12 +51,17 @@ def con(self): self._connection = duckdb.connect(str(self.db_location)) return self._connection + def ingest_unigrams(self): con = self.con - wordids = self.root / 'unigram__ncid.parquet' - con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") + fin = self.root / 'wordids.feather' + word_table = pa.feather.read_table(fin) + pa.parquet.write_table(word_table, fin.with_suffix(".parquet")) con.execute(f"CREATE TABLE words AS SELECT * FROM parquet_scan('{self.root / 'wordids.parquet'}')") con.execute(f"CREATE TABLE wordsheap AS SELECT wordid, token as word, lower(token) as lowercase FROM words") + + wordids = self.root / 'unigram__ncid.parquet' + con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") def ingest_metadata(self): for tabpath in self.flat_tabs(): @@ -97,20 +102,22 @@ def create_slow_catalog(self): con.execute(f"CREATE VIEW slowcat AS SELECT {','.join(unique)} FROM catalog") def ingest_wordcounts(self): - self.con.execute('CREATE TABLE nwords ("_ncid" INTEGER, "nwords" INTEGER)') - - for p in (self.root / "document_lengths").glob("*.feather"): - tb = feather.read_table(p) - indices = feather.read_table(self.root / "build/batch_indices" / p.name, columns = ["_ncid"]) - zipped = pa.table([indices['_ncid'], tb['count']], ["_ncid", "nwords"]) - self.con.register_arrow("t", zipped) - self.con.execute("INSERT INTO nwords (_ncid, nwords) SELECT * FROM t") + + self.con.execute('CREATE TABLE nwords ("@id" STRING, "nwords" INTEGER)') + logger.debug("Creating nwords") + for batch in self.iter_over('document_lengths'): + seen_a_word = True + tb = pa.Table.from_batches([batch]) + self.con.register_arrow("t", tb) + self.con.execute('INSERT INTO nwords ("@id", nwords) SELECT * FROM t') self.con.unregister("t") + if not seen_a_word: + raise FileNotFoundError("No document lengths for corpus.") self.con.execute("ALTER TABLE catalog ADD nwords INTEGER") - self.con.execute('UPDATE catalog SET nwords = nwords.nwords FROM nwords WHERE "catalog"."_ncid" = "nwords"."_ncid"') + self.con.execute('UPDATE catalog SET nwords = nwords.nwords FROM nwords WHERE "catalog"."@id" = "nwords"."@id"') self.con.execute("ALTER TABLE fastcat ADD nwords INTEGER") - self.con.execute('UPDATE fastcat SET nwords = nwords.nwords FROM nwords WHERE fastcat._ncid = nwords._ncid') + self.con.execute('UPDATE fastcat SET nwords = catalog.nwords FROM catalog WHERE fastcat._ncid = catalog._ncid') def build(self): logger.info("Preparing metadata") @@ -132,6 +139,6 @@ def build(self): logger.info("Building slow catalog view") self.create_slow_catalog() self.con.close() - + self._connection = duckdb.connect(str(self.db_location), read_only = True) RESERVED_NAMES = ["slowcat", "fastcat", "catalog", "my_nwords", "unigram__ncid"] \ No newline at end of file diff --git a/bookwormDB/schema_primitives.py b/bookwormDB/schema_primitives.py index ad62cc4..13fe934 100644 --- a/bookwormDB/schema_primitives.py +++ b/bookwormDB/schema_primitives.py @@ -9,7 +9,7 @@ "$id": "#/properties/counttype/items", "type": "string", "default": "WordCount", - "enum": agg_keys + "enum": agg_keys, "pattern": "^(.*)$" } } From bbedb0d514889427f99d92f8a78d9c1aa9c32a89 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Fri, 18 Jun 2021 19:14:24 -0400 Subject: [PATCH 33/41] broken, waiting on upstream fix in duckdb --- bookwormDB/bin/dbbindings-flask.py | 3 ++- bookwormDB/builder.py | 18 ++++++++++++++---- tests/test_API.py | 8 ++++++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/bookwormDB/bin/dbbindings-flask.py b/bookwormDB/bin/dbbindings-flask.py index 281bb47..a2ce05d 100755 --- a/bookwormDB/bin/dbbindings-flask.py +++ b/bookwormDB/bin/dbbindings-flask.py @@ -5,6 +5,8 @@ from flask import Flask, request, Response, jsonify import json import os +import logging +logger = logging.getLogger("bookworm") app = Flask(__name__) @@ -18,7 +20,6 @@ def index(): @app.route('/debug') def debug_api(): - import logging logging.basicConfig(level=logging.INFO) JSONinput = request.args.get('queryTerms') or request.args.get('query') if not JSONinput: diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 558ced4..93d14de 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -52,14 +52,18 @@ def con(self): return self._connection - def ingest_unigrams(self): + def ingest_wordids(self): con = self.con fin = self.root / 'wordids.feather' word_table = pa.feather.read_table(fin) pa.parquet.write_table(word_table, fin.with_suffix(".parquet")) + logger.debug("INGESTING INTO words") con.execute(f"CREATE TABLE words AS SELECT * FROM parquet_scan('{self.root / 'wordids.parquet'}')") + logger.debug("INGESTING INTO wordsheap") con.execute(f"CREATE TABLE wordsheap AS SELECT wordid, token as word, lower(token) as lowercase FROM words") + def ingest_unigram__ncid(self): + con = self.con wordids = self.root / 'unigram__ncid.parquet' con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") @@ -104,8 +108,9 @@ def create_slow_catalog(self): def ingest_wordcounts(self): self.con.execute('CREATE TABLE nwords ("@id" STRING, "nwords" INTEGER)') - logger.debug("Creating nwords") + logger.info("Creating nwords") for batch in self.iter_over('document_lengths'): + logger.info(f"Ingesting batch of length {len(batch)}") seen_a_word = True tb = pa.Table.from_batches([batch]) self.con.register_arrow("t", tb) @@ -113,10 +118,14 @@ def ingest_wordcounts(self): self.con.unregister("t") if not seen_a_word: raise FileNotFoundError("No document lengths for corpus.") - + logger.info("Creating nwords on `catalog`") self.con.execute("ALTER TABLE catalog ADD nwords INTEGER") + logger.info("Updating nwords on `catalog` from nwords table.") + return self.con.execute('UPDATE catalog SET nwords = nwords.nwords FROM nwords WHERE "catalog"."@id" = "nwords"."@id"') + logger.info("Creating nwords on `fastcat`.") self.con.execute("ALTER TABLE fastcat ADD nwords INTEGER") + logger.info("Updating nwords on `fastcat` from catalog table.") self.con.execute('UPDATE fastcat SET nwords = catalog.nwords FROM catalog WHERE fastcat._ncid = catalog._ncid') def build(self): @@ -125,7 +134,8 @@ def build(self): logger.info("Sorting unigrams for duck ingest") self.sort_parquet_unigrams() logger.info("Ingesting unigrams") - self.ingest_unigrams() + self.ingest_wordids() + self.ingest_unigram__ncid() # logger.warning("Ingesting bigrams") logger.info("Ingesting metadata") diff --git a/tests/test_API.py b/tests/test_API.py index 3dd2da5..bfa8834 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -42,7 +42,7 @@ def unicode_bookworm(tmpdir_factory): class Test_Bookworm_SQL_Creation(): def test_nwords_populated(self, federalist_bookworm): wordCount = federalist_bookworm.query('SELECT SUM(nwords) FROM fastcat').fetchall()[0][0] - # This should be 212,081, but I don't want the tests to start failing when + # This should be about 212,081, but I don't want the tests to start failing when # we change the tokenization rules or miscellaneous things about encoding. assert wordCount > 200000 """ @@ -309,7 +309,11 @@ def test_various_unicode_cases(self, unicode_bookworm): "words_collation": "Case_Insensitive", "method": "data", "format": "json" } - val = json.loads(DuckDBCall(unicode_bookworm, query = query).execute())['data'] + try: + val = json.loads(DuckDBCall(unicode_bookworm, query = query).execute())['data'] + except KeyError: + print(DuckDBCall(unicode_bookworm, query = query).execute()) + raise assert(val[0] > 0) def test_asterisks_in_search_limits(self, federalist_bookworm): From f0c74135205cc2b6feb3d95aa3e46d1af6e6e3f0 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Fri, 9 Jul 2021 21:46:24 -0400 Subject: [PATCH 34/41] Link to latest nonconsumptive version --- bookwormDB/builder.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 93d14de..26cc098 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -1,4 +1,4 @@ -from nonconsumptive.ducksauce import quacksort +from ducksauce import from_files import duckdb import numpy as np from base64 import b64encode, b64decode @@ -7,7 +7,7 @@ from nonconsumptive.metadata import Catalog from pathlib import Path import logging -from pyarrow import feather +from pyarrow import feather, parquet logger = logging.getLogger("bookworm") class BookwormCorpus(Corpus): @@ -26,14 +26,16 @@ def encoded_batches(self): yield batch def bookworm_name(self): + return self.db_location.with_suffix("").name - def sort_parquet_unigrams(self): - dest = self.root / 'unigram__ncid.parquet' - if dest.exists(): - logger.warning(f"Using existed sorted unigrams at {dest} without checking if they're out of date.") - return - quacksort(self.encoded_batches(), ['wordid', '_ncid'], self.root / 'unigram__ncid.parquet', block_size = 5_000_000_000) + def create_unigrams(self): + self.cache_set.add("ncid_wordid") + for i in self.encoded_wordcounts(): + pass + + def sort_unigrams(self, block_size = 5_000_000): + from_files((self.root / "ncid_wordid").glob("*"), ['wordid', '_ncid'], self.root / 'unigram__ncid.parquet', block_size = block_size) def prepare_metadata(self): self.metadata.to_flat_catalog() @@ -106,11 +108,10 @@ def create_slow_catalog(self): con.execute(f"CREATE VIEW slowcat AS SELECT {','.join(unique)} FROM catalog") def ingest_wordcounts(self): - self.con.execute('CREATE TABLE nwords ("@id" STRING, "nwords" INTEGER)') logger.info("Creating nwords") - for batch in self.iter_over('document_lengths'): - logger.info(f"Ingesting batch of length {len(batch)}") + seen_a_word = False + for batch in self.iter_over('document_lengths', ids = "@id"): seen_a_word = True tb = pa.Table.from_batches([batch]) self.con.register_arrow("t", tb) @@ -121,7 +122,6 @@ def ingest_wordcounts(self): logger.info("Creating nwords on `catalog`") self.con.execute("ALTER TABLE catalog ADD nwords INTEGER") logger.info("Updating nwords on `catalog` from nwords table.") - return self.con.execute('UPDATE catalog SET nwords = nwords.nwords FROM nwords WHERE "catalog"."@id" = "nwords"."@id"') logger.info("Creating nwords on `fastcat`.") self.con.execute("ALTER TABLE fastcat ADD nwords INTEGER") @@ -132,7 +132,8 @@ def build(self): logger.info("Preparing metadata") self.prepare_metadata() logger.info("Sorting unigrams for duck ingest") - self.sort_parquet_unigrams() + self.create_unigrams() + self.sort_unigrams() logger.info("Ingesting unigrams") self.ingest_wordids() self.ingest_unigram__ncid() From 86372e62fed06d457b30f9500de6f64f74cf16f1 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Tue, 13 Jul 2021 22:15:15 -0400 Subject: [PATCH 35/41] full nonconsumptive integration --- .vscode/.ropeproject/config.py | 114 ++++++++ .vscode/settings.json | 8 + bookwormDB/DuckSchema.py | 20 ++ bookwormDB/builder.py | 14 +- bookwormDB/countManager.py | 239 --------------- bookwormDB/duckdb.py | 2 + bookwormDB/general_API.py | 54 ++-- bookwormDB/manager.py | 516 ++++----------------------------- 8 files changed, 240 insertions(+), 727 deletions(-) create mode 100644 .vscode/.ropeproject/config.py create mode 100644 .vscode/settings.json diff --git a/.vscode/.ropeproject/config.py b/.vscode/.ropeproject/config.py new file mode 100644 index 0000000..dee2d1a --- /dev/null +++ b/.vscode/.ropeproject/config.py @@ -0,0 +1,114 @@ +# The default ``config.py`` +# flake8: noqa + + +def set_prefs(prefs): + """This function is called before opening the project""" + + # Specify which files and folders to ignore in the project. + # Changes to ignored resources are not added to the history and + # VCSs. Also they are not returned in `Project.get_files()`. + # Note that ``?`` and ``*`` match all characters but slashes. + # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' + # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' + # '.svn': matches 'pkg/.svn' and all of its children + # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' + # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' + prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', + '.hg', '.svn', '_svn', '.git', '.tox'] + + # Specifies which files should be considered python files. It is + # useful when you have scripts inside your project. Only files + # ending with ``.py`` are considered to be python files by + # default. + # prefs['python_files'] = ['*.py'] + + # Custom source folders: By default rope searches the project + # for finding source folders (folders that should be searched + # for finding modules). You can add paths to that list. Note + # that rope guesses project source folders correctly most of the + # time; use this if you have any problems. + # The folders should be relative to project root and use '/' for + # separating folders regardless of the platform rope is running on. + # 'src/my_source_folder' for instance. + # prefs.add('source_folders', 'src') + + # You can extend python path for looking up modules + # prefs.add('python_path', '~/python/') + + # Should rope save object information or not. + prefs['save_objectdb'] = True + prefs['compress_objectdb'] = False + + # If `True`, rope analyzes each module when it is being saved. + prefs['automatic_soa'] = True + # The depth of calls to follow in static object analysis + prefs['soa_followed_calls'] = 0 + + # If `False` when running modules or unit tests "dynamic object + # analysis" is turned off. This makes them much faster. + prefs['perform_doa'] = True + + # Rope can check the validity of its object DB when running. + prefs['validate_objectdb'] = True + + # How many undos to hold? + prefs['max_history_items'] = 32 + + # Shows whether to save history across sessions. + prefs['save_history'] = True + prefs['compress_history'] = False + + # Set the number spaces used for indenting. According to + # :PEP:`8`, it is best to use 4 spaces. Since most of rope's + # unit-tests use 4 spaces it is more reliable, too. + prefs['indent_size'] = 4 + + # Builtin and c-extension modules that are allowed to be imported + # and inspected by rope. + prefs['extension_modules'] = [] + + # Add all standard c-extensions to extension_modules list. + prefs['import_dynload_stdmods'] = True + + # If `True` modules with syntax errors are considered to be empty. + # The default value is `False`; When `False` syntax errors raise + # `rope.base.exceptions.ModuleSyntaxError` exception. + prefs['ignore_syntax_errors'] = False + + # If `True`, rope ignores unresolvable imports. Otherwise, they + # appear in the importing namespace. + prefs['ignore_bad_imports'] = False + + # If `True`, rope will insert new module imports as + # `from import ` by default. + prefs['prefer_module_from_imports'] = False + + # If `True`, rope will transform a comma list of imports into + # multiple separate import statements when organizing + # imports. + prefs['split_imports'] = False + + # If `True`, rope will remove all top-level import statements and + # reinsert them at the top of the module when making changes. + prefs['pull_imports_to_top'] = True + + # If `True`, rope will sort imports alphabetically by module name instead + # of alphabetically by import statement, with from imports after normal + # imports. + prefs['sort_imports_alphabetically'] = False + + # Location of implementation of + # rope.base.oi.type_hinting.interfaces.ITypeHintingFactory In general + # case, you don't have to change this value, unless you're an rope expert. + # Change this value to inject you own implementations of interfaces + # listed in module rope.base.oi.type_hinting.providers.interfaces + # For example, you can add you own providers for Django Models, or disable + # the search type-hinting in a class hierarchy, etc. + prefs['type_hinting_factory'] = ( + 'rope.base.oi.type_hinting.factory.default_type_hinting_factory') + + +def project_opened(project): + """This function is called after opening the project""" + # Do whatever you like here! diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..adc9fab --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.nosetestsEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/bookwormDB/DuckSchema.py b/bookwormDB/DuckSchema.py index e816a67..ec2124a 100644 --- a/bookwormDB/DuckSchema.py +++ b/bookwormDB/DuckSchema.py @@ -1,6 +1,7 @@ import pyarrow as pa from base64 import b64decode import logging +import pandas as pd logger = logging.getLogger("bookworm") class DuckSchema(object): @@ -53,11 +54,13 @@ def __init__(self, db): schema = dict(tables) current_anchor = None + self.fields = [] for tablename, tab in schema.items(): sch = pa.ipc.read_schema(pa.py_buffer(b64decode(tab))) if tablename in ["catalog"]: continue for i, field in enumerate(sch): + self.fields.append(field) if i == 0: current_anchor = field.name else: @@ -74,6 +77,23 @@ def __init__(self, db): self.tableToLookIn[field] = "slowcat" self.anchorFields[field] = "_ncid" + def to_pandas(self): + """ + Return a JSON representation of the schema. + """ + fields = [] + for field in self.fields: + name = field.name + if name.endswith("__id"): + continue + elif name in { 'count', 'wordid', '_ncid' }: + continue + elif str(field.type) == 'old_string': + continue + else: + fields.append({'dbname': name, 'dtype': str(field.type)}) + return pd.DataFrame(fields) + def tables_for_variable(self, variable, depth = 0): """ Returns the tables needed to look up a variable, back up to 'fastcat' or 'wordsheap' diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 26cc098..1697eba 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -26,16 +26,10 @@ def encoded_batches(self): yield batch def bookworm_name(self): - return self.db_location.with_suffix("").name - - def create_unigrams(self): - self.cache_set.add("ncid_wordid") - for i in self.encoded_wordcounts(): - pass - + def sort_unigrams(self, block_size = 5_000_000): - from_files((self.root / "ncid_wordid").glob("*"), ['wordid', '_ncid'], self.root / 'unigram__ncid.parquet', block_size = block_size) + from_files((self.root / "encoded_unigrams").glob("*"), ['wordid', '_ncid'], self.root / 'unigram__ncid.parquet', block_size = block_size) def prepare_metadata(self): self.metadata.to_flat_catalog() @@ -108,6 +102,7 @@ def create_slow_catalog(self): con.execute(f"CREATE VIEW slowcat AS SELECT {','.join(unique)} FROM catalog") def ingest_wordcounts(self): + self.con.execute('DROP TABLE IF EXISTS nwords') self.con.execute('CREATE TABLE nwords ("@id" STRING, "nwords" INTEGER)') logger.info("Creating nwords") seen_a_word = False @@ -131,8 +126,9 @@ def ingest_wordcounts(self): def build(self): logger.info("Preparing metadata") self.prepare_metadata() + logger.info("Creating unigrams for duck ingest") + self.cache("encoded_unigrams") logger.info("Sorting unigrams for duck ingest") - self.create_unigrams() self.sort_unigrams() logger.info("Ingesting unigrams") self.ingest_wordids() diff --git a/bookwormDB/countManager.py b/bookwormDB/countManager.py index 1012659..e69de29 100644 --- a/bookwormDB/countManager.py +++ b/bookwormDB/countManager.py @@ -1,239 +0,0 @@ -import sys -import os -import bounter -from collections import Counter -from .tokenizer import Tokenizer, tokenBatches, PreTokenized -from multiprocessing import Process, Queue, Pool -from .multiprocessingHelp import mp_stats, running_processes -import multiprocessing as mp -import psutil -import queue -import logging -logger = logging.getLogger("bookworm") - -import fileinput -import time -import csv -from pathlib import Path -import gzip -import hashlib - -cpus, memory = mp_stats() - - -# Allocate half of available memory for the bounter, in megabytes. -memory = int(memory/1024/1024/2) - -# Use another third of the memory for storing worker counts; divided -# by number of CPUS. -# Assume 200 bytes per entry in python dict. - -QUEUE_POST_THRESH = int(memory / 3 * 1024 * 1024 / 200 / cpus) -logger.debug("Ideal queue size is {}".format(QUEUE_POST_THRESH)) -QUEUE_POST_THRESH = max([100000, QUEUE_POST_THRESH]) - -logger.info("Filling dicts to size {}".format(QUEUE_POST_THRESH)) - -def flush_counter(counter, qout): - for k in ['', '\x00']: - try: - del counter[k] - except KeyError: - continue - qout.put(counter) - -def counter(qout, i, fin, mode = "count"): - """ - # Counts words exactly in a separate process. - # It runs in place. - If mode is 'encode', this is called for a side-effect of writing - files to disk. - """ - - totals = 0 - errors = 0 - - if mode == "count": - counter = Counter() - encoder = tokenBatches(['words']) - - if mode == "encode": - encoder = tokenBatches(['unigrams', 'bigrams']) - - datatype = "raw" - - count_signals = [".unigrams", ".bigrams", ".trigrams", ".quadgrams"] - logger.info(f"fin is {fin}") - - for signal in count_signals: - if signal in fin: - datatype = signal.strip(".") - if mode == "encode": - encoder = tokenBatches([datatype]) - - for id, text in yield_texts(fin, i, encoder.IDfile): - if datatype == "raw": - tokenizer = Tokenizer(text) - else: - tokenizer = PreTokenized(text, encoder.levels[0]) - - # When encoding - if mode == "encode": - encoder.encodeRow(id, tokenizer, write_completed = True) - continue - - # When building counts - counter.update(tokenizer.counts("words")) - - # When the counter is long, post it to the master and clear it. - if len(counter) > QUEUE_POST_THRESH: - flush_counter(counter=counter, qout = qout) - counter = Counter() - - # Cleanup. - if mode == "count": - logger.debug("Flushing leftover counts from thread {}".format(i)) - flush_counter(counter=counter, qout = qout) - if mode == "encode": - encoder.close() - -def yield_texts(fname, i, IDfile): - p = Path(fname) - if p.is_dir(): - for id, text in yield_texts_from_directory(p, i, IDfile): - yield (id, text) - else: - for id, text in yield_lines_from_single_file(p, i, IDfile): - yield (id, text) - -def yield_texts_from_directory(dir, i, IDfile): - for file in dir.glob('**/*.txt*'): - # Strips _djvu just for Internet Archive. - basename = file.name.rsplit(".txt", 1)[0] - # print(basename, file.name) - try: - id = IDfile[basename] - except KeyError: - logger.info(f"No catalog entry for {basename} at {file.name}, skipping") - continue - # Use sha256 - key = int(hashlib.md5(basename.encode('utf-8')).hexdigest(), 16) - logger.info(basename, key) - if key % cpus != i: - continue - if file.name.endswith(".txt.gz"): - try: - fin = gzip.open(file, mode="rt") - except UnicodeDecodeError: - logger.error(f"Unable to read {file}: unicode error") - continue - except gzip.BadGzipFile: - logger.error(f"Unable to read {file}: Bad gzip file") - continue - elif file.name.endswith(".txt"): - fin = open(file) - else: - logger.error(f"Can't handle file {file}") - try: - yield (basename, fin.read().replace("\t", "\f").replace("\n", "\f")) - except UnicodeDecodeError: - logger.error(f"Unable to read {file}") - except gzip.BadGzipFile: - logger.error(f"Unable to read {file}: Bad gzip file") - continue - -def yield_lines_from_single_file(fname, i, IDfile): - - if (str(fname).endswith(".gz")): - fin = gzip.open(fname, mode = 'rt') - else: - fin = open(fname) - totals = 0 - errors = 0 - for ii, row in enumerate(fin): - if ii % cpus != i: - # Don't do anything on most lines. - continue - - totals += 1 - try: - (filename, text) = row.rstrip().split("\t",1) - except ValueError: - errors += 1 - continue - try: - id = IDfile[filename] - except KeyError: - logger.warn(f"No catalog entry for {id} though it appears in {filename}, skipping") - continue - - yield (filename, text) - if totals > 0 and errors/totals > 0.01: - logger.warn("Skipped {} rows without tabs".format(errors)) - - -def create_counts(input): - - """ - The first step of wordcounting is done on a worker--then those - counts are shipped here to a bounter object that counts approximately. - """ - - qout = Queue(cpus * 2) - workers = [] - logger.info("Spawning {} count processes on {}".format(cpus, input)) - for i in range(cpus): - p = Process(target = counter, args = (qout, i, input, "count")) - p.start() - workers.append(p) - - wordcounter = bounter.bounter(memory) - - while True: - - try: - input_dict = qout.get_nowait() - logger.debug("inputting queue of length {} from worker".format(len(input_dict))) - wordcounter.update(input_dict) - - except queue.Empty: - if running_processes(workers): - time.sleep(1/100) - else: - break - except ValueError: - for k, v in input_dict.items(): - print("'{}'\t'{}'".format(k, v)) - wordcounter.update({k: v}) - raise - except TypeError: - for k, v in input_dict.items(): - print("'{}'\t'{}'".format(k, v)) - wordcounter.update({k: v}) - raise - - return wordcounter - -def create_wordlist(n, input, output): - - counter = create_counts(input) - counter = sorted(list(counter.iteritems()), key = lambda x: -1 * x[1]) - output = open(output, "w") - logger.info(f"Created wordlist from {input}") - logger.info(f"top 10 words are {[c for c in counter[:10]]}") - for i, (k, v) in enumerate(counter): - output.write("{}\t{}\t{}\n".format(i, k, v)) - if i >= n: - break - -def encode_words(wordlist, input): - qout = Queue(cpus * 2) - workers = [] - - for i in range(cpus): - p = Process(target = counter, args = (qout, i, input, "encode")) - p.start() - workers.append(p) - - while running_processes(workers): - time.sleep(1/30) diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index 0085b35..a9669e7 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -69,6 +69,8 @@ def __init__(self, query_object = {}, db = None, databaseScheme = None): self.databaseScheme = databaseScheme if databaseScheme is None: self.databaseScheme = DuckSchema(self.db) + if query_object['method'] == 'schema': + return self._wordswhere = None self.words = "words" self.defaults() # Take some defaults diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index b3f568c..e504340 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -250,6 +250,15 @@ def __init__(self, query): self.set_defaults() + def clone(self, query): + """ + Make a clone of the APIcall object. + Used with multipart queries. + + Should be sure that query itself is deeply cloned. + """ + return APIcall(query) + def set_defaults(self): query = self.query if "search_limits" not in query: @@ -444,7 +453,6 @@ def execute(self): if not 'method' in self.query: return "You must pass a method to the query." if method=="returnPossibleFields": - method = "json_c" self.query['method'] = "schema" method = "schema" @@ -462,7 +470,8 @@ def execute(self): frame = self.data() if fmt == "json": - return self.return_json(version=2) + val = frame.to_dict(orient = "records") + return self._prepare_response(val, version = 2) if fmt == "csv": return frame.to_csv(encoding="utf8", index=False) @@ -519,16 +528,16 @@ def multi_execute(self, version=1): for limits in self.query['search_limits']: child = deepcopy(self.query) child['search_limits'] = limits - q = self.__class__(child).return_json(raw_python_object=True, - version=version) + q = self.clone(child).return_json(raw_python_object=True, + version=version) returnable.append(q) return self._prepare_response(returnable, version) - if version == 3: + if version >= 3: for i, limits in enumerate(self.query['search_limits']): child = deepcopy(self.query) child['search_limits'] = limits - f = self.__class__(child).data() + f = self.clone(child).data() f['Search'] = i if i == 0: frame = f @@ -536,11 +545,9 @@ def multi_execute(self, version=1): frame = frame.append(f, ignore_index = True) return frame - def html(self, data): """ - Return data in column-oriented format with run-length encoding - on duplicate values. + return an HTML table. """ if isinstance(data, Series) and 'status' in data: @@ -634,10 +641,7 @@ def _prepare_response(self, data, version=1): resp = dict(status="error", data="Internal error: unknown response version") - try: - return json.dumps(resp) - except ValueError: - return json.dumps(resp) + return json.dumps(resp) class oldSQLAPIcall(APIcall): @@ -704,10 +708,18 @@ class DuckDBCall(APIcall): to discourage on-the-fly creation which is slow. """ - def __init__(self, db, **kwargs): + def __init__(self, query, db): self.db = db - super().__init__(**kwargs) + super().__init__(query) + + def clone(self, query): + """ + Make a clone of the object. + Used with multipart queries. + + """ + return DuckDBCall(query, db = self.db) def generate_pandas_frame(self, call = None): """ @@ -721,10 +733,14 @@ def generate_pandas_frame(self, call = None): """ if call is None: call = self.query - - q = DuckQuery(call, db = self.db).query() - logger.warning("Preparing to execute {}".format(q)) - df = self.db.execute(q).df() + q = DuckQuery(call, db = self.db) + if call['method'] == 'schema': + m = q.databaseScheme.to_pandas() + print(m) + return m + query = q.query() + logger.warning("Preparing to execute {}".format(query)) + df = self.db.execute(query).df() logger.debug("Query retrieved") return df diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index edc6420..b9c3ad9 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -1,13 +1,15 @@ -from __future__ import print_function import re +from pathlib import Path import sys import os import bookwormDB import argparse +import json import nonconsumptive as nc from .store import store import logging +from nonconsumptive.commander import namespace_to_kwargs, add_builder_parameters logger = logging.getLogger("bookworm") """ @@ -18,12 +20,6 @@ the run_arguments function pulls commands from the command line. Any useful new bookworm methods should be passed through run_arguments to work. - -Some modules, especially bookworm-specific ones, -are imported inline in the code here--that substantially -(as in, 1 second to 0.2 seconds) reduces startup time -for the command-line executable, -even though it's not best practice otherwise. """ class BookwormManager(object): @@ -41,25 +37,6 @@ def __init__(self, cnf_file=None, database=None): self.basedir = None self.dbname = None - for i in range(10): - basedir = "../"*i - if os.path.exists(basedir + ".bookworm"): - self.basedir = basedir - break - if self.basedir==None: - logger.debug("No bookworm directory found; hopefully this isn't a build call.") - - if cnf_file is not None: - config = configparser.ConfigParser(allow_no_value=True) - config.read([cnf_file]) - if config.has_section("client"): - """ - Silently go along if the config doesn't exist. - """ - try: - self.dbname = config.get("client", "database") - except configParser.NoOptionError: - pass # More specific options override the config file if database is not None: @@ -70,46 +47,10 @@ def config(self,args): """ Performs useful configuration tasks, such as setting up a MySQL installation. """ - if args.target=="mysql": - import bookwormDB.configuration - bookwormDB.configuration.recommend_my_cnf() - if args.target=="mysql-info": - from bookwormDB.configuration import Configfile - config = Configfile("admin") - print("The admin configuration login currently being used should be the following.\n") - config.write_out() if args.target=="apache": from bookwormDB.configuration import apache apache() - def init(self, args): - """ - Initialize the current directory as a bookworm directory. - """ - # Create a configuration file - if not args.force: - if os.path.exists(".bookworm"): - logger.error(""" - You already have a folder named '.bookworm'. - Probably you've already initialized a Bookworm here. - """) - return - if not os.path.exists("bookworm.cnf"): - fout = open("bookworm.cnf", "w") - if self.dbname: - loc = self.dbname - else: - loc = os.path.relpath(".", "..") - print("Configuring Bookworm named '{}'".format(loc)) - print("Change the file at bookworm.cnf if this is undesirable".format(loc)) - fout.write("[client]\ndatabase = {}\n".format(loc)) - else: - fout = open("bookworm.cnf", "w") - loc = os.path.relpath(".", "..") - print("Configuring Bookworm named '{}'".format(loc)) - print("Change the file at bookworm.cnf if this is undesirable".format(loc)) - fout.write("[client]\ndatabase = {}\n".format(loc)) - def query(self, args): """ Run a query against the API from the command line. @@ -118,10 +59,10 @@ def query(self, args): from bookwormDB.general_API import DuckDBCall import json import duckdb - query = json.loads(args.APIcall) + query = args.APIcall logger.info(query) con = duckdb.connect("/drobo/bookworm_dbs/" + query['database'], read_only = True) - caller = DuckDBCall(con, query = query) + caller = DuckDBCall(query = query, con = con) logger.info(caller.execute()) def serve(self, args): @@ -133,308 +74,20 @@ def serve(self, args): from bookwormDB.wsgi import run run(args.port, args.bind, args.workers) - import http.server - from http.server import HTTPServer - import shutil - - base_dir = args.dir - base_cgi_dir = os.path.normpath(base_dir + "/" + "cgi-bin") - d3_dir = os.path.normpath(base_dir + "/" + "D3") - for dir in [base_dir,base_cgi_dir]: - if not os.path.exists(dir): - os.makedirs(dir) - - API = os.path.normpath(os.path.dirname(bookwormDB.__file__) + "/bin/dbbindings.py") - if not os.path.exists(base_cgi_dir + "/" + API): - shutil.copy(API, base_cgi_dir) - - if not os.path.exists(d3_dir): - call(["git","clone","http://github.com/bmschmidt/BookwormD3",d3_dir]) - - # Use the Makefile to build the linechartGUI. This is a little Rube Goldberg-y. - args.target="linechartGUI" - - raise TypeError("The line below this is nonsense") - self.prep(args) - - os.chdir(base_dir) - # Actually serve it. - PORT = args.port - - httpd = HTTPServer(("", PORT), http.server.CGIHTTPRequestHandler) - - print("\n\n" + "****"*20) - print("A local bookworm server is now running") - print("You can now view some charts in a web-browser at http://localhost:%d/D3" % PORT) - print("If you have a time variable, linecharts are at http://localhost:%d/%s" % (PORT,self.dbname)) - print("Please note that this is not a very secure way: if you plan to put your bookworm") - print("on the open web, consider using apache.") - httpd.serve_forever() - - - def extension(self,args): - """ - Creates (or updates) an extension - """ - - if not os.path.exists(self.basedir + ".bookworm/extensions"): - os.makedirs(self.basedir + ".bookworm/extensions") - - my_extension = Extension(args,basedir = self.basedir) - my_extension.clone_or_pull() - my_extension.make() - def build(self, args): - self.prep(args) - - def prep(self, args): - """ - This is a wrapper to all the functions define here: the purpose - is to continue to allow access to internal methods in, for instance, - the Makefile, without documenting all of them in separate functions. - - That's a little groaty, I know. - """ - logger.debug(args) - - getattr(self, args.goal)(args) - - def wordlist(self, args): - """ - Create a wordlist of the top 1.5 million words. - """ - from .countManager import create_wordlist - if os.path.exists(".bookworm/texts/wordlist/wordlist.txt"): - return - try: - os.makedirs(".bookworm/texts/wordlist") - except FileExistsError: - pass - - input = args.input - if args.feature_counts: - logger.info(args.feature_counts) - input = [a for a in args.feature_counts if 'unigrams' in a][0] - create_wordlist(n = 1.5e06, - input = input, - output = ".bookworm/texts/wordlist/wordlist.txt") - - def destroy(self, args): - self.pristine(args) - - def pristine(self, args): - # Old name still works. - import bookwormDB.CreateDatabase - bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None) - if self.dbname == "mysql": - raise NameError("Don't try to delete the mysql database") - bookworm.db.query("DROP DATABASE IF EXISTS {}".format(self.dbname)) - import shutil - try: - shutil.rmtree('.bookworm') - except FileNotFoundError: - pass - - - def encoded(self, args): - """ - Using the wordlist and catalog, create encoded files. - """ - self.wordlist(args) - self.derived_catalog(args) - - for k in ['unigrams', 'bigrams', 'trigrams', 'quadgrams', 'completed']: - try: - os.makedirs(".bookworm/texts/encoded/{}".format(k)) - except FileExistsError: - pass - from .countManager import encode_words - - if args.feature_counts: - for feature in args.feature_counts: - encode_words(".bookworm/texts/wordlist/wordlist.txt", feature) - else: - encode_words(".bookworm/texts/wordlist/wordlist.txt", args.input) - - def all(self, args): - self.preDatabaseMetadata(args) - self.encoded(args) - self.database_wordcounts(args) - self.database_metadata(args) - - def preDatabaseMetadata(self, args=None, **kwargs): - import os - if not os.path.exists("field_descriptions.json"): - if os.path.exists("field_descriptions.csv"): - self.field_descriptions_from_csv() - else: - self.guess_field_descriptions() - self.derived_catalog(args) - import bookwormDB.CreateDatabase - # Doesn't need a created database yet, just needs access - # to some pieces. - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase() - logger.info("Writing metadata to new catalog file...") - Bookworm.variableSet.writeMetadata() - - # This creates helper files in the /metadata/ folder. - - def derived_catalog(self, args): - - if not os.path.exists(".bookworm/metadata"): - os.makedirs(".bookworm/metadata") - if os.path.exists(".bookworm/metadata/jsoncatalog_derived.txt"): - return - - from bookwormDB.MetaParser import parse_catalog_multicore, ParseFieldDescs - - logger.debug("Preparing to write field descriptions") - ParseFieldDescs(write = True) - logger.debug("Preparing to write catalog") - parse_catalog_multicore() - - def field_descriptions_from_csv(self): - import pandas as pd - import json - jsonified = pd.read_csv("field_descriptions.csv").to_json(orient="records") - with open("field_descriptions.json", "w") as fout: - fout.write(jsonified) - - def guess_field_descriptions(self, args = None, **kwargs): - - """ - Use a number of rules of thumb to automatically generate a field_descriptions.json file. - This may bin some categories incorrectly (depending on names, for example it may treat dates - as either categorical or time variables). - """ - - import bookwormDB.CreateDatabase - import json - import os - import pandas as pd - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname, variableFile=None) - Bookworm.setVariables("jsoncatalog.txt", jsonDefinition=None) - guess = Bookworm.variableSet.guessAtFieldDescriptions() - guess = pd.DataFrame(guess) - guess.to_csv("field_descriptions.csv", index = False) - raise FileNotFoundError("No field descriptions file found." - "Creating guess for field descriptions at: field_descriptions.csv." - "You should probably inspect and edit this file before you build." - "But if you suspect it's right, you can rebuild again immediately.") - - def reload_memory(self,args): - import bookwormDB.CreateDatabase - dbnames = [self.dbname] - if args.all==True: - dbnames = [] - datahandler = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname,variableFile=None) - cursor = datahandler.db.query("SELECT TABLE_SCHEMA FROM information_schema.tables WHERE TABLE_NAME='masterTableTable'") - for row in cursor.fetchall(): - dbnames.append(row[0]) - logger.info("The following databases are bookworms to be reloaded:") - for name in dbnames: - logger.info("\t" + name) - - for database in dbnames: - logger.info("Reloading memory tables for %s" %database) - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(database,variableFile=None) - Bookworm.reloadMemoryTables(force=args.force) - - def database_metadata(self, args): - import bookwormDB.CreateDatabase - logger.debug("creating metadata db") - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname) - Bookworm.variableSet.loadMetadata() - - logger.debug("creating metadata variable tables") - - # This creates a table in the database that makes the results of - # field_descriptions accessible through the API, and updates the - - Bookworm.loadVariableDescriptionsIntoDatabase() - - Bookworm.create_fastcat_and_wordsheap_disk_tables() - - Bookworm.grantPrivileges() + from .builder import BookwormCorpus + nc_params = namespace_to_kwargs(args) + db_path = args.db_directory / args.database + corp = BookwormCorpus( + db_location = db_path, + **nc_params, + cache_set = {"tokenization", "word_counts", + "encoded_unigrams", "document_lengths"}) + corp.build() + def add_metadata(self, args): - import bookwormDB.CreateDatabase - import bookwormDB.convertTSVtoJSONarray - bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname,None) - anchorField = args.key - if args.format == "tsv": - # TSV is just converted into JSON in a file at tmp.txt, and slurped in that way. - if args.key is None: - args.key = open(args.file).readline().split("\t")[0] - f = "tmp.txt" - bookwormDB.convertTSVtoJSONarray.convertToJSON(args.file, f) - args.file = f - - bookworm.importNewFile(args.file, - anchorField=args.key, - jsonDefinition=args.field_descriptions) - - - def database_wordcounts(self, args = None, **kwargs): - """ - Builds the wordcount components of the database. This will die - if you can't connect to the database server. - """ - cmd_args = args - import bookwormDB.CreateDatabase - - index = True - reverse_index = True - ingest = True - newtable = True - - if cmd_args and hasattr(cmd_args, "index_only"): - if cmd_args.index_only: - ingest = False - newtable = False - else: - index = not cmd_args.no_index - newtable = not cmd_args.no_delete - reverse_index = not cmd_args.no_reverse_index - if not (newtable and ingest and index): - logging.warn("database_wordcounts args not supported for bigrams yet.") - - Bookworm = bookwormDB.CreateDatabase.BookwormSQLDatabase(self.dbname) - Bookworm.load_word_list() - Bookworm.create_unigram_book_counts(newtable=newtable, ingest=ingest, index=index, reverse_index=reverse_index) - Bookworm.create_bigram_book_counts() - -class Extension(object): - - """ - A bookworm extension. Initialized with an args object, - which has the element url, the location of a clonable git repo. - - Because I don't want people to have to write extensions in python, - they are build using `make`. - """ - - def __init__(self,args,basedir="./"): - self.args = args - self.dir = basedir + ".bookworm/extensions/" + re.sub(".*/","",self.args.url) - - def clone_or_pull(self): - if not os.path.exists(self.dir): - logger.info("cloning git repo from " + self.args.url) - call(["git","clone",self.args.url,self.dir]) - else: - logger.info("updating pre-existing git repo at " + self.dir) - Popen(["git","pull"],cwd=self.dir) - - def make(self): - logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~") - logger.debug("Running make in " + self.dir) - Popen(["make"], cwd=self.dir) - -# Initiate MySQL connection. - - -# Pull a method from command line input. + raise NotImplementedError("Functionality missing in 3.0") def run_arguments(): """ @@ -451,117 +104,49 @@ def run_arguments(): Refactoring pull requests welcome. """ - parser = argparse.ArgumentParser(description='Build and maintain a Bookworm database.',prog="bookworm") + parser = argparse.ArgumentParser( + description='Build and maintain a Bookworm database.', + prog="bookworm") parser.add_argument("--configuration","-c",help="The name of the configuration file to read options from: by default, 'bookworm.cnf' in the current directory.", default="bookworm.cnf") - parser.add_argument("--database","-d",help="The name of the bookworm database in MySQL to connect to: by default, read from the active configuration file.", default=None) - parser.add_argument("--log-level","-l", help="The logging detail to use for errors. Default is 'warning', only significant problems; info gives a fuller record, and 'debug' dumps many MySQL queries, etc.",choices=["warning","info","debug"],type=str.lower,default="warning") + parser.add_argument("--log-level", "-l", + help="The logging detail to use for errors." + "Default is 'warning', only significant problems; info gives a " + "fuller record, and 'debug' dumps many db queries, etc.", + choices=["warning","info","debug"],type=str.lower,default="warning") - parser.add_argument("--input", "-i", - help = "The location of texts for an initial build." - "Either a text file ('input.txt' or 'input.txt.gz')" - "or a folder containing txt or txt.gz files, which may be nested" - "inside other directories", default = "input.txt") - - - parser.add_argument("--feature-counts", action='append', - help="Use pre-calculated feature counts rather than tokenizing complete text on the fly. Supply any number of single files per count level like 'input.unigrams', 'input.bigrams', etc.") - - parser.add_argument("--ngrams",nargs="+",default=["unigrams","bigrams"],help="What levels to parse with. Multiple arguments should be unquoted in spaces. This option currently does nothing.") + parser.add_argument("--ngrams",nargs="+",default=["unigrams"],help="What levels to parse with. Multiple arguments should be unquoted in spaces. This option currently does nothing.") + parser.add_argument("--db-directory", required = True, help = "" + "Directory where duckdb databases live.", type = Path) + parser.add_argument("--database", "-d", help = "" + "The database name inside db-folder for this command. " + "Not relevant for 'serve' commands.", + default = None + ) + # Use subparsers to have an action syntax, like git. - subparsers = parser.add_subparsers(title="action", help='The commands to run with Bookworm', dest="action") + subparsers = parser.add_subparsers(title="action", + help='The commands to run with Bookworm', + dest="action") ############# build ################# - build_parser = subparsers.add_parser("build",description = "Create files",help="""Build up the component parts of a Bookworm.\ - if you specify something far along the line (for instance, the linechart GUI), it will\ - build all prior files as well.""") + build_parser = subparsers.add_parser("build", + description = "Create files", + help="Build up the component parts of a Bookworm. " + "if you specify something far along the line") - build_parser.add_argument("target", help="The make that you want to build. To build a full bookworm, type 'build all'.") - - # Grep out all possible targets from the Makefile + # Inherited directly from nonconsumptive.commander. + add_builder_parameters(build_parser) ############# supplement ################# supplement_parser = subparsers.add_parser("add_metadata",help="""Supplement the\ metadata for an already-created Bookworm with new items. They can be keyed to any field already in the database.""") supplement_parser.add_argument("-f","--file",help="""The location of a file with additional metadata to incorporate into your bookworm.""",required=True) - supplement_parser.add_argument( - "--format", - help="""The file format of the new metadata.\ - Must be "json" or "tsv". For JSON, the format is the same as the default\ - jsoncatalog.txt (a text file of json lines, each corresponding to a metadata field);\ - for TSV, a tsv with first line of which is column names,\ - and the first column of which is shared key (like filename). The TSV format,\ - particularly without field descriptions, is much easier to use, but doesn't\ - permit multiple values for the same key.""", - default="json",type=str.lower,choices=["tsv","json"]) - - supplement_parser.add_argument("--key",help="""The name of the key. If not specified and input type is TSV, the first column is used.""",default=None) - supplement_parser.add_argument("--field_descriptions","-d",help="""A description of the new metadata in the format of "field_descriptions.json"; if empty, we'll just guess at some suitable values.""",default=None) - - ######### Reload Memory ############# - memory_tables_parser = subparsers.add_parser("reload_memory",help="Reload the memory\ - tables for the designated Bookworm; this must be done after every MySQL restart") - memory_tables_parser.add_argument("--force-reload",dest="force",action="store_true", - help="Force reload on all memory tables. Use\ - '--skip-reload' for faster execution. On by default\ - .") - memory_tables_parser.add_argument("--skip-reload",dest="force",action="store_false", - help="Don't reload memory tables which have at least\ - one entry in them. Significantly faster, but may produce\ - bad results if the underlying tables have been\ - changed. Good for maintenance, bad for actively updated\ - installations.") - memory_tables_parser.set_defaults(force=False) - memory_tables_parser.add_argument("--all",action="store_true",default=False, - help="Search for all bookworm installations on\ - the server, and reload memory tables for each of them.") - - - ########## Clone and run extensions - extensions_parser = subparsers.add_parser("extension", help="Install Extensions to the current directory") - extensions_parser.add_argument("url",help="A cloneable url for the extension you want to pul: passed as an argument to 'git clone,' so may be either using the https protocol or the git protocol") - - - ########## Clone and run extensions - extensions_parser = subparsers.add_parser("query", help="Run a query using the Bookworm API") - extensions_parser.add_argument("APIcall",help="The json-formatted query to be run.") - - - ########## Build components - extensions_parser = subparsers.add_parser("prep", help="Build individual components.", aliases = ['build']) - extensions_subparsers = extensions_parser.add_subparsers(title="goal", help="The name of the target.", dest="goal") - - # Bookworm prep targets that allow additional args - catalog_prep_parser = extensions_subparsers.add_parser("preDatabaseMetadata", - help=getattr(BookwormManager, "preDatabaseMetadata").__doc__) - - word_ingest_parser = extensions_subparsers.add_parser("database_wordcounts", - help=getattr(BookwormManager, "database_wordcounts").__doc__) - word_ingest_parser.add_argument("--no-delete", action="store_true", help="Do not delete and rebuild the token tables. Useful for a partially finished ingest.") - - word_ingest_parser.add_argument("--no-reverse-index", action="store_true", help="When creating the table, choose not to index _ncid/wordid/counts. This is useful for really large builds. Because this is specified at table creation time, it does nothing with --no-delete or --index-only.") - - word_ingest_parser.add_argument("--no-index", action="store_true", help="Do not re-enable keys after ingesting tokens. Only do this if you intent to manually enable keys or will run this command again.") - - word_ingest_parser.add_argument("--index-only", action="store_true", help="Only re-enable keys. Supercedes other flags.") - - # Bookworm prep targets that don't allow additional args - for prep_arg in BookwormManager.__dict__.keys(): - extensions_subparsers.add_parser(prep_arg, help=getattr(BookwormManager, prep_arg).__doc__) - - """ - Some special functions - """ - - init_parser = subparsers.add_parser("init",help="Initialize the current directory as a bookworm directory") - init_parser.add_argument("--force","-f",help="Overwrite some existing files.",default=False,action="store_true") - init_parser.add_argument("--yes","-y",help="Automatically use default values with no prompts",default=False,action="store_true") - # Serve the current bookworm @@ -592,6 +177,11 @@ def run_arguments(): configure_parser.add_argument("--users",nargs="+",choices=["admin","global","root"],help="The user levels you want to act on.",default=["admin","global"]) configure_parser.add_argument("--force","-f",help="Overwrite existing configurations in potentially bad ways.",action="store_true",default=False) + + configure_parser = subparsers.add_parser("query", help="query the API directly. Inefficient compared to using a running host.") + configure_parser.add_argument("APIcall", help="A JSON string.", type = json.loads) + + # Call the function args = parser.parse_args() # stash those away. @@ -602,12 +192,18 @@ def run_arguments(): raise ValueError('Invalid log level: %s' % args.log_level) # While we're at it, log with line numbers FORMAT = "[%(filename)s:%(lineno)s-%(funcName)s() %(asctime)s.%(msecs)03d] %(message)s" - logging.basicConfig(format=FORMAT, level=numeric_level, datefmt="%I:%M:%S") + logging.basicConfig(format=FORMAT, datefmt="%I:%M:%S") + for logger_name in ["nonconsumptive", "bookworm"]: + logging.getLogger(logger_name).setLevel(numeric_level) + logger.info("Info logging enabled.") - logger.info("Debug logging enabled.") + logger.debug("Debug logging enabled.") # Create the bookworm my_bookworm = BookwormManager(args.configuration, args.database) # Call the current action with the arguments passed in. - getattr(my_bookworm,args.action)(args) + # bookworm build --carefully + # becomes + # BookwormMangager.build(carefully = True) + getattr(my_bookworm, args.action)(args) From fba5d497c3aa1a7a95ca513fdfa559b5dabfdf52 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sat, 17 Jul 2021 23:31:52 -0400 Subject: [PATCH 36/41] Add API method to list all bookworms on endpoint --- bookwormDB/wsgi.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/bookwormDB/wsgi.py b/bookwormDB/wsgi.py index a4ff588..da28148 100644 --- a/bookwormDB/wsgi.py +++ b/bookwormDB/wsgi.py @@ -24,10 +24,7 @@ def content_type(query): if format == "json": return "application/json" - if format == "json_c": - return "application/json" - - if format == "feather": + if format == "feather" or format == "parquet": return "application/octet-stream" if format == "html": @@ -49,10 +46,13 @@ class DuckPool(dict): def __missing__(self, key): # Mother duck said 'quack quack quack quack' # and all of her five little duckies came back. - duck_dir = store()['duckdb_directory'] + duck_dir = args.db_directory self[key] = duckdb.connect(str(Path(duck_dir) / key), read_only = True) return self[key] - + def options(self): + duck_dir = args.db_directory + return [f.name for f in args.db_directory.glob("*") if f.is_file()] + duck_connections = DuckPool() if args.remote_host is None: @@ -103,7 +103,6 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): 'charset': 'utf-8' } - logger.debug("Received query {}".format(query)) start = datetime.now() @@ -120,15 +119,19 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): status = '404' start_response(status, list(headers.items())) return [b'{"status":"error", "message": "You have passed invalid JSON to the Bookworm API"}'] - - args = store()['args'] - - if args.cache == "none": - process = API(query=query, db=duck_connections[query['database']], **API_kwargs) - else: - process = Caching_API(query, query_cache, API, **API_kwargs) - - response_body = process.execute() + + if query['method'] and query['method'] == "endpoints": + query['format'] = "json" + response_body = json.dumps({ + 'status': 'success', + 'data': duck_connections.options() + }) + else: + if args.cache == "none": + process = API(query=query, db=duck_connections[query['database']], **API_kwargs) + else: + process = Caching_API(query, query_cache, API, **API_kwargs) + response_body = process.execute() # It might be binary already. headers['Content-type'] = content_type(query) @@ -151,9 +154,8 @@ def application(environ, start_response, logfile = "bookworm_queries.log"): # Copied from the gunicorn docs. - def number_of_workers(): - return (multiprocessing.cpu_count() * 2) + 1 + return (multiprocessing.cpu_count()) + 1 class StandaloneApplication(gunicorn.app.base.BaseApplication): @@ -180,7 +182,7 @@ def run(port = 10012, bind="0.0.0.0", workers = number_of_workers()): port: the service port bind: the host to bind to. Requests that don't match this address will be ignored. The default accepts all connections: 127.0.0.1 listens - only to localhost. + only to localhost, for when you're hiding it behind nginx or apache or something. """ if workers==0: workers = number_of_workers() From 53ff3211217b7ca2406a43add9db5faf48b78f5b Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Sun, 1 Aug 2021 23:19:43 -0400 Subject: [PATCH 37/41] Change json standard to something more modern --- bookwormDB/duckdb.py | 1 - bookwormDB/general_API.py | 16 ++++++++++--- bookwormDB/manager.py | 18 ++++++++++++--- tests/test_API.py | 47 +++++++++++++++++++-------------------- 4 files changed, 51 insertions(+), 31 deletions(-) diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index a9669e7..7564818 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -102,7 +102,6 @@ def defaults(self): self.search_limits = query_object['search_limits'] self.words_collation = query_object.get('words_collation', "Case_Sensitive") - lookups = { "Case_Insensitive":'lowercase', 'lowercase':'lowercase', diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index e504340..781aabd 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -200,6 +200,16 @@ def need_comparison_query(count_types): needing_fields = [c for c in count_types if not c in ["WordCount","TextCount"]] return len(needing_fields) != 0 +def dates_to_iso(frame): + + for column in frame.columns: + if "date" in str(frame[column].dtype): + frame[column] = frame[column].apply(lambda x: x.isoformat()) + else: + print(str(frame[column].dtype)) + return frame + + def base_count_types(list_of_final_count_types): """ the final count types are calculated from some base types across both @@ -470,7 +480,8 @@ def execute(self): frame = self.data() if fmt == "json": - val = frame.to_dict(orient = "records") + + val = dates_to_iso(frame).to_dict(orient = "records") return self._prepare_response(val, version = 2) if fmt == "csv": @@ -736,11 +747,10 @@ def generate_pandas_frame(self, call = None): q = DuckQuery(call, db = self.db) if call['method'] == 'schema': m = q.databaseScheme.to_pandas() - print(m) return m query = q.query() logger.warning("Preparing to execute {}".format(query)) - df = self.db.execute(query).df() + df = dates_to_iso(self.db.execute(query).df()) logger.debug("Query retrieved") return df diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index b9c3ad9..40e8c67 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -9,6 +9,7 @@ import nonconsumptive as nc from .store import store import logging +import yaml from nonconsumptive.commander import namespace_to_kwargs, add_builder_parameters logger = logging.getLogger("bookworm") @@ -116,10 +117,10 @@ def run_arguments(): "fuller record, and 'debug' dumps many db queries, etc.", choices=["warning","info","debug"],type=str.lower,default="warning") - parser.add_argument("--ngrams",nargs="+",default=["unigrams"],help="What levels to parse with. Multiple arguments should be unquoted in spaces. This option currently does nothing.") + parser.add_argument("--ngrams", nargs="+", default=["unigrams"],help="What levels to parse with. Multiple arguments should be unquoted in spaces. This option currently does nothing.") - parser.add_argument("--db-directory", required = True, help = "" - "Directory where duckdb databases live.", type = Path) + parser.add_argument("--db-directory", help = "" + "Directory where duckdb databases live.", default = None, type = Path) parser.add_argument("--database", "-d", help = "" "The database name inside db-folder for this command. " @@ -184,6 +185,10 @@ def run_arguments(): # Call the function args = parser.parse_args() + if args.db_directory is None: + args.db_directory = Path(default_db_directory()) + if args.db_directory is None: + raise ValueError("You must specify a db directory or include one in a local config file.") # stash those away. store()['args'] = args # Set the logging level based on the input. @@ -207,3 +212,10 @@ def run_arguments(): # becomes # BookwormMangager.build(carefully = True) getattr(my_bookworm, args.action)(args) + +def default_db_directory(): + for p in [Path.home() / ".bookworm.yml"]: + if p.exists(): + ks = yaml.safe_load(p.open()) + if "db_directory" in ks: + return ks["db_directory"] \ No newline at end of file diff --git a/tests/test_API.py b/tests/test_API.py index bfa8834..4c7e4b9 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -68,7 +68,7 @@ def test_groups(self, federalist_bookworm): "method":"data", "format":"json" } - m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] assert len(m) == 5 def test_multiword_search(self, federalist_bookworm): @@ -82,8 +82,8 @@ def test_multiword_search(self, federalist_bookworm): "groups": [] } - m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] - assert m[0] > 33 + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert m[0]["TextPercent"] > 33 def test_ne_with_one_entry(self, federalist_bookworm): import json @@ -98,7 +98,7 @@ def test_ne_with_one_entry(self, federalist_bookworm): "method":"data", "format":"json" } - m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] assert len(m)==4 def test_ne_with_two_entries(self, federalist_bookworm): @@ -114,7 +114,7 @@ def test_ne_with_two_entries(self, federalist_bookworm): "method":"data", "format":"json" } - m = json.loads(DuckDBCall(federalist_bookworm, query= query).execute())['data'] + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] assert len(m)==3 @@ -131,7 +131,7 @@ def test_ne_with_two_entries(self, federalist_bookworm): "method":"data", "format":"json" } - m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] assert len(m)==3 @@ -151,7 +151,7 @@ def test_or_with_two_entries(self, federalist_bookworm): "method":"data", "format":"json" } - m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] assert len(m) == 2 def test_lte_and_gte(self, federalist_bookworm): @@ -167,9 +167,8 @@ def test_lte_and_gte(self, federalist_bookworm): "method":"data", "format":"json" } - m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute()) - print(m) - assert len(m['data'])==6 + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert len(m)==6 def test_and_with_two_entries(self, federalist_bookworm): import json @@ -187,7 +186,7 @@ def test_and_with_two_entries(self, federalist_bookworm): "method":"data", "format":"json" } - m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] assert len(m)==0 def ftest_adding_metadata_to_bookworm(self): @@ -258,14 +257,14 @@ def test_case_sensitivity(self, federalist_bookworm): "method":"data", "format":"json" } - val1 = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] - assert(val1[0] > 0) + val1 = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert(val1[0]["WordCount"] > 0) query["words_collation"] = "Case_Insensitive" - val2= json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + val2= json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] # The words ('The','the') appear more often than ('the') alone. - assert (val2[0] > val1[0]) + assert (val2[0]["WordCount"] > val1[0]["WordCount"]) def test_case_insensitivity_works_without_search_term_existing(self, federalist_bookworm): @@ -277,8 +276,8 @@ def test_case_insensitivity_works_without_search_term_existing(self, federalist_ "words_collation":"Case_Insensitive", "method":"data", "format":"json" } - val = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] - assert (val[0] > 0) + val = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] + assert (val[0]["WordCount"] > 0) def test_unicode_search_term(self, unicode_bookworm): query = { @@ -289,8 +288,8 @@ def test_unicode_search_term(self, unicode_bookworm): "words_collation":"Case_Insensitive", "method":"data", "format":"json" } - val = json.loads(DuckDBCall(unicode_bookworm, query = query).execute())['data'] - assert (val[0] > 0) + val = json.loads(DuckDBCall(db = unicode_bookworm, query = query).execute())['data'] + assert (val[0]["WordCount"] > 0) def test_various_unicode_cases(self, unicode_bookworm): # There's a 'description_' for each individual item. @@ -310,11 +309,11 @@ def test_various_unicode_cases(self, unicode_bookworm): "method": "data", "format": "json" } try: - val = json.loads(DuckDBCall(unicode_bookworm, query = query).execute())['data'] + val = json.loads(DuckDBCall(db = unicode_bookworm, query = query).execute())['data'] except KeyError: - print(DuckDBCall(unicode_bookworm, query = query).execute()) + print(DuckDBCall(db = unicode_bookworm, query = query).execute()) raise - assert(val[0] > 0) + assert(val[0]["WordCount"] > 0) def test_asterisks_in_search_limits(self, federalist_bookworm): """ @@ -329,7 +328,7 @@ def test_asterisks_in_search_limits(self, federalist_bookworm): "method":"data", "format":"json" } - val1 = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + val1 = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] query = { "database":"federalist_bookworm", @@ -339,6 +338,6 @@ def test_asterisks_in_search_limits(self, federalist_bookworm): "method":"data", "format":"json" } val2 = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] - assert(val1[0] == val2[0]) + assert(val1[0]["WordsPerMillion"] == val2[0]["WordsPerMillion"]) \ No newline at end of file From 433d2fca1fe4754cef0c4f175655ad5b0a01a52f Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Mon, 2 Aug 2021 23:13:06 -0400 Subject: [PATCH 38/41] bigrams partially --- bookwormDB/__init__.py | 4 +++ bookwormDB/builder.py | 20 +++++++++++-- bookwormDB/duckdb.py | 4 +-- bookwormDB/general_API.py | 63 ++++----------------------------------- 4 files changed, 29 insertions(+), 62 deletions(-) diff --git a/bookwormDB/__init__.py b/bookwormDB/__init__.py index e69de29..67ca668 100644 --- a/bookwormDB/__init__.py +++ b/bookwormDB/__init__.py @@ -0,0 +1,4 @@ +from .builder import BookwormCorpus +from .general_API import DuckDBCall + +n = 1 \ No newline at end of file diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 1697eba..73034aa 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -28,9 +28,23 @@ def encoded_batches(self): def bookworm_name(self): return self.db_location.with_suffix("").name - def sort_unigrams(self, block_size = 5_000_000): + def sort_unigrams(self, block_size = 500_000_000): from_files((self.root / "encoded_unigrams").glob("*"), ['wordid', '_ncid'], self.root / 'unigram__ncid.parquet', block_size = block_size) + def ingest_ngrams_ncid(self, levels = ['bigram'], block_size = 500_000_000): + con = self.con + for i, f in enumerate(levels): + con.execute(f"DROP TABLE IF EXISTS {f}__ncid") + ngrams = i + 1 + logging.info(f"Creating {f} table.") + sort_order = [f"word{i + 1}" for i in range(ngrams)] + ["_ncid"] + ingest_file = self.root / f"{f}__ncid.parquet" + inputs = [*(self.root / f"encoded_{f}s").glob("*")] + print(inputs) + from_files(inputs, sort_order, ingest_file, block_size = block_size) + con.execute(f"CREATE TABLE {f}__ncid AS SELECT * FROM parquet_scan('{ingest_file}')") + ingest_file.unlink() + def prepare_metadata(self): self.metadata.to_flat_catalog() @@ -62,7 +76,9 @@ def ingest_unigram__ncid(self): con = self.con wordids = self.root / 'unigram__ncid.parquet' con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") - + wordids.unlink() + + def ingest_metadata(self): for tabpath in self.flat_tabs(): name = tabpath.with_suffix("").name diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index 7564818..01d877d 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -224,7 +224,7 @@ def main_table(self): if self.gram_size() == 1: return '"unigram__ncid" as main' if self.gram_size() == 2: - return '"word1_word2__ncid" as main' + return '"bigram__ncid" as main' def full_query_tables(self): # Joins are needed to provide groups, but *not* to provide @@ -418,7 +418,7 @@ def build_wordstables(self): if needsBigrams: self.main = ''' - bigrams_word1_word2 as main + bigrams__ncid as main ''' self.wordstables = """ diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index 781aabd..d27b619 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -2,6 +2,7 @@ from pandas import merge, Series, set_option, DataFrame from pandas.io.sql import read_sql +import pandas as pd from pyarrow import feather from copy import deepcopy from collections import defaultdict @@ -480,8 +481,9 @@ def execute(self): frame = self.data() if fmt == "json": - - val = dates_to_iso(frame).to_dict(orient = "records") + val = dates_to_iso(frame) + val = val.where(pd.notnull(val), None) + val = val.to_dict(orient = "records") return self._prepare_response(val, version = 2) if fmt == "csv": @@ -498,7 +500,7 @@ def execute(self): try: feather.write_feather(frame, fout, compression = compression) except: - logger.warning("You need the pyarrow package installed to export as feather.") + logger.error("You need the pyarrow package installed to export as feather.") raise fout.seek(0) return fout.read() @@ -568,26 +570,6 @@ def html(self, data): set_option('display.max_colwidth', -1) return data.to_html(escape = False, index = False) - - def return_rle_json(self, data): - """ - Return data in column-oriented format with run-length encoding - on duplicate values. - """ - - if isinstance(data, Series) and 'status' in data: - # If data has a status, Bookworm is trying to send us an error - return data.to_json() - - output = {'status':'success', 'data':{}} - - for k in data: - series = data[k] - output['data'][k] = rle(data[k].tolist()) - - return json.dumps(output) - - def return_json(self, raw_python_object=False, version=1): ''' Get JSON data for a single search_limit. @@ -654,41 +636,6 @@ def _prepare_response(self, data, version=1): return json.dumps(resp) - -class oldSQLAPIcall(APIcall): - """ - To make a new backend for the API, you just need to extend the base API - call class like this. - - This one is comically short because all the real work is done in the - userquery object. - - But the point is, you need to define a function "generate_pandas_frame" - that accepts an API call and returns a pandas frame. - - But that API call is more limited than the general API; you only need to - support "WordCount" and "TextCount" methods. - """ - - def generate_pandas_frame(self, call = None): - """ - - This is good example of the query that actually fetches the results. - It creates some SQL, runs it, and returns it as a pandas DataFrame. - - The actual SQL production is handled by the userquery class, which uses - more legacy code. - - """ - - if call is None: - call = self.query - - con = DbConnect(prefs, self.query['database']) - q = userquery(call).query() - df = read_sql(q, con.db) - return df - class MetaAPIcall(APIcall): def __init__(self, endpoints): self.endpoints = endpoints From e370fded9ef12f7b915e46ac7c60e5a647f4a6ff Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Fri, 20 Aug 2021 16:00:39 -0400 Subject: [PATCH 39/41] Improved time handling --- bookwormDB/DuckSchema.py | 21 ++++++++---- bookwormDB/builder.py | 19 +++++++---- bookwormDB/duckdb.py | 68 ++++++++++++++++++++++++++++++--------- bookwormDB/general_API.py | 1 + 4 files changed, 82 insertions(+), 27 deletions(-) diff --git a/bookwormDB/DuckSchema.py b/bookwormDB/DuckSchema.py index ec2124a..78f2339 100644 --- a/bookwormDB/DuckSchema.py +++ b/bookwormDB/DuckSchema.py @@ -19,7 +19,7 @@ class DuckSchema(object): def __init__(self, db): # XXXX self.db = db - + self._records = None # hash of what table each variable is in self.tableToLookIn = { '_ncid': 'fastcat', @@ -77,11 +77,14 @@ def __init__(self, db): self.tableToLookIn[field] = "slowcat" self.anchorFields[field] = "_ncid" - def to_pandas(self): + @property + def records(self): """ Return a JSON representation of the schema. """ - fields = [] + if self._records is not None: + return self._records + fields = {} for field in self.fields: name = field.name if name.endswith("__id"): @@ -91,9 +94,15 @@ def to_pandas(self): elif str(field.type) == 'old_string': continue else: - fields.append({'dbname': name, 'dtype': str(field.type)}) - return pd.DataFrame(fields) - + fields[name] = {'dbname': name, 'dtype': str(field.type)} + for k, v in field.metadata.items(): + fields[name][k.decode('utf-8')] = v.decode('utf-8') + self._records = fields + return fields + + def to_pandas(self): + return pd.DataFrame([*self.records.values()]) + def tables_for_variable(self, variable, depth = 0): """ Returns the tables needed to look up a variable, back up to 'fastcat' or 'wordsheap' diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 73034aa..5e48561 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -28,10 +28,13 @@ def encoded_batches(self): def bookworm_name(self): return self.db_location.with_suffix("").name - def sort_unigrams(self, block_size = 500_000_000): - from_files((self.root / "encoded_unigrams").glob("*"), ['wordid', '_ncid'], self.root / 'unigram__ncid.parquet', block_size = block_size) + def sort_unigrams(self, block_size = 2_500_000_000): + target = self.root / 'unigram__ncid.parquet' + if target.exists(): + logging.info(f"{target} already exists, skipping sort.") + from_files((self.root / "encoded_unigrams").glob("*"), ['wordid', '_ncid'], target, block_size = block_size) - def ingest_ngrams_ncid(self, levels = ['bigram'], block_size = 500_000_000): + def ingest_ngrams_ncid(self, levels = ['bigram'], block_size = 2_500_000_000): con = self.con for i, f in enumerate(levels): con.execute(f"DROP TABLE IF EXISTS {f}__ncid") @@ -43,7 +46,7 @@ def ingest_ngrams_ncid(self, levels = ['bigram'], block_size = 500_000_000): print(inputs) from_files(inputs, sort_order, ingest_file, block_size = block_size) con.execute(f"CREATE TABLE {f}__ncid AS SELECT * FROM parquet_scan('{ingest_file}')") - ingest_file.unlink() +# ingest_file.unlink() def prepare_metadata(self): self.metadata.to_flat_catalog() @@ -76,7 +79,7 @@ def ingest_unigram__ncid(self): con = self.con wordids = self.root / 'unigram__ncid.parquet' con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") - wordids.unlink() +# wordids.unlink() def ingest_metadata(self): @@ -143,7 +146,11 @@ def build(self): logger.info("Preparing metadata") self.prepare_metadata() logger.info("Creating unigrams for duck ingest") - self.cache("encoded_unigrams") + for k in ["token_counts", "tokenization"]: + if k in self.cache_set: + self.multiprocess(k) + self.total_wordcounts # To cache it + self.multiprocess("encoded_unigrams") logger.info("Sorting unigrams for duck ingest") self.sort_unigrams() logger.info("Ingesting unigrams") diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index 01d877d..ba96b75 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -109,9 +109,12 @@ def defaults(self): "case_insensitive":"lowercase", "Case_Sensitive":"word", 'stem':'stem'} + try: + self.word_field = lookups[self.words_collation] + except KeyError: + self.word_field = 'word' - self.word_field = lookups[self.words_collation] - + """ @property def groups(self): if self._groups: @@ -164,10 +167,10 @@ def groups(self): except KeyError: self.groups.add('"' + group + '"') - """ + "" There are the selections which can include table refs, and the groupings, which may not: and the final suffix to enable fast lookup - """ + "" self.selections = ",".join(self.groups) self.groupings = ",".join([group for group in self.groups]) @@ -177,7 +180,7 @@ def groups(self): self.counttype = query_object['counttype'] if isinstance(self.counttype, (str)): self.counttype = [self.counttype] - + """ @property def word_limits(self): if 'word' in self.limits: @@ -206,19 +209,53 @@ def wordid_query(self): return " wordid IN ({})".format(f) else: return " TRUE " - + + def time_rounding(self, fieldname): + if not 'date_resolution' in self.query_object: + return ['year'] + elif isinstance(self.query_object['date_resolution'], (str)): + return self.query_object['date_resolution'] + elif fieldname in self.query_object['date_resolution']: + return self.query_object['date_resolution'][fieldname] + else: + return ['year', 'month'] + def make_group_query(self): + # Based on groups, determine what the groupings are, and the selections including alises. + # Includes date parsing. + aliases = [] + fields = [] for g in self.query_object["groups"]: - try: - aliases.append(self.databaseScheme.aliases[g]) - except KeyError: - aliases.append(g) + wrapped = f'"{g}"' + if g in self.databaseScheme.aliases: + alias = f'"{self.databaseScheme.aliases[g]}"' +# aliases.append(alias) + aliases.append(f"FIRST({wrapped}) as {wrapped}") + fields.append(alias) + elif self.databaseScheme.records[g]['dtype'].startswith("date"): + resolutions = set(self.time_rounding(g)) + date_exprs = [] + for res in ['year', 'month', 'day']: + if res in resolutions: + n = 2 + if res == 'year': + n = 4 + date_exprs.append(f'''FIRST(LPAD({res}("{g}")::char, {n}, '0'))''') + fields.append(f'{res}("{g}")') + else: + if res == 'year': + date_exprs.append("'-'") + # year needs a leading dash "--12-07", though support is rare + aliases.append(" || ".join(date_exprs) + f'AS {wrapped}') # Concatenate strings. + else: + aliases.append(wrapped) + fields.append(wrapped) + grouping = " " if len(self.query_object["groups"]) > 0: - return "GROUP BY {}".format(", ".join(self.query_object["groups"])) - else: - return " " + grouping = "GROUP BY " + ", ".join(fields) + return aliases, grouping def main_table(self): if self.gram_size() == 1: @@ -254,8 +291,9 @@ def query_tables(self): return " NATURAL JOIN ".join(tables) def base_query(self): + group_aliases, group_object = self.make_group_query() return f""" - SELECT {', '.join(self.set_operations() + self.query_object['groups'])} + SELECT {', '.join(self.set_operations() + group_aliases)} FROM {self.query_tables} WHERE {self._ncid_query()} @@ -263,7 +301,7 @@ def base_query(self): {self.wordid_query} AND {self.catwhere} - {self.make_group_query()} + {group_object} """ @property diff --git a/bookwormDB/general_API.py b/bookwormDB/general_API.py index d27b619..68785d0 100644 --- a/bookwormDB/general_API.py +++ b/bookwormDB/general_API.py @@ -483,6 +483,7 @@ def execute(self): if fmt == "json": val = dates_to_iso(frame) val = val.where(pd.notnull(val), None) + val.replace([np.inf, -np.inf], None, inplace=True) val = val.to_dict(orient = "records") return self._prepare_response(val, version = 2) From ec01df0a3e68a230116c55f02765742ebe9f7970 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Fri, 10 Sep 2021 10:38:35 -0400 Subject: [PATCH 40/41] Use duckdb to sort the big tables on ingest instead of custom sort routines. --- bookwormDB/builder.py | 31 +++++++++++++++++++++++-------- bookwormDB/duckdb.py | 13 ++++++++++++- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index 5e48561..f116565 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -7,7 +7,7 @@ from nonconsumptive.metadata import Catalog from pathlib import Path import logging -from pyarrow import feather, parquet +from pyarrow import feather, parquet, dataset logger = logging.getLogger("bookworm") class BookwormCorpus(Corpus): @@ -29,6 +29,7 @@ def bookworm_name(self): return self.db_location.with_suffix("").name def sort_unigrams(self, block_size = 2_500_000_000): + raise("Deprecated method.") target = self.root / 'unigram__ncid.parquet' if target.exists(): logging.info(f"{target} already exists, skipping sort.") @@ -77,10 +78,26 @@ def ingest_wordids(self): def ingest_unigram__ncid(self): con = self.con - wordids = self.root / 'unigram__ncid.parquet' - con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") -# wordids.unlink() + encoded = dataset(self.root / 'encoded_unigrams', format = "feather") + con.register_arrow("unigrams_dataset", encoded) + con.execute("CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT wordid, _ncid, count FROM unigrams_dataset ORDER BY wordid, _ncid") + def ingest_ngram__ncid(self, ngrams = 2): + con = self.con + lookup = [ + None, + None, + "bigram", + "trigram", + "quadgram", + "quintgram" + ] + name = lookup[ngrams] + encoded = dataset(self.root / f'encoded_{name}s', format = "feather") + con.register_arrow("dataset", encoded) + word_cols = ",".join([f"word{i + 1}" for i in range(ngrams)]) + con.execute(f"CREATE TABLE IF NOT EXISTS {name}__ncid AS SELECT {word_cols}, _ncid, count " + "FROM unigrams_dataset ORDER BY {word_cols}, _ncid") def ingest_metadata(self): for tabpath in self.flat_tabs(): @@ -128,7 +145,7 @@ def ingest_wordcounts(self): for batch in self.iter_over('document_lengths', ids = "@id"): seen_a_word = True tb = pa.Table.from_batches([batch]) - self.con.register_arrow("t", tb) + self.con.register("t", tb.to_pandas()) self.con.execute('INSERT INTO nwords ("@id", nwords) SELECT * FROM t') self.con.unregister("t") if not seen_a_word: @@ -151,10 +168,8 @@ def build(self): self.multiprocess(k) self.total_wordcounts # To cache it self.multiprocess("encoded_unigrams") - logger.info("Sorting unigrams for duck ingest") - self.sort_unigrams() - logger.info("Ingesting unigrams") self.ingest_wordids() + logger.info("Sorting and ingesting unigrams") self.ingest_unigram__ncid() # logger.warning("Ingesting bigrams") logger.info("Ingesting metadata") diff --git a/bookwormDB/duckdb.py b/bookwormDB/duckdb.py index ba96b75..7859539 100644 --- a/bookwormDB/duckdb.py +++ b/bookwormDB/duckdb.py @@ -10,6 +10,15 @@ import logging logger = logging.getLogger("bookworm") +""" +APOLOGIA: + +This is the oldest part of the code base. There are a lot of places +where I didn't know how to do things yet, and probably more unused functions +than elsewhere. + +""" + def fail_if_nonword_characters_in_columns(input): keys = all_keys(input) for key in keys: @@ -262,7 +271,9 @@ def main_table(self): return '"unigram__ncid" as main' if self.gram_size() == 2: return '"bigram__ncid" as main' - + if self.gram_size() == 3: + return '"trigram__ncid" as main' + def full_query_tables(self): # Joins are needed to provide groups, but *not* to provide # provide evidence for wheres. From 3d785d645eac94c3e47158e09c195cb2d09bcad6 Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Mon, 27 Sep 2021 10:22:54 -0400 Subject: [PATCH 41/41] Change sorting strategy to a more scalable version of quacksort --- bookwormDB/DuckSchema.py | 8 ++- bookwormDB/builder.py | 109 ++++++++++++++++++++++++++------- bookwormDB/manager.py | 3 +- tests/test_API.py | 7 ++- tests/test_creation.py | 8 ++- tests/test_sql_construction.py | 4 +- 6 files changed, 108 insertions(+), 31 deletions(-) diff --git a/bookwormDB/DuckSchema.py b/bookwormDB/DuckSchema.py index 78f2339..d1ae79d 100644 --- a/bookwormDB/DuckSchema.py +++ b/bookwormDB/DuckSchema.py @@ -95,8 +95,10 @@ def records(self): continue else: fields[name] = {'dbname': name, 'dtype': str(field.type)} - for k, v in field.metadata.items(): - fields[name][k.decode('utf-8')] = v.decode('utf-8') + if field.metadata: + for k, v in field.metadata.items(): + fields[name][k.decode('utf-8')] = v.decode('utf-8') + self._records = fields return fields @@ -107,7 +109,7 @@ def tables_for_variable(self, variable, depth = 0): """ Returns the tables needed to look up a variable, back up to 'fastcat' or 'wordsheap' """ - if variable == '_ncid' or variable == 'wordid': + if variable == '_ncid' or variable == 'wordid' or (variable.startswith("word") and len(variable) == 5): return [] vals = [] try: diff --git a/bookwormDB/builder.py b/bookwormDB/builder.py index f116565..b2fc05e 100644 --- a/bookwormDB/builder.py +++ b/bookwormDB/builder.py @@ -8,6 +8,7 @@ from pathlib import Path import logging from pyarrow import feather, parquet, dataset +from ducksauce import from_files logger = logging.getLogger("bookworm") class BookwormCorpus(Corpus): @@ -16,9 +17,10 @@ class BookwormCorpus(Corpus): not be used to managed existing ones or in a multi-threaded context. """ - def __init__(self, db_location, *args, **kwargs): + def __init__(self, db_location, ngrams, *args, **kwargs): self.db_location = Path(db_location) self._connection = None + self.ngrams = ngrams super().__init__(*args, **kwargs) def encoded_batches(self): @@ -28,13 +30,7 @@ def encoded_batches(self): def bookworm_name(self): return self.db_location.with_suffix("").name - def sort_unigrams(self, block_size = 2_500_000_000): - raise("Deprecated method.") - target = self.root / 'unigram__ncid.parquet' - if target.exists(): - logging.info(f"{target} already exists, skipping sort.") - from_files((self.root / "encoded_unigrams").glob("*"), ['wordid', '_ncid'], target, block_size = block_size) - + """ def ingest_ngrams_ncid(self, levels = ['bigram'], block_size = 2_500_000_000): con = self.con for i, f in enumerate(levels): @@ -46,9 +42,8 @@ def ingest_ngrams_ncid(self, levels = ['bigram'], block_size = 2_500_000_000): inputs = [*(self.root / f"encoded_{f}s").glob("*")] print(inputs) from_files(inputs, sort_order, ingest_file, block_size = block_size) - con.execute(f"CREATE TABLE {f}__ncid AS SELECT * FROM parquet_scan('{ingest_file}')") -# ingest_file.unlink() - + con.execute(f"CREATE TABLE {f}__ncid AS SELECT {} FROM parquet_scan('{ingest_file}')") + """ def prepare_metadata(self): self.metadata.to_flat_catalog() @@ -76,11 +71,47 @@ def ingest_wordids(self): logger.debug("INGESTING INTO wordsheap") con.execute(f"CREATE TABLE wordsheap AS SELECT wordid, token as word, lower(token) as lowercase FROM words") + def create_sorted_ngram_ncid(self, ngrams : int, force : bool = False): + lookup = [ + None, + "unigram", + "bigram", + "trigram", + "quadgram", + "quintgram" + ] + name = lookup[ngrams] + + path = self.root / f'{name}__ncid.parquet' + inputs : List[Path]= [*(self.root / f'encoded_{name}s').glob("*.feather")] + columns : List[str] = [] # Defined below. + if path.exists(): + last_time = max([p.stat().st_mtime for p in inputs]) + if path.stat().st_mtime < last_time or force: + path.unlink() + else: + return + if ngrams > 1: + columns = [f'word{i + 1}' for i in range(ngrams)] + else: + columns = ['wordid'] + logging.getLogger("ducksauce").setLevel(logging.getLogger("bookworm").level) + from_files(inputs, [*columns, '_ncid'], path, block_size = 1_000_000_000) + def ingest_unigram__ncid(self): con = self.con - encoded = dataset(self.root / 'encoded_unigrams', format = "feather") + self.create_sorted_ngram_ncid(ngrams = 1) + wordids = self.root / 'unigram__ncid.parquet' + con.execute(f"CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT * FROM parquet_scan('{wordids}')") + schema = parquet.ParquetFile(wordids).schema_arrow + self.insert_table_schema("unigram__ncid", schema) + +# wordids.unlink() + """ + encoded = dataset.dataset(self.root / 'encoded_unigrams', format = "feather") con.register_arrow("unigrams_dataset", encoded) con.execute("CREATE TABLE IF NOT EXISTS unigram__ncid AS SELECT wordid, _ncid, count FROM unigrams_dataset ORDER BY wordid, _ncid") + """ def ingest_ngram__ncid(self, ngrams = 2): con = self.con @@ -93,21 +124,31 @@ def ingest_ngram__ncid(self, ngrams = 2): "quintgram" ] name = lookup[ngrams] - encoded = dataset(self.root / f'encoded_{name}s', format = "feather") + + wordids = self.root / f'{name}__ncid.parquet' + self.create_sorted_ngram_ncid(ngrams) + con.execute(f"CREATE TABLE IF NOT EXISTS {name}__ncid AS SELECT * FROM parquet_scan('{wordids}')") + schema = parquet.ParquetFile(wordids).schema_arrow + self.insert_table_schema(f"{name}__ncid", schema) + + """ + encoded = dataset.dataset(self.root / f'encoded_{name}s', format = "feather") con.register_arrow("dataset", encoded) - word_cols = ",".join([f"word{i + 1}" for i in range(ngrams)]) + word_cols = ", ".join([f"word{i + 1}" for i in range(ngrams)]) + + con.execute(f"DROP TABLE IF EXISTS {name}__ncid") con.execute(f"CREATE TABLE IF NOT EXISTS {name}__ncid AS SELECT {word_cols}, _ncid, count " - "FROM unigrams_dataset ORDER BY {word_cols}, _ncid") + f"FROM dataset order by {word_cols}, _ncid") + self.insert_table_schema(name + "__ncid", encoded.schema) + """ - def ingest_metadata(self): + def ingest_metadata(self) -> None: for tabpath in self.flat_tabs(): name = tabpath.with_suffix("").name self.con.execute(f"CREATE TABLE {name} AS SELECT * FROM parquet_scan('{tabpath}')") def create_table_schemas(self): con = self.con - con.execute('DROP TABLE IF EXISTS arrow_schemas') - con.execute('CREATE TABLE arrow_schemas (name VARCHAR, schema VARCHAR, type VARCHAR)') insertion = 'INSERT INTO arrow_schemas VALUES (?, ?, ?)' rich = self.metadata.tb @@ -118,13 +159,18 @@ def create_table_schemas(self): # DuckDB can't yet handle blob inserts from python. # https://github.com/duckdb/duckdb/issues/1703 - for tab in [*self.flat_tabs()] + [self.root / Path("unigram__ncid.parquet"), self.root / 'wordids.parquet']: + for tab in [*self.flat_tabs()] + [self.root / 'wordids.parquet']: tabname = tab.with_suffix("").name if tabname in ["sorted", "wordids"]: continue con.execute(insertion, (tabname, b64encode(pa.parquet.ParquetFile(tab).schema_arrow.serialize().to_pybytes()), "table")) + + def insert_table_schema(self, tabname, schema): + con = self.con + insertion = 'INSERT INTO arrow_schemas VALUES (?, ?, ?)' + con.execute(insertion, (tabname, b64encode(schema.serialize().to_pybytes()), "table")) def create_slow_catalog(self): con = self.con @@ -138,6 +184,9 @@ def create_slow_catalog(self): con.execute(f"CREATE VIEW slowcat AS SELECT {','.join(unique)} FROM catalog") def ingest_wordcounts(self): + """ + The total wordcounts for each document. + """ self.con.execute('DROP TABLE IF EXISTS nwords') self.con.execute('CREATE TABLE nwords ("@id" STRING, "nwords" INTEGER)') logger.info("Creating nwords") @@ -162,22 +211,40 @@ def ingest_wordcounts(self): def build(self): logger.info("Preparing metadata") self.prepare_metadata() + logger.info("Creating unigrams for duck ingest") for k in ["token_counts", "tokenization"]: if k in self.cache_set: self.multiprocess(k) self.total_wordcounts # To cache it self.multiprocess("encoded_unigrams") + ngrams = self.ngrams + if ngrams > 1: + self.multiprocess("encoded_bigrams") + if ngrams > 2: + self.multiprocess("encoded_trigrams") + if ngrams > 3: + self.multiprocess("encoded_quadgrams") + if ngrams > 4: + self.multiprocess("encoded_quintgrams") + + con = self.con + con.execute('CREATE TABLE IF NOT EXISTS arrow_schemas (name VARCHAR, schema VARCHAR, type VARCHAR)') + self.ingest_wordids() logger.info("Sorting and ingesting unigrams") self.ingest_unigram__ncid() -# logger.warning("Ingesting bigrams") logger.info("Ingesting metadata") - self.ingest_metadata() logger.info("Creating schemas for load") self.ingest_wordcounts() + for i in range(ngrams): + grams = i + 1 + if i == 0: + continue + logger.info(f"Ingesting {i}grams") + self.ingest_ngram__ncid(grams) self.create_table_schemas() diff --git a/bookwormDB/manager.py b/bookwormDB/manager.py index 40e8c67..8ad32a3 100644 --- a/bookwormDB/manager.py +++ b/bookwormDB/manager.py @@ -81,6 +81,7 @@ def build(self, args): db_path = args.db_directory / args.database corp = BookwormCorpus( db_location = db_path, + ngrams = args.ngrams, **nc_params, cache_set = {"tokenization", "word_counts", "encoded_unigrams", "document_lengths"}) @@ -117,7 +118,7 @@ def run_arguments(): "fuller record, and 'debug' dumps many db queries, etc.", choices=["warning","info","debug"],type=str.lower,default="warning") - parser.add_argument("--ngrams", nargs="+", default=["unigrams"],help="What levels to parse with. Multiple arguments should be unquoted in spaces. This option currently does nothing.") + parser.add_argument("--ngrams", type = int, default = 2, help = "How many ngrams to create count tables for. Maximum 5. Large values will dramatically slow creation.") parser.add_argument("--db-directory", help = "" "Directory where duckdb databases live.", default = None, type = Path) diff --git a/tests/test_API.py b/tests/test_API.py index 4c7e4b9..1a728c2 100644 --- a/tests/test_API.py +++ b/tests/test_API.py @@ -19,6 +19,7 @@ def federalist_bookworm(tmpdir_factory): tmpdir = tmpdir_factory.mktemp("tmpdir") corp = BookwormCorpus( f"{path}", + ngrams = 2, texts = Path('tests/test_bookworm_files/input.txt'), metadata = "tests/test_bookworm_files/jsoncatalog.txt", dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) @@ -32,6 +33,7 @@ def unicode_bookworm(tmpdir_factory): tmpdir = tmpdir_factory.mktemp("tmpdir") corp = BookwormCorpus( f"{path}", + ngrams = 1, texts = Path('tests/test_bookworm_files_unicode/input.txt'), metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) @@ -65,7 +67,8 @@ def test_groups(self, federalist_bookworm): "search_limits":{}, "counttype":"TextPercent", "groups":["author"], - "method":"data", "format":"json" + "method":"data", + "format":"json" } m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] @@ -337,7 +340,7 @@ def test_asterisks_in_search_limits(self, federalist_bookworm): "groups":[], "method":"data", "format":"json" } - val2 = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + val2 = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] assert(val1[0]["WordsPerMillion"] == val2[0]["WordsPerMillion"]) \ No newline at end of file diff --git a/tests/test_creation.py b/tests/test_creation.py index df4a836..4162a19 100644 --- a/tests/test_creation.py +++ b/tests/test_creation.py @@ -9,6 +9,7 @@ def test_ascii_creation(self, tmpdir): corp = BookwormCorpus( path, + ngrams = 2, texts = Path('tests/test_bookworm_files/input.txt'), metadata = "tests/test_bookworm_files/jsoncatalog.txt", dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) @@ -22,9 +23,10 @@ def test_unicode_creation(self, tmpdir): if path.exists(): path.unlink() corp = BookwormCorpus( path, - texts = Path('tests/test_bookworm_files_unicode/input.txt'), - metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", - dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) + ngrams = 2, + texts = Path('tests/test_bookworm_files_unicode/input.txt'), + metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", + dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) corp.build() con = duckdb.connect(str(path)) # There's a 'description_' for each individual item. diff --git a/tests/test_sql_construction.py b/tests/test_sql_construction.py index 631e620..0f4cfd9 100644 --- a/tests/test_sql_construction.py +++ b/tests/test_sql_construction.py @@ -21,6 +21,7 @@ def federalist_bookworm(tmpdir_factory): tmpdir = tmpdir_factory.mktemp("tmpdir") corp = BookwormCorpus( f"{path}", + ngrams = 2, texts = Path('tests/test_bookworm_files/input.txt'), metadata = "tests/test_bookworm_files/jsoncatalog.txt", dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) @@ -35,6 +36,7 @@ def unicode_bookworm(tmpdir_factory): corp = BookwormCorpus( f"{path}", + ngrams = 2, texts = Path('tests/test_bookworm_files_unicode/input.txt'), metadata = "tests/test_bookworm_files_unicode/jsoncatalog.txt", dir = tmpdir, cache_set = {"tokenization", "token_counts", "wordids"}) @@ -58,5 +60,5 @@ def test_ne_with_one_entry(self, federalist_bookworm): "method":"data", "format":"json" } - m = json.loads(DuckDBCall(federalist_bookworm, query = query).execute())['data'] + m = json.loads(DuckDBCall(db = federalist_bookworm, query = query).execute())['data'] assert len(m)==4 \ No newline at end of file