From e5fbc79936a1442753eb84278aae38ad04ecd661 Mon Sep 17 00:00:00 2001 From: James Kunstle Date: Wed, 1 Nov 2023 17:38:02 -0400 Subject: [PATCH] step forward- test caching w/ postgres implemented Signed-off-by: James Kunstle --- 8Knot/cache_manager/cache_facade.py | 128 ++++++++++++ .../visualizations/commits_over_time.py | 57 ++++- 8Knot/queries/commits_query.py | 195 +++++++++++++----- 3 files changed, 319 insertions(+), 61 deletions(-) create mode 100644 8Knot/cache_manager/cache_facade.py diff --git a/8Knot/cache_manager/cache_facade.py b/8Knot/cache_manager/cache_facade.py new file mode 100644 index 00000000..672005e9 --- /dev/null +++ b/8Knot/cache_manager/cache_facade.py @@ -0,0 +1,128 @@ +import psycopg2 as pg +from psycopg2.extras import execute_values +from psycopg2 import sql as pg_sql +import logging +from uuid import uuid4 + + +def cache_query_results( + db_connection_string: str, + query: str, + vars: tuple[tuple], + target_table: str, + bookkeeping_data: tuple[dict], + server_pagination=2000, + client_pagination=2000, +) -> None: + """Runs {query} against primary database specified by {db_connection_string} with variables {vars}. + Retrieves results from db with paginations {server_pagination} and {client_pagination}. + + Args: + db_connection_string (str): _description_ + query (str): _description_ + vars (tuple(tuple)): _description_ + target_table (str): _description_ + bookkeeping_data (tuple(dict)): _description_ + server_pagination (int, optional): _description_. Defaults to 2000. + client_pagination (int, optional): _description_. Defaults to 2000. + """ + logging.warning("TESTING -- CACHE_QUERY_RESULTS BEGIN") + with pg.connect(db_connection_string) as augur_conn: + with augur_conn.cursor(name=f"{target_table}-{uuid4()}") as augur_cur: + # set number of rows we want from primary db at a time + augur_cur.itersize = server_pagination + + logging.warning("TESTING -- EXECUTING QUERY") + # execute query + logging.warning(vars) + augur_cur.execute(query, vars) + + logging.warning("TESTING -- STARTING TRANSACTION") + # connect to cache + with pg.connect( + dbname="augur_cache", + user="postgres", + password="password", + host="postgres-cache", + port="5432", + ) as cache_conn: + logging.warning("TESTING -- COMPOSING SQL") + # compose SQL w/ table name + # ref: https://www.psycopg.org/docs/sql.html + composed_query = pg_sql.SQL( + "INSERT INTO {tbl_name} VALUES %s ON CONFLICT DO NOTHING".format( + tbl_name=target_table + ) + ).as_string(cache_conn) + + # iterate through pages of rows from server. + logging.warning("TESTING -- FETCHING AND STORING ROWS") + while rows := augur_cur.fetchmany(client_pagination): + if not rows: + # we're out of rows + break + + # write available rows to cache. + with cache_conn.cursor() as cache_cur: + execute_values( + cur=cache_cur, + sql=composed_query, + argslist=rows, + page_size=client_pagination, + ) + + # after all data has successfully been written to cache from the primary db, + # insert record of existence for each (cache_func, repo_id) pair. + logging.warning("TESTING -- UPDATING BOOKKEEPING") + with cache_conn.cursor() as cache_cur: + execute_values( + cur=cache_cur, + sql=""" + INSERT INTO cache_bookkeeping (cache_func, repo_id) + VALUES %s + """, + template="(%(cache_func)s, %(repo_id)s)", + argslist=bookkeeping_data, + ) + + logging.warning("TESTING -- COMMITTING TRANSACTION") + # TODO: end of context block, on success, should commit. On failure, should rollback. Need to write test for this. + + # don't need to commit on primary db + logging.warning("TESTING -- SUCCESS") + + +def get_uncached(func_name: str, repolist: list[int]) -> list[int]: # or None + with pg.connect( + dbname="augur_cache", + user="postgres", + password="password", + host="postgres-cache", + port="5432", + ) as cache_conn: + with cache_conn.cursor() as cache_cur: + composed_query = pg_sql.SQL( + """ + SELECT cb.repo_id + FROM cache_bookkeeping cb + WHERE cb.cache_func = '{cache_func_name}' AND cb.repo_id in %s + """.format( + cache_func_name=func_name + ) + ).as_string(cache_conn) + + # exec query + cache_cur.execute(query=composed_query, vars=(tuple(repolist),)) + + # get list of cached repos + already_cached: list[tuple] = cache_cur.fetchall() + + # process list of single-value tuples to get list of values. + # looks like: [(val1,), (val2,), ...] + already_cached: set[int] = set([v[0] for v in already_cached]) + + # repos that are already cached will be removed from repolist set, + # leaving uncached remaining. + not_cached: list[int] = list(set(repolist) - already_cached) + + return not_cached diff --git a/8Knot/pages/contributions/visualizations/commits_over_time.py b/8Knot/pages/contributions/visualizations/commits_over_time.py index 7578bd3a..1c071ca3 100644 --- a/8Knot/pages/contributions/visualizations/commits_over_time.py +++ b/8Knot/pages/contributions/visualizations/commits_over_time.py @@ -12,6 +12,8 @@ from pages.utils.job_utils import nodata_graph import io import time +import psycopg2 +import cache_manager.cache_facade as cf PAGE = "contributions" VIZ_ID = "commits-over-time" @@ -116,16 +118,57 @@ def toggle_popover(n, is_open): ) def commits_over_time_graph(repolist, interval): # wait for data to asynchronously download and become available. - cache = cm() - df = cache.grabm(func=cmq, repos=repolist) - while df is None: - time.sleep(1.0) - df = cache.grabm(func=cmq, repos=repolist) + while not_cached := cf.get_uncached(func_name=cmq.__name__, repolist=repolist): + logging.warning(f"COMMITS_OVER_TIME_VIZ - WAITING ON DATA TO BECOME AVAILABLE") + time.sleep(0.5) # data ready. start = time.perf_counter() logging.warning("COMMITS_OVER_TIME_VIZ - START") + # GET ALL DATA FROM POSTGRES CACHE + df = None + with psycopg2.connect( + dbname="augur_cache", + user="postgres", + password="password", + host="postgres-cache", + port="5432", + ) as cache_conn: + with cache_conn.cursor() as cache_cur: + logging.critical( + cache_cur.mogrify( + """ + SELECT * + FROM commits c + WHERE c.repo_id IN %s; + """, + (tuple(repolist),), + ) + ) + cache_cur.execute( + """ + SELECT * + FROM commits c + WHERE c.repo_id IN %s; + """, + (tuple(repolist),), + ) + + logging.warning("COMMITS OVER TIME - LOADING DATA FROM CACHE") + df = pd.DataFrame( + cache_cur.fetchall(), + columns=[ + "id", + "commits", + "author_email", + "date", + "author_timestamp", + "committer_timestamp", + ], + ) + logging.warning(f"COMMITS OVER TIME - DATA LOADED {df.shape}, {df.head()}") + # test if there is data if df.empty: logging.warning("COMMITS OVER TIME - NO DATA AVAILABLE") @@ -162,7 +205,9 @@ def process_data(df: pd.DataFrame, interval): # converts date column to a datetime object, converts to string first to handle period information # the period slice is to handle weekly corner case - df_created["Date"] = pd.to_datetime(df_created["Date"].astype(str).str[:period_slice]) + df_created["Date"] = pd.to_datetime( + df_created["Date"].astype(str).str[:period_slice] + ) return df_created diff --git a/8Knot/queries/commits_query.py b/8Knot/queries/commits_query.py index 7f9b0a86..14dc109d 100644 --- a/8Knot/queries/commits_query.py +++ b/8Knot/queries/commits_query.py @@ -6,6 +6,9 @@ import io import datetime as dt from sqlalchemy.exc import SQLAlchemyError +import psycopg2 as pg +from psycopg2.extras import execute_values +import cache_manager.cache_facade as cf # DEBUGGING import os @@ -41,7 +44,7 @@ def commits_query(self, repos): # commenting-outunused query components. only need the repo_id and the # authorship date for our current queries. remove the '--' to re-add # the now-removed values. - query_string = f""" + query_string = """ SELECT distinct r.repo_id AS id, @@ -56,64 +59,146 @@ def commits_query(self, repos): c.cmt_committer_timestamp AS committer_timestamp FROM - repo r - JOIN commits c + augur_data.repo r + JOIN augur_data.commits c ON r.repo_id = c.repo_id WHERE - c.repo_id in ({str(repos)[1:-1]}) + c.repo_id in %s """ + # tuple of dictionaries + func_name = commits_query.__name__ + try: - dbm = AugurManager() - engine = dbm.get_engine() - except KeyError: - # noack, data wasn't successfully set. - logging.error(f"{QUERY_NAME}_DATA_QUERY - INCOMPLETE ENVIRONMENT") - return False - except SQLAlchemyError: - logging.error(f"{QUERY_NAME}_DATA_QUERY - COULDN'T CONNECT TO DB") - # allow retry via Celery rules. - raise SQLAlchemyError("DBConnect failed") - - df = dbm.run_query(query_string) - - # change to compatible type and remove all data that has been incorrectly formated - df["author_timestamp"] = pd.to_datetime(df["author_timestamp"], utc=True).dt.date - df = df[df.author_timestamp < dt.date.today()] - - # break apart returned data per repo - # and temporarily store in List to be - # stored in Redis. - pic = [] - for r in repos: - # convert series to a dataframe - # once we've stored the data by ID we no longer need the column. - c_df = pd.DataFrame(df.loc[df["id"] == r].drop(columns=["id"])).reset_index(drop=True) - - # bytes buffer to be written to - b = io.BytesIO() - - # write dataframe in feather format to BytesIO buffer - bs = c_df.to_feather(b) - - # move head of buffer to the beginning - b.seek(0) - - # write the bytes of the buffer into the array - bs = b.read() - pic.append(bs) - - del df - - # store results in Redis - cm_o = cm() - - # 'ack' is a boolean of whether data was set correctly or not. - ack = cm_o.setm( - func=commits_query, - repos=repos, - datas=pic, - ) + # STEP 1: Which repos need to be queried for? + # some might already be in cache. + uncached_repos: list[int] | None = cf.get_uncached( + func_name=func_name, repolist=repos + ) + if not uncached_repos: + logging.warning(f"{QUERY_NAME}_DATA_QUERY - ALL REQUESTED REPOS IN CACHE") + return 0 + else: + logging.warning( + f"{QUERY_NAME}_DATA_QUERY - CACHING {len(uncached_repos)} NEW REPOS" + ) + uncached_repos: tuple[tuple] = tuple([tuple(uncached_repos)]) + + # STEP 2: Query for those repos + logging.warning(f"{QUERY_NAME}_DATA_QUERY - EXECUTING CACHING QUERY") + cf.cache_query_results( + db_connection_string="dbname=padres user=cali password=!baseball21 host=chaoss.tv port=5432", + query=query_string, + vars=uncached_repos, + target_table="commits", + bookkeeping_data=tuple( + {"cache_func": func_name, "repo_id": r} for r in repos + ), + ) + except Exception as e: + logging.critical(f"{QUERY_NAME}_POSTGRES ERROR: {e}") + return 1 + + # try: + # dbm = AugurManager() + # engine = dbm.get_engine() + # except KeyError: + # # noack, data wasn't successfully set. + # logging.error(f"{QUERY_NAME}_DATA_QUERY - INCOMPLETE ENVIRONMENT") + # return False + # except SQLAlchemyError: + # logging.error(f"{QUERY_NAME}_DATA_QUERY - COULDN'T CONNECT TO DB") + # # allow retry via Celery rules. + # raise SQLAlchemyError("DBConnect failed") + + # df = dbm.run_query(query_string) + + # # change to compatible type and remove all data that has been incorrectly formated + # df["author_timestamp"] = pd.to_datetime(df["author_timestamp"], utc=True).dt.date + # df = df[df.author_timestamp < dt.date.today()] + + # # break apart returned data per repo + # # and temporarily store in List to be + # # stored in Redis. + # pic = [] + # for r in repos: + # # convert series to a dataframe + # # once we've stored the data by ID we no longer need the column. + # c_df = pd.DataFrame(df.loc[df["id"] == r].drop(columns=["id"])).reset_index( + # drop=True + # ) + + # # bytes buffer to be written to + # b = io.BytesIO() + + # # write dataframe in feather format to BytesIO buffer + # bs = c_df.to_feather(b) + + # # move head of buffer to the beginning + # b.seek(0) + + # # write the bytes of the buffer into the array + # bs = b.read() + # pic.append(bs) + + # del df + + # # store results in Redis + # cm_o = cm() + + # # 'ack' is a boolean of whether data was set correctly or not. + # ack = cm_o.setm( + # func=commits_query, + # repos=repos, + # datas=pic, + # ) logging.warning(f"{QUERY_NAME}_DATA_QUERY - END") - return ack + return 0 + + +def _query_write( + server_sql, query_args, server_pagination=2000, client_pagination=2000 +): + # CONNECT TO PRIMARY DB + # WITH AUTOCOMMIT OFF, TRANSACTION COMMITS AT END OF CONTEXT + # BLOCK, BUT CONNECTION DOESN'T CLOSE. + # IF THIS IS PARALLELIZED, DON'T NEED TO WORRY ABOUT REENTRANCY. + with pg.connect( + dbname="padres", + user="cali", + password="!baseball21", + host="chaoss.tv", + port="5432", + ) as augur_conn: + with augur_conn.cursor("commits-query") as augur_cur: + # CONFIGURE PRIMARY DB AND EXECUTE QUERY + augur_cur.itersize = server_pagination + if query_args: + augur_cur.execute(server_sql, (query_args,)) + else: + augur_cur.execute(server_sql) + + # CONNECT TO CACHE DB, CONFIGURE w/ AUTOCOMMIT + with pg.connect( + dbname="augur_cache", + user="postgres", + password="password", + host="postgres-cache", + port="5432", + ) as cache_conn: + # ITERATE THROUGH BUFFERED ROWS + while rows := augur_cur.fetchmany(client_pagination): + if not rows: + break + + # WRITE ROWS TO CACHE + with cache_conn.cursor() as cache_cur: + execute_values( + cache_cur, + "INSERT INTO commits VALUES %s ON CONFLICT DO NOTHING", + rows, + ) + + augur_conn.close() + cache_conn.close()