From fbc368f736b7694eb1e5c39e6432c6893a6dfe9f Mon Sep 17 00:00:00 2001 From: Austin Chen <akrolsmir@gmail.com> Date: Sun, 22 Nov 2020 11:18:08 -0800 Subject: [PATCH 1/3] MVP of a Streamlit app --- .gitignore | 1 + analytics/streamlit/.gitignore | 1 + analytics/streamlit/README.md | 15 ++++++++ analytics/streamlit/requirements.txt | 2 ++ analytics/streamlit/streamlit_app.py | 51 ++++++++++++++++++++++++++++ 5 files changed, 70 insertions(+) create mode 100644 .gitignore create mode 100644 analytics/streamlit/.gitignore create mode 100644 analytics/streamlit/README.md create mode 100644 analytics/streamlit/requirements.txt create mode 100644 analytics/streamlit/streamlit_app.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..dbe9c82b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.vscode/ \ No newline at end of file diff --git a/analytics/streamlit/.gitignore b/analytics/streamlit/.gitignore new file mode 100644 index 00000000..e3671126 --- /dev/null +++ b/analytics/streamlit/.gitignore @@ -0,0 +1 @@ +firestore-account-key.json \ No newline at end of file diff --git a/analytics/streamlit/README.md b/analytics/streamlit/README.md new file mode 100644 index 00000000..d0937db5 --- /dev/null +++ b/analytics/streamlit/README.md @@ -0,0 +1,15 @@ +## Developing locally + +1. Install the requirements + + `pip install -r requirements.txt` + +2. Download a service account key from the + [Firebase Console](https://console.firebase.google.com/u/0/project/oneword-cf74a/settings/serviceaccounts/adminsdk) + (or ask Austin for the key) + +3. Paste the key in this folder as `firestore-account-key.json` + +4. Start the analytics app + + `streamlit run streamlit_app.py` diff --git a/analytics/streamlit/requirements.txt b/analytics/streamlit/requirements.txt new file mode 100644 index 00000000..41e9eeff --- /dev/null +++ b/analytics/streamlit/requirements.txt @@ -0,0 +1,2 @@ +streamlit +google-cloud-firestore \ No newline at end of file diff --git a/analytics/streamlit/streamlit_app.py b/analytics/streamlit/streamlit_app.py new file mode 100644 index 00000000..07d2d440 --- /dev/null +++ b/analytics/streamlit/streamlit_app.py @@ -0,0 +1,51 @@ +import streamlit as st +from google.cloud import firestore + +import pandas as pd +import datetime + +# TODO: Show a nice st.error when key is missing +db = firestore.Client.from_service_account_json("firestore-account-key.json") + + +@st.cache +def list_rooms(): + def doc_to_room(doc): + room = doc.to_dict() + room["id"] = doc.id + return room + + rooms_ref = ( + db.collection("rooms") + .order_by("lastUpdateTime", direction="DESCENDING") + .limit(200) + ) + return [doc_to_room(doc) for doc in rooms_ref.stream()] + + +def room_to_df(room): + return pd.DataFrame( + [ + [ + room["id"], + len(room["history"]), + len(room["players"]), # Should dig in history for ALL players + datetime.datetime.fromtimestamp(room["lastUpdateTime"] / 1000), + ] + ] + ) + + +room = list_rooms()[0] +df = room_to_df(room) +for room in list_rooms(): + df2 = room_to_df(room) + df = df.append(df2) + +st.write(df) + + +# Questions to answer: +# How many rounds/games are played daily/weekly/monthly? +# Which words are easist? Hardest? +# Which words are commonly clued? From b1bf5d91926b60a0688934046342c1351472e254 Mon Sep 17 00:00:00 2001 From: marsteralex <bob.masteralex@gmail.com> Date: Sun, 22 Nov 2020 14:56:26 -0800 Subject: [PATCH 2/3] Update streamlit_app.py --- analytics/streamlit/streamlit_app.py | 49 ++++++++++++++++++---------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/analytics/streamlit/streamlit_app.py b/analytics/streamlit/streamlit_app.py index 07d2d440..23173f52 100644 --- a/analytics/streamlit/streamlit_app.py +++ b/analytics/streamlit/streamlit_app.py @@ -17,33 +17,48 @@ def doc_to_room(doc): rooms_ref = ( db.collection("rooms") - .order_by("lastUpdateTime", direction="DESCENDING") - .limit(200) + .order_by("lastUpdateTime", direction="DESCENDING") + .limit(50) ) return [doc_to_room(doc) for doc in rooms_ref.stream()] -def room_to_df(room): - return pd.DataFrame( - [ - [ - room["id"], - len(room["history"]), - len(room["players"]), # Should dig in history for ALL players +def essential_room_info(room): + return [ + room["id"], + len(room["history"]), + len(room["playerData"]), # Should dig in history for ALL players + len(room["players"]), datetime.datetime.fromtimestamp(room["lastUpdateTime"] / 1000), ] - ] - ) -room = list_rooms()[0] -df = room_to_df(room) -for room in list_rooms(): - df2 = room_to_df(room) - df = df.append(df2) -st.write(df) +def filter_rooms(room): + return len(room["history"]) > 0 + +all_rooms = list_rooms() +filtered_rooms = filter(filter_rooms, all_rooms) +all_rooms_df = pd.DataFrame([essential_room_info(room) for room in filtered_rooms]) +all_rooms_df.columns = ["room_name", "num_rounds", "num_players_ever","players_end_of_day", "timestamp"] +st.write("mean number of rounds per room = " + str(all_rooms_df["num_rounds"].mean())) +st.write("median number of rounds per room = " + str(all_rooms_df["num_rounds"].median())) +st.write("mode number of rounds per room = " + str(all_rooms_df["num_rounds"].value_counts().idxmax())) +st.write(filtered_rooms) +st.write(all_rooms_df) +st.line_chart(all_rooms_df) +# st.area_chart(data) +# st.bar_chart(data) +# st.pyplot(fig) +# st.altair_chart(data) +# st.vega_lite_chart(data) +# st.plotly_chart(data) +# st.bokeh_chart(data) +# st.pydeck_chart(data) +# st.deck_gl_chart(data) +# st.graphviz_chart(data) +# st.map(data) # Questions to answer: # How many rounds/games are played daily/weekly/monthly? From 66ebd1feabc2c94146311087fdfd5ae71ad50aea Mon Sep 17 00:00:00 2001 From: marsteralex <bob.masteralex@gmail.com> Date: Mon, 30 Nov 2020 10:06:56 -0800 Subject: [PATCH 3/3] Update streamlit_app.py --- analytics/streamlit/streamlit_app.py | 251 ++++++++++++++++++++++----- 1 file changed, 210 insertions(+), 41 deletions(-) diff --git a/analytics/streamlit/streamlit_app.py b/analytics/streamlit/streamlit_app.py index 23173f52..8183ec37 100644 --- a/analytics/streamlit/streamlit_app.py +++ b/analytics/streamlit/streamlit_app.py @@ -3,13 +3,25 @@ import pandas as pd import datetime +import itertools +import csv # TODO: Show a nice st.error when key is missing db = firestore.Client.from_service_account_json("firestore-account-key.json") +type(db) + +st.write(type(db)) + +global rounds_without_clues +global rounds_without_guesses +global total_rounds +rounds_without_clues = 0 +rounds_without_guesses = 0 +total_rounds = 0 @st.cache -def list_rooms(): +def list_rooms(number_of_rooms): def doc_to_room(doc): room = doc.to_dict() room["id"] = doc.id @@ -18,49 +30,206 @@ def doc_to_room(doc): rooms_ref = ( db.collection("rooms") .order_by("lastUpdateTime", direction="DESCENDING") - .limit(50) + .limit(number_of_rooms) ) - return [doc_to_room(doc) for doc in rooms_ref.stream()] - - -def essential_room_info(room): - return [ - room["id"], - len(room["history"]), - len(room["playerData"]), # Should dig in history for ALL players - len(room["players"]), - datetime.datetime.fromtimestamp(room["lastUpdateTime"] / 1000), - ] - - - -def filter_rooms(room): - return len(room["history"]) > 0 - - -all_rooms = list_rooms() -filtered_rooms = filter(filter_rooms, all_rooms) -all_rooms_df = pd.DataFrame([essential_room_info(room) for room in filtered_rooms]) -all_rooms_df.columns = ["room_name", "num_rounds", "num_players_ever","players_end_of_day", "timestamp"] -st.write("mean number of rounds per room = " + str(all_rooms_df["num_rounds"].mean())) -st.write("median number of rounds per room = " + str(all_rooms_df["num_rounds"].median())) -st.write("mode number of rounds per room = " + str(all_rooms_df["num_rounds"].value_counts().idxmax())) -st.write(filtered_rooms) -st.write(all_rooms_df) -st.line_chart(all_rooms_df) -# st.area_chart(data) -# st.bar_chart(data) -# st.pyplot(fig) -# st.altair_chart(data) -# st.vega_lite_chart(data) -# st.plotly_chart(data) -# st.bokeh_chart(data) -# st.pydeck_chart(data) -# st.deck_gl_chart(data) -# st.graphviz_chart(data) -# st.map(data) + temp = [doc_to_room(doc) for doc in rooms_ref.stream()] + return temp + + +def room_to_essential_info(room): + return [room["id"], + len(room["history"]), + len(room["players"]), # Should dig in history for ALL players + len(room["playerData"]), + datetime.datetime.fromtimestamp(room["lastUpdateTime"] / 1000), ] + + +def room_to_history_list(room): + global rounds_without_clues, rounds_without_guesses, total_rounds + history_list = [] + for history in room["history"]: + total_rounds += 1 + + clues_list = [str(clueTM).lower() for clueTM in + list(history["clues"].values())] # Hack because Austin doesn't sanitize inputs + user_guess = history["guess"] + + if len(clues_list) == 0: + rounds_without_clues += 1 + if should_filter_rounds_without_clues: + continue + if user_guess == "": + rounds_without_guesses += 1 + if should_filter_rounds_without_guesses: + continue + + history_list.append([ + history["category"], + history["word"].lower(), + clues_list, + user_guess.lower(), + history["guess"] == history["word"], len(room["playerData"]), 1 + len(clues_list) + ]) + return history_list + + +def round_to_clue_list(input_round): + clue_list = [] + for c in input_round[2]: # for clue in clues + clue_list.append([ + c, # the clue. + input_round[0], # category + input_round[1], # word + input_round[3], # guess + input_round[4], # success + input_round[6], # active players + + ]) + return clue_list + + +def round_to_pairwise_list(input_round): + pairwise_list = [] + stuff = itertools.combinations(input_round[2], 2) + for pairs in stuff: + sorted_pairs = sorted(pairs) + pairwise_list.append([sorted_pairs[0], sorted_pairs[1] + , input_round[0], # category + input_round[1], # word + input_round[3], # guess + input_round[4], # success + input_round[6], # active players + ]) + return pairwise_list + + +st.title("Simple analytics for One Word") +st.header("Options") +should_filter_empty_room = st.sidebar.checkbox("Filter rooms without games") +should_filter_rounds_without_clues = st.sidebar.checkbox("Filter rounds without clues") +should_filter_rounds_without_guesses = st.sidebar.checkbox("Filter rounds without guesses") +num_rooms = st.sidebar.slider("Number of rooms to query", min_value=1, max_value=2000, value=500) +raw_rooms = list_rooms(num_rooms) +nonempty_rooms = [room for room in raw_rooms if len(room["history"]) > 0] +filtered_rooms = nonempty_rooms if should_filter_empty_room else raw_rooms +st.write(filtered_rooms[0]) +room_id, num_rounds, end_num_players, num_players, date = "id", "number of rounds", "number of players at end", "number of players", "date of game" +df = pd.DataFrame([room_to_essential_info(room) for room in filtered_rooms], + columns=[room_id, num_rounds, end_num_players, num_players, date]) + +st.header("Analysis By Games") +st.subheader("Raw Data") +st.write("Rooms excluded: " + str(len(raw_rooms) - len(nonempty_rooms))) +st.write("Empty room percentage: " + str(len(nonempty_rooms) / (len(raw_rooms) * 1.0))) +st.write(df) + +st.subheader("Daily Breakdown") +df_daily = df.set_index(date).groupby(pd.Grouper(freq="D")).agg( + total_games=(room_id, "count"), + average_rounds_per_game=(num_rounds, "mean"), + mean_end_players_per_game=(end_num_players, "mean"), + mean_players_per_game=(num_players, "mean"), +) +st.write(df_daily) +st.line_chart(df_daily) + +st.subheader("Monthly Breakdown") +df_monthly = df.set_index(date).groupby(pd.Grouper(freq="M")).agg( + total_games=(room_id, "count"), + average_rounds_per_game=(num_rounds, "mean"), + mean_end_players_per_game=(end_num_players, "mean"), + mean_players_per_game=(num_players, "mean"), +) +st.write(df_monthly) +st.line_chart(df_monthly) + +rounds = [] +for room in filtered_rooms: + rounds += room_to_history_list(room) + +category, word, clues, guess, success, players, active_players = "category", "word", "clues", "guess", "success", "total_players", "active_players" +rounds_df = pd.DataFrame(rounds, columns=[category, word, clues, guess, success, players, active_players]) + +st.header("Analysis By Rounds") +st.subheader("Raw Data") +st.write("Rounds excluded without clues: " + str(rounds_without_clues)) +st.write("Rounds excluded without guesses: " + str(rounds_without_guesses)) +st.write("Rounds without clues percentage: " + str(rounds_without_clues / (total_rounds * 1.0))) +st.write("Rounds without guesses percentage: " + str(rounds_without_guesses / (total_rounds * 1.0))) + +st.write(rounds_df) + +st.subheader("By Category") +category_df = rounds_df.set_index(category).groupby(category).agg( + rounds_played=(success, "count"), + success_rate=(success, "mean"), +) +st.write(category_df) + +st.subheader("By Word") +min_rounds_played = st.sidebar.slider("Minimum Rounds Played", min_value=1, max_value=100) +min_success_rate_word = st.sidebar.slider("Minimum Success Rate (Word)", min_value=0.0, max_value=1.0, step=0.01) +word_df = rounds_df.set_index(word).groupby(word).agg( + rounds_played=(success, "count"), + success_rate=(success, "mean"), +) +word_df = word_df[word_df["rounds_played"] >= min_rounds_played] +word_df = word_df[word_df["success_rate"] >= min_success_rate_word] +st.write(word_df) + +clues = [] +for r in rounds: + clues += round_to_clue_list(r) + +st.subheader("By Clue") +min_times_clued = st.sidebar.slider("Minimum Times Clued", min_value=1, max_value=100) +min_active_players = st.sidebar.slider("Minimum Number of Active Players", min_value=0, max_value=100) +max_active_players = st.sidebar.slider("Maximum Number of Active Players", min_value=0, max_value=100, value=100) +clue = "clue" +clues_df = pd.DataFrame(clues, columns=[clue, category, word, guess, success, active_players]) +st.write(clues_df) +clues_df = clues_df[min_active_players <= clues_df[active_players]] +clues_df= clues_df[max_active_players >= clues_df[active_players]] +clues_df_agg = clues_df.groupby([word, clue]).agg( + times_clued=(success, "count"), + success_rate=(success, "mean"), + most_common_guess=(guess, lambda x: x.value_counts().index[0]), + most_common_category=(category, lambda x: x.value_counts().index[0]) +) +clues_df_agg = clues_df_agg[clues_df_agg["times_clued"] >= min_times_clued] +clues_df_agg = clues_df_agg[clues_df_agg["success_rate"] >= min_success_rate_word] +st.write(clues_df_agg) + +pairwise = [] +for r in rounds: + if len(r[2]) > 1: + pairwise +=round_to_pairwise_list(r) + +st.subheader("Pairwise Clues") +clue1="clue1" +clue2="clue2" +clues_df = pd.DataFrame(pairwise, columns=[clue1, clue2, category, word, guess, success, active_players]) +st.write(clues_df) +clues_df = clues_df[min_active_players <= clues_df[active_players]] +clues_df= clues_df[max_active_players >= clues_df[active_players]] +clues_df_agg = clues_df.groupby([word, clue1, clue2]).agg( + times_clued=(success, "count"), + success_rate=(success, "mean"), + clue1=(clue1, lambda x: x.value_counts().index[0]), + clue2=(clue2, lambda x: x.value_counts().index[0]), + most_common_guess=(guess, lambda x: x.value_counts().index[0]), + most_common_category=(category, lambda x: x.value_counts().index[0]) +) +clues_df_agg = clues_df_agg[clues_df_agg["times_clued"] >= min_times_clued] +clues_df_agg = clues_df_agg[clues_df_agg["success_rate"] >= min_success_rate_word] +st.write(clues_df_agg) # Questions to answer: # How many rounds/games are played daily/weekly/monthly? # Which words are easist? Hardest? # Which words are commonly clued? + + +# filter custom + +# does total_players matter or does only active_players matter?