From fbc368f736b7694eb1e5c39e6432c6893a6dfe9f Mon Sep 17 00:00:00 2001
From: Austin Chen <akrolsmir@gmail.com>
Date: Sun, 22 Nov 2020 11:18:08 -0800
Subject: [PATCH 1/3] MVP of a Streamlit app

---
 .gitignore                           |  1 +
 analytics/streamlit/.gitignore       |  1 +
 analytics/streamlit/README.md        | 15 ++++++++
 analytics/streamlit/requirements.txt |  2 ++
 analytics/streamlit/streamlit_app.py | 51 ++++++++++++++++++++++++++++
 5 files changed, 70 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 analytics/streamlit/.gitignore
 create mode 100644 analytics/streamlit/README.md
 create mode 100644 analytics/streamlit/requirements.txt
 create mode 100644 analytics/streamlit/streamlit_app.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..dbe9c82b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.vscode/
\ No newline at end of file
diff --git a/analytics/streamlit/.gitignore b/analytics/streamlit/.gitignore
new file mode 100644
index 00000000..e3671126
--- /dev/null
+++ b/analytics/streamlit/.gitignore
@@ -0,0 +1 @@
+firestore-account-key.json
\ No newline at end of file
diff --git a/analytics/streamlit/README.md b/analytics/streamlit/README.md
new file mode 100644
index 00000000..d0937db5
--- /dev/null
+++ b/analytics/streamlit/README.md
@@ -0,0 +1,15 @@
+## Developing locally
+
+1. Install the requirements
+
+   `pip install -r requirements.txt`
+
+2. Download a service account key from the
+   [Firebase Console](https://console.firebase.google.com/u/0/project/oneword-cf74a/settings/serviceaccounts/adminsdk)
+   (or ask Austin for the key)
+
+3. Paste the key in this folder as `firestore-account-key.json`
+
+4. Start the analytics app
+
+   `streamlit run streamlit_app.py`
diff --git a/analytics/streamlit/requirements.txt b/analytics/streamlit/requirements.txt
new file mode 100644
index 00000000..41e9eeff
--- /dev/null
+++ b/analytics/streamlit/requirements.txt
@@ -0,0 +1,2 @@
+streamlit
+google-cloud-firestore
\ No newline at end of file
diff --git a/analytics/streamlit/streamlit_app.py b/analytics/streamlit/streamlit_app.py
new file mode 100644
index 00000000..07d2d440
--- /dev/null
+++ b/analytics/streamlit/streamlit_app.py
@@ -0,0 +1,51 @@
+import streamlit as st
+from google.cloud import firestore
+
+import pandas as pd
+import datetime
+
+# TODO: Show a nice st.error when key is missing
+db = firestore.Client.from_service_account_json("firestore-account-key.json")
+
+
+@st.cache
+def list_rooms():
+    def doc_to_room(doc):
+        room = doc.to_dict()
+        room["id"] = doc.id
+        return room
+
+    rooms_ref = (
+        db.collection("rooms")
+        .order_by("lastUpdateTime", direction="DESCENDING")
+        .limit(200)
+    )
+    return [doc_to_room(doc) for doc in rooms_ref.stream()]
+
+
+def room_to_df(room):
+    return pd.DataFrame(
+        [
+            [
+                room["id"],
+                len(room["history"]),
+                len(room["players"]),  # Should dig in history for ALL players
+                datetime.datetime.fromtimestamp(room["lastUpdateTime"] / 1000),
+            ]
+        ]
+    )
+
+
+room = list_rooms()[0]
+df = room_to_df(room)
+for room in list_rooms():
+    df2 = room_to_df(room)
+    df = df.append(df2)
+
+st.write(df)
+
+
+# Questions to answer:
+# How many rounds/games are played daily/weekly/monthly?
+# Which words are easist? Hardest?
+# Which words are commonly clued?

From b1bf5d91926b60a0688934046342c1351472e254 Mon Sep 17 00:00:00 2001
From: marsteralex <bob.masteralex@gmail.com>
Date: Sun, 22 Nov 2020 14:56:26 -0800
Subject: [PATCH 2/3] Update streamlit_app.py

---
 analytics/streamlit/streamlit_app.py | 49 ++++++++++++++++++----------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/analytics/streamlit/streamlit_app.py b/analytics/streamlit/streamlit_app.py
index 07d2d440..23173f52 100644
--- a/analytics/streamlit/streamlit_app.py
+++ b/analytics/streamlit/streamlit_app.py
@@ -17,33 +17,48 @@ def doc_to_room(doc):
 
     rooms_ref = (
         db.collection("rooms")
-        .order_by("lastUpdateTime", direction="DESCENDING")
-        .limit(200)
+            .order_by("lastUpdateTime", direction="DESCENDING")
+            .limit(50)
     )
     return [doc_to_room(doc) for doc in rooms_ref.stream()]
 
 
-def room_to_df(room):
-    return pd.DataFrame(
-        [
-            [
-                room["id"],
-                len(room["history"]),
-                len(room["players"]),  # Should dig in history for ALL players
+def essential_room_info(room):
+    return [
+        room["id"],
+        len(room["history"]),
+        len(room["playerData"]),  # Should dig in history for ALL players
+        len(room["players"]),
                 datetime.datetime.fromtimestamp(room["lastUpdateTime"] / 1000),
             ]
-        ]
-    )
 
 
-room = list_rooms()[0]
-df = room_to_df(room)
-for room in list_rooms():
-    df2 = room_to_df(room)
-    df = df.append(df2)
 
-st.write(df)
+def filter_rooms(room):
+    return len(room["history"]) > 0
+
 
+all_rooms = list_rooms()
+filtered_rooms = filter(filter_rooms, all_rooms)
+all_rooms_df = pd.DataFrame([essential_room_info(room) for room in filtered_rooms])
+all_rooms_df.columns = ["room_name", "num_rounds", "num_players_ever","players_end_of_day", "timestamp"]
+st.write("mean number of rounds per room = " + str(all_rooms_df["num_rounds"].mean()))
+st.write("median number of rounds per room = " + str(all_rooms_df["num_rounds"].median()))
+st.write("mode number of rounds per room = " + str(all_rooms_df["num_rounds"].value_counts().idxmax()))
+st.write(filtered_rooms)
+st.write(all_rooms_df)
+st.line_chart(all_rooms_df)
+# st.area_chart(data)
+# st.bar_chart(data)
+# st.pyplot(fig)
+# st.altair_chart(data)
+# st.vega_lite_chart(data)
+# st.plotly_chart(data)
+# st.bokeh_chart(data)
+# st.pydeck_chart(data)
+# st.deck_gl_chart(data)
+# st.graphviz_chart(data)
+# st.map(data)
 
 # Questions to answer:
 # How many rounds/games are played daily/weekly/monthly?

From 66ebd1feabc2c94146311087fdfd5ae71ad50aea Mon Sep 17 00:00:00 2001
From: marsteralex <bob.masteralex@gmail.com>
Date: Mon, 30 Nov 2020 10:06:56 -0800
Subject: [PATCH 3/3] Update streamlit_app.py

---
 analytics/streamlit/streamlit_app.py | 251 ++++++++++++++++++++++-----
 1 file changed, 210 insertions(+), 41 deletions(-)

diff --git a/analytics/streamlit/streamlit_app.py b/analytics/streamlit/streamlit_app.py
index 23173f52..8183ec37 100644
--- a/analytics/streamlit/streamlit_app.py
+++ b/analytics/streamlit/streamlit_app.py
@@ -3,13 +3,25 @@
 
 import pandas as pd
 import datetime
+import itertools
+import csv
 
 # TODO: Show a nice st.error when key is missing
 db = firestore.Client.from_service_account_json("firestore-account-key.json")
 
+type(db)
+
+st.write(type(db))
+
+global rounds_without_clues
+global rounds_without_guesses
+global total_rounds
+rounds_without_clues = 0
+rounds_without_guesses = 0
+total_rounds = 0
 
 @st.cache
-def list_rooms():
+def list_rooms(number_of_rooms):
     def doc_to_room(doc):
         room = doc.to_dict()
         room["id"] = doc.id
@@ -18,49 +30,206 @@ def doc_to_room(doc):
     rooms_ref = (
         db.collection("rooms")
             .order_by("lastUpdateTime", direction="DESCENDING")
-            .limit(50)
+            .limit(number_of_rooms)
     )
-    return [doc_to_room(doc) for doc in rooms_ref.stream()]
-
-
-def essential_room_info(room):
-    return [
-        room["id"],
-        len(room["history"]),
-        len(room["playerData"]),  # Should dig in history for ALL players
-        len(room["players"]),
-                datetime.datetime.fromtimestamp(room["lastUpdateTime"] / 1000),
-            ]
-
-
-
-def filter_rooms(room):
-    return len(room["history"]) > 0
-
-
-all_rooms = list_rooms()
-filtered_rooms = filter(filter_rooms, all_rooms)
-all_rooms_df = pd.DataFrame([essential_room_info(room) for room in filtered_rooms])
-all_rooms_df.columns = ["room_name", "num_rounds", "num_players_ever","players_end_of_day", "timestamp"]
-st.write("mean number of rounds per room = " + str(all_rooms_df["num_rounds"].mean()))
-st.write("median number of rounds per room = " + str(all_rooms_df["num_rounds"].median()))
-st.write("mode number of rounds per room = " + str(all_rooms_df["num_rounds"].value_counts().idxmax()))
-st.write(filtered_rooms)
-st.write(all_rooms_df)
-st.line_chart(all_rooms_df)
-# st.area_chart(data)
-# st.bar_chart(data)
-# st.pyplot(fig)
-# st.altair_chart(data)
-# st.vega_lite_chart(data)
-# st.plotly_chart(data)
-# st.bokeh_chart(data)
-# st.pydeck_chart(data)
-# st.deck_gl_chart(data)
-# st.graphviz_chart(data)
-# st.map(data)
+    temp = [doc_to_room(doc) for doc in rooms_ref.stream()]
+    return temp
+
+
+def room_to_essential_info(room):
+    return [room["id"],
+            len(room["history"]),
+            len(room["players"]),  # Should dig in history for ALL players
+            len(room["playerData"]),
+            datetime.datetime.fromtimestamp(room["lastUpdateTime"] / 1000), ]
+
+
+def room_to_history_list(room):
+    global rounds_without_clues, rounds_without_guesses, total_rounds
+    history_list = []
+    for history in room["history"]:
+        total_rounds += 1
+
+        clues_list = [str(clueTM).lower() for clueTM in
+                      list(history["clues"].values())]  # Hack because Austin doesn't sanitize inputs
+        user_guess = history["guess"]
+
+        if len(clues_list) == 0:
+            rounds_without_clues += 1
+            if should_filter_rounds_without_clues:
+                continue
+        if user_guess == "":
+            rounds_without_guesses += 1
+            if should_filter_rounds_without_guesses:
+                continue
+
+        history_list.append([
+            history["category"],
+            history["word"].lower(),
+            clues_list,
+            user_guess.lower(),
+            history["guess"] == history["word"], len(room["playerData"]), 1 + len(clues_list)
+        ])
+    return history_list
+
+
+def round_to_clue_list(input_round):
+    clue_list = []
+    for c in input_round[2]:  # for clue in clues
+        clue_list.append([
+            c,  # the clue.
+            input_round[0],  # category
+            input_round[1],  # word
+            input_round[3],  # guess
+            input_round[4],  # success
+            input_round[6],  # active players
+
+        ])
+    return clue_list
+
+
+def round_to_pairwise_list(input_round):
+    pairwise_list = []
+    stuff = itertools.combinations(input_round[2], 2)
+    for pairs in stuff:
+        sorted_pairs = sorted(pairs)
+        pairwise_list.append([sorted_pairs[0], sorted_pairs[1]
+                                 , input_round[0],  # category
+                              input_round[1],  # word
+                              input_round[3],  # guess
+                              input_round[4],  # success
+                              input_round[6],  # active players
+                              ])
+    return pairwise_list
+
+
+st.title("Simple analytics for One Word")
+st.header("Options")
+should_filter_empty_room = st.sidebar.checkbox("Filter rooms without games")
+should_filter_rounds_without_clues = st.sidebar.checkbox("Filter rounds without clues")
+should_filter_rounds_without_guesses = st.sidebar.checkbox("Filter rounds without guesses")
+num_rooms = st.sidebar.slider("Number of rooms to query", min_value=1, max_value=2000, value=500)
+raw_rooms = list_rooms(num_rooms)
 
+nonempty_rooms = [room for room in raw_rooms if len(room["history"]) > 0]
+filtered_rooms = nonempty_rooms if should_filter_empty_room else raw_rooms
+st.write(filtered_rooms[0])
+room_id, num_rounds, end_num_players, num_players, date = "id", "number of rounds", "number of players at end", "number of players", "date of game"
+df = pd.DataFrame([room_to_essential_info(room) for room in filtered_rooms],
+                  columns=[room_id, num_rounds, end_num_players, num_players, date])
+
+st.header("Analysis By Games")
+st.subheader("Raw Data")
+st.write("Rooms excluded: " + str(len(raw_rooms) - len(nonempty_rooms)))
+st.write("Empty room percentage: " + str(len(nonempty_rooms) / (len(raw_rooms) * 1.0)))
+st.write(df)
+
+st.subheader("Daily Breakdown")
+df_daily = df.set_index(date).groupby(pd.Grouper(freq="D")).agg(
+    total_games=(room_id, "count"),
+    average_rounds_per_game=(num_rounds, "mean"),
+    mean_end_players_per_game=(end_num_players, "mean"),
+    mean_players_per_game=(num_players, "mean"),
+)
+st.write(df_daily)
+st.line_chart(df_daily)
+
+st.subheader("Monthly Breakdown")
+df_monthly = df.set_index(date).groupby(pd.Grouper(freq="M")).agg(
+    total_games=(room_id, "count"),
+    average_rounds_per_game=(num_rounds, "mean"),
+    mean_end_players_per_game=(end_num_players, "mean"),
+    mean_players_per_game=(num_players, "mean"),
+)
+st.write(df_monthly)
+st.line_chart(df_monthly)
+
+rounds = []
+for room in filtered_rooms:
+    rounds += room_to_history_list(room)
+
+category, word, clues, guess, success, players, active_players = "category", "word", "clues", "guess", "success", "total_players", "active_players"
+rounds_df = pd.DataFrame(rounds, columns=[category, word, clues, guess, success, players, active_players])
+
+st.header("Analysis By Rounds")
+st.subheader("Raw Data")
+st.write("Rounds excluded without clues: " + str(rounds_without_clues))
+st.write("Rounds excluded without guesses: " + str(rounds_without_guesses))
+st.write("Rounds without clues percentage: " + str(rounds_without_clues / (total_rounds * 1.0)))
+st.write("Rounds without guesses percentage: " + str(rounds_without_guesses / (total_rounds * 1.0)))
+
+st.write(rounds_df)
+
+st.subheader("By Category")
+category_df = rounds_df.set_index(category).groupby(category).agg(
+    rounds_played=(success, "count"),
+    success_rate=(success, "mean"),
+)
+st.write(category_df)
+
+st.subheader("By Word")
+min_rounds_played = st.sidebar.slider("Minimum Rounds Played", min_value=1, max_value=100)
+min_success_rate_word = st.sidebar.slider("Minimum Success Rate (Word)", min_value=0.0, max_value=1.0, step=0.01)
+word_df = rounds_df.set_index(word).groupby(word).agg(
+    rounds_played=(success, "count"),
+    success_rate=(success, "mean"),
+)
+word_df = word_df[word_df["rounds_played"] >= min_rounds_played]
+word_df = word_df[word_df["success_rate"] >= min_success_rate_word]
+st.write(word_df)
+
+clues = []
+for r in rounds:
+    clues += round_to_clue_list(r)
+
+st.subheader("By Clue")
+min_times_clued = st.sidebar.slider("Minimum Times Clued", min_value=1, max_value=100)
+min_active_players = st.sidebar.slider("Minimum Number of Active Players", min_value=0, max_value=100)
+max_active_players = st.sidebar.slider("Maximum Number of Active Players", min_value=0, max_value=100, value=100)
+clue = "clue"
+clues_df = pd.DataFrame(clues, columns=[clue, category, word, guess, success, active_players])
+st.write(clues_df)
+clues_df = clues_df[min_active_players <= clues_df[active_players]]
+clues_df= clues_df[max_active_players >= clues_df[active_players]]
+clues_df_agg = clues_df.groupby([word, clue]).agg(
+    times_clued=(success, "count"),
+    success_rate=(success, "mean"),
+    most_common_guess=(guess, lambda x: x.value_counts().index[0]),
+    most_common_category=(category, lambda x: x.value_counts().index[0])
+)
+clues_df_agg = clues_df_agg[clues_df_agg["times_clued"] >= min_times_clued]
+clues_df_agg = clues_df_agg[clues_df_agg["success_rate"] >= min_success_rate_word]
+st.write(clues_df_agg)
+
+pairwise = []
+for r in rounds:
+    if len(r[2]) > 1:
+        pairwise +=round_to_pairwise_list(r)
+
+st.subheader("Pairwise Clues")
+clue1="clue1"
+clue2="clue2"
+clues_df = pd.DataFrame(pairwise, columns=[clue1, clue2, category, word, guess, success, active_players])
+st.write(clues_df)
+clues_df = clues_df[min_active_players <= clues_df[active_players]]
+clues_df= clues_df[max_active_players >= clues_df[active_players]]
+clues_df_agg = clues_df.groupby([word, clue1, clue2]).agg(
+    times_clued=(success, "count"),
+    success_rate=(success, "mean"),
+    clue1=(clue1, lambda x: x.value_counts().index[0]),
+    clue2=(clue2, lambda x: x.value_counts().index[0]),
+    most_common_guess=(guess, lambda x: x.value_counts().index[0]),
+    most_common_category=(category, lambda x: x.value_counts().index[0])
+)
+clues_df_agg = clues_df_agg[clues_df_agg["times_clued"] >= min_times_clued]
+clues_df_agg = clues_df_agg[clues_df_agg["success_rate"] >= min_success_rate_word]
+st.write(clues_df_agg)
 # Questions to answer:
 # How many rounds/games are played daily/weekly/monthly?
 # Which words are easist? Hardest?
 # Which words are commonly clued?
+
+
+# filter custom
+
+# does total_players matter or does only active_players matter?