add tables and cols

noah-art3mis · Jul 29, 2024 · 1257196 · 1257196
1 parent 0711af1
commit 1257196
Show file tree

Hide file tree

Showing 7 changed files with 42 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -2,9 +2,9 @@
 
 Lightweight prompt evaluation package.
 
-Uses [ollama](https://github.com/ollama/ollama-python) to run LLMs locally if needed.
+Use online. Can also be used locally through streamlit. Can use [ollama](https://github.com/ollama/ollama-python) to run LLMs locally if necessary.
 
-Cost estimation gives very rough estimates (input times 2).
+Cost estimation is very rough (input \* 2).
 
 ## How to use
 
@@ -59,11 +59,8 @@ Cost estimation gives very rough estimates (input times 2).
 
 ## TODO
 
--   add 'get report' to runner
--   fix: urgent does not call proper model
--   turn query into an interface
--   save logs in txt
--   add [asyncio](https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client)
+-   add tables
+-   add tests
 
 ## Resources
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ tiktoken = "^0.7.0"
 python-dotenv = "^1.0.1"
 ollama = "^0.2.1"
 streamlit = "^1.37.0"
+pandas = "^2.2.2"
 
 
 [tool.poetry.group.dev.dependencies]

diff --git a/simulacro_1.jpeg b/simulacro_1.jpeg
diff --git a/src/crucible/classes/Runner.py b/src/crucible/classes/Runner.py
@@ -124,12 +124,4 @@ def calculate_costs(self) -> float:
     def _check_exists(self, *args) -> None:
         for arg in args:
             if arg is None:
-                raise ValueError(f"No globals provided for {arg}")
-
-    def generate_report(self) -> str:
-        for task in self.tasks:
-            self.report.add_result(task)
-
-        self.report.calculate_total_cost()
-
-        return self.report.generate_report()
+                raise ValueError(f"No globals provided for {arg}")
diff --git a/src/crucible/streamlit_app.py b/src/crucible/streamlit_app.py
@@ -1,4 +1,6 @@
+import pandas as pd
 import streamlit as st
+
 from crucible.utils.grading import GradingType
 from crucible.classes.Runner import Runner
 from crucible.classes.Model import Source
@@ -22,8 +24,7 @@
 st.caption("Lightweight prompt evaluation")
 st.caption("An AUTOMATON tool")
 
-st.markdown("---")
-st.subheader("Configuration")
+st.header("Configuration")
 
 with st.expander("Models"):
 
@@ -69,7 +70,7 @@
         prompts.append(Prompt(prompt_id, prompt_slot, prompt_content))
 
 
-with st.expander("Variable"):
+with st.expander("Variables"):
     n_var = st.number_input("Number of variables", min_value=1, max_value=20, value=2)
     variables = []
     for i in range(int(n_var)):
@@ -91,10 +92,11 @@
 
 st.markdown("---")
 st.subheader("Other configs")
-grading_type = st.selectbox(
-    "Select grading type", available_gradings, format_func=lambda x: x.name
+a1, a2 = st.columns(2)
+grading_type = a1.selectbox(
+    "Select grading type", available_gradings, format_func=lambda x: x.name.lower()
 )
-temperature = st.slider("Select temperature", 0.0, 1.0, 0.0, 0.2)
+temperature = a2.slider("Select temperature", 0.0, 1.0, 0.0, 0.2)
 
 
 st.markdown("---")
@@ -110,7 +112,7 @@ def click_button():
 st.button("Compile", on_click=click_button)
 
 if st.session_state.compiled:
-    st.subheader("Summary:")
+    st.header("Summary:")
 
     runner = Runner(
         models=models,
@@ -124,19 +126,20 @@ def click_button():
     )
     estimated_costs = runner.estimate_all_costs()
 
-    st.markdown("- **Models**")
+    c1, c2, c3 = st.columns(3)
+    c1.markdown("**Models**")
     for model in runner.models:
-        st.markdown(f"\t- {model.id}")
+        c1.markdown(f"\t- {model.id}")
 
-    st.markdown("- **Prompts**")
+    c2.markdown("**Prompts**")
     for prompt in runner.prompts:
-        st.markdown(f"\t- {prompt.id}")
+        c2.markdown(f"\t- {prompt.id}")
 
-    st.markdown("- **Variables**")
+    c3.markdown("**Variables**")
     for variable in runner.variables:
-        st.markdown(f"\t- {variable.id}")
+        c3.markdown(f"\t- {variable.id}")
 
-    st.markdown(f"**Grading Type**: {runner.grading_type}")
+    st.markdown(f"**Grading Type**: {runner.grading_type.name.lower()}")
     st.markdown(f"**Temperature**: {runner.temperature}")
     st.markdown(f"**Total cases**: {len(runner.tasks)}")
     st.markdown(f"**Estimated costs**: ${estimated_costs:.2f} USD")
@@ -153,9 +156,21 @@ def click_button():
             progress_bar.progress(progress, text="Running CRUCIBLE...")
 
         report = Report(runner)
-        result = report.__dict__
-        st.subheader("Results:")
-        st.write(result)
 
-# Uncomment and implement if needed
-# st.download_button("Download report", file)
+        st.header("Results:")
+        col1, col2 = st.columns(2)
+        col1.metric("Cost (USD)", report.cost)
+        col2.metric("Time (seconds)", report.time)
+
+        st.subheader("Per model:")
+        st.data_editor(pd.DataFrame(report.per_model))
+        st.subheader("Per prompt:")
+        st.data_editor(pd.DataFrame(report.per_prompt))
+        st.subheader("Per variable:")
+        st.data_editor(pd.DataFrame(report.per_variable))
+
+        st.subheader("All tasks:")
+        st.data_editor(pd.DataFrame(report.tasks))
+
+        st.subheader("Result(json):")
+        st.write(report.__dict__)
diff --git a/src/crucible/utils/grading.py b/src/crucible/utils/grading.py
@@ -41,11 +41,10 @@ def grade_response(
                 content=GRADING_PROMPT.replace("{expected}", variable.expected[0]),
             )
 
-            response = _model.query(
+            response, cost = _model.query(
                 prompt=_prompt,
                 variable=_variable,
                 temp=0,
-                danger_mode=True,
                 api_key=openai_api_key,
             )