diff --git a/.gitignore b/.gitignore
index 016b59e..1f17760 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,7 @@ pnpm-debug.log*
# jetbrains setting folder
.idea/
+
+# temporary files
+temp.*
+tmp.*
\ No newline at end of file
diff --git a/src/assets/img1.png b/src/assets/img1.png
new file mode 100644
index 0000000..bc334e6
Binary files /dev/null and b/src/assets/img1.png differ
diff --git a/src/pages/index.mdx b/src/pages/index.mdx
index 2a24897..5febfef 100644
--- a/src/pages/index.mdx
+++ b/src/pages/index.mdx
@@ -26,6 +26,7 @@ export const components = {pre: CodeBlock}
import demo_placeholder from "../assets/demo_placeholder_trimed.mp4";
import flowchart12 from "../assets/flowchart12.png";
import table1 from "../assets/table1.png";
+import img1 from "../assets/img1.png";
import multi_turn_examples_5 from "../assets/multi_turn_examples_5.svg";
import multiturn_results_main_4 from "../assets/multiturn_results_main_4.jpg";
import exp1 from "../assets/exp1.png";
@@ -108,7 +109,7 @@ We evaluated 8 commercial models (GPT-4o, GPT-4o mini, Gemini 1.5 Pro, Gemini 1.
-
+
## Benchmark Performance: Multi-turn Evaluations
@@ -116,7 +117,7 @@ We evaluated 8 commercial models (GPT-4o, GPT-4o mini, Gemini 1.5 Pro, Gemini 1.
-
+
We find that all models displayed noticeable improvements in feedback following. The best commercial models achieves improvements of up to 7.1% in visual similarity and 2.7% in IoU-based layout similarity within five rounds of interaction. Quesion asking, however, appears to be a more challenging task as all models struggled to pose effective questions about the sketches and showed very few improvements with statistical significance.
@@ -124,7 +125,7 @@ We find that all models displayed noticeable improvements in feedback following.
-
+
{/*