From fd9cd48ac666b995150dca643059fe714dc97e13 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Thu, 19 Sep 2024 14:28:55 -0500
Subject: [PATCH 01/21] test: compare local benchmark to `latest` and exit 1

---
 perf/efps/index.ts   | 136 ++++++++++++++++++++++++++++++++++++++-----
 perf/efps/runTest.ts |  24 +++++---
 2 files changed, 137 insertions(+), 23 deletions(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index c7cb95e7703..2eb215936f4 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -2,6 +2,8 @@
 // eslint-disable-next-line import/no-unassigned-import
 import 'dotenv/config'
 
+import fs from 'node:fs'
+import os from 'node:os'
 import path from 'node:path'
 import process from 'node:process'
 import {fileURLToPath} from 'node:url'
@@ -11,13 +13,13 @@ import chalk from 'chalk'
 import Table from 'cli-table3'
 import Ora from 'ora'
 
-// eslint-disable-next-line import/no-extraneous-dependencies
 import {exec} from './helpers/exec'
 import {runTest} from './runTest'
 import article from './tests/article/article'
 import recipe from './tests/recipe/recipe'
 import singleString from './tests/singleString/singleString'
 import synthetic from './tests/synthetic/synthetic'
+import {type EfpsResult} from './types'
 
 const headless = true
 const tests = [singleString, recipe, article, synthetic]
@@ -63,6 +65,18 @@ await exec({
   cwd: monorepoRoot,
 })
 
+// Prepare the latest version of the 'sanity' package
+const tmpDir = path.join(os.tmpdir(), `sanity-latest-${Date.now()}`)
+await fs.promises.mkdir(tmpDir, {recursive: true})
+spinner.start('')
+await exec({
+  command: 'npm install sanity@latest --no-save',
+  cwd: tmpDir,
+  spinner,
+  text: ['Downloading latest sanity package…', 'Downloaded latest sanity package'],
+})
+const sanityPackagePath = path.join(tmpDir, 'node_modules', 'sanity')
+
 await exec({
   text: ['Ensuring playwright is installed…', 'Playwright is installed'],
   command: 'npx playwright install',
@@ -70,9 +84,15 @@ await exec({
 })
 
 const table = new Table({
-  head: [chalk.bold('benchmark'), 'eFPS p50', 'eFPS p75', 'eFPS p90'].map((cell) =>
-    chalk.cyan(cell),
-  ),
+  head: [
+    chalk.bold('benchmark'),
+    'latest p50',
+    'local p50 (Δ%)',
+    'latest p75',
+    'local p75 (Δ%)',
+    'latest p90',
+    'local p90 (Δ%)',
+  ].map((cell) => chalk.cyan(cell)),
 })
 
 const formatFps = (fps: number) => {
@@ -82,26 +102,108 @@ const formatFps = (fps: number) => {
   return chalk.yellow(rounded)
 }
 
+const formatPercentage = (value: number): string => {
+  const rounded = value.toFixed(1)
+  const sign = value >= 0 ? '+' : ''
+  if (value >= 0) return chalk.green(`${sign}${rounded}%`)
+  return chalk.red(`${rounded}%`)
+}
+
+// Initialize the regression flag
+let hasSignificantRegression = false
+
+interface TestResult {
+  testName: string
+  version: 'local' | 'latest'
+  results: EfpsResult[]
+}
+
+const allResults: TestResult[] = []
+
 for (let i = 0; i < tests.length; i++) {
   const test = tests[i]
-  const results = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}]…`,
+
+  // Run with local 'sanity' package
+  const localResults = await runTest({
+    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with local 'sanity'…`,
+    test,
+    resultsDir,
+    spinner,
+    client,
+    headless,
+    projectId,
+  })
+
+  allResults.push({
+    testName: test.name,
+    version: 'local',
+    results: localResults,
+  })
+
+  // Run with latest 'sanity' package
+  const latestResults = await runTest({
+    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@latest'…`,
     test,
     resultsDir,
     spinner,
     client,
     headless,
     projectId,
+    sanityPackagePath,
+  })
+
+  allResults.push({
+    testName: test.name,
+    version: 'latest',
+    results: latestResults,
   })
+}
 
-  for (const result of results) {
-    table.push({
-      [[chalk.bold(test.name), result.label ? `(${result.label})` : ''].join(' ')]: [
-        formatFps(result.p50),
-        formatFps(result.p75),
-        formatFps(result.p90),
-      ],
-    })
+for (const test of tests) {
+  const localResult = allResults.find((r) => r.testName === test.name && r.version === 'local')
+  const latestResult = allResults.find((r) => r.testName === test.name && r.version === 'latest')
+
+  if (localResult && latestResult) {
+    const localResultsMap = new Map<string | undefined, EfpsResult>()
+    for (const res of localResult.results) {
+      localResultsMap.set(res.label, res)
+    }
+    const latestResultsMap = new Map<string | undefined, EfpsResult>()
+    for (const res of latestResult.results) {
+      latestResultsMap.set(res.label, res)
+    }
+
+    for (const [label, latest] of latestResultsMap) {
+      const local = localResultsMap.get(label)
+      if (local) {
+        // Compute percentage differences
+        const p50Diff = ((local.p50 - latest.p50) / latest.p50) * 100
+        const p75Diff = ((local.p75 - latest.p75) / latest.p75) * 100
+        const p90Diff = ((local.p90 - latest.p90) / latest.p90) * 100
+
+        // Check for significant regression
+        // eslint-disable-next-line max-depth
+        if (p50Diff < -50 || p75Diff < -50 || p90Diff < -50) {
+          hasSignificantRegression = true
+        }
+
+        const rowLabel = [chalk.bold(test.name), label ? `(${label})` : ''].join(' ')
+
+        table.push([
+          rowLabel,
+          formatFps(latest.p50),
+          `${formatFps(local.p50)} (${formatPercentage(p50Diff)})`,
+          formatFps(latest.p75),
+          `${formatFps(local.p75)} (${formatPercentage(p75Diff)})`,
+          formatFps(latest.p90),
+          `${formatFps(local.p90)} (${formatPercentage(p90Diff)})`,
+        ])
+      } else {
+        spinner.fail(`Missing local result for test '${test.name}', label '${label}'`)
+      }
+    }
+  } else {
+    spinner.fail(`Missing results for test '${test.name}'`)
   }
 }
 
@@ -113,3 +215,9 @@ console.log(`
 │ The number of renders ("frames") that is assumed to be possible
 │ within a second. Derived from input latency. ${chalk.green('Higher')} is better.
 `)
+
+// Exit with code 1 if regression detected
+if (hasSignificantRegression) {
+  console.error(chalk.red('Performance regression detected exceeding 50% threshold.'))
+  process.exit(1)
+}
diff --git a/perf/efps/runTest.ts b/perf/efps/runTest.ts
index 201487686df..0bf11f3b826 100644
--- a/perf/efps/runTest.ts
+++ b/perf/efps/runTest.ts
@@ -24,6 +24,7 @@ interface RunTestOptions {
   projectId: string
   headless: boolean
   client: SanityClient
+  sanityPackagePath?: string // Add this line
 }
 
 export async function runTest({
@@ -34,6 +35,7 @@ export async function runTest({
   projectId,
   headless,
   client,
+  sanityPackagePath,
 }: RunTestOptions): Promise<EfpsResult[]> {
   const log = (text: string) => {
     spinner.text = `${prefix}\n  └ ${text}`
@@ -41,19 +43,23 @@ export async function runTest({
 
   spinner.start(prefix)
 
-  const outDir = path.join(workspaceDir, 'builds', test.name)
-  const testResultsDir = path.join(resultsDir, test.name)
+  const versionLabel = sanityPackagePath ? 'latest' : 'local'
+  const outDir = path.join(workspaceDir, 'builds', test.name, versionLabel)
+  const testResultsDir = path.join(resultsDir, test.name, versionLabel)
 
   await fs.promises.mkdir(outDir, {recursive: true})
   log('Building…')
 
+  const alias: Record<string, string> = {'#config': fileURLToPath(test.configPath!)}
+  if (sanityPackagePath) {
+    alias.sanity = sanityPackagePath
+  }
+
   await vite.build({
     appType: 'spa',
     build: {outDir, sourcemap: true},
     plugins: [{...sourcemaps(), enforce: 'pre'}, react()],
-    resolve: {
-      alias: {'#config': fileURLToPath(test.configPath!)},
-    },
+    resolve: {alias},
     logLevel: 'silent',
   })
 
@@ -107,9 +113,9 @@ export async function runTest({
 
     log('Loading editor…')
     await page.goto(
-      `http://localhost:3300/intent/edit/id=${encodeURIComponent(document._id)};type=${encodeURIComponent(
-        documentToCreate._type,
-      )}`,
+      `http://localhost:3300/intent/edit/id=${encodeURIComponent(
+        document._id,
+      )};type=${encodeURIComponent(documentToCreate._type)}`,
     )
 
     await cdp.send('Profiler.enable')
@@ -138,7 +144,7 @@ export async function runTest({
       JSON.stringify(remappedProfile),
     )
 
-    spinner.succeed(`Ran benchmark '${test.name}'`)
+    spinner.succeed(`Ran benchmark '${test.name}' (${versionLabel})`)
 
     return results
   } finally {

From a7a13ddfe8f9ee93d063d00b5425883a173cd1d5 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Thu, 19 Sep 2024 15:31:05 -0500
Subject: [PATCH 02/21] feat: add markdown table

---
 perf/efps/index.ts | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 2eb215936f4..84ca0debd49 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -95,6 +95,8 @@ const table = new Table({
   ].map((cell) => chalk.cyan(cell)),
 })
 
+const markdownLines: string[] = []
+
 const formatFps = (fps: number) => {
   const rounded = fps.toFixed(1)
   if (fps >= 60) return chalk.green(rounded)
@@ -109,6 +111,18 @@ const formatPercentage = (value: number): string => {
   return chalk.red(`${rounded}%`)
 }
 
+// For markdown formatting without colors
+const formatFpsPlain = (fps: number) => {
+  const rounded = fps.toFixed(1)
+  return rounded
+}
+
+const formatPercentagePlain = (value: number): string => {
+  const rounded = value.toFixed(1)
+  const sign = value >= 0 ? '+' : ''
+  return `${sign}${rounded}%`
+}
+
 // Initialize the regression flag
 let hasSignificantRegression = false
 
@@ -159,6 +173,15 @@ for (let i = 0; i < tests.length; i++) {
   })
 }
 
+// Prepare markdown header
+markdownLines.push('# Benchmark Results\n')
+markdownLines.push(
+  '| Benchmark | Latest p50 | Local p50 (Δ%) | Latest p75 | Local p75 (Δ%) | Latest p90 | Local p90 (Δ%) |',
+)
+markdownLines.push(
+  '|-----------|------------|----------------|------------|----------------|------------|----------------|',
+)
+
 for (const test of tests) {
   const localResult = allResults.find((r) => r.testName === test.name && r.version === 'local')
   const latestResult = allResults.find((r) => r.testName === test.name && r.version === 'latest')
@@ -198,6 +221,18 @@ for (const test of tests) {
           formatFps(latest.p90),
           `${formatFps(local.p90)} (${formatPercentage(p90Diff)})`,
         ])
+
+        // Add to markdown table
+        const markdownRow = [
+          [test.name, label ? `(${label})` : ''].join(' '),
+          formatFpsPlain(latest.p50),
+          `${formatFpsPlain(local.p50)} (${formatPercentagePlain(p50Diff)})`,
+          formatFpsPlain(latest.p75),
+          `${formatFpsPlain(local.p75)} (${formatPercentagePlain(p75Diff)})`,
+          formatFpsPlain(latest.p90),
+          `${formatFpsPlain(local.p90)} (${formatPercentagePlain(p90Diff)})`,
+        ]
+        markdownLines.push(`| ${markdownRow.join(' | ')} |`)
       } else {
         spinner.fail(`Missing local result for test '${test.name}', label '${label}'`)
       }
@@ -216,6 +251,19 @@ console.log(`
 │ within a second. Derived from input latency. ${chalk.green('Higher')} is better.
 `)
 
+// Add explanation to markdown
+markdownLines.push('\n')
+markdownLines.push('**eFPS — editor "Frames Per Second"**')
+markdownLines.push('\n')
+markdownLines.push(
+  'The number of renders ("frames") that is assumed to be possible within a second. Derived from input latency. **Higher** is better.',
+)
+markdownLines.push('\n')
+
+// Write markdown file
+const markdownOutputPath = path.join(resultsDir, 'benchmark-results.md')
+await fs.promises.writeFile(markdownOutputPath, markdownLines.join('\n'))
+
 // Exit with code 1 if regression detected
 if (hasSignificantRegression) {
   console.error(chalk.red('Performance regression detected exceeding 50% threshold.'))

From e3fc9faa4fe312513a7930b41981f01a17fc9f19 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Fri, 20 Sep 2024 16:04:57 -0500
Subject: [PATCH 03/21] feat: clean up table output

---
 perf/efps/index.ts | 110 ++++++++++++++++++++++++++-------------------
 1 file changed, 65 insertions(+), 45 deletions(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 84ca0debd49..13b45355029 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -1,3 +1,4 @@
+/* eslint-disable max-depth */
 /* eslint-disable no-console */
 // eslint-disable-next-line import/no-unassigned-import
 import 'dotenv/config'
@@ -84,18 +85,12 @@ await exec({
 })
 
 const table = new Table({
-  head: [
-    chalk.bold('benchmark'),
-    'latest p50',
-    'local p50 (Δ%)',
-    'latest p75',
-    'local p75 (Δ%)',
-    'latest p90',
-    'local p90 (Δ%)',
-  ].map((cell) => chalk.cyan(cell)),
+  head: [chalk.bold('benchmark'), 'Passed?', 'p50 eFPS (Δ%)', 'p75 eFPS (Δ%)', 'p90 eFPS (Δ%)'].map(
+    (cell) => chalk.cyan(cell),
+  ),
 })
 
-const markdownLines: string[] = []
+const markdownRows: string[] = []
 
 const formatFps = (fps: number) => {
   const rounded = fps.toFixed(1)
@@ -107,8 +102,8 @@ const formatFps = (fps: number) => {
 const formatPercentage = (value: number): string => {
   const rounded = value.toFixed(1)
   const sign = value >= 0 ? '+' : ''
-  if (value >= 0) return chalk.green(`${sign}${rounded}%`)
-  return chalk.red(`${rounded}%`)
+  if (value > -50) return `${sign}${rounded}%`
+  return chalk.red(`${sign}${rounded}%`)
 }
 
 // For markdown formatting without colors
@@ -123,8 +118,27 @@ const formatPercentagePlain = (value: number): string => {
   return `${sign}${rounded}%`
 }
 
-// Initialize the regression flag
-let hasSignificantRegression = false
+function getStatus(
+  p50Diff: number,
+  p75Diff: number,
+  p90Diff: number,
+): 'error' | 'warning' | 'passed' {
+  if (p50Diff < -50 || p75Diff < -50 || p90Diff < -50) {
+    return 'error'
+  } else if (p50Diff < -20 || p75Diff < -20 || p90Diff < -20) {
+    return 'warning'
+  }
+  return 'passed'
+}
+
+function getStatusEmoji(status: 'error' | 'warning' | 'passed'): string {
+  if (status === 'error') return '🔴'
+  if (status === 'warning') return '⚠️'
+  return '✅'
+}
+
+// Initialize the overall status
+let overallStatus: 'error' | 'warning' | 'passed' = 'passed'
 
 interface TestResult {
   testName: string
@@ -173,15 +187,6 @@ for (let i = 0; i < tests.length; i++) {
   })
 }
 
-// Prepare markdown header
-markdownLines.push('# Benchmark Results\n')
-markdownLines.push(
-  '| Benchmark | Latest p50 | Local p50 (Δ%) | Latest p75 | Local p75 (Δ%) | Latest p90 | Local p90 (Δ%) |',
-)
-markdownLines.push(
-  '|-----------|------------|----------------|------------|----------------|------------|----------------|',
-)
-
 for (const test of tests) {
   const localResult = allResults.find((r) => r.testName === test.name && r.version === 'local')
   const latestResult = allResults.find((r) => r.testName === test.name && r.version === 'latest')
@@ -204,35 +209,35 @@ for (const test of tests) {
         const p75Diff = ((local.p75 - latest.p75) / latest.p75) * 100
         const p90Diff = ((local.p90 - latest.p90) / latest.p90) * 100
 
-        // Check for significant regression
-        // eslint-disable-next-line max-depth
-        if (p50Diff < -50 || p75Diff < -50 || p90Diff < -50) {
-          hasSignificantRegression = true
+        // Determine test status
+        const testStatus = getStatus(p50Diff, p75Diff, p90Diff)
+
+        // Update overall status
+        if (testStatus === 'error') {
+          overallStatus = 'error'
+        } else if (testStatus === 'warning' && overallStatus === 'passed') {
+          overallStatus = 'warning'
         }
 
         const rowLabel = [chalk.bold(test.name), label ? `(${label})` : ''].join(' ')
 
         table.push([
           rowLabel,
-          formatFps(latest.p50),
+          getStatusEmoji(testStatus),
           `${formatFps(local.p50)} (${formatPercentage(p50Diff)})`,
-          formatFps(latest.p75),
           `${formatFps(local.p75)} (${formatPercentage(p75Diff)})`,
-          formatFps(latest.p90),
           `${formatFps(local.p90)} (${formatPercentage(p90Diff)})`,
         ])
 
-        // Add to markdown table
+        // Add to markdown rows
         const markdownRow = [
           [test.name, label ? `(${label})` : ''].join(' '),
-          formatFpsPlain(latest.p50),
+          getStatusEmoji(testStatus),
           `${formatFpsPlain(local.p50)} (${formatPercentagePlain(p50Diff)})`,
-          formatFpsPlain(latest.p75),
           `${formatFpsPlain(local.p75)} (${formatPercentagePlain(p75Diff)})`,
-          formatFpsPlain(latest.p90),
           `${formatFpsPlain(local.p90)} (${formatPercentagePlain(p90Diff)})`,
         ]
-        markdownLines.push(`| ${markdownRow.join(' | ')} |`)
+        markdownRows.push(`| ${markdownRow.join(' | ')} |`)
       } else {
         spinner.fail(`Missing local result for test '${test.name}', label '${label}'`)
       }
@@ -251,21 +256,36 @@ console.log(`
 │ within a second. Derived from input latency. ${chalk.green('Higher')} is better.
 `)
 
-// Add explanation to markdown
-markdownLines.push('\n')
-markdownLines.push('**eFPS — editor "Frames Per Second"**')
-markdownLines.push('\n')
-markdownLines.push(
-  'The number of renders ("frames") that is assumed to be possible within a second. Derived from input latency. **Higher** is better.',
-)
-markdownLines.push('\n')
+// Map overallStatus to status text
+const statusText =
+  // eslint-disable-next-line no-nested-ternary
+  overallStatus === 'error' ? 'Error' : overallStatus === 'warning' ? 'Warning' : 'Passed'
+const statusEmoji = getStatusEmoji(overallStatus)
+
+// Build the markdown content
+const markdownContent = [
+  '# Benchmark Results',
+  '',
+  `<details>`,
+  `<summary>${statusEmoji} Performance Benchmark Results — Status: **${statusText}** </summary>`,
+  '',
+  '| Benchmark | Passed? | p50 eFPS (Δ%) | p75 eFPS (Δ%) | p90 eFPS (Δ%) |',
+  '|-----------|---------|---------------|---------------|---------------|',
+  ...markdownRows,
+  '</details>',
+  '',
+  '> **eFPS — editor "Frames Per Second"**',
+  '> ',
+  '> The number of renders ("frames") that is assumed to be possible within a second. Derived from input latency. **Higher** is better.',
+  '',
+].join('\n')
 
 // Write markdown file
 const markdownOutputPath = path.join(resultsDir, 'benchmark-results.md')
-await fs.promises.writeFile(markdownOutputPath, markdownLines.join('\n'))
+await fs.promises.writeFile(markdownOutputPath, markdownContent)
 
 // Exit with code 1 if regression detected
-if (hasSignificantRegression) {
+if (overallStatus === 'error') {
   console.error(chalk.red('Performance regression detected exceeding 50% threshold.'))
   process.exit(1)
 }

From 8c87f8ef459645d051ee21b2632551a3ed2db380 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Fri, 20 Sep 2024 16:23:49 -0500
Subject: [PATCH 04/21] refactor: remove warning

---
 perf/efps/index.ts | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 13b45355029..87fefb201af 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -118,27 +118,20 @@ const formatPercentagePlain = (value: number): string => {
   return `${sign}${rounded}%`
 }
 
-function getStatus(
-  p50Diff: number,
-  p75Diff: number,
-  p90Diff: number,
-): 'error' | 'warning' | 'passed' {
+function getStatus(p50Diff: number, p75Diff: number, p90Diff: number): 'error' | 'passed' {
   if (p50Diff < -50 || p75Diff < -50 || p90Diff < -50) {
     return 'error'
-  } else if (p50Diff < -20 || p75Diff < -20 || p90Diff < -20) {
-    return 'warning'
   }
   return 'passed'
 }
 
-function getStatusEmoji(status: 'error' | 'warning' | 'passed'): string {
+function getStatusEmoji(status: 'error' | 'passed'): string {
   if (status === 'error') return '🔴'
-  if (status === 'warning') return '⚠️'
   return '✅'
 }
 
 // Initialize the overall status
-let overallStatus: 'error' | 'warning' | 'passed' = 'passed'
+let overallStatus: 'error' | 'passed' = 'passed'
 
 interface TestResult {
   testName: string
@@ -215,8 +208,6 @@ for (const test of tests) {
         // Update overall status
         if (testStatus === 'error') {
           overallStatus = 'error'
-        } else if (testStatus === 'warning' && overallStatus === 'passed') {
-          overallStatus = 'warning'
         }
 
         const rowLabel = [chalk.bold(test.name), label ? `(${label})` : ''].join(' ')
@@ -257,9 +248,7 @@ console.log(`
 `)
 
 // Map overallStatus to status text
-const statusText =
-  // eslint-disable-next-line no-nested-ternary
-  overallStatus === 'error' ? 'Error' : overallStatus === 'warning' ? 'Warning' : 'Passed'
+const statusText = overallStatus === 'error' ? 'Error' : 'Passed'
 const statusEmoji = getStatusEmoji(overallStatus)
 
 // Build the markdown content
@@ -280,8 +269,8 @@ const markdownContent = [
   '',
 ].join('\n')
 
-// Write markdown file
-const markdownOutputPath = path.join(resultsDir, 'benchmark-results.md')
+// Write markdown file to root of results
+const markdownOutputPath = path.join(workspaceDir, 'results', 'benchmark-results.md')
 await fs.promises.writeFile(markdownOutputPath, markdownContent)
 
 // Exit with code 1 if regression detected

From f3ddff07b05e93869cb4c6587789bf06200fc8de Mon Sep 17 00:00:00 2001
From: Binoy Patel <me@binoy.io>
Date: Thu, 19 Sep 2024 09:37:40 -0400
Subject: [PATCH 05/21] chore: add efps github actions

---
 .github/workflows/efps.yml | 67 ++++++++++++++++++++++++++++++++++++++
 package.json               |  1 +
 2 files changed, 68 insertions(+)
 create mode 100644 .github/workflows/efps.yml

diff --git a/.github/workflows/efps.yml b/.github/workflows/efps.yml
new file mode 100644
index 00000000000..ea93fc3218f
--- /dev/null
+++ b/.github/workflows/efps.yml
@@ -0,0 +1,67 @@
+name: eFPS Test
+on:
+  pull_request:
+jobs:
+  install:
+    timeout-minutes: 30
+    runs-on: ubuntu-latest
+    env:
+      TURBO_TOKEN: ${{ secrets.TURBO_TOKEN }}
+      TURBO_TEAM: ${{ vars.TURBO_TEAM }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: 18
+
+      - uses: pnpm/action-setup@v4
+        name: Install pnpm
+        id: pnpm-install
+        with:
+          run_install: false
+
+      - name: Get pnpm store directory
+        id: pnpm-cache
+        shell: bash
+        run: |
+          echo "STORE_PATH=$(pnpm store path)" >> $GITHUB_OUTPUT
+
+      - name: Cache node modules
+        id: cache-node-modules
+        uses: actions/cache@v4
+        env:
+          cache-name: cache-node-modules
+        with:
+          path: ${{ steps.pnpm-cache.outputs.STORE_PATH }}
+          key: ${{ runner.os }}-pnpm-store-${{ env.cache-name }}-${{ hashFiles('**/pnpm-lock.yaml') }}
+          restore-keys: |
+            v1-${{ runner.os }}-pnpm-store-${{ env.cache-name }}-
+            v1-${{ runner.os }}-pnpm-store-
+            v1-${{ runner.os }}-
+
+      - name: Install project dependencies
+        run: pnpm install
+
+      - name: Store Playwright's Version
+        run: |
+          PLAYWRIGHT_VERSION=$(npx playwright --version | sed 's/Version //')
+          echo "Playwright's Version: $PLAYWRIGHT_VERSION"
+          echo "PLAYWRIGHT_VERSION=$PLAYWRIGHT_VERSION" >> $GITHUB_ENV
+
+      - name: Cache Playwright Browsers for Playwright's Version
+        id: cache-playwright-browsers
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ms-playwright
+          key: playwright-browsers-${{ env.PLAYWRIGHT_VERSION }}
+
+      - name: Install Playwright Browsers
+        if: steps.cache-playwright-browsers.outputs.cache-hit != 'true'
+        run: npx playwright install --with-deps
+
+      - name: Run eFPS tests
+        env:
+          VITE_PERF_EFPS_PROJECT_ID: ${{ secrets.PERF_EFPS_PROJECT_ID }}
+          VITE_PERF_EFPS_DATASET: ${{ secrets.PERF_EFPS_DATASET }}
+          PERF_EFPS_SANITY_TOKEN: ${{ secrets.SANITY_E2E_SESSION_TOKEN_NEW }}
+        run: pnpm efps:test
diff --git a/package.json b/package.json
index 90b1b6822ee..d6622afb2da 100644
--- a/package.json
+++ b/package.json
@@ -54,6 +54,7 @@
     "e2e:setup": "node -r dotenv-flow/config -r esbuild-register scripts/e2e/setup",
     "e2e:start": "pnpm --filter studio-e2e-testing preview",
     "etl": "node -r dotenv-flow/config -r esbuild-register scripts/etl",
+    "efps:test": "cd perf/efps && pnpm test",
     "example:blog-studio": "cd examples/blog-studio && pnpm start",
     "example:clean-studio": "cd examples/blog-studio && pnpm start",
     "example:ecommerce-studio": "cd examples/blog-studio && pnpm start",

From 97136baeae566978fc619d7d6700ee8bdedf1a08 Mon Sep 17 00:00:00 2001
From: Binoy Patel <me@binoy.io>
Date: Thu, 19 Sep 2024 09:54:37 -0400
Subject: [PATCH 06/21] chore: use different token for efps tests

---
 .github/workflows/efps.yml | 2 +-
 perf/efps/index.ts         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/efps.yml b/.github/workflows/efps.yml
index ea93fc3218f..80f15bc8573 100644
--- a/.github/workflows/efps.yml
+++ b/.github/workflows/efps.yml
@@ -63,5 +63,5 @@ jobs:
         env:
           VITE_PERF_EFPS_PROJECT_ID: ${{ secrets.PERF_EFPS_PROJECT_ID }}
           VITE_PERF_EFPS_DATASET: ${{ secrets.PERF_EFPS_DATASET }}
-          PERF_EFPS_SANITY_TOKEN: ${{ secrets.SANITY_E2E_SESSION_TOKEN_NEW }}
+          PERF_EFPS_SANITY_TOKEN: ${{ secrets.PERF_EFPS_SANITY_TOKEN }}
         run: pnpm efps:test
diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 87fefb201af..81f382e8b91 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -80,7 +80,7 @@ const sanityPackagePath = path.join(tmpDir, 'node_modules', 'sanity')
 
 await exec({
   text: ['Ensuring playwright is installed…', 'Playwright is installed'],
-  command: 'npx playwright install',
+  command: 'npx playwright install --with-deps',
   spinner,
 })
 

From 7cf935bee69f188a0469e7128324833dc71ae5c3 Mon Sep 17 00:00:00 2001
From: Binoy Patel <me@binoy.io>
Date: Thu, 19 Sep 2024 22:20:21 -0400
Subject: [PATCH 07/21] test: add comment with perf report result

---
 .github/workflows/efps.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/efps.yml b/.github/workflows/efps.yml
index 80f15bc8573..dbe3c389a63 100644
--- a/.github/workflows/efps.yml
+++ b/.github/workflows/efps.yml
@@ -65,3 +65,17 @@ jobs:
           VITE_PERF_EFPS_DATASET: ${{ secrets.PERF_EFPS_DATASET }}
           PERF_EFPS_SANITY_TOKEN: ${{ secrets.PERF_EFPS_SANITY_TOKEN }}
         run: pnpm efps:test
+
+      - name: PR comment with report
+        uses: thollander/actions-comment-pull-request@fabd468d3a1a0b97feee5f6b9e499eab0dd903f6 # v2
+        if: ${{ github.event_name == 'pull_request' }}
+        with:
+          comment_tag: "efps-report"
+          filePath: ${{ github.workspace }}/perf/efps/results/benchmark-results.md
+
+      - uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: efps-report
+          path: perf/efps/results
+          retention-days: 30

From 626a47abacce37e9b38534ce3c5059f0beb18a47 Mon Sep 17 00:00:00 2001
From: Binoy Patel <me@binoy.io>
Date: Thu, 19 Sep 2024 23:01:39 -0400
Subject: [PATCH 08/21] test: keep playwright install script

---
 perf/efps/index.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 81f382e8b91..87fefb201af 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -80,7 +80,7 @@ const sanityPackagePath = path.join(tmpDir, 'node_modules', 'sanity')
 
 await exec({
   text: ['Ensuring playwright is installed…', 'Playwright is installed'],
-  command: 'npx playwright install --with-deps',
+  command: 'npx playwright install',
   spinner,
 })
 

From f03831b9113af100a66040afe56c4eb312ae8b64 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Tue, 24 Sep 2024 09:37:44 -0500
Subject: [PATCH 09/21] comment out alias for now

---
 perf/efps/runTest.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf/efps/runTest.ts b/perf/efps/runTest.ts
index 0bf11f3b826..a7991d1916f 100644
--- a/perf/efps/runTest.ts
+++ b/perf/efps/runTest.ts
@@ -52,7 +52,7 @@ export async function runTest({
 
   const alias: Record<string, string> = {'#config': fileURLToPath(test.configPath!)}
   if (sanityPackagePath) {
-    alias.sanity = sanityPackagePath
+    // alias.sanity = sanityPackagePath
   }
 
   await vite.build({

From 43b3aaeb401201ebf3ee39524b119afa9d7cc397 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Tue, 24 Sep 2024 09:59:07 -0500
Subject: [PATCH 10/21] remove profiling for now

---
 perf/efps/runTest.ts | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/perf/efps/runTest.ts b/perf/efps/runTest.ts
index a7991d1916f..805fb78f93f 100644
--- a/perf/efps/runTest.ts
+++ b/perf/efps/runTest.ts
@@ -11,7 +11,6 @@ import sourcemaps from 'rollup-plugin-sourcemaps'
 import handler from 'serve-handler'
 import * as vite from 'vite'
 
-import {remapCpuProfile} from './helpers/remapCpuProfile'
 import {type EfpsResult, type EfpsTest, type EfpsTestRunnerContext} from './types'
 
 const workspaceDir = path.dirname(fileURLToPath(import.meta.url))
@@ -109,7 +108,7 @@ export async function runTest({
       typeof test.document === 'function' ? await test.document(runnerContext) : test.document
     document = await client.create(documentToCreate)
 
-    const cdp = await context.newCDPSession(page)
+    // const cdp = await context.newCDPSession(page)
 
     log('Loading editor…')
     await page.goto(
@@ -118,8 +117,8 @@ export async function runTest({
       )};type=${encodeURIComponent(documentToCreate._type)}`,
     )
 
-    await cdp.send('Profiler.enable')
-    await cdp.send('Profiler.start')
+    // await cdp.send('Profiler.enable')
+    // await cdp.send('Profiler.start')
 
     log('Benchmarking…')
     const result = await test.run({...runnerContext, document})
@@ -127,22 +126,22 @@ export async function runTest({
     log('Saving results…')
     const results = Array.isArray(result) ? result : [result]
 
-    const {profile} = await cdp.send('Profiler.stop')
-    const remappedProfile = await remapCpuProfile(profile, outDir)
+    // const {profile} = await cdp.send('Profiler.stop')
+    // const remappedProfile = await remapCpuProfile(profile, outDir)
 
     await fs.promises.mkdir(testResultsDir, {recursive: true})
     await fs.promises.writeFile(
       path.join(testResultsDir, 'results.json'),
       JSON.stringify(results, null, 2),
     )
-    await fs.promises.writeFile(
-      path.join(testResultsDir, 'raw.cpuprofile'),
-      JSON.stringify(profile),
-    )
-    await fs.promises.writeFile(
-      path.join(testResultsDir, 'mapped.cpuprofile'),
-      JSON.stringify(remappedProfile),
-    )
+    // await fs.promises.writeFile(
+    //   path.join(testResultsDir, 'raw.cpuprofile'),
+    //   JSON.stringify(profile),
+    // )
+    // await fs.promises.writeFile(
+    //   path.join(testResultsDir, 'mapped.cpuprofile'),
+    //   JSON.stringify(remappedProfile),
+    // )
 
     spinner.succeed(`Ran benchmark '${test.name}' (${versionLabel})`)
 

From ea871aec4ec1062f1f0311cf80b7fb2dcd00b412 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Tue, 24 Sep 2024 12:08:22 -0500
Subject: [PATCH 11/21] cap fps to 60

---
 perf/efps/helpers/measureFpsForInput.ts | 6 +++---
 perf/efps/helpers/measureFpsForPte.ts   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/perf/efps/helpers/measureFpsForInput.ts b/perf/efps/helpers/measureFpsForInput.ts
index 4b0c584c795..93f13b1dc37 100644
--- a/perf/efps/helpers/measureFpsForInput.ts
+++ b/perf/efps/helpers/measureFpsForInput.ts
@@ -74,9 +74,9 @@ export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
     return matchingEvent.timestamp - inputEvent.timestamp
   })
 
-  const p50 = 1000 / calculatePercentile(latencies, 0.5)
-  const p75 = 1000 / calculatePercentile(latencies, 0.75)
-  const p90 = 1000 / calculatePercentile(latencies, 0.9)
+  const p50 = Math.min(1000 / calculatePercentile(latencies, 0.5), 60)
+  const p75 = Math.min(1000 / calculatePercentile(latencies, 0.75), 60)
+  const p90 = Math.min(1000 / calculatePercentile(latencies, 0.9), 60)
 
   return {p50, p75, p90, latencies}
 }
diff --git a/perf/efps/helpers/measureFpsForPte.ts b/perf/efps/helpers/measureFpsForPte.ts
index ffa233e9064..ee54cd719f1 100644
--- a/perf/efps/helpers/measureFpsForPte.ts
+++ b/perf/efps/helpers/measureFpsForPte.ts
@@ -86,9 +86,9 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
     return matchingEvent.timestamp - inputEvent.timestamp - matchingEvent.textContentProcessingTime
   })
 
-  const p50 = 1000 / calculatePercentile(latencies, 0.5)
-  const p75 = 1000 / calculatePercentile(latencies, 0.75)
-  const p90 = 1000 / calculatePercentile(latencies, 0.9)
+  const p50 = Math.min(1000 / calculatePercentile(latencies, 0.5), 60)
+  const p75 = Math.min(1000 / calculatePercentile(latencies, 0.75), 60)
+  const p90 = Math.min(1000 / calculatePercentile(latencies, 0.9), 60)
 
   return {p50, p75, p90, latencies}
 }

From c65513cf6d036e44cb9d4b3e5bc99511a0d5e041 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Wed, 25 Sep 2024 06:38:38 -0500
Subject: [PATCH 12/21] wip

---
 perf/efps/helpers/measureFpsForInput.ts |   6 +-
 perf/efps/helpers/measureFpsForPte.ts   |   6 +-
 perf/efps/index.ts                      | 244 +++++++++++-------------
 perf/efps/runTest.ts                    |  30 +--
 4 files changed, 132 insertions(+), 154 deletions(-)

diff --git a/perf/efps/helpers/measureFpsForInput.ts b/perf/efps/helpers/measureFpsForInput.ts
index 93f13b1dc37..417cd3f191b 100644
--- a/perf/efps/helpers/measureFpsForInput.ts
+++ b/perf/efps/helpers/measureFpsForInput.ts
@@ -74,9 +74,9 @@ export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
     return matchingEvent.timestamp - inputEvent.timestamp
   })
 
-  const p50 = Math.min(1000 / calculatePercentile(latencies, 0.5), 60)
-  const p75 = Math.min(1000 / calculatePercentile(latencies, 0.75), 60)
-  const p90 = Math.min(1000 / calculatePercentile(latencies, 0.9), 60)
+  const p50 = Math.min(1000 / calculatePercentile(latencies, 0.5), 100)
+  const p75 = Math.min(1000 / calculatePercentile(latencies, 0.75), 100)
+  const p90 = Math.min(1000 / calculatePercentile(latencies, 0.9), 100)
 
   return {p50, p75, p90, latencies}
 }
diff --git a/perf/efps/helpers/measureFpsForPte.ts b/perf/efps/helpers/measureFpsForPte.ts
index ee54cd719f1..6b6a294af4d 100644
--- a/perf/efps/helpers/measureFpsForPte.ts
+++ b/perf/efps/helpers/measureFpsForPte.ts
@@ -86,9 +86,9 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
     return matchingEvent.timestamp - inputEvent.timestamp - matchingEvent.textContentProcessingTime
   })
 
-  const p50 = Math.min(1000 / calculatePercentile(latencies, 0.5), 60)
-  const p75 = Math.min(1000 / calculatePercentile(latencies, 0.75), 60)
-  const p90 = Math.min(1000 / calculatePercentile(latencies, 0.9), 60)
+  const p50 = Math.min(1000 / calculatePercentile(latencies, 0.5), 100)
+  const p75 = Math.min(1000 / calculatePercentile(latencies, 0.75), 100)
+  const p90 = Math.min(1000 / calculatePercentile(latencies, 0.9), 100)
 
   return {p50, p75, p90, latencies}
 }
diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 87fefb201af..de7940542b8 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -22,6 +22,9 @@ import singleString from './tests/singleString/singleString'
 import synthetic from './tests/synthetic/synthetic'
 import {type EfpsResult} from './types'
 
+const tag = 'latest'
+const deltaThreshold = 0.1
+
 const headless = true
 const tests = [singleString, recipe, article, synthetic]
 
@@ -66,17 +69,17 @@ await exec({
   cwd: monorepoRoot,
 })
 
-// Prepare the latest version of the 'sanity' package
-const tmpDir = path.join(os.tmpdir(), `sanity-latest-${Date.now()}`)
+const tmpDir = path.join(os.tmpdir(), `sanity-${tag}-${Date.now()}`)
 await fs.promises.mkdir(tmpDir, {recursive: true})
 spinner.start('')
 await exec({
-  command: 'npm install sanity@latest --no-save',
+  command: `pnpm install sanity@${tag}`,
   cwd: tmpDir,
   spinner,
-  text: ['Downloading latest sanity package…', 'Downloaded latest sanity package'],
+  text: [`Downloading sanity@${tag} package…`, `Downloaded sanity@${tag}`],
 })
-const sanityPackagePath = path.join(tmpDir, 'node_modules', 'sanity')
+const baseSanityPkgPath = path.join(tmpDir, 'node_modules', 'sanity')
+const localSanityPkgPath = path.dirname(fileURLToPath(import.meta.resolve('sanity/package.json')))
 
 await exec({
   text: ['Ensuring playwright is installed…', 'Playwright is installed'],
@@ -84,25 +87,19 @@ await exec({
   spinner,
 })
 
-const table = new Table({
-  head: [chalk.bold('benchmark'), 'Passed?', 'p50 eFPS (Δ%)', 'p75 eFPS (Δ%)', 'p90 eFPS (Δ%)'].map(
-    (cell) => chalk.cyan(cell),
-  ),
-})
-
-const markdownRows: string[] = []
-
 const formatFps = (fps: number) => {
   const rounded = fps.toFixed(1)
+  if (fps >= 100) return chalk.green('99.9+')
   if (fps >= 60) return chalk.green(rounded)
-  if (fps < 20) return chalk.red(rounded)
-  return chalk.yellow(rounded)
+  if (fps >= 20) return chalk.yellow(rounded)
+  return chalk.red(rounded)
 }
 
-const formatPercentage = (value: number): string => {
-  const rounded = value.toFixed(1)
-  const sign = value >= 0 ? '+' : ''
-  if (value > -50) return `${sign}${rounded}%`
+const formatPercentage = (delta: number): string => {
+  const percentage = delta * 100
+  const rounded = percentage.toFixed(1)
+  const sign = delta >= 0 ? '+' : ''
+  if (delta >= -deltaThreshold) return `${sign}${rounded}%`
   return chalk.red(`${sign}${rounded}%`)
 }
 
@@ -118,163 +115,142 @@ const formatPercentagePlain = (value: number): string => {
   return `${sign}${rounded}%`
 }
 
-function getStatus(p50Diff: number, p75Diff: number, p90Diff: number): 'error' | 'passed' {
-  if (p50Diff < -50 || p75Diff < -50 || p90Diff < -50) {
-    return 'error'
-  }
-  return 'passed'
-}
-
-function getStatusEmoji(status: 'error' | 'passed'): string {
-  if (status === 'error') return '🔴'
-  return '✅'
-}
-
-// Initialize the overall status
-let overallStatus: 'error' | 'passed' = 'passed'
-
-interface TestResult {
-  testName: string
-  version: 'local' | 'latest'
-  results: EfpsResult[]
-}
-
-const allResults: TestResult[] = []
-
+const testOutput: Array<{
+  name: string
+  results: Array<EfpsResult & {delta: number; passed: boolean}>
+}> = []
 for (let i = 0; i < tests.length; i++) {
   const test = tests[i]
 
   // Run with local 'sanity' package
+  // [RUNS] [ singleString ] [local] [latest] [...]
+  //
   const localResults = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with local 'sanity'…`,
+    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local 'sanity'…`,
     test,
     resultsDir,
-    spinner,
     client,
     headless,
     projectId,
-  })
-
-  allResults.push({
-    testName: test.name,
-    version: 'local',
-    results: localResults,
+    sanityPkgPath: localSanityPkgPath,
+    log: () => {},
   })
 
   // Run with latest 'sanity' package
-  const latestResults = await runTest({
+  const baseResults = await runTest({
     prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@latest'…`,
     test,
     resultsDir,
-    spinner,
     client,
     headless,
     projectId,
-    sanityPackagePath,
+    sanityPkgPath: baseSanityPkgPath,
+    log: () => {},
   })
 
-  allResults.push({
-    testName: test.name,
-    version: 'latest',
-    results: latestResults,
+  const combinedResults = localResults.map((localResult, index) => {
+    const baseResult = baseResults[index]
+    const delta = (baseResult.p50 - localResult.p50) / baseResult.p50
+    return {
+      ...localResult,
+      delta,
+      passed: delta >= -deltaThreshold,
+    }
   })
+
+  testOutput.push({name: test.name, results: combinedResults})
 }
 
-for (const test of tests) {
-  const localResult = allResults.find((r) => r.testName === test.name && r.version === 'local')
-  const latestResult = allResults.find((r) => r.testName === test.name && r.version === 'latest')
+const p50Min = testOutput.flatMap((i) => i.results).sort((a, b) => a.p50 - b.p50)[0]
 
-  if (localResult && latestResult) {
-    const localResultsMap = new Map<string | undefined, EfpsResult>()
-    for (const res of localResult.results) {
-      localResultsMap.set(res.label, res)
-    }
-    const latestResultsMap = new Map<string | undefined, EfpsResult>()
-    for (const res of latestResult.results) {
-      latestResultsMap.set(res.label, res)
-    }
+const table = new Table({
+  head: [chalk.bold('Benchmark'), 'eFPS', `vs \`${tag}\``, 'Passed?'].map((cell) =>
+    chalk.cyan(cell),
+  ),
+})
 
-    for (const [label, latest] of latestResultsMap) {
-      const local = localResultsMap.get(label)
-      if (local) {
-        // Compute percentage differences
-        const p50Diff = ((local.p50 - latest.p50) / latest.p50) * 100
-        const p75Diff = ((local.p75 - latest.p75) / latest.p75) * 100
-        const p90Diff = ((local.p90 - latest.p90) / latest.p90) * 100
-
-        // Determine test status
-        const testStatus = getStatus(p50Diff, p75Diff, p90Diff)
-
-        // Update overall status
-        if (testStatus === 'error') {
-          overallStatus = 'error'
-        }
-
-        const rowLabel = [chalk.bold(test.name), label ? `(${label})` : ''].join(' ')
-
-        table.push([
-          rowLabel,
-          getStatusEmoji(testStatus),
-          `${formatFps(local.p50)} (${formatPercentage(p50Diff)})`,
-          `${formatFps(local.p75)} (${formatPercentage(p75Diff)})`,
-          `${formatFps(local.p90)} (${formatPercentage(p90Diff)})`,
-        ])
-
-        // Add to markdown rows
-        const markdownRow = [
-          [test.name, label ? `(${label})` : ''].join(' '),
-          getStatusEmoji(testStatus),
-          `${formatFpsPlain(local.p50)} (${formatPercentagePlain(p50Diff)})`,
-          `${formatFpsPlain(local.p75)} (${formatPercentagePlain(p75Diff)})`,
-          `${formatFpsPlain(local.p90)} (${formatPercentagePlain(p90Diff)})`,
-        ]
-        markdownRows.push(`| ${markdownRow.join(' | ')} |`)
-      } else {
-        spinner.fail(`Missing local result for test '${test.name}', label '${label}'`)
-      }
-    }
-  } else {
-    spinner.fail(`Missing results for test '${test.name}'`)
+for (const {name, results} of testOutput) {
+  for (const {delta, p50, label, passed} of results) {
+    table.push([
+      label ? `${name} (${label})` : name,
+      formatFps(p50),
+      formatPercentage(delta),
+      passed ? '✅' : '🔴',
+    ])
   }
 }
 
+const allPassed = testOutput.flatMap((i) => i.results).every((i) => i.passed)
+
+// const markdownRows: string[] = []
+
 console.log(table.toString())
 console.log(`
+${chalk.bold('Lowest eFPS:')} ${formatFps(p50Min.p50)}`)
+console.log(`
 
-│ ${chalk.bold('eFPS — editor "Frames Per Second"')}
-│
-│ The number of renders ("frames") that is assumed to be possible
+│ ${chalk.bold('eFPS — Editor "Frames Per Second"')}
+│ The number of renders, aka "frames", that are assumed to be possible
 │ within a second. Derived from input latency. ${chalk.green('Higher')} is better.
+│
+│ ${chalk.bold(`vs \`${tag}\``)}
+│ The percentage difference of the current branch when compared to \`sanity@${tag}\`.
+│
+│ ${chalk.bold('Passed?')}
+│ Tests are failed when any of the median eFPS results perform more than 10% worse.
 `)
 
 // Map overallStatus to status text
-const statusText = overallStatus === 'error' ? 'Error' : 'Passed'
-const statusEmoji = getStatusEmoji(overallStatus)
+// const statusText = overallStatus === 'error' ? 'Error' : 'Passed'
+// const statusEmoji = getStatusEmoji(overallStatus)
+
+const markdownRows = testOutput
+  .flatMap((test) =>
+    test.results.map((result) => ({
+      ...result,
+      label: result.label ? `${test.name} (${result.label})` : test.name,
+    })),
+  )
+  .map(
+    ({label, p50, delta, passed}) =>
+      `| ${label} | ${formatFpsPlain(p50)} | ${formatPercentagePlain(delta)} | ${passed ? '✅' : '🔴'} |`,
+  )
+  .join('\n')
 
 // Build the markdown content
-const markdownContent = [
-  '# Benchmark Results',
-  '',
-  `<details>`,
-  `<summary>${statusEmoji} Performance Benchmark Results — Status: **${statusText}** </summary>`,
-  '',
-  '| Benchmark | Passed? | p50 eFPS (Δ%) | p75 eFPS (Δ%) | p90 eFPS (Δ%) |',
-  '|-----------|---------|---------------|---------------|---------------|',
-  ...markdownRows,
-  '</details>',
-  '',
-  '> **eFPS — editor "Frames Per Second"**',
-  '> ',
-  '> The number of renders ("frames") that is assumed to be possible within a second. Derived from input latency. **Higher** is better.',
-  '',
-].join('\n')
+const markdown = `
+<details>
+<summary><strong>⚡️ Editor Performance Report</strong><br/><br/>
+
+| <strong>${formatFpsPlain(p50Min.p50)}</strong> <br/><sub>eFPS</sub> | ${formatPercentagePlain(p50Min.delta)}<br/><sub>vs <code>${tag}</code></sub> | ${allPassed ? '✅' : '🔴'} <br/> <sub>${allPassed ? 'Passed' : 'Failed'}</sub> |
+| --- | --- | --- |
+
+> **eFPS** — Editor "Frames Per Second"
+> <sup>The number of renders aka "frames" that is assumed to be possible within a second. Derived from input latency. _Higher_ is better.</sup>
+
+<sup>↓ expand for details</sup>
+</summary>
+
+| Benchmark | eFPS | vs \`${tag}\` | Passed? |
+|-----------| ---: | ------------: | :-----: |
+${markdownRows}
+
+> **eFPS — Editor "Frames Per Second"**
+> The number of renders, aka "frames", that are assumed to be possible within a second.
+> Derived from input latency. _Higher_ is better.
+>
+> **vs \`${tag}\`**
+> The percentage difference of the current branch when compared to \`sanity@${tag}\`.
+>
+> **Passed?**
+> Tests are failed when any of the median eFPS results perform more than 10% worse.
+`
 
 // Write markdown file to root of results
 const markdownOutputPath = path.join(workspaceDir, 'results', 'benchmark-results.md')
-await fs.promises.writeFile(markdownOutputPath, markdownContent)
+await fs.promises.writeFile(markdownOutputPath, markdown)
 
 // Exit with code 1 if regression detected
-if (overallStatus === 'error') {
-  console.error(chalk.red('Performance regression detected exceeding 50% threshold.'))
+if (!allPassed) {
   process.exit(1)
 }
diff --git a/perf/efps/runTest.ts b/perf/efps/runTest.ts
index 805fb78f93f..29cf727b598 100644
--- a/perf/efps/runTest.ts
+++ b/perf/efps/runTest.ts
@@ -5,7 +5,6 @@ import {fileURLToPath} from 'node:url'
 
 import {type SanityClient} from '@sanity/client'
 import react from '@vitejs/plugin-react'
-import {type Ora} from 'ora'
 import {chromium} from 'playwright'
 import sourcemaps from 'rollup-plugin-sourcemaps'
 import handler from 'serve-handler'
@@ -19,39 +18,42 @@ interface RunTestOptions {
   prefix: string
   test: EfpsTest
   resultsDir: string
-  spinner: Ora
+  // spinner: Ora
   projectId: string
   headless: boolean
   client: SanityClient
-  sanityPackagePath?: string // Add this line
+  sanityPkgPath: string
+  log: (text: string) => void
 }
 
 export async function runTest({
   prefix,
   test,
   resultsDir,
-  spinner,
+  // spinner,
   projectId,
   headless,
   client,
-  sanityPackagePath,
+  sanityPkgPath,
+  log,
 }: RunTestOptions): Promise<EfpsResult[]> {
-  const log = (text: string) => {
-    spinner.text = `${prefix}\n  └ ${text}`
-  }
+  console.log(prefix)
+  // const log = (text: string) => {
+  //   // spinner.text = `${prefix}\n  └ ${text}`
+  // }
 
-  spinner.start(prefix)
+  // spinner.start(prefix)
 
-  const versionLabel = sanityPackagePath ? 'latest' : 'local'
+  const versionLabel = sanityPkgPath ? 'latest' : 'local'
   const outDir = path.join(workspaceDir, 'builds', test.name, versionLabel)
   const testResultsDir = path.join(resultsDir, test.name, versionLabel)
 
   await fs.promises.mkdir(outDir, {recursive: true})
   log('Building…')
 
-  const alias: Record<string, string> = {'#config': fileURLToPath(test.configPath!)}
-  if (sanityPackagePath) {
-    // alias.sanity = sanityPackagePath
+  const alias: Record<string, string> = {
+    '#config': fileURLToPath(test.configPath!),
+    'sanity': sanityPkgPath,
   }
 
   await vite.build({
@@ -143,7 +145,7 @@ export async function runTest({
     //   JSON.stringify(remappedProfile),
     // )
 
-    spinner.succeed(`Ran benchmark '${test.name}' (${versionLabel})`)
+    // spinner.succeed(`Ran benchmark '${test.name}' (${versionLabel})`)
 
     return results
   } finally {

From c93d109c573198be5d6d787dc4fb199a24d8a1ad Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Wed, 25 Sep 2024 07:20:19 -0500
Subject: [PATCH 13/21] wip

---
 perf/efps/index.ts | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index de7940542b8..4074a42b9a7 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -105,13 +105,15 @@ const formatPercentage = (delta: number): string => {
 
 // For markdown formatting without colors
 const formatFpsPlain = (fps: number) => {
+  if (fps >= 100) return '99.9+'
   const rounded = fps.toFixed(1)
   return rounded
 }
 
-const formatPercentagePlain = (value: number): string => {
-  const rounded = value.toFixed(1)
-  const sign = value >= 0 ? '+' : ''
+const formatPercentagePlain = (delta: number): string => {
+  const percentage = delta * 100
+  const rounded = percentage.toFixed(1)
+  const sign = delta >= 0 ? '+' : ''
   return `${sign}${rounded}%`
 }
 
@@ -122,9 +124,6 @@ const testOutput: Array<{
 for (let i = 0; i < tests.length; i++) {
   const test = tests[i]
 
-  // Run with local 'sanity' package
-  // [RUNS] [ singleString ] [local] [latest] [...]
-  //
   const localResults = await runTest({
     prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local 'sanity'…`,
     test,
@@ -150,7 +149,7 @@ for (let i = 0; i < tests.length; i++) {
 
   const combinedResults = localResults.map((localResult, index) => {
     const baseResult = baseResults[index]
-    const delta = (baseResult.p50 - localResult.p50) / baseResult.p50
+    const delta = (localResult.p50 - baseResult.p50) / baseResult.p50
     return {
       ...localResult,
       delta,
@@ -182,8 +181,6 @@ for (const {name, results} of testOutput) {
 
 const allPassed = testOutput.flatMap((i) => i.results).every((i) => i.passed)
 
-// const markdownRows: string[] = []
-
 console.log(table.toString())
 console.log(`
 ${chalk.bold('Lowest eFPS:')} ${formatFps(p50Min.p50)}`)
@@ -200,10 +197,6 @@ console.log(`
 │ Tests are failed when any of the median eFPS results perform more than 10% worse.
 `)
 
-// Map overallStatus to status text
-// const statusText = overallStatus === 'error' ? 'Error' : 'Passed'
-// const statusEmoji = getStatusEmoji(overallStatus)
-
 const markdownRows = testOutput
   .flatMap((test) =>
     test.results.map((result) => ({

From e74cdf1c71e36aec6221480e570929c1efc540a4 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Wed, 25 Sep 2024 07:32:51 -0500
Subject: [PATCH 14/21] remove useless test

---
 perf/efps/index.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 4074a42b9a7..cffb809f1b5 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -18,7 +18,7 @@ import {exec} from './helpers/exec'
 import {runTest} from './runTest'
 import article from './tests/article/article'
 import recipe from './tests/recipe/recipe'
-import singleString from './tests/singleString/singleString'
+// import singleString from './tests/singleString/singleString'
 import synthetic from './tests/synthetic/synthetic'
 import {type EfpsResult} from './types'
 
@@ -26,7 +26,7 @@ const tag = 'latest'
 const deltaThreshold = 0.1
 
 const headless = true
-const tests = [singleString, recipe, article, synthetic]
+const tests = [recipe, article, synthetic]
 
 const projectId = process.env.VITE_PERF_EFPS_PROJECT_ID!
 const dataset = process.env.VITE_PERF_EFPS_DATASET!

From 524bed595f38529fe075b004910716ba294b52f7 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Wed, 25 Sep 2024 07:43:54 -0500
Subject: [PATCH 15/21] change to .2

---
 perf/efps/index.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index cffb809f1b5..82655d2340b 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -23,7 +23,7 @@ import synthetic from './tests/synthetic/synthetic'
 import {type EfpsResult} from './types'
 
 const tag = 'latest'
-const deltaThreshold = 0.1
+const deltaThreshold = 0.2
 
 const headless = true
 const tests = [recipe, article, synthetic]

From a3164a371d56e9dd3fea40378a193e179baf389f Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Thu, 26 Sep 2024 13:25:34 -0500
Subject: [PATCH 16/21] what if we ran it 3 times

---
 perf/efps/index.ts   | 123 +++++++++++++++++++++++++++++++++++--------
 perf/efps/runTest.ts |   1 +
 2 files changed, 101 insertions(+), 23 deletions(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 82655d2340b..beec496f4eb 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -18,7 +18,6 @@ import {exec} from './helpers/exec'
 import {runTest} from './runTest'
 import article from './tests/article/article'
 import recipe from './tests/recipe/recipe'
-// import singleString from './tests/singleString/singleString'
 import synthetic from './tests/synthetic/synthetic'
 import {type EfpsResult} from './types'
 
@@ -81,6 +80,16 @@ await exec({
 const baseSanityPkgPath = path.join(tmpDir, 'node_modules', 'sanity')
 const localSanityPkgPath = path.dirname(fileURLToPath(import.meta.resolve('sanity/package.json')))
 
+// const compareTag = 'v3.57.0'
+
+// await exec({
+//   command: `pnpm install sanity@${compareTag}`,
+//   cwd: tmpDir,
+//   spinner,
+//   text: [`Downloading sanity@${compareTag} package…`, `Downloaded sanity@${compareTag}`],
+// })
+// const compareSanityPkgPath = path.join(tmpDir, 'node_modules', 'sanity')
+
 await exec({
   text: ['Ensuring playwright is installed…', 'Playwright is installed'],
   command: 'npx playwright install',
@@ -124,8 +133,28 @@ const testOutput: Array<{
 for (let i = 0; i < tests.length; i++) {
   const test = tests[i]
 
-  const localResults = await runTest({
-    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local 'sanity'…`,
+  const localResult1 = await runTest({
+    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 1`,
+    test,
+    resultsDir,
+    client,
+    headless,
+    projectId,
+    sanityPkgPath: localSanityPkgPath,
+    log: () => {},
+  })
+  const localResult2 = await runTest({
+    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 2`,
+    test,
+    resultsDir,
+    client,
+    headless,
+    projectId,
+    sanityPkgPath: localSanityPkgPath,
+    log: () => {},
+  })
+  const localResult3 = await runTest({
+    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 3`,
     test,
     resultsDir,
     client,
@@ -135,9 +164,24 @@ for (let i = 0; i < tests.length; i++) {
     log: () => {},
   })
 
-  // Run with latest 'sanity' package
-  const baseResults = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@latest'…`,
+  const localResults = localResult1.map((result1, index) => {
+    const result2 = localResult2[index]
+    const result3 = localResult3[index]
+
+    let min = result1
+    const arr = [result1, result2, result3]
+
+    for (const item of arr) {
+      if (item.p50 < min.p50) {
+        min = item
+      }
+    }
+
+    return min
+  })
+
+  const baseResult1 = await runTest({
+    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 1`,
     test,
     resultsDir,
     client,
@@ -146,6 +190,42 @@ for (let i = 0; i < tests.length; i++) {
     sanityPkgPath: baseSanityPkgPath,
     log: () => {},
   })
+  const baseResult2 = await runTest({
+    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 2`,
+    test,
+    resultsDir,
+    client,
+    headless,
+    projectId,
+    sanityPkgPath: baseSanityPkgPath,
+    log: () => {},
+  })
+  const baseResult3 = await runTest({
+    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 3`,
+    test,
+    resultsDir,
+    client,
+    headless,
+    projectId,
+    sanityPkgPath: baseSanityPkgPath,
+    log: () => {},
+  })
+
+  const baseResults = baseResult1.map((result1, index) => {
+    const result2 = baseResult2[index]
+    const result3 = baseResult3[index]
+
+    let min = result1
+    const arr = [result1, result2, result3]
+
+    for (const item of arr) {
+      if (item.p50 < min.p50) {
+        min = item
+      }
+    }
+
+    return min
+  })
 
   const combinedResults = localResults.map((localResult, index) => {
     const baseResult = baseResults[index]
@@ -194,23 +274,9 @@ console.log(`
 │ The percentage difference of the current branch when compared to \`sanity@${tag}\`.
 │
 │ ${chalk.bold('Passed?')}
-│ Tests are failed when any of the median eFPS results perform more than 10% worse.
+│ Tests are failed when any of the median eFPS results perform more than ${deltaThreshold * 100}% worse.
 `)
 
-const markdownRows = testOutput
-  .flatMap((test) =>
-    test.results.map((result) => ({
-      ...result,
-      label: result.label ? `${test.name} (${result.label})` : test.name,
-    })),
-  )
-  .map(
-    ({label, p50, delta, passed}) =>
-      `| ${label} | ${formatFpsPlain(p50)} | ${formatPercentagePlain(delta)} | ${passed ? '✅' : '🔴'} |`,
-  )
-  .join('\n')
-
-// Build the markdown content
 const markdown = `
 <details>
 <summary><strong>⚡️ Editor Performance Report</strong><br/><br/>
@@ -226,7 +292,18 @@ const markdown = `
 
 | Benchmark | eFPS | vs \`${tag}\` | Passed? |
 |-----------| ---: | ------------: | :-----: |
-${markdownRows}
+${testOutput
+  .flatMap((test) =>
+    test.results.map((result) => ({
+      ...result,
+      label: result.label ? `${test.name} (${result.label})` : test.name,
+    })),
+  )
+  .map(
+    ({label, p50, delta, passed}) =>
+      `| ${label} | ${formatFpsPlain(p50)} | ${formatPercentagePlain(delta)} | ${passed ? '✅' : '🔴'} |`,
+  )
+  .join('\n')}
 
 > **eFPS — Editor "Frames Per Second"**
 > The number of renders, aka "frames", that are assumed to be possible within a second.
@@ -236,7 +313,7 @@ ${markdownRows}
 > The percentage difference of the current branch when compared to \`sanity@${tag}\`.
 >
 > **Passed?**
-> Tests are failed when any of the median eFPS results perform more than 10% worse.
+> Tests are failed when any of the median eFPS results perform more than ${deltaThreshold * 100}% worse.
 `
 
 // Write markdown file to root of results
diff --git a/perf/efps/runTest.ts b/perf/efps/runTest.ts
index 29cf727b598..722c53516ed 100644
--- a/perf/efps/runTest.ts
+++ b/perf/efps/runTest.ts
@@ -10,6 +10,7 @@ import sourcemaps from 'rollup-plugin-sourcemaps'
 import handler from 'serve-handler'
 import * as vite from 'vite'
 
+// TODO: add test duration to metrics
 import {type EfpsResult, type EfpsTest, type EfpsTestRunnerContext} from './types'
 
 const workspaceDir = path.dirname(fileURLToPath(import.meta.url))

From 2d72c2e1ca7f78c0643bc071d310332ef2222e0a Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Thu, 26 Sep 2024 13:42:56 -0500
Subject: [PATCH 17/21] exact same

---
 perf/efps/index.ts | 55 +++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index beec496f4eb..34b51c35dd2 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -143,8 +143,8 @@ for (let i = 0; i < tests.length; i++) {
     sanityPkgPath: localSanityPkgPath,
     log: () => {},
   })
-  const localResult2 = await runTest({
-    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 2`,
+  const baseResult1 = await runTest({
+    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 1`,
     test,
     resultsDir,
     client,
@@ -153,8 +153,8 @@ for (let i = 0; i < tests.length; i++) {
     sanityPkgPath: localSanityPkgPath,
     log: () => {},
   })
-  const localResult3 = await runTest({
-    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 3`,
+  const localResult2 = await runTest({
+    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 2`,
     test,
     resultsDir,
     client,
@@ -163,41 +163,24 @@ for (let i = 0; i < tests.length; i++) {
     sanityPkgPath: localSanityPkgPath,
     log: () => {},
   })
-
-  const localResults = localResult1.map((result1, index) => {
-    const result2 = localResult2[index]
-    const result3 = localResult3[index]
-
-    let min = result1
-    const arr = [result1, result2, result3]
-
-    for (const item of arr) {
-      if (item.p50 < min.p50) {
-        min = item
-      }
-    }
-
-    return min
-  })
-
-  const baseResult1 = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 1`,
+  const baseResult2 = await runTest({
+    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 2`,
     test,
     resultsDir,
     client,
     headless,
     projectId,
-    sanityPkgPath: baseSanityPkgPath,
+    sanityPkgPath: localSanityPkgPath,
     log: () => {},
   })
-  const baseResult2 = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 2`,
+  const localResult3 = await runTest({
+    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 3`,
     test,
     resultsDir,
     client,
     headless,
     projectId,
-    sanityPkgPath: baseSanityPkgPath,
+    sanityPkgPath: localSanityPkgPath,
     log: () => {},
   })
   const baseResult3 = await runTest({
@@ -207,10 +190,26 @@ for (let i = 0; i < tests.length; i++) {
     client,
     headless,
     projectId,
-    sanityPkgPath: baseSanityPkgPath,
+    sanityPkgPath: localSanityPkgPath,
     log: () => {},
   })
 
+  const localResults = localResult1.map((result1, index) => {
+    const result2 = localResult2[index]
+    const result3 = localResult3[index]
+
+    let min = result1
+    const arr = [result1, result2, result3]
+
+    for (const item of arr) {
+      if (item.p50 < min.p50) {
+        min = item
+      }
+    }
+
+    return min
+  })
+
   const baseResults = baseResult1.map((result1, index) => {
     const result2 = baseResult2[index]
     const result3 = baseResult3[index]

From 60a10170fa668bab60a171fd31796c6f918627fc Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Sat, 28 Sep 2024 20:21:37 -0500
Subject: [PATCH 18/21] wip

---
 perf/efps/helpers/aggregateLatencies.ts  |  48 +++
 perf/efps/helpers/calculatePercentile.ts |  21 -
 perf/efps/helpers/measureBlockingTime.ts |  47 +++
 perf/efps/helpers/measureFpsForInput.ts  |  41 +-
 perf/efps/helpers/measureFpsForPte.ts    |  38 +-
 perf/efps/index.ts                       | 466 +++++++++++++----------
 perf/efps/tests/article/article.ts       |  32 +-
 perf/efps/tests/recipe/recipe.ts         |  23 +-
 perf/efps/tests/synthetic/synthetic.ts   |  24 +-
 perf/efps/types.ts                       |  22 +-
 10 files changed, 453 insertions(+), 309 deletions(-)
 create mode 100644 perf/efps/helpers/aggregateLatencies.ts
 delete mode 100644 perf/efps/helpers/calculatePercentile.ts
 create mode 100644 perf/efps/helpers/measureBlockingTime.ts

diff --git a/perf/efps/helpers/aggregateLatencies.ts b/perf/efps/helpers/aggregateLatencies.ts
new file mode 100644
index 00000000000..9611060cae6
--- /dev/null
+++ b/perf/efps/helpers/aggregateLatencies.ts
@@ -0,0 +1,48 @@
+import {type EfpsResult} from '../types'
+
+function calculatePercentile(numbers: number[], percentile: number): number {
+  // Sort the array in ascending order
+  const sorted = numbers.slice().sort((a, b) => a - b)
+
+  // Calculate the index
+  const index = percentile * (sorted.length - 1)
+
+  // If the index is an integer, return the value at that index
+  if (Number.isInteger(index)) {
+    return sorted[index]
+  }
+
+  // Otherwise, interpolate between the two nearest values
+  const lowerIndex = Math.floor(index)
+  const upperIndex = Math.ceil(index)
+  const lowerValue = sorted[lowerIndex]
+  const upperValue = sorted[upperIndex]
+
+  const fraction = index - lowerIndex
+  return lowerValue + (upperValue - lowerValue) * fraction
+}
+
+function calculateError(numbers: number[]) {
+  const mean = numbers.reduce((sum, num) => sum + num, 0) / numbers.length
+
+  // calculate the sum of squared differences from the mean
+  const squaredDiffs = numbers.map((num) => Math.pow(num - mean, 2))
+  const sumSquaredDiffs = squaredDiffs.reduce((sum, diff) => sum + diff, 0)
+
+  const variance = sumSquaredDiffs / (numbers.length - 1)
+  const standardDeviation = Math.sqrt(variance)
+
+  // We assume normal distribution and multiply the standard deviations by 1.96
+  // which aims to represent 95% of the population
+  return 1.96 * standardDeviation
+}
+
+export function aggregateLatencies(values: number[]): EfpsResult['latency'] {
+  return {
+    median: calculatePercentile(values, 0.5),
+    error: calculateError(values),
+    p75: calculatePercentile(values, 0.75),
+    p90: calculatePercentile(values, 0.9),
+    p99: calculatePercentile(values, 0.99),
+  }
+}
diff --git a/perf/efps/helpers/calculatePercentile.ts b/perf/efps/helpers/calculatePercentile.ts
deleted file mode 100644
index 54acbded118..00000000000
--- a/perf/efps/helpers/calculatePercentile.ts
+++ /dev/null
@@ -1,21 +0,0 @@
-export function calculatePercentile(numbers: number[], percentile: number): number {
-  // Sort the array in ascending order
-  const sorted = numbers.slice().sort((a, b) => a - b)
-
-  // Calculate the index
-  const index = percentile * (sorted.length - 1)
-
-  // If the index is an integer, return the value at that index
-  if (Number.isInteger(index)) {
-    return sorted[index]
-  }
-
-  // Otherwise, interpolate between the two nearest values
-  const lowerIndex = Math.floor(index)
-  const upperIndex = Math.ceil(index)
-  const lowerValue = sorted[lowerIndex]
-  const upperValue = sorted[upperIndex]
-
-  const fraction = index - lowerIndex
-  return lowerValue + (upperValue - lowerValue) * fraction
-}
diff --git a/perf/efps/helpers/measureBlockingTime.ts b/perf/efps/helpers/measureBlockingTime.ts
new file mode 100644
index 00000000000..3828b1eff36
--- /dev/null
+++ b/perf/efps/helpers/measureBlockingTime.ts
@@ -0,0 +1,47 @@
+import {type Page} from 'playwright'
+
+const BLOCKING_TASK_THRESHOLD = 50
+
+export function measureBlockingTime(page: Page): () => Promise<number> {
+  const idleGapLengthsPromise = page.evaluate(async () => {
+    const idleGapLengths: number[] = []
+    const done = false
+    let last = performance.now()
+
+    const handler = () => {
+      const current = performance.now()
+      idleGapLengths.push(current - last)
+      last = current
+
+      if (done) return
+      requestIdleCallback(handler)
+    }
+
+    requestIdleCallback(handler)
+
+    await new Promise((resolve) => {
+      document.addEventListener('__blockingTimeFinish', resolve, {once: true})
+    })
+
+    return idleGapLengths
+  })
+
+  async function getBlockingTime() {
+    await page.evaluate(() => {
+      document.dispatchEvent(new CustomEvent('__blockingTimeFinish'))
+    })
+
+    const idleGapLengths = await idleGapLengthsPromise
+
+    const blockingTime = idleGapLengths
+      // only consider the gap lengths that are blocking
+      .filter((idleGapLength) => idleGapLength > BLOCKING_TASK_THRESHOLD)
+      // subtract the allowed time so we're only left with blocking time
+      .map((idleGapLength) => idleGapLength - BLOCKING_TASK_THRESHOLD)
+      .reduce((sum, next) => sum + next, 0)
+
+    return blockingTime
+  }
+
+  return getBlockingTime
+}
diff --git a/perf/efps/helpers/measureFpsForInput.ts b/perf/efps/helpers/measureFpsForInput.ts
index 417cd3f191b..22310a92fb2 100644
--- a/perf/efps/helpers/measureFpsForInput.ts
+++ b/perf/efps/helpers/measureFpsForInput.ts
@@ -1,9 +1,28 @@
-import {type Locator} from 'playwright'
+import {type Page} from 'playwright'
 
 import {type EfpsResult} from '../types'
-import {calculatePercentile} from './calculatePercentile'
+import {aggregateLatencies} from './aggregateLatencies'
+import {measureBlockingTime} from './measureBlockingTime'
 
-export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
+interface MeasureFpsForInputOptions {
+  label?: string
+  page: Page
+  fieldName: string
+}
+
+export async function measureFpsForInput({
+  label,
+  fieldName,
+  page,
+}: MeasureFpsForInputOptions): Promise<EfpsResult> {
+  const start = Date.now()
+
+  const input = page
+    .locator(
+      `[data-testid="field-${fieldName}"] input[type="text"], ` +
+        `[data-testid="field-${fieldName}"] textarea`,
+    )
+    .first()
   await input.waitFor({state: 'visible'})
   const characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 
@@ -49,6 +68,8 @@ export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
   await input.pressSequentially(startingMarker)
   await new Promise((resolve) => setTimeout(resolve, 500))
 
+  const getBlockingTime = measureBlockingTime(page)
+
   for (const character of characters) {
     inputEvents.push({character, timestamp: Date.now()})
     await input.press(character)
@@ -57,6 +78,9 @@ export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
 
   await input.blur()
 
+  await page.evaluate(() => window.document.dispatchEvent(new CustomEvent('__finish')))
+
+  const blockingTime = await getBlockingTime()
   const renderEvents = await rendersPromise
 
   await new Promise((resolve) => setTimeout(resolve, 500))
@@ -74,9 +98,10 @@ export async function measureFpsForInput(input: Locator): Promise<EfpsResult> {
     return matchingEvent.timestamp - inputEvent.timestamp
   })
 
-  const p50 = Math.min(1000 / calculatePercentile(latencies, 0.5), 100)
-  const p75 = Math.min(1000 / calculatePercentile(latencies, 0.75), 100)
-  const p90 = Math.min(1000 / calculatePercentile(latencies, 0.9), 100)
-
-  return {p50, p75, p90, latencies}
+  return {
+    latency: aggregateLatencies(latencies),
+    blockingTime,
+    label: label || fieldName,
+    runDuration: Date.now() - start,
+  }
 }
diff --git a/perf/efps/helpers/measureFpsForPte.ts b/perf/efps/helpers/measureFpsForPte.ts
index 6b6a294af4d..06f89786c1b 100644
--- a/perf/efps/helpers/measureFpsForPte.ts
+++ b/perf/efps/helpers/measureFpsForPte.ts
@@ -1,9 +1,22 @@
-import {type Locator} from 'playwright'
+import {type Page} from 'playwright'
 
 import {type EfpsResult} from '../types'
-import {calculatePercentile} from './calculatePercentile'
+import {aggregateLatencies} from './aggregateLatencies'
+import {measureBlockingTime} from './measureBlockingTime'
 
-export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
+interface MeasureFpsForPteOptions {
+  fieldName: string
+  label?: string
+  page: Page
+}
+
+export async function measureFpsForPte({
+  fieldName,
+  page,
+  label,
+}: MeasureFpsForPteOptions): Promise<EfpsResult> {
+  const start = Date.now()
+  const pteField = page.locator(`[data-testid="field-${fieldName}"]`)
   const characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
 
   await pteField.waitFor({state: 'visible'})
@@ -24,14 +37,14 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
     }[] = []
 
     const mutationObserver = new MutationObserver(() => {
-      const start = performance.now()
+      const textStart = performance.now()
       const textContent = el.textContent || ''
-      const end = performance.now()
+      const textEnd = performance.now()
 
       updates.push({
         value: textContent,
         timestamp: Date.now(),
-        textContentProcessingTime: end - start,
+        textContentProcessingTime: textEnd - textStart,
       })
     })
 
@@ -63,6 +76,7 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
   await contentEditable.pressSequentially(startingMarker)
   await new Promise((resolve) => setTimeout(resolve, 500))
 
+  const getBlockingTime = measureBlockingTime(page)
   for (const character of characters) {
     inputEvents.push({character, timestamp: Date.now()})
     await contentEditable.press(character)
@@ -71,6 +85,7 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
 
   await contentEditable.blur()
 
+  const blockingTime = await getBlockingTime()
   const renderEvents = await rendersPromise
 
   const latencies = inputEvents.map((inputEvent) => {
@@ -86,9 +101,10 @@ export async function measureFpsForPte(pteField: Locator): Promise<EfpsResult> {
     return matchingEvent.timestamp - inputEvent.timestamp - matchingEvent.textContentProcessingTime
   })
 
-  const p50 = Math.min(1000 / calculatePercentile(latencies, 0.5), 100)
-  const p75 = Math.min(1000 / calculatePercentile(latencies, 0.75), 100)
-  const p90 = Math.min(1000 / calculatePercentile(latencies, 0.9), 100)
-
-  return {p50, p75, p90, latencies}
+  return {
+    latency: aggregateLatencies(latencies),
+    blockingTime,
+    label: label || fieldName,
+    runDuration: Date.now() - start,
+  }
 }
diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 34b51c35dd2..65b5f46142f 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -19,13 +19,13 @@ import {runTest} from './runTest'
 import article from './tests/article/article'
 import recipe from './tests/recipe/recipe'
 import synthetic from './tests/synthetic/synthetic'
-import {type EfpsResult} from './types'
+import {type EfpsAbResult, type EfpsResult, type EfpsTest} from './types'
 
-const tag = 'latest'
-const deltaThreshold = 0.2
+const warningThreshold = 0.2
+const testAttemptCount = process.env.CI ? 3 : 1
 
 const headless = true
-const tests = [recipe, article, synthetic]
+const tests = [article, recipe, synthetic]
 
 const projectId = process.env.VITE_PERF_EFPS_PROJECT_ID!
 const dataset = process.env.VITE_PERF_EFPS_DATASET!
@@ -57,44 +57,25 @@ const resultsDir = path.join(
     .toLowerCase()}`,
 )
 
-const spinner = Ora()
-
-spinner.info(`Running ${tests.length} tests: ${tests.map((t) => `'${t.name}'`).join(', ')}`)
-
-await exec({
-  text: ['Building the monorepo…', 'Built monorepo'],
-  command: 'pnpm run build',
-  spinner,
-  cwd: monorepoRoot,
-})
-
-const tmpDir = path.join(os.tmpdir(), `sanity-${tag}-${Date.now()}`)
-await fs.promises.mkdir(tmpDir, {recursive: true})
-spinner.start('')
-await exec({
-  command: `pnpm install sanity@${tag}`,
-  cwd: tmpDir,
-  spinner,
-  text: [`Downloading sanity@${tag} package…`, `Downloaded sanity@${tag}`],
-})
-const baseSanityPkgPath = path.join(tmpDir, 'node_modules', 'sanity')
-const localSanityPkgPath = path.dirname(fileURLToPath(import.meta.resolve('sanity/package.json')))
+const getSanityPkgPathForTag = async (tag: string) => {
+  const tmpDir = path.join(os.tmpdir(), `sanity-${tag}`)
 
-// const compareTag = 'v3.57.0'
+  try {
+    await fs.promises.rm(tmpDir, {recursive: true})
+  } catch {
+    // intentionally blank
+  }
+  await fs.promises.mkdir(tmpDir, {recursive: true})
 
-// await exec({
-//   command: `pnpm install sanity@${compareTag}`,
-//   cwd: tmpDir,
-//   spinner,
-//   text: [`Downloading sanity@${compareTag} package…`, `Downloaded sanity@${compareTag}`],
-// })
-// const compareSanityPkgPath = path.join(tmpDir, 'node_modules', 'sanity')
+  await exec({
+    command: `npm install sanity@${tag}`,
+    cwd: tmpDir,
+    spinner,
+    text: [`Downloading sanity@${tag} package…`, `Downloaded sanity@${tag}`],
+  })
 
-await exec({
-  text: ['Ensuring playwright is installed…', 'Playwright is installed'],
-  command: 'npx playwright install',
-  spinner,
-})
+  return path.join(tmpDir, 'node_modules', 'sanity')
+}
 
 const formatFps = (fps: number) => {
   const rounded = fps.toFixed(1)
@@ -104,18 +85,40 @@ const formatFps = (fps: number) => {
   return chalk.red(rounded)
 }
 
+const formatEfps = (latencyMs: number) => {
+  const efps = 1000 / latencyMs
+  const rounded = efps.toFixed(1)
+
+  if (efps >= 100) return chalk.green('99.9+')
+  if (efps >= 60) return chalk.green(rounded)
+  if (efps >= 20) return chalk.yellow(rounded)
+  return chalk.red(rounded)
+}
+
+const formatPercentageChange = (experiment: number, reference: number): string => {
+  if (experiment < 16 && reference < 16) return '-/-%'
+  const delta = (experiment - reference) / reference
+  if (!delta) return '-/-%'
+  const percentage = delta * 100
+  const rounded = percentage.toFixed(1)
+  const sign = delta >= 0 ? '+' : ''
+  return `${sign}${rounded}%`
+}
+
 const formatPercentage = (delta: number): string => {
   const percentage = delta * 100
   const rounded = percentage.toFixed(1)
   const sign = delta >= 0 ? '+' : ''
-  if (delta >= -deltaThreshold) return `${sign}${rounded}%`
+  if (delta >= -warningThreshold) return `${sign}${rounded}%`
   return chalk.red(`${sign}${rounded}%`)
 }
 
 // For markdown formatting without colors
-const formatFpsPlain = (fps: number) => {
-  if (fps >= 100) return '99.9+'
-  const rounded = fps.toFixed(1)
+const formatEfpsPlain = (latencyMs: number) => {
+  const efps = 1000 / latencyMs
+  const rounded = efps.toFixed(1)
+
+  if (efps >= 100) return '99.9+'
   return rounded
 }
 
@@ -126,200 +129,259 @@ const formatPercentagePlain = (delta: number): string => {
   return `${sign}${rounded}%`
 }
 
-const testOutput: Array<{
-  name: string
-  results: Array<EfpsResult & {delta: number; passed: boolean}>
-}> = []
-for (let i = 0; i < tests.length; i++) {
-  const test = tests[i]
+// START
 
-  const localResult1 = await runTest({
-    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 1`,
-    test,
-    resultsDir,
-    client,
-    headless,
-    projectId,
-    sanityPkgPath: localSanityPkgPath,
-    log: () => {},
-  })
-  const baseResult1 = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 1`,
-    test,
-    resultsDir,
-    client,
-    headless,
-    projectId,
-    sanityPkgPath: localSanityPkgPath,
-    log: () => {},
-  })
-  const localResult2 = await runTest({
-    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 2`,
-    test,
-    resultsDir,
-    client,
-    headless,
-    projectId,
-    sanityPkgPath: localSanityPkgPath,
-    log: () => {},
-  })
-  const baseResult2 = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 2`,
-    test,
-    resultsDir,
-    client,
-    headless,
-    projectId,
-    sanityPkgPath: localSanityPkgPath,
-    log: () => {},
-  })
-  const localResult3 = await runTest({
-    prefix: `Running test '${test.name}' [${i + 1}/${tests.length}] with local… 3`,
-    test,
-    resultsDir,
-    client,
-    headless,
-    projectId,
-    sanityPkgPath: localSanityPkgPath,
-    log: () => {},
-  })
-  const baseResult3 = await runTest({
-    prefix: `Running '${test.name}' [${i + 1}/${tests.length}] with 'sanity@${tag}'… 3`,
-    test,
-    resultsDir,
-    client,
-    headless,
-    projectId,
-    sanityPkgPath: localSanityPkgPath,
-    log: () => {},
-  })
+const spinner = Ora()
+
+spinner.info(`Running ${tests.length} tests: ${tests.map((t) => `'${t.name}'`).join(', ')}`)
 
-  const localResults = localResult1.map((result1, index) => {
-    const result2 = localResult2[index]
-    const result3 = localResult3[index]
+await exec({
+  text: ['Building the monorepo…', 'Built monorepo'],
+  command: 'pnpm run build',
+  spinner,
+  cwd: monorepoRoot,
+})
 
-    let min = result1
-    const arr = [result1, result2, result3]
+await exec({
+  text: ['Ensuring playwright is installed…', 'Playwright is installed'],
+  command: 'npx playwright install',
+  spinner,
+})
 
-    for (const item of arr) {
-      if (item.p50 < min.p50) {
-        min = item
-      }
-    }
+const localSanityPkgPath = path.dirname(fileURLToPath(import.meta.resolve('sanity/package.json')))
 
-    return min
-  })
+const referenceSanityPkgPath = await getSanityPkgPathForTag('v3.57.4')
+const experimentSanityPkgPath = await getSanityPkgPathForTag('v3.58.0')
 
-  const baseResults = baseResult1.map((result1, index) => {
-    const result2 = baseResult2[index]
-    const result3 = baseResult3[index]
+function mergeResults(baseResults: EfpsResult[] | undefined, incomingResults: EfpsResult[]) {
+  if (!baseResults) return incomingResults
 
-    let min = result1
-    const arr = [result1, result2, result3]
+  return incomingResults.map((incomingResult, index) => {
+    const baseResult = baseResults[index]
 
-    for (const item of arr) {
-      if (item.p50 < min.p50) {
-        min = item
-      }
-    }
+    const incomingMedianLatency = incomingResult.latency.median
+    const baseMedianLatency = baseResult.latency.median
 
-    return min
+    // if the incoming test run performed better, we'll take that one
+    if (incomingMedianLatency < baseMedianLatency) return incomingResult
+    // otherwise, use the previous run
+    return baseResult
   })
+}
 
-  const combinedResults = localResults.map((localResult, index) => {
-    const baseResult = baseResults[index]
-    const delta = (localResult.p50 - baseResult.p50) / baseResult.p50
-    return {
-      ...localResult,
-      delta,
-      passed: delta >= -deltaThreshold,
-    }
-  })
+const testResults: Array<{
+  name: string
+  results: EfpsAbResult[]
+}> = []
 
-  testOutput.push({name: test.name, results: combinedResults})
+async function runAbTest(test: EfpsTest) {
+  let referenceResults: EfpsResult[] | undefined
+  let experimentResults: EfpsResult[] | undefined
+
+  for (let attempt = 0; attempt < testAttemptCount; attempt++) {
+    referenceResults = mergeResults(
+      referenceResults,
+      await runTest({
+        prefix: `running ${referenceSanityPkgPath}`,
+        test,
+        resultsDir,
+        client,
+        headless,
+        projectId,
+        sanityPkgPath: referenceSanityPkgPath,
+        log: () => {},
+      }),
+    )
+
+    experimentResults = mergeResults(
+      experimentResults,
+      await runTest({
+        prefix: `running ${experimentSanityPkgPath}`,
+        test,
+        resultsDir,
+        client,
+        headless,
+        projectId,
+        sanityPkgPath: experimentSanityPkgPath,
+        log: () => {},
+      }),
+    )
+  }
+
+  return experimentResults!.map(
+    (experimentResult, index): EfpsAbResult => ({
+      experiment: experimentResult,
+      reference: referenceResults![index],
+    }),
+  )
 }
 
-const p50Min = testOutput.flatMap((i) => i.results).sort((a, b) => a.p50 - b.p50)[0]
+for (let i = 0; i < tests.length; i++) {
+  const test = tests[i]
+  testResults.push({
+    name: test.name,
+    results: await runAbTest(test),
+  })
+}
 
 const table = new Table({
-  head: [chalk.bold('Benchmark'), 'eFPS', `vs \`${tag}\``, 'Passed?'].map((cell) =>
+  head: [chalk.bold('Benchmark'), 'reference', 'experiment', 'Δ (%)', ''].map((cell) =>
     chalk.cyan(cell),
   ),
 })
 
-for (const {name, results} of testOutput) {
-  for (const {delta, p50, label, passed} of results) {
+function isSignificantlyDifferent(experiment: number, reference: number) {
+  // values are too small to and are already performing well
+  if (experiment < 16 && reference < 16) return false
+  const delta = (experiment - reference) / reference
+  return delta >= warningThreshold
+}
+
+for (const {name, results} of testResults) {
+  for (const {experiment, reference} of results) {
+    const significantlyDifferent = isSignificantlyDifferent(
+      experiment.latency.median,
+      reference.latency.median,
+    )
+
+    const sign = experiment.latency.median >= reference.latency.median ? '+' : ''
+    const msDifference = `${sign}${(experiment.latency.median - reference.latency.median).toFixed(0)}ms`
+    const percentageChange = formatPercentageChange(
+      experiment.latency.median,
+      reference.latency.median,
+    )
+
     table.push([
-      label ? `${name} (${label})` : name,
-      formatFps(p50),
-      formatPercentage(delta),
-      passed ? '✅' : '🔴',
+      `${name} (${experiment.label})`,
+      `${formatEfps(reference.latency.median)} efps (${reference.latency.median.toFixed(0)}ms)`,
+      `${formatEfps(experiment.latency.median)} efps (${experiment.latency.median.toFixed(0)}ms)`,
+      `${significantlyDifferent ? chalk.red(msDifference) : msDifference} (${percentageChange})`,
+      significantlyDifferent ? '🔴' : '✅',
     ])
   }
 }
 
-const allPassed = testOutput.flatMap((i) => i.results).every((i) => i.passed)
-
 console.log(table.toString())
-console.log(`
-${chalk.bold('Lowest eFPS:')} ${formatFps(p50Min.p50)}`)
-console.log(`
-
-│ ${chalk.bold('eFPS — Editor "Frames Per Second"')}
-│ The number of renders, aka "frames", that are assumed to be possible
-│ within a second. Derived from input latency. ${chalk.green('Higher')} is better.
-│
-│ ${chalk.bold(`vs \`${tag}\``)}
-│ The percentage difference of the current branch when compared to \`sanity@${tag}\`.
-│
-│ ${chalk.bold('Passed?')}
-│ Tests are failed when any of the median eFPS results perform more than ${deltaThreshold * 100}% worse.
-`)
-
-const markdown = `
-<details>
-<summary><strong>⚡️ Editor Performance Report</strong><br/><br/>
 
-| <strong>${formatFpsPlain(p50Min.p50)}</strong> <br/><sub>eFPS</sub> | ${formatPercentagePlain(p50Min.delta)}<br/><sub>vs <code>${tag}</code></sub> | ${allPassed ? '✅' : '🔴'} <br/> <sub>${allPassed ? 'Passed' : 'Failed'}</sub> |
-| --- | --- | --- |
+let comparisonTable = `
+|     | Benchmark | reference<br/><sup>input latency</sup> | experiment<br/><sup>input latency</sup> | Δ (%)<br/><sup>latency difference</sup> |
+| --- | :-- | :-- | :-- | :-- |
+`
 
-> **eFPS** — Editor "Frames Per Second"
-> <sup>The number of renders aka "frames" that is assumed to be possible within a second. Derived from input latency. _Higher_ is better.</sup>
+const detailedInformationHeader = `
+| Benchmark | latency | p75 | p90 | p99 | blocking time | test duration |
+| --------- | ------: | --: | --: | --: | ------------: | ------------: |
+`
 
-<sup>↓ expand for details</sup>
-</summary>
+let referenceTable = detailedInformationHeader
+let experimentTable = detailedInformationHeader
+
+for (const {name, results} of testResults) {
+  for (const {experiment, reference} of results) {
+    const significantlyDifferent = isSignificantlyDifferent(
+      experiment.latency.median,
+      reference.latency.median,
+    )
+
+    const sign = experiment.latency.median >= reference.latency.median ? '+' : ''
+    const msDifference = `${sign}${(experiment.latency.median - reference.latency.median).toFixed(0)}ms`
+    const percentageChange = formatPercentageChange(
+      experiment.latency.median,
+      reference.latency.median,
+    )
+
+    const benchmarkName = `${name} (${experiment.label})`
+
+    comparisonTable +=
+      // status
+      `| ${significantlyDifferent ? '🔴' : '✅'} ` +
+      // benchmark name
+      `| ${benchmarkName} ` +
+      // reference latency
+      `| ${formatEfpsPlain(reference.latency.median)} efps (${reference.latency.median.toFixed(0)}ms) ` +
+      // experiment latency
+      `| ${formatEfpsPlain(experiment.latency.median)} efps (${experiment.latency.median.toFixed(0)}ms) ` +
+      // difference
+      `| ${msDifference} (${percentageChange}) |\n`
+
+    referenceTable +=
+      // benchmark name
+      `| ${benchmarkName} ` +
+      // latency
+      `| ${reference.latency.median.toFixed(0)}±${reference.latency.error.toFixed(0)}ms ` +
+      // p75
+      `| ${reference.latency.p75.toFixed(0)}ms ` +
+      // p90
+      `| ${reference.latency.p90.toFixed(0)}ms ` +
+      // p99
+      `| ${reference.latency.p99.toFixed(0)}ms ` +
+      // blocking time
+      `| ${reference.blockingTime.toFixed(0)}ms ` +
+      // test duration
+      `| ${(reference.runDuration / 1000).toFixed(1)}s ` +
+      `|\n`
+
+    experimentTable +=
+      // benchmark name
+      `| ${benchmarkName} ` +
+      // latency
+      `| ${experiment.latency.median.toFixed(0)}±${experiment.latency.error.toFixed(0)}ms ` +
+      // p75
+      `| ${experiment.latency.p75.toFixed(0)}ms ` +
+      // p90
+      `| ${experiment.latency.p90.toFixed(0)}ms ` +
+      // p99
+      `| ${experiment.latency.p99.toFixed(0)}ms ` +
+      // blocking time
+      `| ${experiment.blockingTime.toFixed(0)}ms ` +
+      // test duration
+      `| ${(experiment.runDuration / 1000).toFixed(1)}s ` +
+      `|\n`
+  }
+}
 
-| Benchmark | eFPS | vs \`${tag}\` | Passed? |
-|-----------| ---: | ------------: | :-----: |
-${testOutput
-  .flatMap((test) =>
-    test.results.map((result) => ({
-      ...result,
-      label: result.label ? `${test.name} (${result.label})` : test.name,
-    })),
-  )
-  .map(
-    ({label, p50, delta, passed}) =>
-      `| ${label} | ${formatFpsPlain(p50)} | ${formatPercentagePlain(delta)} | ${passed ? '✅' : '🔴'} |`,
-  )
-  .join('\n')}
+const markdown = `### ⚡️ Editor Performance Report
+
+Updated ${new Date().toUTCString()}
 
-> **eFPS — Editor "Frames Per Second"**
-> The number of renders, aka "frames", that are assumed to be possible within a second.
-> Derived from input latency. _Higher_ is better.
+${comparisonTable}
+
+> **efps** — editor "frames per second". The number of updates assumed to be possible within a second.
 >
-> **vs \`${tag}\`**
-> The percentage difference of the current branch when compared to \`sanity@${tag}\`.
+> Derived from input latency. \`efps = 1000 / input_latency\`
+
+<details>
+
+<summary><strong>Detailed information</strong></summary>
+
+### 🏠 Reference result
+
+The performance result of \`sanity@latest\`
+
+
+${referenceTable}
+
+### 🧪 Experiment result
+
+The performance result of this branch
+
+${experimentTable}
+
+### 📚 Glossary
+
+> #### column definitions
 >
-> **Passed?**
-> Tests are failed when any of the median eFPS results perform more than ${deltaThreshold * 100}% worse.
+> - **benchmark** — the name of the test, e.g. "article", followed by the label of the field being measured, e.g. "(title)".
+> - **latency** — the time between when a key was pressed and when it was rendered. derived from a set of samples. the median (p50) is shown along with a margin of error.
+> - **p75** — the 75th percentile of the input latency in the test run. 75% of the sampled inputs in this benchmark were processed faster than this value. this provides insight into the upper range of typical performance.
+> - **p90** — the 90th percentile of the input latency in the test run. 90% of the sampled inputs were faster than this. this metric helps identify slower interactions that occurred less frequently during the benchmark.
+> - **p99** — the 99th percentile of the input latency in the test run. only 1% of sampled inputs were slower than this. this represents the worst-case scenarios encountered during the benchmark, useful for identifying potential performance outliers.
+> - **blocking time** — the total time during which the main thread was blocked, preventing user input and UI updates. this metric helps identify performance bottlenecks that may cause the interface to feel unresponsive.
+> - **test duration** — how long the test run took to complete.
+
+</details>
 `
 
 // Write markdown file to root of results
 const markdownOutputPath = path.join(workspaceDir, 'results', 'benchmark-results.md')
 await fs.promises.writeFile(markdownOutputPath, markdown)
-
-// Exit with code 1 if regression detected
-if (!allPassed) {
-  process.exit(1)
-}
diff --git a/perf/efps/tests/article/article.ts b/perf/efps/tests/article/article.ts
index f99cdde3b3f..25d88082d88 100644
--- a/perf/efps/tests/article/article.ts
+++ b/perf/efps/tests/article/article.ts
@@ -140,30 +140,10 @@ export default defineEfpsTest({
 
     return document
   },
-  run: async ({page}) => {
-    return [
-      {
-        label: 'title',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-title"] input[type="text"]'),
-        )),
-      },
-      {
-        label: 'body',
-        ...(await measureFpsForPte(page.locator('[data-testid="field-body"]'))),
-      },
-      {
-        label: 'string in object',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-seo.metaTitle"] input[type="text"]'),
-        )),
-      },
-      {
-        label: 'string in array',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-tags"] input[type="text"]').first(),
-        )),
-      },
-    ]
-  },
+  run: async ({page}) => [
+    await measureFpsForInput({page, fieldName: 'title'}),
+    await measureFpsForPte({page, fieldName: 'body'}),
+    await measureFpsForInput({page, fieldName: 'seo.metaTitle', label: 'string inside object'}),
+    await measureFpsForInput({page, fieldName: 'tags', label: 'string inside array'}),
+  ],
 })
diff --git a/perf/efps/tests/recipe/recipe.ts b/perf/efps/tests/recipe/recipe.ts
index 3c9f8e4c230..9b6fc523076 100644
--- a/perf/efps/tests/recipe/recipe.ts
+++ b/perf/efps/tests/recipe/recipe.ts
@@ -160,22 +160,9 @@ export default defineEfpsTest({
 
     return recipe
   },
-  run: async ({page}) => {
-    return [
-      {
-        label: 'name',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-name"] input[type="text"]'),
-        )),
-      },
-      {
-        label: 'description',
-        ...(await measureFpsForInput(page.locator('[data-testid="field-description"] textarea'))),
-      },
-      {
-        label: 'instructions',
-        ...(await measureFpsForPte(page.locator('[data-testid="field-instructions"]'))),
-      },
-    ]
-  },
+  run: async ({page}) => [
+    await measureFpsForInput({page, fieldName: 'name'}),
+    await measureFpsForInput({page, fieldName: 'description'}),
+    await measureFpsForPte({page, fieldName: 'instructions'}),
+  ],
 })
diff --git a/perf/efps/tests/synthetic/synthetic.ts b/perf/efps/tests/synthetic/synthetic.ts
index 2ed731428b8..e0cdc573180 100644
--- a/perf/efps/tests/synthetic/synthetic.ts
+++ b/perf/efps/tests/synthetic/synthetic.ts
@@ -120,20 +120,12 @@ export default defineEfpsTest({
 
     return synthetic
   },
-  run: async ({page}) => {
-    return [
-      {
-        label: 'title',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-title"] input[type="text"]'),
-        )),
-      },
-      {
-        label: 'string in object',
-        ...(await measureFpsForInput(
-          page.locator('[data-testid="field-syntheticObject.name"] input[type="text"]'),
-        )),
-      },
-    ]
-  },
+  run: async ({page}) => [
+    await measureFpsForInput({page, fieldName: 'title'}),
+    await measureFpsForInput({
+      page,
+      fieldName: 'syntheticObject.name',
+      label: 'string inside object',
+    }),
+  ],
 })
diff --git a/perf/efps/types.ts b/perf/efps/types.ts
index 910932926e3..c4d71c8b61c 100644
--- a/perf/efps/types.ts
+++ b/perf/efps/types.ts
@@ -12,18 +12,26 @@ export interface EfpsTest {
   name: string
   configPath: string | undefined
   document: SanityDocumentStub | ((context: EfpsTestRunnerContext) => Promise<SanityDocumentStub>)
-  run: (context: EfpsTestRunnerContext & {document: SanityDocument}) => Promise<EfpsTestResult>
+  run: (context: EfpsTestRunnerContext & {document: SanityDocument}) => Promise<EfpsResult[]>
 }
 
 export interface EfpsResult {
-  label?: string
-  p50: number
-  p75: number
-  p90: number
-  latencies: number[]
+  label: string
+  runDuration: number
+  blockingTime: number
+  latency: {
+    median: number
+    error: number
+    p75: number
+    p90: number
+    p99: number
+  }
 }
 
-export type EfpsTestResult = EfpsResult | EfpsResult[]
+export interface EfpsAbResult {
+  experiment: EfpsResult
+  reference: EfpsResult
+}
 
 export function defineEfpsTest(config: EfpsTest): EfpsTest {
   return config

From 7a75792b069f632f81975c69ae02b5c54c8565d2 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Sat, 28 Sep 2024 22:41:30 -0500
Subject: [PATCH 19/21] test

---
 perf/efps/helpers/aggregateLatencies.ts |   4 +-
 perf/efps/index.ts                      | 131 ++++++++++++++----------
 perf/efps/runTest.ts                    |   8 +-
 perf/efps/types.ts                      |   2 +-
 4 files changed, 82 insertions(+), 63 deletions(-)

diff --git a/perf/efps/helpers/aggregateLatencies.ts b/perf/efps/helpers/aggregateLatencies.ts
index 9611060cae6..c058d36785b 100644
--- a/perf/efps/helpers/aggregateLatencies.ts
+++ b/perf/efps/helpers/aggregateLatencies.ts
@@ -22,7 +22,7 @@ function calculatePercentile(numbers: number[], percentile: number): number {
   return lowerValue + (upperValue - lowerValue) * fraction
 }
 
-function calculateError(numbers: number[]) {
+function calculateSpread(numbers: number[]) {
   const mean = numbers.reduce((sum, num) => sum + num, 0) / numbers.length
 
   // calculate the sum of squared differences from the mean
@@ -40,7 +40,7 @@ function calculateError(numbers: number[]) {
 export function aggregateLatencies(values: number[]): EfpsResult['latency'] {
   return {
     median: calculatePercentile(values, 0.5),
-    error: calculateError(values),
+    spread: calculateSpread(values),
     p75: calculatePercentile(values, 0.75),
     p90: calculatePercentile(values, 0.9),
     p99: calculatePercentile(values, 0.99),
diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 65b5f46142f..57e68b03acc 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -1,4 +1,3 @@
-/* eslint-disable max-depth */
 /* eslint-disable no-console */
 // eslint-disable-next-line import/no-unassigned-import
 import 'dotenv/config'
@@ -21,11 +20,12 @@ import recipe from './tests/recipe/recipe'
 import synthetic from './tests/synthetic/synthetic'
 import {type EfpsAbResult, type EfpsResult, type EfpsTest} from './types'
 
-const warningThreshold = 0.2
-const testAttemptCount = process.env.CI ? 3 : 1
+const WARNING_THRESHOLD = 0.2
+const TEST_ATTEMPTS = process.env.CI ? 3 : 1
 
-const headless = true
-const tests = [article, recipe, synthetic]
+const HEADLESS = true
+const REFERENCE_TAG = 'latest'
+const TESTS = [article, recipe, synthetic]
 
 const projectId = process.env.VITE_PERF_EFPS_PROJECT_ID!
 const dataset = process.env.VITE_PERF_EFPS_DATASET!
@@ -77,14 +77,6 @@ const getSanityPkgPathForTag = async (tag: string) => {
   return path.join(tmpDir, 'node_modules', 'sanity')
 }
 
-const formatFps = (fps: number) => {
-  const rounded = fps.toFixed(1)
-  if (fps >= 100) return chalk.green('99.9+')
-  if (fps >= 60) return chalk.green(rounded)
-  if (fps >= 20) return chalk.yellow(rounded)
-  return chalk.red(rounded)
-}
-
 const formatEfps = (latencyMs: number) => {
   const efps = 1000 / latencyMs
   const rounded = efps.toFixed(1)
@@ -105,14 +97,6 @@ const formatPercentageChange = (experiment: number, reference: number): string =
   return `${sign}${rounded}%`
 }
 
-const formatPercentage = (delta: number): string => {
-  const percentage = delta * 100
-  const rounded = percentage.toFixed(1)
-  const sign = delta >= 0 ? '+' : ''
-  if (delta >= -warningThreshold) return `${sign}${rounded}%`
-  return chalk.red(`${sign}${rounded}%`)
-}
-
 // For markdown formatting without colors
 const formatEfpsPlain = (latencyMs: number) => {
   const efps = 1000 / latencyMs
@@ -122,18 +106,9 @@ const formatEfpsPlain = (latencyMs: number) => {
   return rounded
 }
 
-const formatPercentagePlain = (delta: number): string => {
-  const percentage = delta * 100
-  const rounded = percentage.toFixed(1)
-  const sign = delta >= 0 ? '+' : ''
-  return `${sign}${rounded}%`
-}
-
-// START
-
 const spinner = Ora()
 
-spinner.info(`Running ${tests.length} tests: ${tests.map((t) => `'${t.name}'`).join(', ')}`)
+spinner.info(`Running ${TESTS.length} tests: ${TESTS.map((t) => `'${t.name}'`).join(', ')}`)
 
 await exec({
   text: ['Building the monorepo…', 'Built monorepo'],
@@ -150,8 +125,8 @@ await exec({
 
 const localSanityPkgPath = path.dirname(fileURLToPath(import.meta.resolve('sanity/package.json')))
 
-const referenceSanityPkgPath = await getSanityPkgPathForTag('v3.57.4')
-const experimentSanityPkgPath = await getSanityPkgPathForTag('v3.58.0')
+const referenceSanityPkgPath = await getSanityPkgPathForTag(REFERENCE_TAG)
+const experimentSanityPkgPath = localSanityPkgPath
 
 function mergeResults(baseResults: EfpsResult[] | undefined, incomingResults: EfpsResult[]) {
   if (!baseResults) return incomingResults
@@ -178,15 +153,16 @@ async function runAbTest(test: EfpsTest) {
   let referenceResults: EfpsResult[] | undefined
   let experimentResults: EfpsResult[] | undefined
 
-  for (let attempt = 0; attempt < testAttemptCount; attempt++) {
+  for (let attempt = 0; attempt < TEST_ATTEMPTS; attempt++) {
     referenceResults = mergeResults(
       referenceResults,
       await runTest({
         prefix: `running ${referenceSanityPkgPath}`,
+        key: 'reference',
         test,
         resultsDir,
         client,
-        headless,
+        headless: HEADLESS,
         projectId,
         sanityPkgPath: referenceSanityPkgPath,
         log: () => {},
@@ -197,10 +173,11 @@ async function runAbTest(test: EfpsTest) {
       experimentResults,
       await runTest({
         prefix: `running ${experimentSanityPkgPath}`,
+        key: 'experiment',
         test,
         resultsDir,
         client,
-        headless,
+        headless: HEADLESS,
         projectId,
         sanityPkgPath: experimentSanityPkgPath,
         log: () => {},
@@ -216,25 +193,36 @@ async function runAbTest(test: EfpsTest) {
   )
 }
 
-for (let i = 0; i < tests.length; i++) {
-  const test = tests[i]
+for (let i = 0; i < TESTS.length; i++) {
+  const test = TESTS[i]
   testResults.push({
     name: test.name,
     results: await runAbTest(test),
   })
 }
 
-const table = new Table({
-  head: [chalk.bold('Benchmark'), 'reference', 'experiment', 'Δ (%)', ''].map((cell) =>
-    chalk.cyan(cell),
-  ),
+const comparisonTableCli = new Table({
+  head: ['Benchmark', 'reference', 'experiment', 'Δ (%)', ''].map((cell) => chalk.cyan(cell)),
 })
 
+const detailedInformationCliHead = [
+  'Benchmark',
+  'latency',
+  'p75',
+  'p90',
+  'p99',
+  'blocking time',
+  'test duration',
+].map((i) => chalk.cyan(i))
+
+const referenceTableCli = new Table({head: detailedInformationCliHead})
+const experimentTableCli = new Table({head: detailedInformationCliHead})
+
 function isSignificantlyDifferent(experiment: number, reference: number) {
   // values are too small to and are already performing well
   if (experiment < 16 && reference < 16) return false
   const delta = (experiment - reference) / reference
-  return delta >= warningThreshold
+  return delta >= WARNING_THRESHOLD
 }
 
 for (const {name, results} of testResults) {
@@ -251,21 +239,51 @@ for (const {name, results} of testResults) {
       reference.latency.median,
     )
 
-    table.push([
-      `${name} (${experiment.label})`,
+    const benchmarkName = `${name} (${experiment.label})`
+
+    comparisonTableCli.push([
+      benchmarkName,
       `${formatEfps(reference.latency.median)} efps (${reference.latency.median.toFixed(0)}ms)`,
       `${formatEfps(experiment.latency.median)} efps (${experiment.latency.median.toFixed(0)}ms)`,
       `${significantlyDifferent ? chalk.red(msDifference) : msDifference} (${percentageChange})`,
       significantlyDifferent ? '🔴' : '✅',
     ])
+
+    referenceTableCli.push([
+      benchmarkName,
+      `${reference.latency.median.toFixed(0)}±${reference.latency.spread.toFixed(0)}ms`,
+      `${reference.latency.p75.toFixed(0)}ms`,
+      `${reference.latency.p90.toFixed(0)}ms`,
+      `${reference.latency.p99.toFixed(0)}ms`,
+      `${reference.blockingTime.toFixed(0)}ms`,
+      `${(reference.runDuration / 1000).toFixed(1)}s`,
+    ])
+
+    experimentTableCli.push([
+      benchmarkName,
+      `${experiment.latency.median.toFixed(0)}±${experiment.latency.spread.toFixed(0)}ms`,
+      `${experiment.latency.p75.toFixed(0)}ms`,
+      `${experiment.latency.p90.toFixed(0)}ms`,
+      `${experiment.latency.p99.toFixed(0)}ms`,
+      `${experiment.blockingTime.toFixed(0)}ms`,
+      `${(experiment.runDuration / 1000).toFixed(1)}s`,
+    ])
   }
 }
 
-console.log(table.toString())
+console.log()
+console.log('Reference vs experiment')
+console.log(comparisonTableCli.toString())
+console.log()
+console.log('Reference result')
+console.log(referenceTableCli.toString())
+console.log()
+console.log('Experiment result')
+console.log(experimentTableCli.toString())
 
 let comparisonTable = `
-|     | Benchmark | reference<br/><sup>input latency</sup> | experiment<br/><sup>input latency</sup> | Δ (%)<br/><sup>latency difference</sup> |
-| --- | :-- | :-- | :-- | :-- |
+| Benchmark | reference<br/><sup>input latency</sup> | experiment<br/><sup>input latency</sup> | Δ (%)<br/><sup>latency difference</sup> | |
+| :-- | :-- | :-- | :-- | --- |
 `
 
 const detailedInformationHeader = `
@@ -293,8 +311,6 @@ for (const {name, results} of testResults) {
     const benchmarkName = `${name} (${experiment.label})`
 
     comparisonTable +=
-      // status
-      `| ${significantlyDifferent ? '🔴' : '✅'} ` +
       // benchmark name
       `| ${benchmarkName} ` +
       // reference latency
@@ -302,13 +318,16 @@ for (const {name, results} of testResults) {
       // experiment latency
       `| ${formatEfpsPlain(experiment.latency.median)} efps (${experiment.latency.median.toFixed(0)}ms) ` +
       // difference
-      `| ${msDifference} (${percentageChange}) |\n`
+      `| ${msDifference} (${percentageChange}) ` +
+      // status
+      `| ${significantlyDifferent ? '🔴' : '✅'} ` +
+      `|\n`
 
     referenceTable +=
       // benchmark name
       `| ${benchmarkName} ` +
       // latency
-      `| ${reference.latency.median.toFixed(0)}±${reference.latency.error.toFixed(0)}ms ` +
+      `| ${reference.latency.median.toFixed(0)}±${reference.latency.spread.toFixed(0)}ms ` +
       // p75
       `| ${reference.latency.p75.toFixed(0)}ms ` +
       // p90
@@ -325,7 +344,7 @@ for (const {name, results} of testResults) {
       // benchmark name
       `| ${benchmarkName} ` +
       // latency
-      `| ${experiment.latency.median.toFixed(0)}±${experiment.latency.error.toFixed(0)}ms ` +
+      `| ${experiment.latency.median.toFixed(0)}±${experiment.latency.spread.toFixed(0)}ms ` +
       // p75
       `| ${experiment.latency.p75.toFixed(0)}ms ` +
       // p90
@@ -346,7 +365,7 @@ Updated ${new Date().toUTCString()}
 
 ${comparisonTable}
 
-> **efps** — editor "frames per second". The number of updates assumed to be possible within a second.
+> **efps** — editor "frames per second". The number of updates assumed to be possible within a second.
 >
 > Derived from input latency. \`efps = 1000 / input_latency\`
 
@@ -356,7 +375,7 @@ ${comparisonTable}
 
 ### 🏠 Reference result
 
-The performance result of \`sanity@latest\`
+The performance result of \`sanity@${REFERENCE_TAG}\`
 
 
 ${referenceTable}
@@ -372,7 +391,7 @@ ${experimentTable}
 > #### column definitions
 >
 > - **benchmark** — the name of the test, e.g. "article", followed by the label of the field being measured, e.g. "(title)".
-> - **latency** — the time between when a key was pressed and when it was rendered. derived from a set of samples. the median (p50) is shown along with a margin of error.
+> - **latency** — the time between when a key was pressed and when it was rendered. derived from a set of samples. the median (p50) is shown along with its spread.
 > - **p75** — the 75th percentile of the input latency in the test run. 75% of the sampled inputs in this benchmark were processed faster than this value. this provides insight into the upper range of typical performance.
 > - **p90** — the 90th percentile of the input latency in the test run. 90% of the sampled inputs were faster than this. this metric helps identify slower interactions that occurred less frequently during the benchmark.
 > - **p99** — the 99th percentile of the input latency in the test run. only 1% of sampled inputs were slower than this. this represents the worst-case scenarios encountered during the benchmark, useful for identifying potential performance outliers.
diff --git a/perf/efps/runTest.ts b/perf/efps/runTest.ts
index 722c53516ed..89b272c5132 100644
--- a/perf/efps/runTest.ts
+++ b/perf/efps/runTest.ts
@@ -10,7 +10,6 @@ import sourcemaps from 'rollup-plugin-sourcemaps'
 import handler from 'serve-handler'
 import * as vite from 'vite'
 
-// TODO: add test duration to metrics
 import {type EfpsResult, type EfpsTest, type EfpsTestRunnerContext} from './types'
 
 const workspaceDir = path.dirname(fileURLToPath(import.meta.url))
@@ -24,6 +23,7 @@ interface RunTestOptions {
   headless: boolean
   client: SanityClient
   sanityPkgPath: string
+  key: string
   log: (text: string) => void
 }
 
@@ -36,6 +36,7 @@ export async function runTest({
   headless,
   client,
   sanityPkgPath,
+  key,
   log,
 }: RunTestOptions): Promise<EfpsResult[]> {
   console.log(prefix)
@@ -45,9 +46,8 @@ export async function runTest({
 
   // spinner.start(prefix)
 
-  const versionLabel = sanityPkgPath ? 'latest' : 'local'
-  const outDir = path.join(workspaceDir, 'builds', test.name, versionLabel)
-  const testResultsDir = path.join(resultsDir, test.name, versionLabel)
+  const outDir = path.join(workspaceDir, 'builds', test.name, key)
+  const testResultsDir = path.join(resultsDir, test.name, key)
 
   await fs.promises.mkdir(outDir, {recursive: true})
   log('Building…')
diff --git a/perf/efps/types.ts b/perf/efps/types.ts
index c4d71c8b61c..ad9df28fb3d 100644
--- a/perf/efps/types.ts
+++ b/perf/efps/types.ts
@@ -21,7 +21,7 @@ export interface EfpsResult {
   blockingTime: number
   latency: {
     median: number
-    error: number
+    spread: number
     p75: number
     p90: number
     p99: number

From 93886e56d6ffb699bee98da010956dcaff26d942 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Sun, 29 Sep 2024 09:57:16 -0500
Subject: [PATCH 20/21] stuff

---
 perf/efps/helpers/aggregateLatencies.ts | 18 +------
 perf/efps/index.ts                      | 71 ++++++++++++++-----------
 perf/efps/runTest.ts                    | 48 ++++++++---------
 perf/efps/types.ts                      |  3 +-
 4 files changed, 62 insertions(+), 78 deletions(-)

diff --git a/perf/efps/helpers/aggregateLatencies.ts b/perf/efps/helpers/aggregateLatencies.ts
index c058d36785b..e3f82202a77 100644
--- a/perf/efps/helpers/aggregateLatencies.ts
+++ b/perf/efps/helpers/aggregateLatencies.ts
@@ -22,25 +22,9 @@ function calculatePercentile(numbers: number[], percentile: number): number {
   return lowerValue + (upperValue - lowerValue) * fraction
 }
 
-function calculateSpread(numbers: number[]) {
-  const mean = numbers.reduce((sum, num) => sum + num, 0) / numbers.length
-
-  // calculate the sum of squared differences from the mean
-  const squaredDiffs = numbers.map((num) => Math.pow(num - mean, 2))
-  const sumSquaredDiffs = squaredDiffs.reduce((sum, diff) => sum + diff, 0)
-
-  const variance = sumSquaredDiffs / (numbers.length - 1)
-  const standardDeviation = Math.sqrt(variance)
-
-  // We assume normal distribution and multiply the standard deviations by 1.96
-  // which aims to represent 95% of the population
-  return 1.96 * standardDeviation
-}
-
 export function aggregateLatencies(values: number[]): EfpsResult['latency'] {
   return {
-    median: calculatePercentile(values, 0.5),
-    spread: calculateSpread(values),
+    p50: calculatePercentile(values, 0.5),
     p75: calculatePercentile(values, 0.75),
     p90: calculatePercentile(values, 0.9),
     p99: calculatePercentile(values, 0.99),
diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 57e68b03acc..85efd2ccf99 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -24,6 +24,7 @@ const WARNING_THRESHOLD = 0.2
 const TEST_ATTEMPTS = process.env.CI ? 3 : 1
 
 const HEADLESS = true
+const ENABLE_PROFILER = false
 const REFERENCE_TAG = 'latest'
 const TESTS = [article, recipe, synthetic]
 
@@ -134,8 +135,8 @@ function mergeResults(baseResults: EfpsResult[] | undefined, incomingResults: Ef
   return incomingResults.map((incomingResult, index) => {
     const baseResult = baseResults[index]
 
-    const incomingMedianLatency = incomingResult.latency.median
-    const baseMedianLatency = baseResult.latency.median
+    const incomingMedianLatency = incomingResult.latency.p50
+    const baseMedianLatency = baseResult.latency.p50
 
     // if the incoming test run performed better, we'll take that one
     if (incomingMedianLatency < baseMedianLatency) return incomingResult
@@ -154,35 +155,47 @@ async function runAbTest(test: EfpsTest) {
   let experimentResults: EfpsResult[] | undefined
 
   for (let attempt = 0; attempt < TEST_ATTEMPTS; attempt++) {
+    const attemptMessage = TEST_ATTEMPTS > 1 ? ` [${attempt + 1}/${TEST_ATTEMPTS}]` : ''
+    const referenceMessage = `Running test '${test.name}' on \`sanity@${REFERENCE_TAG}\`${attemptMessage}`
+    spinner.start(referenceMessage)
+
     referenceResults = mergeResults(
       referenceResults,
       await runTest({
-        prefix: `running ${referenceSanityPkgPath}`,
         key: 'reference',
         test,
         resultsDir,
         client,
         headless: HEADLESS,
+        enableProfiler: ENABLE_PROFILER,
         projectId,
         sanityPkgPath: referenceSanityPkgPath,
-        log: () => {},
+        log: (message) => {
+          spinner.text = `${referenceMessage}: ${message}`
+        },
       }),
     )
+    spinner.succeed(`Ran test '${test.name}' on \`sanity@${REFERENCE_TAG}\`${attemptMessage}`)
 
+    const experimentMessage = `Running test '${test.name}' on this branch${attemptMessage}`
+    spinner.start(experimentMessage)
     experimentResults = mergeResults(
       experimentResults,
       await runTest({
-        prefix: `running ${experimentSanityPkgPath}`,
         key: 'experiment',
         test,
         resultsDir,
         client,
         headless: HEADLESS,
+        enableProfiler: ENABLE_PROFILER,
         projectId,
         sanityPkgPath: experimentSanityPkgPath,
-        log: () => {},
+        log: (message) => {
+          spinner.text = `${experimentMessage}: ${message}`
+        },
       }),
     )
+    spinner.succeed(`Ran test '${test.name}' on this branch${attemptMessage}`)
   }
 
   return experimentResults!.map(
@@ -228,30 +241,27 @@ function isSignificantlyDifferent(experiment: number, reference: number) {
 for (const {name, results} of testResults) {
   for (const {experiment, reference} of results) {
     const significantlyDifferent = isSignificantlyDifferent(
-      experiment.latency.median,
-      reference.latency.median,
+      experiment.latency.p50,
+      reference.latency.p50,
     )
 
-    const sign = experiment.latency.median >= reference.latency.median ? '+' : ''
-    const msDifference = `${sign}${(experiment.latency.median - reference.latency.median).toFixed(0)}ms`
-    const percentageChange = formatPercentageChange(
-      experiment.latency.median,
-      reference.latency.median,
-    )
+    const sign = experiment.latency.p50 >= reference.latency.p50 ? '+' : ''
+    const msDifference = `${sign}${(experiment.latency.p50 - reference.latency.p50).toFixed(0)}ms`
+    const percentageChange = formatPercentageChange(experiment.latency.p50, reference.latency.p50)
 
     const benchmarkName = `${name} (${experiment.label})`
 
     comparisonTableCli.push([
       benchmarkName,
-      `${formatEfps(reference.latency.median)} efps (${reference.latency.median.toFixed(0)}ms)`,
-      `${formatEfps(experiment.latency.median)} efps (${experiment.latency.median.toFixed(0)}ms)`,
+      `${formatEfps(reference.latency.p50)} efps (${reference.latency.p50.toFixed(0)}ms)`,
+      `${formatEfps(experiment.latency.p50)} efps (${experiment.latency.p50.toFixed(0)}ms)`,
       `${significantlyDifferent ? chalk.red(msDifference) : msDifference} (${percentageChange})`,
       significantlyDifferent ? '🔴' : '✅',
     ])
 
     referenceTableCli.push([
       benchmarkName,
-      `${reference.latency.median.toFixed(0)}±${reference.latency.spread.toFixed(0)}ms`,
+      `${reference.latency.p50.toFixed(0)}ms`,
       `${reference.latency.p75.toFixed(0)}ms`,
       `${reference.latency.p90.toFixed(0)}ms`,
       `${reference.latency.p99.toFixed(0)}ms`,
@@ -261,7 +271,7 @@ for (const {name, results} of testResults) {
 
     experimentTableCli.push([
       benchmarkName,
-      `${experiment.latency.median.toFixed(0)}±${experiment.latency.spread.toFixed(0)}ms`,
+      `${experiment.latency.p50.toFixed(0)}ms`,
       `${experiment.latency.p75.toFixed(0)}ms`,
       `${experiment.latency.p90.toFixed(0)}ms`,
       `${experiment.latency.p99.toFixed(0)}ms`,
@@ -282,7 +292,7 @@ console.log('Experiment result')
 console.log(experimentTableCli.toString())
 
 let comparisonTable = `
-| Benchmark | reference<br/><sup>input latency</sup> | experiment<br/><sup>input latency</sup> | Δ (%)<br/><sup>latency difference</sup> | |
+| Benchmark | reference<br/><sup>latency of \`sanity@${REFERENCE_TAG}\`</sup> | experiment<br/><sup>latency of this branch</sup> | Δ (%)<br/><sup>latency difference</sup> | |
 | :-- | :-- | :-- | :-- | --- |
 `
 
@@ -297,16 +307,13 @@ let experimentTable = detailedInformationHeader
 for (const {name, results} of testResults) {
   for (const {experiment, reference} of results) {
     const significantlyDifferent = isSignificantlyDifferent(
-      experiment.latency.median,
-      reference.latency.median,
+      experiment.latency.p50,
+      reference.latency.p50,
     )
 
-    const sign = experiment.latency.median >= reference.latency.median ? '+' : ''
-    const msDifference = `${sign}${(experiment.latency.median - reference.latency.median).toFixed(0)}ms`
-    const percentageChange = formatPercentageChange(
-      experiment.latency.median,
-      reference.latency.median,
-    )
+    const sign = experiment.latency.p50 >= reference.latency.p50 ? '+' : ''
+    const msDifference = `${sign}${(experiment.latency.p50 - reference.latency.p50).toFixed(0)}ms`
+    const percentageChange = formatPercentageChange(experiment.latency.p50, reference.latency.p50)
 
     const benchmarkName = `${name} (${experiment.label})`
 
@@ -314,9 +321,9 @@ for (const {name, results} of testResults) {
       // benchmark name
       `| ${benchmarkName} ` +
       // reference latency
-      `| ${formatEfpsPlain(reference.latency.median)} efps (${reference.latency.median.toFixed(0)}ms) ` +
+      `| ${formatEfpsPlain(reference.latency.p50)} efps (${reference.latency.p50.toFixed(0)}ms) ` +
       // experiment latency
-      `| ${formatEfpsPlain(experiment.latency.median)} efps (${experiment.latency.median.toFixed(0)}ms) ` +
+      `| ${formatEfpsPlain(experiment.latency.p50)} efps (${experiment.latency.p50.toFixed(0)}ms) ` +
       // difference
       `| ${msDifference} (${percentageChange}) ` +
       // status
@@ -327,7 +334,7 @@ for (const {name, results} of testResults) {
       // benchmark name
       `| ${benchmarkName} ` +
       // latency
-      `| ${reference.latency.median.toFixed(0)}±${reference.latency.spread.toFixed(0)}ms ` +
+      `| ${reference.latency.p50.toFixed(0)}ms ` +
       // p75
       `| ${reference.latency.p75.toFixed(0)}ms ` +
       // p90
@@ -344,7 +351,7 @@ for (const {name, results} of testResults) {
       // benchmark name
       `| ${benchmarkName} ` +
       // latency
-      `| ${experiment.latency.median.toFixed(0)}±${experiment.latency.spread.toFixed(0)}ms ` +
+      `| ${experiment.latency.p50.toFixed(0)}ms ` +
       // p75
       `| ${experiment.latency.p75.toFixed(0)}ms ` +
       // p90
@@ -391,7 +398,7 @@ ${experimentTable}
 > #### column definitions
 >
 > - **benchmark** — the name of the test, e.g. "article", followed by the label of the field being measured, e.g. "(title)".
-> - **latency** — the time between when a key was pressed and when it was rendered. derived from a set of samples. the median (p50) is shown along with its spread.
+> - **latency** — the time between when a key was pressed and when it was rendered. derived from a set of samples. the median (p50) is shown to show the most common latency.
 > - **p75** — the 75th percentile of the input latency in the test run. 75% of the sampled inputs in this benchmark were processed faster than this value. this provides insight into the upper range of typical performance.
 > - **p90** — the 90th percentile of the input latency in the test run. 90% of the sampled inputs were faster than this. this metric helps identify slower interactions that occurred less frequently during the benchmark.
 > - **p99** — the 99th percentile of the input latency in the test run. only 1% of sampled inputs were slower than this. this represents the worst-case scenarios encountered during the benchmark, useful for identifying potential performance outliers.
diff --git a/perf/efps/runTest.ts b/perf/efps/runTest.ts
index 89b272c5132..1f93a34f9fc 100644
--- a/perf/efps/runTest.ts
+++ b/perf/efps/runTest.ts
@@ -10,42 +10,34 @@ import sourcemaps from 'rollup-plugin-sourcemaps'
 import handler from 'serve-handler'
 import * as vite from 'vite'
 
+import {remapCpuProfile} from './helpers/remapCpuProfile'
 import {type EfpsResult, type EfpsTest, type EfpsTestRunnerContext} from './types'
 
 const workspaceDir = path.dirname(fileURLToPath(import.meta.url))
 
 interface RunTestOptions {
-  prefix: string
   test: EfpsTest
   resultsDir: string
-  // spinner: Ora
   projectId: string
   headless: boolean
   client: SanityClient
   sanityPkgPath: string
   key: string
+  enableProfiler: boolean
   log: (text: string) => void
 }
 
 export async function runTest({
-  prefix,
   test,
   resultsDir,
-  // spinner,
   projectId,
   headless,
   client,
   sanityPkgPath,
   key,
+  enableProfiler,
   log,
 }: RunTestOptions): Promise<EfpsResult[]> {
-  console.log(prefix)
-  // const log = (text: string) => {
-  //   // spinner.text = `${prefix}\n  └ ${text}`
-  // }
-
-  // spinner.start(prefix)
-
   const outDir = path.join(workspaceDir, 'builds', test.name, key)
   const testResultsDir = path.join(resultsDir, test.name, key)
 
@@ -111,7 +103,7 @@ export async function runTest({
       typeof test.document === 'function' ? await test.document(runnerContext) : test.document
     document = await client.create(documentToCreate)
 
-    // const cdp = await context.newCDPSession(page)
+    const cdp = enableProfiler ? await context.newCDPSession(page) : null
 
     log('Loading editor…')
     await page.goto(
@@ -120,8 +112,10 @@ export async function runTest({
       )};type=${encodeURIComponent(documentToCreate._type)}`,
     )
 
-    // await cdp.send('Profiler.enable')
-    // await cdp.send('Profiler.start')
+    if (cdp) {
+      await cdp.send('Profiler.enable')
+      await cdp.send('Profiler.start')
+    }
 
     log('Benchmarking…')
     const result = await test.run({...runnerContext, document})
@@ -129,24 +123,24 @@ export async function runTest({
     log('Saving results…')
     const results = Array.isArray(result) ? result : [result]
 
-    // const {profile} = await cdp.send('Profiler.stop')
-    // const remappedProfile = await remapCpuProfile(profile, outDir)
-
     await fs.promises.mkdir(testResultsDir, {recursive: true})
     await fs.promises.writeFile(
       path.join(testResultsDir, 'results.json'),
       JSON.stringify(results, null, 2),
     )
-    // await fs.promises.writeFile(
-    //   path.join(testResultsDir, 'raw.cpuprofile'),
-    //   JSON.stringify(profile),
-    // )
-    // await fs.promises.writeFile(
-    //   path.join(testResultsDir, 'mapped.cpuprofile'),
-    //   JSON.stringify(remappedProfile),
-    // )
-
-    // spinner.succeed(`Ran benchmark '${test.name}' (${versionLabel})`)
+
+    if (cdp) {
+      const {profile} = await cdp.send('Profiler.stop')
+      await fs.promises.writeFile(
+        path.join(testResultsDir, 'raw.cpuprofile'),
+        JSON.stringify(profile),
+      )
+      const remappedProfile = await remapCpuProfile(profile, outDir)
+      await fs.promises.writeFile(
+        path.join(testResultsDir, 'mapped.cpuprofile'),
+        JSON.stringify(remappedProfile),
+      )
+    }
 
     return results
   } finally {
diff --git a/perf/efps/types.ts b/perf/efps/types.ts
index ad9df28fb3d..835c64c8bde 100644
--- a/perf/efps/types.ts
+++ b/perf/efps/types.ts
@@ -20,8 +20,7 @@ export interface EfpsResult {
   runDuration: number
   blockingTime: number
   latency: {
-    median: number
-    spread: number
+    p50: number
     p75: number
     p90: number
     p99: number

From 00d438c74b26aae0b115c6f9587b3c4b59676eb9 Mon Sep 17 00:00:00 2001
From: Rico Kahler <ricokahler@gmail.com>
Date: Sun, 29 Sep 2024 10:04:08 -0500
Subject: [PATCH 21/21] add manual workflow dispatch

---
 .github/workflows/efps.yml | 14 ++++++++++++++
 perf/efps/index.ts         |  6 ++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/efps.yml b/.github/workflows/efps.yml
index dbe3c389a63..d46e9c6d557 100644
--- a/.github/workflows/efps.yml
+++ b/.github/workflows/efps.yml
@@ -1,6 +1,18 @@
 name: eFPS Test
 on:
   pull_request:
+  workflow_dispatch:
+    inputs:
+      reference_tag:
+        description: "Reference tag for comparison"
+        required: true
+        default: "latest"
+      enable_profiler:
+        description: "Enable profiler"
+        required: true
+        type: boolean
+        default: false
+
 jobs:
   install:
     timeout-minutes: 30
@@ -64,6 +76,8 @@ jobs:
           VITE_PERF_EFPS_PROJECT_ID: ${{ secrets.PERF_EFPS_PROJECT_ID }}
           VITE_PERF_EFPS_DATASET: ${{ secrets.PERF_EFPS_DATASET }}
           PERF_EFPS_SANITY_TOKEN: ${{ secrets.PERF_EFPS_SANITY_TOKEN }}
+          REFERENCE_TAG: ${{ github.event.inputs.reference_tag || 'latest' }}
+          ENABLE_PROFILER: ${{ github.event.inputs.enable_profiler || false }}
         run: pnpm efps:test
 
       - name: PR comment with report
diff --git a/perf/efps/index.ts b/perf/efps/index.ts
index 85efd2ccf99..6e7e864e962 100644
--- a/perf/efps/index.ts
+++ b/perf/efps/index.ts
@@ -24,8 +24,10 @@ const WARNING_THRESHOLD = 0.2
 const TEST_ATTEMPTS = process.env.CI ? 3 : 1
 
 const HEADLESS = true
-const ENABLE_PROFILER = false
-const REFERENCE_TAG = 'latest'
+// eslint-disable-next-line turbo/no-undeclared-env-vars
+const ENABLE_PROFILER = process.env.ENABLE_PROFILER === 'true'
+// eslint-disable-next-line turbo/no-undeclared-env-vars
+const REFERENCE_TAG = process.env.REFERENCE_TAG || 'latest'
 const TESTS = [article, recipe, synthetic]
 
 const projectId = process.env.VITE_PERF_EFPS_PROJECT_ID!