From 704679c5ebd8fd00f063349aac96d6f874848e32 Mon Sep 17 00:00:00 2001
From: Scott Morrison <scott.morrison@gmail.com>
Date: Mon, 21 Aug 2023 22:21:23 +1000
Subject: [PATCH] feat: initial commit of lean-training-data tools

---
 .gitignore                                    |   5 +
 .vscode/settings.json                         |  10 +
 README.md                                     | 257 ++++++++++++++++++
 TrainingData.lean                             |   5 +
 TrainingData/Frontend.lean                    | 238 ++++++++++++++++
 TrainingData/InfoTree/Basic.lean              | 169 ++++++++++++
 .../InfoTree/TacticInvocation/Basic.lean      |  74 +++++
 TrainingData/InfoTree/ToJson.lean             | 114 ++++++++
 TrainingData/Mathlib.lean                     |   1 +
 lake-manifest.json                            |  51 ++++
 lakefile.lean                                 |  30 ++
 lean-toolchain                                |   1 +
 make.sh                                       |  40 +++
 scripts/declaration_types.lean                |  27 ++
 scripts/export_infotree.lean                  |  47 ++++
 scripts/export_infotree.sh                    |  38 +++
 scripts/premises.lean                         | 130 +++++++++
 scripts/tactic_benchmark.lean                 | 150 ++++++++++
 scripts/tactic_benchmark.sh                   |  23 ++
 scripts/tactic_benchmark_summary.sh           |  28 ++
 scripts/training_data.lean                    | 115 ++++++++
 scripts/training_data.sh                      |  37 +++
 22 files changed, 1590 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .vscode/settings.json
 create mode 100644 README.md
 create mode 100644 TrainingData.lean
 create mode 100644 TrainingData/Frontend.lean
 create mode 100644 TrainingData/InfoTree/Basic.lean
 create mode 100644 TrainingData/InfoTree/TacticInvocation/Basic.lean
 create mode 100644 TrainingData/InfoTree/ToJson.lean
 create mode 100644 TrainingData/Mathlib.lean
 create mode 100644 lake-manifest.json
 create mode 100644 lakefile.lean
 create mode 100644 lean-toolchain
 create mode 100644 make.sh
 create mode 100644 scripts/declaration_types.lean
 create mode 100644 scripts/export_infotree.lean
 create mode 100755 scripts/export_infotree.sh
 create mode 100644 scripts/premises.lean
 create mode 100644 scripts/tactic_benchmark.lean
 create mode 100755 scripts/tactic_benchmark.sh
 create mode 100755 scripts/tactic_benchmark_summary.sh
 create mode 100644 scripts/training_data.lean
 create mode 100755 scripts/training_data.sh

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e8758e3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+/build
+/lake-packages
+/.cache
+.DS_Store
+/out
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..89f72cb
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,10 @@
+{
+  "editor.insertSpaces": true,
+  "editor.tabSize": 2,
+  "editor.rulers" : [100],
+  "files.encoding": "utf8",
+  "files.eol": "\n",
+  "files.insertFinalNewline": true,
+  "files.trimFinalNewlines": true,
+  "files.trimTrailingWhitespace": true,
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8e96679
--- /dev/null
+++ b/README.md
@@ -0,0 +1,257 @@
+# Data extraction from Lean libraries
+
+This repository provides some tools for extracting data from Lean libraries (particularly Mathlib).
+Most of the focus is data which may be useful for training ML models.
+If you're looking for Lean training data and these tools don't give you what you want, please ask!
+We would like to enable sharing of data and easy access to the Lean libraries for everyone.
+
+## Tools
+
+`declaration_types`
+: For each declaration imported in the target file (e.g. `Mathlib`),
+  print the name and type of the declaration.
+
+`premises`
+: For each declaration imported in a target file (e.g. `Mathlib`),
+  list all of its direct dependencies (i.e. constants referenced from its type or proof).
+  Constants appearing in explicit arguments are prefixed `*`,
+  and constants used by the simplifier are prefixed `s`.
+
+`training_data`
+: Export all goal / tactic pairs from the library, with additional context.
+  `--proofstep` outputs in `[GOAL]...[PROOFSTEP]...` format.
+
+`tactic_benchmark`
+: Run a tactic against every declaration in the library, tracking runtimes and successes.
+
+`export_infotree`
+: Export the `InfoTree` for each module as JSON.
+  Filtering flags: `--tactics` (only tactic nodes) `--substantive` (no structuring tactics) and
+  `--original` (no synthetic syntax nodes, e.g. from macro expansions)
+
+## Usage instructions
+
+* `git clone https://github.com/semorrison/lean-training-data.git`
+
+* Install [`elan`](https://github.com/leanprover/elan) by running
+
+```shell
+curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh
+```
+
+* Go into the `lean-training-data` directory.
+* Run `lake exe cache get` (this downloads precompiled binaries for `Mathlib`).
+* Run `lake exe <tool>`, where `<tool>` is one of the programs documented below.
+
+## Detailed instructions
+
+### `declaration_types`
+
+`lake exe declaration_types Mathlib` will import `Mathlib`, and then print the names and types
+of every declaration in the environment.
+
+Sample output:
+
+```lean
+---
+theorem
+TopologicalSpace.OpenNhds.map_id_obj
+∀ {X : TopCat} (x : ↑X) (U : TopologicalSpace.OpenNhds (↑(CategoryTheory.CategoryStruct.id X) x)),
+  (TopologicalSpace.OpenNhds.map (CategoryTheory.CategoryStruct.id X) x).obj U = U
+```
+
+The first line is the declaration type (`theorem`, `definition`, `inductive`, etc), the second line
+is the name, and subsequent lines are the type. Each block is separated by `---`.
+
+This takes about 16 minutes on my system (probably parallelizes if needed?)
+producing a 45mb file, containing the types for ~190000 declarations in Mathlib/Std/Aesop/Qq/Cli.
+
+### `premises`
+
+`lake exe premises Mathlib` will calculate declaration dependencies up to a target file
+(defaulting to all of Mathlib).
+
+* Declarations are separated by `---`.
+* In each block the first declaration is the theorem or definition we are analyzing,
+* Subsequent indented declarations are those used in its proof or definition.
+* Declarations prefixed with a `*` appear in explicit arguments.
+  (This approximates "is visible in the pretty printed form".)
+* Declarations prefixed with a `s` are used by the simplifier.
+
+Sample output:
+
+```lean
+---
+List.toFinset.ext_iff
+* congrArg
+  List.instMembershipList
+  Finset
+  Finset.instMembershipFinset
+* Membership.mem
+  List.toFinset
+* iff_self
+  List
+* Iff
+* congrFun
+* congr
+  True
+* of_eq_true
+  Eq
+* Eq.trans
+  DecidableEq
+* forall_congr
+s List.mem_toFinset
+s Finset.ext_iff
+s propext
+```
+
+This takes about 4 minutes on my system, producing a 115mb file,
+containing information for ~170000 declarations in Mathlib and Std.
+
+### `training_data`
+
+`lake exe export_infotree Mathlib.Logic.Hydra` will recompile the target module,
+and output all the tactic invocations appearing in the file.
+
+The default output is a verbose human/machine readable format described below,
+or the `--proofstep` flag just gives `[GOAL]...[PROOFSTEP]...` output
+as used for training some models.
+
+TODO: Break out individual steps of `rw` and `simp_rw`, with intermediate goals.
+This is easy to do, just needs some plumbing.
+
+The output consists of blocks of the form:
+
+```text
+===
+Mathlib.Logic.Hydra
+---
+61
+---
+theorem cutExpand_le_invImage_lex [DecidableEq α] [IsIrrefl α r] :
+    CutExpand r ≤ InvImage (Finsupp.Lex (rᶜ ⊓ (· ≠ ·)) (· < ·)) toFinsupp := by
+
+---
+α : Type u_1
+r : α → α → Prop
+inst✝¹ : DecidableEq α
+inst✝ : IsIrrefl α r
+⊢ CutExpand r ≤ InvImage (Finsupp.Lex (rᶜ ⊓ fun x x_1 => x ≠ x_1) fun x x_1 => x < x_1) ↑toFinsupp
+---
+64:2-64:27
+---
+rintro s t ⟨u, a, hr, he⟩
+---
+case intro.intro.intro
+α : Type u_1
+r : α → α → Prop
+inst✝¹ : DecidableEq α
+inst✝ : IsIrrefl α r
+s t u : Multiset α
+a : α
+hr : ∀ (a' : α), a' ∈ u → r a' a
+he : s + {a} = t + u
+⊢ InvImage (Finsupp.Lex (rᶜ ⊓ fun x x_1 => x ≠ x_1) fun x x_1 => x < x_1) (↑toFinsupp) s t
+---
+```
+
+Here:
+
+* `Mathlib.Logic.Hydra` is the name of the module where this goal occurs.
+* `61` is the number of lines before the declaration (i.e. the `theorem` statement is on line `62`.)
+* `theorem ...` is the *partial* declaration, including a fragment of the tactic proof.
+* Next is the goal state at that point.
+  (Typically just one goal, as we don't report the goal states before structuring commands.)
+  Note that there is no guarantee that truncating the file at this point will actually cause Lean
+  to display this goal: the presence of earlier structuring commands could result in Lean showing
+  an error message instead.
+* `64:2-64:27` is the range of positions occupied by the tactic invocation in the original file.
+  This allows us to look up this goal in a live Lean session, so we can try out alternative tactics.
+* `rintro s t ⟨u, a, hr, he⟩` is the tactic used in the library.
+* After that is the goal state after running the tactic.
+  (Often multiple goals.)"
+
+There is also `scripts/training_data.sh`, which will run in parallel over all of Mathlib,
+recording results in `out/training_data/`.
+
+### `tactic_benchmark`
+
+`lake exe tactic_benchmark --aesop Mathlib.Topology.ContinuousFunction.Ordered`
+will run `aesop` on the type of each
+declaration in the target module, reporting success or failure,
+and the runtime (in seconds and heartbeats).
+
+Sample output:
+
+```shell
+% lake exe tactic_benchmark --aesop Mathlib.Topology.ContinuousFunction.Ordered
+Mathlib.Topology.ContinuousFunction.Ordered 7 21
+❌ ContinuousMap.abs (0.006912s) (111 heartbeats)
+❌ ContinuousMap.instAbsContinuousMap (0.007926s) (115 heartbeats)
+✅ ContinuousMap.abs_apply (0.027873s) (401 heartbeats)
+❌ ContinuousMap.partialOrder (0.008932s) (105 heartbeats)
+✅ ContinuousMap.le_def (0.071422s) (984 heartbeats)
+❌ ContinuousMap.lt_def (0.162438s) (2284 heartbeats)
+❌ ContinuousMap.sup (0.012593s) (118 heartbeats)
+✅ ContinuousMap.sup_coe (0.204213s) (2838 heartbeats)
+✅ ContinuousMap.sup_apply (0.201279s) (2799 heartbeats)
+❌ ContinuousMap.semilatticeSup (0.007805s) (119 heartbeats)
+❌ ContinuousMap.inf (0.010117s) (119 heartbeats)
+✅ ContinuousMap.inf_coe (0.175238s) (2798 heartbeats)
+✅ ContinuousMap.inf_apply (0.174505s) (2683 heartbeats)
+❌ ContinuousMap.semilatticeInf (0.007189s) (119 heartbeats)
+❌ ContinuousMap.instLatticeContinuousMap (0.007780s) (119 heartbeats)
+❌ ContinuousMap.sup'_apply (0.049451s) (727 heartbeats)
+❌ ContinuousMap.sup'_coe (0.069892s) (1101 heartbeats)
+❌ ContinuousMap.inf'_apply (0.049896s) (728 heartbeats)
+❌ ContinuousMap.inf'_coe (0.070128s) (1102 heartbeats)
+❌ ContinuousMap.IccExtend (0.030286s) (466 heartbeats)
+✅ ContinuousMap.coe_IccExtend (0.148204s) (2390 heartbeats)
+```
+
+Currently supported flags are `--aesop`, `--simp_all` (which runs `intros; simp_all`),
+`--rfl` (which runs `intros; rfl`), and `--exact` (which runs `exact?`),
+but it is trivial to add more by editing `tactic_benchmark.lean`.
+
+There is also `scripts/tactic_benchmark.sh`, which will run in parallel over all of Mathlib,
+recording results in `out/tactic_benchmark/`.
+
+After you've run it, `scripts/tactic_benchmark_summary.sh` will report the success rates,
+as well as the differential success rates
+(e.g. number of goals solved by `simp_all` but not by `aesop`).
+
+TODO: run on all tactic goals in the library, not just the types of declarations.
+Should not be difficult given the existing components.
+
+### `export_infotree`
+
+`lake exe export_infotree Mathlib.Logic.Hydra` will recompile the target module,
+extract the `InfoTree`s, and then write these out as JSON to stdout.
+The JSON contains pretty-printed goals before and after every tactic invocation,
+and the pretty-printed syntax of every tactic invocation, and explicitly constructed term.
+
+There is also `scripts/export_infotree.sh`, which will run in parallel over all of Mathlib,
+recording results in `out/export_infotree/`.
+
+If you need to use this tool,
+consider modifying one of the other tools to give you directly what you want!
+
+## See also
+
+### LeanDojo
+
+[LeanDojo] provides similar tools.
+
+I like that the tools provided here are standalone tools provided separately from any model or benchmark.
+They are "pure lean" (plus a little bash scripting), and may be useful models for anyone interested
+in metaprogramming tools for examining compiled Lean code.
+
+### Releases
+
+You can find a downloadable copy of the output of all these tools under `Releases`.
+Please ping me if you'd like these to be updated. (We could run a CI job.)
+
+### Derivatives
+
+If you use these tools or the downloadable releases to prepare other publicly available datasets
+(e.g. train/test splits) or models, please reference this repository to help others find it.
diff --git a/TrainingData.lean b/TrainingData.lean
new file mode 100644
index 0000000..b2bc35b
--- /dev/null
+++ b/TrainingData.lean
@@ -0,0 +1,5 @@
+import TrainingData.Frontend
+import TrainingData.InfoTree.Basic
+import TrainingData.InfoTree.ToJson
+import TrainingData.InfoTree.TacticInvocation.Basic
+import TrainingData.Mathlib
\ No newline at end of file
diff --git a/TrainingData/Frontend.lean b/TrainingData/Frontend.lean
new file mode 100644
index 0000000..b97a871
--- /dev/null
+++ b/TrainingData/Frontend.lean
@@ -0,0 +1,238 @@
+/-
+Copyright (c) 2023 Scott Morrison. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Scott Morrison
+-/
+import Lean.Elab.Frontend
+import Std.Util.TermUnsafe
+import Std.Data.MLList.Basic
+
+/-!
+# Compiling Lean sources to obtain `Environment`, `Message`s and `InfoTree`s.
+
+The main entry point is
+
+```
+def processInput (input : String) (env? : Option Environment := none)
+    (opts : Options := {}) (fileName : Option String := none) (info : Bool := true) :
+    IO (Environment × List Message × List InfoTree) :=
+  ...
+```
+
+which attempts to compile Lean source code, returning an `Environment`,
+along with any generated `Message`s and `InfoTree`s.
+
+The optional argument `env?` allows specifying an existing `Environment`, for partial compilation.
+If this is non-empty, then the source code may not contain any `import` statements.
+
+You may suppress the generation of `InfoTree`s using `info := false`.
+
+For finer-grained control of compilation, we define a `CompilationStep` structure
+which contains information about the results of each command.
+
+You can use `processInput'` to obtain a monadic lazy list of `CompilationStep`s.
+
+The functions `compileModule : Name → IO (List CompilationStep)` and
+`moduleInfoTrees : Name → IO (List InfoTree)` are useful for compiling single modules from source.
+-/
+
+set_option autoImplicit true
+
+open Lean Elab Frontend
+
+namespace Lean.PersistentArray
+
+/--
+Drop the first `n` elements of a `PersistentArray`, returning the results as a `List`.
+-/
+-- We can't remove the `[Inhabited α]` hypotheses here until
+-- `PersistentArray`'s `GetElem` instance also does.
+def drop [Inhabited α] (t : PersistentArray α) (n : Nat) : List α :=
+  List.range (t.size - n) |>.map fun i => t.get! (n + i)
+
+end Lean.PersistentArray
+
+namespace MLList
+
+/-- Run a lazy list in a `ReaderT` monad on some fixed state. -/
+partial def runReaderT [Monad m] (L : MLList (ReaderT.{u, u} ρ m) α) (r : ρ) : MLList m α :=
+  squash fun _ =>
+    return match ← (uncons L).run r with
+    | none => nil
+    | some (a, L') => cons a (L'.runReaderT r)
+
+/-- Run a lazy list in a `StateRefT'` monad on some initial state. -/
+partial def runStateRefT [Monad m] [MonadLiftT (ST ω) m] (L : MLList (StateRefT' ω σ m) α) (s : σ) :
+    MLList m α :=
+  squash fun _ =>
+    return match ← (uncons L).run s with
+    | (none, _) => nil
+    | (some (a, L'), s') => cons a (L'.runStateRefT s')
+
+end MLList
+
+namespace Lean.Elab.IO
+
+/--
+Results from processing a command.
+
+Contains the `Environment` before and after,
+the `src : Substring` and `stx : Syntax` of the command,
+and any `Message`s and `InfoTree`s produced while processing.
+-/
+structure CompilationStep where
+  src : Substring
+  stx : Syntax
+  before : Environment
+  after : Environment
+  msgs : List Message
+  trees : List InfoTree
+
+namespace CompilationStep
+
+/--
+Process one command, returning a `CompilationStep` and
+`done : Bool`, indicating whether this was the last command.
+-/
+def one : FrontendM (CompilationStep × Bool) := do
+  let s := (← get).commandState
+  let beforePos := (← get).cmdPos
+  let before := s.env
+  let done ← processCommand
+  let stx := (← get).commands.back
+  let src := ⟨(← read).inputCtx.input, beforePos, (← get).cmdPos⟩ -- FIXME this is incorrect
+  let s' := (← get).commandState
+  let after := s'.env
+  let msgs := s'.messages.msgs.drop s.messages.msgs.size
+  let trees := s'.infoState.trees.drop s.infoState.trees.size
+  return ({ src, stx, before, after, msgs, trees }, done)
+
+/-- Process all commands in the input. -/
+partial def all : FrontendM (List CompilationStep) := do
+  let (cmd, done) ← CompilationStep.one
+  if done then
+    return [cmd]
+  else
+    return cmd :: (← all)
+
+/-- Return all new `ConstantInfo`s added during the processed command. -/
+def diff (cmd : CompilationStep) : List ConstantInfo :=
+  cmd.after.constants.map₂.toList.filterMap
+    fun (c, i) => if cmd.before.constants.map₂.contains c then none else some i
+
+end CompilationStep
+
+/--
+Returns a monadic lazy list of `CompilationStep`s.
+This needs to be provided with initial state, see `compilationSteps`.
+-/
+partial def compilationSteps_aux :  MLList FrontendM CompilationStep :=
+  .squash fun _ => aux
+where
+  /-- Implementation of `compilationSteps_aux`.  -/
+  aux := do
+    let (cmd, done) ← CompilationStep.one
+    if done then
+      return .ofList [cmd]
+    else
+      return .cons cmd (← aux)
+
+/-- Return the the `CompilationStep`s, as a monadic lazy list in `IO`. -/
+def compilationSteps (inputCtx : Parser.InputContext) (parserState : Parser.ModuleParserState)
+    (commandState : Command.State) : MLList IO CompilationStep :=
+  compilationSteps_aux.runReaderT { inputCtx }
+    |>.runStateRefT { commandState, parserState, cmdPos := parserState.pos }
+
+/--
+Process some text input, with or without an existing environment.
+If there is no existing environment, we parse the input for headers (e.g. import statements),
+and create a new environment.
+Otherwise, we add to the existing environment.
+Returns a list containing data about each processed command.
+
+Be aware that Lean does not support compiling multiple files in the same sessions.
+Often it works, but if the compiled files do anything complicated with initializers then
+nothing is gauranteed.
+-/
+def processInput' (input : String) (env? : Option Environment := none)
+    (opts : Options := {}) (fileName : Option String := none) (info : Bool := true) :
+    MLList IO CompilationStep := unsafe do
+  let fileName   := fileName.getD "<input>"
+  let inputCtx   := Parser.mkInputContext input fileName
+  let (parserState, commandState) ← match env? with
+  | none => do
+    enableInitializersExecution
+    let (header, parserState, messages) ← Parser.parseHeader inputCtx
+    let (env, messages) ← processHeader header opts messages inputCtx
+    pure (parserState, (Command.mkState env messages opts))
+  | some env => do
+    pure ({ : Parser.ModuleParserState }, Command.mkState env {} opts)
+  compilationSteps inputCtx parserState { commandState with infoState.enabled := info }
+
+/--
+Process some text input, with or without an existing environment.
+If there is no existing environment, we parse the input for headers (e.g. import statements),
+and create a new environment.
+Otherwise, we add to the existing environment.
+Returns the resulting environment, along with a list of messages and info trees.
+-/
+def processInput (input : String) (env? : Option Environment := none)
+    (opts : Options := {}) (fileName : Option String := none) (info : Bool := true) :
+    IO (Environment × List Message × List InfoTree) := do
+  let steps ← processInput' input env? opts fileName info |>.force
+  match steps.getLast? with
+  | none => throw <| IO.userError "No commands found in input."
+  | some { after, .. } =>
+    return (after, steps.bind CompilationStep.msgs, steps.bind CompilationStep.trees)
+
+open System
+
+-- TODO allow finding Lean 4 sources from the toolchain.
+def findLean (mod : Name) : IO FilePath := do
+  return FilePath.mk ((← findOLean mod).toString.replace "build/lib/" "") |>.withExtension "lean"
+
+/-- Implementation of `moduleSource`, which is the cached version of this function. -/
+def moduleSource' (mod : Name) : IO String := do
+  IO.FS.readFile (← findLean mod)
+
+initialize sourceCache : IO.Ref <| HashMap Name String ←
+  IO.mkRef .empty
+
+/-- Read the source code of the named module. The results are cached. -/
+def moduleSource (mod : Name) : IO String := do
+  let m ← sourceCache.get
+  match m.find? mod with
+  | some r => return r
+  | none => do
+    let v ← moduleSource' mod
+    sourceCache.set (m.insert mod v)
+    return v
+
+/-- Implementation of `compileModule`, which is the cached version of this function. -/
+def compileModule' (mod : Name) : MLList IO CompilationStep := do
+  Lean.Elab.IO.processInput' (← moduleSource mod) none {} (← findLean mod).toString
+
+initialize compilationCache : IO.Ref <| HashMap Name (List CompilationStep) ←
+  IO.mkRef .empty
+
+/--
+Compile the source file for the named module, returning the
+resulting environment, any generated messages, and all info trees.
+
+The results are cached, although be aware that compiling multiple files in the same session
+is unsupported, and may lead to exciting results:
+you should check all compiled files for error messages if attempting this.
+-/
+def compileModule (mod : Name) : IO (List CompilationStep) := do
+  let m ← compilationCache.get
+  match m.find? mod with
+  | some r => return r
+  | none => do
+    let v ← compileModule' mod |>.force
+    compilationCache.set (m.insert mod v)
+    return v
+
+/-- Compile the source file for the named module, returning all info trees. -/
+def moduleInfoTrees (mod : Name) : IO (List InfoTree) := do
+  let steps ← compileModule mod
+  return steps.bind (fun c => c.trees)
diff --git a/TrainingData/InfoTree/Basic.lean b/TrainingData/InfoTree/Basic.lean
new file mode 100644
index 0000000..e107880
--- /dev/null
+++ b/TrainingData/InfoTree/Basic.lean
@@ -0,0 +1,169 @@
+import Lean
+
+open Lean Elab
+
+namespace Lean.FileMap
+
+/-- Extract the range of a `Syntax` expressed as lines and columns. -/
+-- Extracted from the private declaration `Lean.Elab.formatStxRange`,
+-- in `Lean.Elab.InfoTree.Main`.
+def stxRange (fileMap : FileMap) (stx : Syntax) : Position × Position :=
+  let pos    := stx.getPos?.getD 0
+  let endPos := stx.getTailPos?.getD pos
+  (fileMap.toPosition pos, fileMap.toPosition endPos)
+
+end Lean.FileMap
+
+namespace Lean.Elab.Info
+
+/-- The type of a `Lean.Elab.Info`, as a string. -/
+def kind : Info → String
+  | .ofTacticInfo         _ => "TacticInfo"
+  | .ofTermInfo           _ => "TermInfo"
+  | .ofCommandInfo        _ => "CommmandInfo"
+  | .ofMacroExpansionInfo _ => "MacroExpansionInfo"
+  | .ofOptionInfo         _ => "OptionInfo"
+  | .ofFieldInfo          _ => "FieldInfo"
+  | .ofCompletionInfo     _ => "CompletionInfo"
+  | .ofUserWidgetInfo     _ => "UserWidgetInfo"
+  | .ofCustomInfo         _ => "CustomInfo"
+  | .ofFVarAliasInfo      _ => "FVarAliasInfo"
+  | .ofFieldRedeclInfo    _ => "FieldRedeclInfo"
+
+/-- The `Syntax` for a `Lean.Elab.Info`, if there is one. -/
+def stx? : Info → Option Syntax
+  | .ofTacticInfo         info => info.stx
+  | .ofTermInfo           info => info.stx
+  | .ofCommandInfo        info => info.stx
+  | .ofMacroExpansionInfo info => info.stx
+  | .ofOptionInfo         info => info.stx
+  | .ofFieldInfo          info => info.stx
+  | .ofCompletionInfo     info => info.stx
+  | .ofUserWidgetInfo     info => info.stx
+  | .ofCustomInfo         info => info.stx
+  | .ofFVarAliasInfo      _    => none
+  | .ofFieldRedeclInfo    info => info.stx
+
+/-- Is the `Syntax` for this `Lean.Elab.Info` original, or synthetic? -/
+def isOriginal (i : Info) : Bool :=
+  match i.stx? with
+  | none => true   -- Somewhat unclear what to do with `FVarAliasInfo`, so be conservative.
+  | some stx => match stx.getHeadInfo with
+    | .original .. => true
+    | _ => false
+
+end Lean.Elab.Info
+
+namespace Lean.Elab.ContextInfo
+
+/-- Pretty print an expression in the given `ContextInfo` with the given `LocalContext`. -/
+def ppExpr (ctx : ContextInfo) (lctx : LocalContext) (e : Expr) : IO Format :=
+  ctx.runMetaM lctx (do Meta.ppExpr (← instantiateMVars e))
+
+end Lean.Elab.ContextInfo
+
+namespace Lean.Elab.TacticInfo
+
+/-- Find the name for the outermost `Syntax` in this `TacticInfo`. -/
+def name? (t : TacticInfo) : Option Name :=
+  match t.stx with
+  | Syntax.node _ n _ => some n
+  | _ => none
+
+/-- Decide whether a tactic is "substantive",
+or is merely a tactic combinator (e.g. `by`, `;`, multiline tactics, parenthesized tactics). -/
+def isSubstantive (t : TacticInfo) : Bool :=
+  match t.name? with
+  | none => false
+  | some `null => false
+  | some ``cdot => false
+  | some ``cdotTk => false
+  | some ``Lean.Parser.Term.byTactic => false
+  | some ``Lean.Parser.Tactic.tacticSeq => false
+  | some ``Lean.Parser.Tactic.tacticSeq1Indented => false
+  | some ``Lean.Parser.Tactic.«tactic_<;>_» => false
+  | some ``Lean.Parser.Tactic.paren => false
+  | _ => true
+
+end Lean.Elab.TacticInfo
+
+namespace Lean.Elab.InfoTree
+
+/--
+Keep `.node` nodes and `.hole` nodes satisfying predicates.
+
+Returns a `List InfoTree`, although in most situations this will be a singleton.
+-/
+partial def filter (p : Info → Bool) (m : MVarId → Bool := fun _ => false) :
+    InfoTree → List InfoTree
+  | .context ctx tree => tree.filter p m |>.map (.context ctx)
+  | .node info children =>
+    if p info then
+      [.node info (children.toList.map (filter p m)).join.toPArray']
+    else
+      (children.toList.map (filter p m)).join
+  | .hole mvar => if m mvar then [.hole mvar] else []
+
+/-- Discard all nodes besides `.context` nodes and `TacticInfo` nodes. -/
+partial def retainTacticInfo (tree : InfoTree) : List InfoTree :=
+  tree.filter fun | .ofTacticInfo _ => true | _ => false
+
+/-- Retain only nodes with "original" syntax. -/
+partial def retainOriginal (tree : InfoTree) : List InfoTree :=
+  tree.filter Info.isOriginal
+
+/-- Discard all TacticInfo nodes that are tactic combinators or structuring tactics. -/
+-- There is considerable grey area here: what to do with `classical`?
+partial def retainSubstantive (tree : InfoTree) : List InfoTree :=
+  tree.filter fun | .ofTacticInfo i => i.isSubstantive | _ => true
+
+/-- Discard any enclosing `InfoTree.context` layers. -/
+def consumeContext : InfoTree → InfoTree
+  | .context _ t => t.consumeContext
+  | t => t
+
+/-- Is this `InfoTree` a `TermInfo` for some `Expr`? -/
+def ofExpr? (i : InfoTree) : Option Expr := match i with
+  | .node (.ofTermInfo i) _ => some i.expr
+  | _ => none
+
+/-- Is this `InfoTree` a `TermInfo` for some `Name`? -/
+def ofName? (i : InfoTree) : Option Name := i.ofExpr?.bind Expr.constName?
+
+/-- Check if the `InfoTree` is the top level `InfoTree` for a declaration,
+if so, return it along with the declaration name. -/
+def elabDecl? (t : InfoTree) : Option Name :=
+  match t.consumeContext with
+  | .node (.ofCommandInfo i) c =>
+    if i.elaborator == `Lean.Elab.Command.elabDeclaration
+    then
+      -- this is hacky: we are relying on the ordering of the child nodes.
+      c.toList.foldr (fun cc acc => match (cc.consumeContext.ofName?, acc) with
+                       | (_, some r) => some r
+                       | (some n, none) => some n
+                       | (none, none) => none )
+                     none
+    else
+      none
+  | _ => none
+
+/-- Analogue of `Lean.Elab.InfoTree.findInfo?`, but that returns a list of all results. -/
+partial def findAllInfo (t : InfoTree) (ctx : Option ContextInfo) (p : Info → Bool) :
+    List (Info × Option ContextInfo × PersistentArray InfoTree) :=
+  match t with
+  | context ctx t => t.findAllInfo ctx p
+  | node i ts  =>
+      (if p i then [(i, ctx, ts)] else []) ++ ts.toList.bind (fun t => t.findAllInfo ctx p)
+  | _ => []
+
+/-- Return all `TacticInfo` nodes in an `InfoTree` corresponding to tactics,
+each equipped with its relevant `ContextInfo`, and any children info trees. -/
+def findTacticNodes (t : InfoTree) : List (TacticInfo × ContextInfo × PersistentArray InfoTree) :=
+  let infos := t.findAllInfo none fun i => match i with
+    | .ofTacticInfo _ => true
+    | _ => false
+  infos.filterMap fun p => match p with
+  | (.ofTacticInfo i, some ctx, children) => (i, ctx, children)
+  | _ => none
+
+end Lean.Elab.InfoTree
diff --git a/TrainingData/InfoTree/TacticInvocation/Basic.lean b/TrainingData/InfoTree/TacticInvocation/Basic.lean
new file mode 100644
index 0000000..0874700
--- /dev/null
+++ b/TrainingData/InfoTree/TacticInvocation/Basic.lean
@@ -0,0 +1,74 @@
+import TrainingData.InfoTree.Basic
+
+set_option autoImplicit true
+
+open Lean Elab
+
+/--
+A helper structure containing a `TacticInfo` and `ContextInfo`,
+along with children `InfoTree`s.
+
+It is convenient to bundle these together because
+many functions rely on both the `TacticInfo` and the `ContextInfo`.
+-/
+structure Lean.Elab.TacticInvocation where
+  info : TacticInfo
+  ctx : ContextInfo
+  children : PersistentArray InfoTree
+
+namespace Lean.Elab.TacticInvocation
+
+/-- Return the range of the tactic, as a pair of file positions. -/
+def range (t : TacticInvocation) : Position × Position := t.ctx.fileMap.stxRange t.info.stx
+
+/-- Pretty print a tactic. -/
+def pp (t : TacticInvocation) : IO Format :=
+  t.ctx.runMetaM {} try
+    Lean.PrettyPrinter.ppTactic ⟨t.info.stx⟩
+  catch _ =>
+    pure "<failed to pretty print>"
+
+open Meta
+
+/-- Run a tactic on the goals stored in a `TacticInvocation`. -/
+def runMetaMGoalsBefore (t : TacticInvocation) (x : List MVarId → MetaM α) : IO α := do
+  t.ctx.runMetaM {} <| Meta.withMCtx t.info.mctxBefore <| x t.info.goalsBefore
+
+/-- Run a tactic on the after goals stored in a `TacticInvocation`. -/
+def runMetaMGoalsAfter (t : TacticInvocation) (x : List MVarId → MetaM α) : IO α := do
+  t.ctx.runMetaM {} <| Meta.withMCtx t.info.mctxAfter <| x t.info.goalsAfter
+
+/-- Run a tactic on the main goal stored in a `TacticInvocation`. -/
+def runMetaM (t : TacticInvocation) (x : MVarId → MetaM α) : IO α := do
+  match t.info.goalsBefore.head? with
+  | none => throw <| IO.userError s!"No goals at {← t.pp}"
+  | some g => t.runMetaMGoalsBefore fun _ => do g.withContext <| x g
+
+def mainGoal (t : TacticInvocation) : IO Expr :=
+  t.runMetaM (fun g => do instantiateMVars (← g.getType))
+
+def formatMainGoal (t : TacticInvocation) : IO Format :=
+  t.runMetaM (fun g => do ppExpr (← instantiateMVars (← g.getType)))
+
+def goalState (t : TacticInvocation) : IO (List Format) := do
+  t.runMetaMGoalsBefore (fun gs => gs.mapM fun g => do Meta.ppGoal g)
+
+def goalStateAfter (t : TacticInvocation) : IO (List Format) := do
+  t.runMetaMGoalsAfter (fun gs => gs.mapM fun g => do Meta.ppGoal g)
+
+def ppExpr (t : TacticInvocation) (e : Expr) : IO Format :=
+  t.runMetaM (fun _ => do Meta.ppExpr (← instantiateMVars e))
+
+end Lean.Elab.TacticInvocation
+
+namespace Lean.Elab.InfoTree
+
+/--
+Finds all tactic invocations in an `InfoTree`,
+ignoring structuring tactics (e.g. `by`, `;`, multiline tactics, parenthesized tactics).
+-/
+def tactics (t : InfoTree) : List TacticInvocation :=
+  t.findTacticNodes.map (fun ⟨i, ctx, children⟩ => ⟨i, ctx, children⟩)
+    |>.filter fun i => i.info.isSubstantive
+
+end Lean.Elab.InfoTree
diff --git a/TrainingData/InfoTree/ToJson.lean b/TrainingData/InfoTree/ToJson.lean
new file mode 100644
index 0000000..65ab8cb
--- /dev/null
+++ b/TrainingData/InfoTree/ToJson.lean
@@ -0,0 +1,114 @@
+import TrainingData.InfoTree.TacticInvocation.Basic
+
+/-!
+# Exporting an `InfoTree` as Json
+
+-/
+
+namespace Lean.Elab
+
+structure InfoTreeNode (α : Type) where
+  kind : String
+  node : Option α
+  children : List Json
+deriving ToJson
+
+deriving instance ToJson for Lean.Position
+
+structure Syntax.Range where
+  synthetic : Bool
+  start : Lean.Position
+  finish : Lean.Position
+deriving ToJson
+
+structure Syntax.Json where
+  pp : Option String
+  -- raw : String
+  range : Range
+deriving ToJson
+
+def _root_.Lean.Syntax.toRange (stx : Syntax) (ctx : ContextInfo) : Syntax.Range :=
+  let pos    := stx.getPos?.getD 0
+  let endPos := stx.getTailPos?.getD pos
+  { start := ctx.fileMap.toPosition pos
+    finish := ctx.fileMap.toPosition endPos
+    synthetic := match stx.getHeadInfo with
+    | .original .. => false
+    | _ => true }
+
+def _root_.Lean.Syntax.toJson (stx : Syntax) (ctx : ContextInfo) (lctx : LocalContext) : IO Syntax.Json := do
+  return {
+    pp := match (← ctx.ppSyntax lctx stx).pretty with
+      | "failed to pretty print term (use 'set_option pp.rawOnError true' for raw representation)" => none
+      | pp => some pp
+    -- raw := toString stx
+    range := stx.toRange ctx }
+
+structure TacticInfo.Json where
+  name : Option Name
+  stx : Syntax.Json
+  goalsBefore : List String
+  goalsAfter : List String
+deriving ToJson
+
+-- Note: this is not responsible for converting the children to Json.
+def TacticInfo.toJson (i : TacticInfo) (ctx : ContextInfo) : IO TacticInfo.Json := do
+  let I : TacticInvocation := ⟨i, ctx, .empty⟩
+  return {
+    name := i.name?
+    stx :=
+    { pp := Format.pretty (← I.pp),
+      -- raw := toString i.info.stx,
+      range := i.stx.toRange ctx },
+    goalsBefore := (← I.goalState).map Format.pretty,
+    goalsAfter := (← I.goalStateAfter).map Format.pretty }
+
+structure CommandInfo.Json where
+  elaborator : Option Name
+  stx : Syntax.Json
+deriving ToJson
+
+def CommandInfo.toJson (info : CommandInfo) (ctx : ContextInfo) : IO CommandInfo.Json := do
+  return {
+    elaborator := match info.elaborator with | .anonymous => none | n => some n,
+    stx := ← info.stx.toJson ctx {} }
+
+structure TermInfo.Json where
+  elaborator : Option Name
+  stx : Syntax.Json
+  expectedType? : Option String
+  expr : String
+  isBinder : Bool
+deriving ToJson
+
+def TermInfo.toJson (info : TermInfo) (ctx : ContextInfo) : IO TermInfo.Json := do
+  return {
+    elaborator := match info.elaborator with | .anonymous => none | n => some n,
+    stx := ← info.stx.toJson ctx info.lctx,
+    expectedType? := ← info.expectedType?.mapM fun ty => do
+      pure (← ctx.ppExpr info.lctx ty).pretty
+    expr := (← ctx.ppExpr info.lctx info.expr).pretty
+    isBinder := info.isBinder }
+
+structure InfoTree.HoleJson where
+  goalState : String
+deriving ToJson
+
+partial def InfoTree.toJson (t : InfoTree) (ctx? : Option ContextInfo) : IO Json := do
+  match t with
+  | .context i t => t.toJson i
+  | .node info children =>
+    if let some ctx := ctx? then
+      let node : Option Json ← match info with
+      | .ofTermInfo    info => some <$> (do pure <| Lean.toJson (← info.toJson ctx))
+      | .ofCommandInfo info => some <$> (do pure <| Lean.toJson (← info.toJson ctx))
+      | .ofTacticInfo  info => some <$> (do pure <| Lean.toJson (← info.toJson ctx))
+      | _                   => pure none
+      return Lean.toJson (InfoTreeNode.mk info.kind node (← children.toList.mapM fun t' => t'.toJson ctx))
+    else throw <| IO.userError "No `ContextInfo` available."
+  | .hole mvarId =>
+    if let some ctx := ctx? then
+     return Lean.toJson (InfoTree.HoleJson.mk (← ctx.runMetaM {} (do Meta.ppGoal mvarId)).pretty)
+    else throw <| IO.userError "No `ContextInfo` available."
+
+end Lean.Elab
diff --git a/TrainingData/Mathlib.lean b/TrainingData/Mathlib.lean
new file mode 100644
index 0000000..befa7a0
--- /dev/null
+++ b/TrainingData/Mathlib.lean
@@ -0,0 +1 @@
+import Mathlib
\ No newline at end of file
diff --git a/lake-manifest.json b/lake-manifest.json
new file mode 100644
index 0000000..f95ec2a
--- /dev/null
+++ b/lake-manifest.json
@@ -0,0 +1,51 @@
+{"version": 5,
+ "packagesDir": "lake-packages",
+ "packages":
+ [{"git":
+   {"url": "https://github.com/EdAyers/ProofWidgets4",
+    "subDir?": null,
+    "rev": "a0c2cd0ac3245a0dade4f925bcfa97e06dd84229",
+    "opts": {},
+    "name": "proofwidgets",
+    "inputRev?": "v0.0.13",
+    "inherited": true}},
+  {"git":
+   {"url": "https://github.com/mhuisi/lean4-cli.git",
+    "subDir?": null,
+    "rev": "5a858c32963b6b19be0d477a30a1f4b6c120be7e",
+    "opts": {},
+    "name": "Cli",
+    "inputRev?": "nightly",
+    "inherited": true}},
+  {"git":
+   {"url": "https://github.com/leanprover-community/mathlib4.git",
+    "subDir?": null,
+    "rev": "c779bd594f3dbfeb8d26e4ca968eb7b3831f369e",
+    "opts": {},
+    "name": "mathlib",
+    "inputRev?": "withImportModules",
+    "inherited": false}},
+  {"git":
+   {"url": "https://github.com/gebner/quote4",
+    "subDir?": null,
+    "rev": "81cc13c524a68d0072561dbac276cd61b65872a6",
+    "opts": {},
+    "name": "Qq",
+    "inputRev?": "master",
+    "inherited": true}},
+  {"git":
+   {"url": "https://github.com/JLimperg/aesop",
+    "subDir?": null,
+    "rev": "086c98bb129ca856381d4414dc0afd6e3e4ae2ef",
+    "opts": {},
+    "name": "aesop",
+    "inputRev?": "master",
+    "inherited": true}},
+  {"git":
+   {"url": "https://github.com/leanprover/std4",
+    "subDir?": null,
+    "rev": "61a6507eff4dd71c01e0c7e194bc569d4719a6d3",
+    "opts": {},
+    "name": "std",
+    "inputRev?": "main",
+    "inherited": true}}]}
diff --git a/lakefile.lean b/lakefile.lean
new file mode 100644
index 0000000..a2d39de
--- /dev/null
+++ b/lakefile.lean
@@ -0,0 +1,30 @@
+import Lake
+open Lake DSL
+
+package «lean-training-data» {
+  -- add any package configuration options here
+}
+
+require mathlib from git
+  "https://github.com/leanprover-community/mathlib4.git" @ "withImportModules"
+
+@[default_target]
+lean_lib TrainingData where
+
+lean_exe tactic_benchmark where
+  root := `scripts.tactic_benchmark
+  supportInterpreter := true
+
+lean_exe export_infotree where
+  root := `scripts.export_infotree
+  supportInterpreter := true
+
+lean_exe training_data where
+  root := `scripts.training_data
+  supportInterpreter := true
+
+lean_exe premises where
+  root := `scripts.premises
+
+lean_exe declaration_types where
+  root := `scripts.declaration_types
diff --git a/lean-toolchain b/lean-toolchain
new file mode 100644
index 0000000..1b34910
--- /dev/null
+++ b/lean-toolchain
@@ -0,0 +1 @@
+leanprover/lean4:nightly-2023-08-19
diff --git a/make.sh b/make.sh
new file mode 100644
index 0000000..fd8a446
--- /dev/null
+++ b/make.sh
@@ -0,0 +1,40 @@
+# This script prepares the following files:
+# * YYYY-MM-DD-declaration_types.gz
+# * YYYY-MM-DD-premises.gz
+# * YYYY-MM-DD-export_infotree_full.tgz
+# * YYYY-MM-DD-export_infotree_min.tgz
+# * YYYY-MM-DD-training_data.tgz
+# * YYYY-MM-DD-proof_step.tgz
+
+DATE=`date "+%Y-%m-%d"`
+
+lake exe declaration_types > out/$DATE-declaration_types
+gzip out/$DATE-declaration_types
+
+lake exe premises > out/$DATE-premises
+gzip out/$DATE-premises
+
+rm -rf out/export_infotree
+./scripts/export_infotree.sh
+cd out
+tar cvzf $DATE-export_infotree_full.tgz export_infotree
+cd ..
+rm -rf out/export_infotree
+./scripts/export_infotree.sh --tactics --original --substantive
+cd out
+tar cvzf $DATE-export_infotree_min.tgz export_infotree
+cd ..
+
+rm -rf out/training_data
+./scripts/training_data.sh
+cd out
+tar cvzf $DATE-training_data.tgz training_data
+cd ..
+rm -rf out/training_data
+./scripts/training_data.sh --proofstep
+cd out
+tar cvzf $DATE-proof_step.tgz training_data
+cd ..
+
+rm -rf out/tactic_benchmark
+./scripts/tactic_benchmark.sh --aesop
diff --git a/scripts/declaration_types.lean b/scripts/declaration_types.lean
new file mode 100644
index 0000000..11eecae
--- /dev/null
+++ b/scripts/declaration_types.lean
@@ -0,0 +1,27 @@
+import Mathlib.Lean.CoreM
+
+open Lean Meta
+
+def Lean.ConstantInfo.kind : ConstantInfo → String
+  | .axiomInfo  _ => "axiom"
+  | .defnInfo   _ => "def"
+  | .thmInfo    _ => "theorem"
+  | .opaqueInfo _ => "opaque"
+  | .quotInfo   _ => "quot" -- Not sure what this is!
+  | .inductInfo _ => "inductive"
+  | .ctorInfo   _ => "constructor"
+  | .recInfo    _ => "recursor"
+
+def main (args : List String) : IO UInt32 := do
+  let modules := match args with
+  | [] => [`Mathlib]
+  | args => args.map fun s => s.toName
+  searchPathRef.set compileTimeSearchPath%
+  CoreM.withImportModules modules do
+    for (n, c) in (← getEnv).constants.map₁ do
+      if ! (← n.isBlackListed) then
+        IO.println "---"
+        IO.println c.kind
+        IO.println n
+        IO.println (← MetaM.run' do ppExpr c.type)
+  return 0
diff --git a/scripts/export_infotree.lean b/scripts/export_infotree.lean
new file mode 100644
index 0000000..6f57271
--- /dev/null
+++ b/scripts/export_infotree.lean
@@ -0,0 +1,47 @@
+import TrainingData.Frontend
+import TrainingData.InfoTree.ToJson
+import Mathlib.Tactic.ToExpr -- Upstreamed to std4 as https://github.com/leanprover/std4/pull/221
+import Mathlib.Util.Cli -- Upstreamed to lean4-cli as https://github.com/mhuisi/lean4-cli/pull/15
+import Mathlib.Lean.CoreM
+
+open Lean Elab IO Meta
+open Cli System
+
+structure InfoTreeExport where
+  name : Name
+  trees : List Json
+deriving ToJson
+
+def exportInfoTree (args : Cli.Parsed) : IO UInt32 := do
+    searchPathRef.set compileTimeSearchPath%
+    let target := args.positionalArg! "module" |>.as! Name
+    let mut trees ← moduleInfoTrees target
+    if args.hasFlag "tactics" then
+      trees := (trees.map InfoTree.retainTacticInfo).join
+    if args.hasFlag "original" then
+      trees := (trees.map InfoTree.retainOriginal).join
+    if args.hasFlag "substantive" then
+      trees := (trees.map InfoTree.retainSubstantive).join
+    let json ← trees.mapM fun t => t.toJson none
+    IO.println <| toJson <| InfoTreeExport.mk target json
+    return 0
+
+/-- Setting up command line options and help text for `lake exe export_infotree`. -/
+def export_infotree : Cmd := `[Cli|
+  export_infotree VIA exportInfoTree; ["0.0.1"]
+  "Export the InfoTrees for a file as JSON."
+
+  FLAGS:
+    "tactics";      "Only export TacticInfo nodes."
+    "original";     "Skip nodes with synthetic syntax."
+    "substantive";  "Skip tactic combinators."
+
+  ARGS:
+    module : Name; "Lean module to compile and export InfoTrees."
+]
+
+/-- `lake exe export_infotree` -/
+def main (args : List String) : IO UInt32 :=
+  export_infotree.validate args
+
+-- See `./scripts/export_infotree.sh` to run this against all of Mathlib.
diff --git a/scripts/export_infotree.sh b/scripts/export_infotree.sh
new file mode 100755
index 0000000..6c09b63
--- /dev/null
+++ b/scripts/export_infotree.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# Run either as `scripts/export_infotree.sh` to run on all of Mathlib,
+# or `scripts/export_infotree.sh Mathlib.Logic.Hydra` to run on just one file.
+# Results will go in `build/export_infotree/Mathlib.Logic.Hydra.json`.
+
+# To pass the flags `--tactics --original --substantive` you have to modify the script below.
+# TODO: proper command line arguments.
+
+
+FLAGS=()
+ARGS=()
+
+for arg in "$@"; do
+    if [[ $arg == --* ]]; then
+        FLAGS+=("$arg")
+    else
+        ARGS+=("$arg")
+    fi
+done
+
+if [ ${#ARGS[@]} -eq 0 ]; then
+  rm -f ./build/lake.lock
+  lake build export_infotree
+  parallel -j32 ./scripts/export_infotree.sh ${FLAGS[@]} -- `cat lake-packages/mathlib/Mathlib.lean | sed -e 's/import //'`
+else
+  DIR=out/export_infotree
+  mkdir -p $DIR
+  mod=${ARGS[0]}
+  if [ ! -f $DIR/$mod.json ]; then
+    echo $mod
+    if [ ! -f build/bin/export_infotree ]; then
+      lake build export_infotree
+    fi
+    build/bin/export_infotree ${FLAGS[@]} $mod > $DIR/$mod._json && mv $DIR/$mod._json $DIR/$mod.json
+    # build/bin/export_infotree $mod > $DIR/$mod._json && mv $DIR/$mod._json $DIR/$mod.json
+  fi
+fi
diff --git a/scripts/premises.lean b/scripts/premises.lean
new file mode 100644
index 0000000..b1f0ae1
--- /dev/null
+++ b/scripts/premises.lean
@@ -0,0 +1,130 @@
+import Mathlib.Lean.CoreM
+import Mathlib.Control.Basic
+
+/-!
+Generate declaration dependencies up to a target file (defaulting to all of Mathlib).
+
+* Declarations are separated by `---`.
+* In each block the first declaration is the theorem or definition we are analyzing,
+* Subsequent indented declarations are those used in its proof or definition.
+* Declarations prefixed with a `* ` appear in explicit arguments.
+  (This approximates "is visible in the pretty printed form".)
+* Declarations prefixed with a `s ` are used by the simplifier.
+
+-/
+
+open Lean Meta
+
+def isAuxLemma : Name → Bool
+| .num (.str _ "_auxLemma") _ => true
+| _ => false
+
+partial def Lean.ConstantInfo.getUsedConstants' (c : ConstantInfo)
+    (constantsMap : HashMap Name ConstantInfo)
+    (unfolding : Name → Bool := isAuxLemma) : NameSet × NameSet := Id.run do
+  let mut direct : NameSet := ∅
+  let mut unfolded : NameSet := ∅
+  for n in c.getUsedConstants do
+    if unfolding n then
+      if let some c := constantsMap.find? n then
+        unfolded := unfolded ++ c.getUsedConstants
+    else
+      direct := direct.insert n
+  return (direct, unfolded)
+
+/--
+Traverse all constants from imported files,
+collecting the constants which are used in either the type or value of the theorem or definition.
+
+A predicate `unfolding` picks out a class of constants which should not be returned,
+but instead replaced with the set of constants appearing in their type or value.
+The typical use case is to unfold `_auxLemma`s generated dynamically by the simplifier.
+-/
+def allUsedConstants (unfolding : Name → Bool := isAuxLemma) :
+    CoreM (NameMap (NameSet × NameSet)) := do
+  let constantsMap := (← getEnv).constants.map₁
+  let mut result : NameMap (NameSet × NameSet) := ∅
+  for (n, c) in constantsMap do
+    -- We omit all internally generated auxiliary statements.
+    if ! (← n.isBlackListed) then
+      result := result.insert n (c.getUsedConstants' constantsMap unfolding)
+  return result
+
+open Lean in
+elab "#printNum " n:ident i:num : command => do
+  let name := Name.num n.getId i.getNat
+  let some decl := (← getEnv).find? name | throwError "no such declaration {name}"
+  logInfo m!"{name} : {decl.type} :=\n{decl.value?.getD (.bvar 0)}"
+
+/--
+Traverse an expression, collecting `Name`s,
+but do not descend into implicit arguments.
+-/
+partial def Lean.Expr.explicitConstants : Expr → MetaM NameSet
+| .app f x => do
+  -- We wrap with `try?` here because on tiny fraction of declarations in Mathlib,
+  -- e.g. `Computation.exists_of_mem_parallel`, this fails with an error like
+  -- `function expected ?m.88 ?m.93`.
+  match (← try? (inferType f)) with
+  | some (.forallE _ _ _ .default) => return (← f.explicitConstants) ++ (← x.explicitConstants)
+  | _ => f.explicitConstants
+| .lam _ t b _ => do b.instantiate1 (← mkFreshExprMVar t) |>.explicitConstants
+| .forallE _ t b _ => do b.instantiate1 (← mkFreshExprMVar t) |>.explicitConstants
+| .letE n t v b _ => return (← v.explicitConstants)
+    ++ (← withLetDecl n t v fun fvar => (b.instantiate1 fvar).explicitConstants)
+| .const n _ => return NameSet.empty.insert n
+| .mdata _ e => e.explicitConstants
+| _ => return NameSet.empty
+
+/--
+Collect all the constants used in the values of theorem or definition,
+ignoring implicit arguments of functions.
+-/
+def Lean.ConstantInfo.explicitConstants (c : ConstantInfo) : MetaM NameSet := do
+  match c.value? with
+  | some e => e.explicitConstants
+  | none => return ∅
+
+/--
+Traverse all constants from imported files,
+collecting the constants which are used in the value of the theorem or definition,
+ignoring implicit arguments of functions.
+-/
+def allExplicitConstants : MetaM (NameMap NameSet) := do
+  let r := (← getEnv).constants.map₁
+  let mut result : NameMap NameSet := ∅
+  for (n, c) in r do
+    -- We omit all internally generated auxiliary statements.
+    if ! (← n.isBlackListed) then
+      result := result.insert n (← c.explicitConstants)
+  return result
+
+def main (args : List String) : IO UInt32 := do
+  let options := Options.empty.insert `maxHeartbeats (0 : Nat)
+  let modules := match args with
+  | [] => [`Mathlib]
+  | args => args.map fun s => s.toName
+  searchPathRef.set compileTimeSearchPath%
+  CoreM.withImportModules modules (options := options) do
+    let allConstants ← allUsedConstants
+    let explicitConstants ← MetaM.run' allExplicitConstants
+    for (n, (d, u)) in allConstants do
+      match n.components with
+      -- Note we keep `Std` as it has many lemmas about numbers and data structures.
+      | "Lean" :: _
+      | "Qq" :: _
+      | "Cli" :: _
+      | "Aesop" :: _ => continue
+      | components => if components.contains "Tactic" then continue
+      let explicit := explicitConstants.find? n |>.getD ∅
+      IO.println "---"
+      IO.println n
+      for m in d do
+        if explicit.contains m then
+          IO.println s!"* {m}"
+        else
+          IO.println s!"  {m}"
+      for m in u do
+        if ! d.contains m then
+          IO.println s!"s {m}"
+  return 0
diff --git a/scripts/tactic_benchmark.lean b/scripts/tactic_benchmark.lean
new file mode 100644
index 0000000..808662f
--- /dev/null
+++ b/scripts/tactic_benchmark.lean
@@ -0,0 +1,150 @@
+import TrainingData.Frontend
+import Mathlib.Control.Basic
+import Mathlib.Lean.Expr.Basic
+import Mathlib.Tactic.Common
+import Mathlib.Tactic.ToExpr
+import Mathlib.Util.Cli
+import Aesop
+import Lean.Util.Trace
+
+open Lean Core Elab IO Meta Term Tactic -- All the monads!
+
+
+/-- Count the number of heartbeats used during a monadic function. -/
+-- Upstreamed in https://github.com/leanprover/std4/pull/212
+def Lean.withHeartbeats [Monad m] [MonadLiftT BaseIO m] (x : m α) : m (α × Nat) := do
+  let start ← IO.getNumHeartbeats
+  let r ← x
+  let finish ← IO.getNumHeartbeats
+  return (r, (finish - start) / 1000)
+
+set_option autoImplicit true
+
+/--
+Compile the designated module, and run a monadic function with each new `ConstantInfo`,
+with the `Environment` as it was *before* the command which created that declaration.
+
+(Internal declarations according to `Name.isBlackListed` are skipped.)
+-/
+def runAtDecls (mod : Name) (tac : ConstantInfo → MetaM (Option α)) :
+    MLList IO (ConstantInfo × α) := do
+  let fileName := (← findLean mod).toString
+  let steps := compileModule' mod
+  let targets := steps.bind fun c => (MLList.ofList c.diff).map fun i => (c, i)
+
+  targets.filterMapM fun (cmd, ci) => do
+    for m in cmd.msgs do IO.eprintln (bombEmoji ++ (← m.data.toString))
+    unless cmd.msgs.isEmpty do
+      throw <| IO.userError s!"Unexpected messages in: {mod} during elaboration of {cmd.stx}"
+
+    let options := ({} : KVMap).insert `maxHeartbeats (.ofNat 20000)
+    let ctx := { fileName, options, fileMap := default }
+    let state := { env := cmd.before }
+    -- From `IO` to `CoreM`:
+    Prod.fst <$> (CoreM.toIO · ctx state) do
+      if ← ci.name.isBlackListed then
+        pure none
+      else
+        -- From `CoreM` to `MetaM`:
+        MetaM.run' (ctx := {}) (s := {}) do
+          match ← tac ci with
+          | some r => pure (ci, r)
+          | none => pure none
+
+inductive ResultType
+| failure
+| subgoals
+| notDefEq
+| success
+deriving Repr, BEq
+
+instance : ToString ResultType where
+  toString := fun
+  | .failure => "failure"
+  | .subgoals => "subgoals"
+  | .notDefEq => "notDefEq"
+  | .success => "success"
+
+structure Result where
+  type : ResultType
+  seconds : Float
+  heartbeats : Nat
+
+/--
+Compile the designated module, select declarations satisfying a predicate,
+and run a tactic on the type of each declaration.
+-/
+def runTacticAtDecls (mod : Name) (decls : ConstantInfo → CoreM Bool) (tac : TacticM Unit) :
+    MLList IO (ConstantInfo × Result) := do
+  runAtDecls mod fun ci => do
+    if ! (← decls ci) then return none
+    let g ← mkFreshExprMVar ci.type
+    -- From `MetaM` to `TermElabM`
+    let ((gs, seconds), heartbeats) ← withHeartbeats <| withSeconds <|
+      try? <| TermElabM.run' do
+        -- From `TermElabM` to `TacticM`!
+        Tactic.run g.mvarId! tac
+    let type : ResultType ← match gs with
+    | none => pure .failure
+    | some (_ :: _) => pure .subgoals
+    | some [] =>
+      match ci.value? with
+      | none => pure .success
+      | some v =>
+        if ← isProp ci.type then
+          pure .success
+        else
+        match ← try? (isDefEq g v) with
+        | none
+          -- In this case we should perhaps return an "uncertain" value.
+          -- The problem is that `v` may contain constants generated by the simplifier
+          -- during elaboration of the original proof,
+          -- and which aren't in the current environment, so we can't really compare `g` and `v`
+        | some false => pure .notDefEq
+        | some true => pure .success
+    return some ⟨type, seconds, heartbeats⟩
+
+
+def useAesop : TacticM Unit := do evalTactic (← `(tactic| aesop))
+def useExact? : TacticM Unit := do evalTactic (← `(tactic| exact?))
+def useRfl : TacticM Unit := do evalTactic (← `(tactic| intros; rfl))
+def useSimpAll : TacticM Unit := do evalTactic (← `(tactic| intros; simp_all))
+
+open Cli System
+
+def tacticBenchmarkMain (args : Cli.Parsed) : IO UInt32 := do
+  let module := args.positionalArg! "module" |>.as! Name
+  searchPathRef.set compileTimeSearchPath%
+  let tac ←
+    if args.hasFlag "aesop" then pure useAesop else
+    if args.hasFlag "exact" then pure useExact? else
+    if args.hasFlag "rfl" then pure useRfl else
+    if args.hasFlag "simp_all" then pure useSimpAll else
+    throw <| IO.userError "Specify a tactic, e.g. `--aesop`"
+  let result := runTacticAtDecls module (fun _ => pure true) tac
+  IO.println s!"{module}"
+  for (ci, ⟨type, seconds, heartbeats⟩) in result do
+    IO.println <| (if type == .success then checkEmoji else crossEmoji) ++ " " ++ ci.name.toString ++
+      s!" ({seconds}s) ({heartbeats} heartbeats)"
+  return 0
+
+/-- Setting up command line options and help text for `lake exe tactic_benchmark`. -/
+def tactic_benchmark : Cmd := `[Cli|
+  tactic_benchmark VIA tacticBenchmarkMain; ["0.0.1"]
+  "Run a customisable tactic at all declarations in a file."
+
+  FLAGS:
+    "aesop";       "Use `aesop`."
+    "exact";       "Use `exact?`."
+    "rfl";         "Use `intros; rfl`."
+    "simp_all";    "Use `intros; simp_all`."
+
+  ARGS:
+    module : Name; "Lean module to compile and export InfoTrees."
+]
+
+/-- `lake exe tactic_benchmark` -/
+def main (args : List String) : IO UInt32 :=
+  tactic_benchmark.validate args
+
+-- See `scripts/tactic_benchmark.sh` for a script to run this on all of Mathlib.
diff --git a/scripts/tactic_benchmark.sh b/scripts/tactic_benchmark.sh
new file mode 100755
index 0000000..321738f
--- /dev/null
+++ b/scripts/tactic_benchmark.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+# Run either as `scripts/tactic_benchmark.sh --aesop` to run aesop on all of Mathlib,
+# or `scripts/tactic_benchmark.sh --simp_all Mathlib.Logic.Hydra` to run on just one file.
+# Results will go in `build/tactic_benchmark/simp_all/Mathlib.Logic.Hydra.bench`.
+
+
+if [ "$#" -eq 1 ]; then
+  rm -f ./build/lake.lock
+  lake build tactic_benchmark
+  parallel -j32 ./scripts/tactic_benchmark.sh $1 -- `cat lake-packages/mathlib/Mathlib.lean | sed -e 's/import //'`
+else
+  DIR=out/tactic_benchmark/${1#--}
+  mkdir -p $DIR
+  mod=$2
+  if [ ! -f $DIR/$mod.bench ]; then
+    echo $mod
+    if [ ! -f build/bin/tactic_benchmark ]; then
+      lake build tactic_benchmark
+    fi
+    timeout 5m build/bin/tactic_benchmark $1 $mod > $DIR/$mod._bench && mv $DIR/$mod._bench $DIR/$mod.bench
+  fi
+fi
diff --git a/scripts/tactic_benchmark_summary.sh b/scripts/tactic_benchmark_summary.sh
new file mode 100755
index 0000000..7547096
--- /dev/null
+++ b/scripts/tactic_benchmark_summary.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+## Writes a human readable summary of the tactics tested so far using `tactic_benchmark`.
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+mkdir -p $SCRIPT_DIR/../out/tactic_benchmark
+cd $SCRIPT_DIR/../out/tactic_benchmark
+
+tactics=`ls -d */ | cut -f1 -d'/'`
+
+for tac in $tactics; do
+  mkdir -p $tac
+  cd $tac
+  grep ✅ * | awk '{print $1 " " $2}' | sort > ../$tac.out
+  grep ❌ * | awk '{print $1 " " $2}' | sort > ../$tac.failure
+  grep -h "^\(✅\|❌\)" * | awk '{gsub(/[()s]/,""); s+=$3;}END{printf "%.0f\n",s}' > ../$tac.time
+  cd ..
+done
+
+for tac in $tactics; do
+  echo "Solved by $tac: " `cat $tac.out | wc -l` / `cat $tac.failure | wc -l` " in " `cat $tac.time`"s"
+  for other in $tactics; do
+    if [ "$tac" != "$other" ]; then
+      echo "  but not $other: " `comm -23 $tac.out $other.out | wc -l`
+    fi
+  done
+done
diff --git a/scripts/training_data.lean b/scripts/training_data.lean
new file mode 100644
index 0000000..4063d5e
--- /dev/null
+++ b/scripts/training_data.lean
@@ -0,0 +1,115 @@
+import TrainingData.Frontend
+import TrainingData.InfoTree.ToJson
+import TrainingData.InfoTree.TacticInvocation.Basic
+import Mathlib.Data.String.Defs
+import Mathlib.Util.Cli
+import Mathlib.Lean.CoreM
+import Cli
+
+open Lean Elab IO Meta
+open Cli System
+
+namespace Lean.Elab.TacticInvocation
+
+def verboseTrainingData (module : Name) (i : TacticInvocation) : IO String := do
+  let mut result := "===\n"
+  result := result ++ s!"{module}\n---\n"
+  let sourceUpToTactic := Substring.mk (← moduleSource module) 0 (i.info.stx.getPos?.getD 0)
+  let chunks := sourceUpToTactic.splitOn "\n\n"
+  let declUpToTactic := chunks.getLast!.toString
+  let offset := chunks.dropLast.foldl (init := 0) (fun t c => t + (c.toString.count '\n') + 2)
+  result := result ++ s!"{offset}\n---\n{declUpToTactic}\n---\n"
+  result := result ++ (Format.joinSep (← i.goalState) "\n").pretty ++ "\n---\n"
+  let ⟨⟨startLine, startCol⟩, ⟨endLine, endCol⟩⟩ := i.range
+  result := result ++ s!"{startLine}:{startCol}-{endLine}:{endCol}\n---\n"
+  result := result ++ (← i.pp).pretty ++ "\n---\n"
+  result := result ++ (Format.joinSep (← i.goalStateAfter) "\n").pretty ++ "\n---\n"
+  return result
+
+def proofStepData (i : TacticInvocation) : IO String := do
+  let mut result := "[GOAL]\n" ++ (Format.joinSep (← i.goalState) "\n").pretty ++ "\n[PROOFSTEP]\n" ++ (← i.pp).pretty
+
+  return result
+
+end Lean.Elab.TacticInvocation
+
+def trainingData (args : Cli.Parsed) : IO UInt32 := do
+    searchPathRef.set compileTimeSearchPath%
+    let module := args.positionalArg! "module" |>.as! Name
+    let mut trees ← moduleInfoTrees module
+    trees := trees.bind InfoTree.retainTacticInfo
+    trees := trees.bind InfoTree.retainOriginal
+    trees := trees.bind InfoTree.retainSubstantive
+    for t in trees do
+      for t in t.tactics do
+        if args.hasFlag "proofstep" then
+          IO.println (← t.proofStepData)
+        else
+          IO.println (← t.verboseTrainingData module)
+    return 0
+
+/-- Setting up command line options and help text for `lake exe training_data`. -/
+def training_data : Cmd := `[Cli|
+  training_data VIA trainingData; ["0.0.1"]
+"Export training data consisting of goal states and tactic invocations from the given file.
+
+The output consists of blocks of the form:
+```
+===
+Mathlib.Logic.Hydra
+---
+61
+---
+theorem cutExpand_le_invImage_lex [DecidableEq α] [IsIrrefl α r] :
+    CutExpand r ≤ InvImage (Finsupp.Lex (rᶜ ⊓ (· ≠ ·)) (· < ·)) toFinsupp := by
+
+---
+α : Type u_1
+r : α → α → Prop
+inst✝¹ : DecidableEq α
+inst✝ : IsIrrefl α r
+⊢ CutExpand r ≤ InvImage (Finsupp.Lex (rᶜ ⊓ fun x x_1 => x ≠ x_1) fun x x_1 => x < x_1) ↑toFinsupp
+---
+64:2-64:27
+---
+rintro s t ⟨u, a, hr, he⟩
+---
+case intro.intro.intro
+α : Type u_1
+r : α → α → Prop
+inst✝¹ : DecidableEq α
+inst✝ : IsIrrefl α r
+s t u : Multiset α
+a : α
+hr : ∀ (a' : α), a' ∈ u → r a' a
+he : s + {a} = t + u
+⊢ InvImage (Finsupp.Lex (rᶜ ⊓ fun x x_1 => x ≠ x_1) fun x x_1 => x < x_1) (↑toFinsupp) s t
+---
+```
+Here:
+* `Mathlib.Logic.Hydra` is the name of the module where this goal occurs.
+* `61` is the number of lines before the declaration (i.e. the `theorem` statement is on line `62`.)
+* `theorem ...` is the *partial* declaration, including a fragment of the tactic proof.
+* Next is the goal state at that point.
+  (Typically just one goal, as we don't report the goal states before structuring commands.)
+  Note that there is no guarantee that truncating the file at this point will actually cause Lean
+  to display this goal: the presence of earlier structuring commands could result in Lean showing
+  an error message instead.
+* `64:2-64:27` is the range of positions occupied by the tactic invocation in the original file.
+  This allows us to look up this goal in a live Lean session, so we can try out alternative tactics.
+* `rintro s t ⟨u, a, hr, he⟩` is the tactic used in the library.
+* After that is the goal state after running the tactic.
+  (Often multiple goals.)"
+
+  FLAGS:
+    "proofstep";       "Use the proofstep format: [GOAL]tactic-state[PROOFSTEP]next-tactic"
+
+  ARGS:
+    module : Name; "Lean module to compile and export training data."
+]
+
+/-- `lake exe training_data` -/
+def main (args : List String) : IO UInt32 :=
+  training_data.validate args
+
+-- See `scripts/training_data.sh` for a script to run this on all of Mathlib.
diff --git a/scripts/training_data.sh b/scripts/training_data.sh
new file mode 100755
index 0000000..a7f25e4
--- /dev/null
+++ b/scripts/training_data.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+## Generates pretty-printed descriptions of every goal/tactic pair.
+
+# See the help text in `lake exe training_data` for a description of the output format.
+
+# Run either as `scripts/training_data.sh` to run on all of Mathlib (several hours),
+# or `scripts/training_data.sh Mathlib.Logic.Hydra` to run on just one file.
+# Results will go in `build/training_data/Mathlib.Logic.Hydra.train`.
+
+FLAGS=()
+ARGS=()
+
+for arg in "$@"; do
+    if [[ $arg == --* ]]; then
+        FLAGS+=("$arg")
+    else
+        ARGS+=("$arg")
+    fi
+done
+
+if [ ${#ARGS[@]} -eq 0 ]; then
+  rm -f ./build/lake.lock
+  lake build training_data
+  parallel -j32 ./scripts/training_data.sh ${FLAGS[@]} -- `cat lake-packages/mathlib/Mathlib.lean | sed -e 's/import //'`
+else
+  DIR=out/training_data
+  mkdir -p $DIR
+  mod=${ARGS[0]}
+  if [ ! -f $DIR/$mod.train ]; then
+    echo $mod
+    if [ ! -f build/bin/training_data ]; then
+      lake build training_data
+    fi
+    build/bin/training_data ${FLAGS[@]} $mod > $DIR/$mod._train && mv $DIR/$mod._train $DIR/$mod.train
+  fi
+fi