From 704679c5ebd8fd00f063349aac96d6f874848e32 Mon Sep 17 00:00:00 2001 From: Scott Morrison Date: Mon, 21 Aug 2023 22:21:23 +1000 Subject: [PATCH] feat: initial commit of lean-training-data tools --- .gitignore | 5 + .vscode/settings.json | 10 + README.md | 257 ++++++++++++++++++ TrainingData.lean | 5 + TrainingData/Frontend.lean | 238 ++++++++++++++++ TrainingData/InfoTree/Basic.lean | 169 ++++++++++++ .../InfoTree/TacticInvocation/Basic.lean | 74 +++++ TrainingData/InfoTree/ToJson.lean | 114 ++++++++ TrainingData/Mathlib.lean | 1 + lake-manifest.json | 51 ++++ lakefile.lean | 30 ++ lean-toolchain | 1 + make.sh | 40 +++ scripts/declaration_types.lean | 27 ++ scripts/export_infotree.lean | 47 ++++ scripts/export_infotree.sh | 38 +++ scripts/premises.lean | 130 +++++++++ scripts/tactic_benchmark.lean | 150 ++++++++++ scripts/tactic_benchmark.sh | 23 ++ scripts/tactic_benchmark_summary.sh | 28 ++ scripts/training_data.lean | 115 ++++++++ scripts/training_data.sh | 37 +++ 22 files changed, 1590 insertions(+) create mode 100644 .gitignore create mode 100644 .vscode/settings.json create mode 100644 README.md create mode 100644 TrainingData.lean create mode 100644 TrainingData/Frontend.lean create mode 100644 TrainingData/InfoTree/Basic.lean create mode 100644 TrainingData/InfoTree/TacticInvocation/Basic.lean create mode 100644 TrainingData/InfoTree/ToJson.lean create mode 100644 TrainingData/Mathlib.lean create mode 100644 lake-manifest.json create mode 100644 lakefile.lean create mode 100644 lean-toolchain create mode 100644 make.sh create mode 100644 scripts/declaration_types.lean create mode 100644 scripts/export_infotree.lean create mode 100755 scripts/export_infotree.sh create mode 100644 scripts/premises.lean create mode 100644 scripts/tactic_benchmark.lean create mode 100755 scripts/tactic_benchmark.sh create mode 100755 scripts/tactic_benchmark_summary.sh create mode 100644 scripts/training_data.lean create mode 100755 scripts/training_data.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e8758e3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/build +/lake-packages +/.cache +.DS_Store +/out diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..89f72cb --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "editor.insertSpaces": true, + "editor.tabSize": 2, + "editor.rulers" : [100], + "files.encoding": "utf8", + "files.eol": "\n", + "files.insertFinalNewline": true, + "files.trimFinalNewlines": true, + "files.trimTrailingWhitespace": true, +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..8e96679 --- /dev/null +++ b/README.md @@ -0,0 +1,257 @@ +# Data extraction from Lean libraries + +This repository provides some tools for extracting data from Lean libraries (particularly Mathlib). +Most of the focus is data which may be useful for training ML models. +If you're looking for Lean training data and these tools don't give you what you want, please ask! +We would like to enable sharing of data and easy access to the Lean libraries for everyone. + +## Tools + +`declaration_types` +: For each declaration imported in the target file (e.g. `Mathlib`), + print the name and type of the declaration. + +`premises` +: For each declaration imported in a target file (e.g. `Mathlib`), + list all of its direct dependencies (i.e. constants referenced from its type or proof). + Constants appearing in explicit arguments are prefixed `*`, + and constants used by the simplifier are prefixed `s`. + +`training_data` +: Export all goal / tactic pairs from the library, with additional context. + `--proofstep` outputs in `[GOAL]...[PROOFSTEP]...` format. + +`tactic_benchmark` +: Run a tactic against every declaration in the library, tracking runtimes and successes. + +`export_infotree` +: Export the `InfoTree` for each module as JSON. + Filtering flags: `--tactics` (only tactic nodes) `--substantive` (no structuring tactics) and + `--original` (no synthetic syntax nodes, e.g. from macro expansions) + +## Usage instructions + +* `git clone https://github.com/semorrison/lean-training-data.git` + +* Install [`elan`](https://github.com/leanprover/elan) by running + +```shell +curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh +``` + +* Go into the `lean-training-data` directory. +* Run `lake exe cache get` (this downloads precompiled binaries for `Mathlib`). +* Run `lake exe `, where `` is one of the programs documented below. + +## Detailed instructions + +### `declaration_types` + +`lake exe declaration_types Mathlib` will import `Mathlib`, and then print the names and types +of every declaration in the environment. + +Sample output: + +```lean +--- +theorem +TopologicalSpace.OpenNhds.map_id_obj +∀ {X : TopCat} (x : ↑X) (U : TopologicalSpace.OpenNhds (↑(CategoryTheory.CategoryStruct.id X) x)), + (TopologicalSpace.OpenNhds.map (CategoryTheory.CategoryStruct.id X) x).obj U = U +``` + +The first line is the declaration type (`theorem`, `definition`, `inductive`, etc), the second line +is the name, and subsequent lines are the type. Each block is separated by `---`. + +This takes about 16 minutes on my system (probably parallelizes if needed?) +producing a 45mb file, containing the types for ~190000 declarations in Mathlib/Std/Aesop/Qq/Cli. + +### `premises` + +`lake exe premises Mathlib` will calculate declaration dependencies up to a target file +(defaulting to all of Mathlib). + +* Declarations are separated by `---`. +* In each block the first declaration is the theorem or definition we are analyzing, +* Subsequent indented declarations are those used in its proof or definition. +* Declarations prefixed with a `*` appear in explicit arguments. + (This approximates "is visible in the pretty printed form".) +* Declarations prefixed with a `s` are used by the simplifier. + +Sample output: + +```lean +--- +List.toFinset.ext_iff +* congrArg + List.instMembershipList + Finset + Finset.instMembershipFinset +* Membership.mem + List.toFinset +* iff_self + List +* Iff +* congrFun +* congr + True +* of_eq_true + Eq +* Eq.trans + DecidableEq +* forall_congr +s List.mem_toFinset +s Finset.ext_iff +s propext +``` + +This takes about 4 minutes on my system, producing a 115mb file, +containing information for ~170000 declarations in Mathlib and Std. + +### `training_data` + +`lake exe export_infotree Mathlib.Logic.Hydra` will recompile the target module, +and output all the tactic invocations appearing in the file. + +The default output is a verbose human/machine readable format described below, +or the `--proofstep` flag just gives `[GOAL]...[PROOFSTEP]...` output +as used for training some models. + +TODO: Break out individual steps of `rw` and `simp_rw`, with intermediate goals. +This is easy to do, just needs some plumbing. + +The output consists of blocks of the form: + +```text +=== +Mathlib.Logic.Hydra +--- +61 +--- +theorem cutExpand_le_invImage_lex [DecidableEq α] [IsIrrefl α r] : + CutExpand r ≤ InvImage (Finsupp.Lex (rᶜ ⊓ (· ≠ ·)) (· < ·)) toFinsupp := by + +--- +α : Type u_1 +r : α → α → Prop +inst✝¹ : DecidableEq α +inst✝ : IsIrrefl α r +⊢ CutExpand r ≤ InvImage (Finsupp.Lex (rᶜ ⊓ fun x x_1 => x ≠ x_1) fun x x_1 => x < x_1) ↑toFinsupp +--- +64:2-64:27 +--- +rintro s t ⟨u, a, hr, he⟩ +--- +case intro.intro.intro +α : Type u_1 +r : α → α → Prop +inst✝¹ : DecidableEq α +inst✝ : IsIrrefl α r +s t u : Multiset α +a : α +hr : ∀ (a' : α), a' ∈ u → r a' a +he : s + {a} = t + u +⊢ InvImage (Finsupp.Lex (rᶜ ⊓ fun x x_1 => x ≠ x_1) fun x x_1 => x < x_1) (↑toFinsupp) s t +--- +``` + +Here: + +* `Mathlib.Logic.Hydra` is the name of the module where this goal occurs. +* `61` is the number of lines before the declaration (i.e. the `theorem` statement is on line `62`.) +* `theorem ...` is the *partial* declaration, including a fragment of the tactic proof. +* Next is the goal state at that point. + (Typically just one goal, as we don't report the goal states before structuring commands.) + Note that there is no guarantee that truncating the file at this point will actually cause Lean + to display this goal: the presence of earlier structuring commands could result in Lean showing + an error message instead. +* `64:2-64:27` is the range of positions occupied by the tactic invocation in the original file. + This allows us to look up this goal in a live Lean session, so we can try out alternative tactics. +* `rintro s t ⟨u, a, hr, he⟩` is the tactic used in the library. +* After that is the goal state after running the tactic. + (Often multiple goals.)" + +There is also `scripts/training_data.sh`, which will run in parallel over all of Mathlib, +recording results in `out/training_data/`. + +### `tactic_benchmark` + +`lake exe tactic_benchmark --aesop Mathlib.Topology.ContinuousFunction.Ordered` +will run `aesop` on the type of each +declaration in the target module, reporting success or failure, +and the runtime (in seconds and heartbeats). + +Sample output: + +```shell +% lake exe tactic_benchmark --aesop Mathlib.Topology.ContinuousFunction.Ordered +Mathlib.Topology.ContinuousFunction.Ordered 7 21 +❌ ContinuousMap.abs (0.006912s) (111 heartbeats) +❌ ContinuousMap.instAbsContinuousMap (0.007926s) (115 heartbeats) +✅ ContinuousMap.abs_apply (0.027873s) (401 heartbeats) +❌ ContinuousMap.partialOrder (0.008932s) (105 heartbeats) +✅ ContinuousMap.le_def (0.071422s) (984 heartbeats) +❌ ContinuousMap.lt_def (0.162438s) (2284 heartbeats) +❌ ContinuousMap.sup (0.012593s) (118 heartbeats) +✅ ContinuousMap.sup_coe (0.204213s) (2838 heartbeats) +✅ ContinuousMap.sup_apply (0.201279s) (2799 heartbeats) +❌ ContinuousMap.semilatticeSup (0.007805s) (119 heartbeats) +❌ ContinuousMap.inf (0.010117s) (119 heartbeats) +✅ ContinuousMap.inf_coe (0.175238s) (2798 heartbeats) +✅ ContinuousMap.inf_apply (0.174505s) (2683 heartbeats) +❌ ContinuousMap.semilatticeInf (0.007189s) (119 heartbeats) +❌ ContinuousMap.instLatticeContinuousMap (0.007780s) (119 heartbeats) +❌ ContinuousMap.sup'_apply (0.049451s) (727 heartbeats) +❌ ContinuousMap.sup'_coe (0.069892s) (1101 heartbeats) +❌ ContinuousMap.inf'_apply (0.049896s) (728 heartbeats) +❌ ContinuousMap.inf'_coe (0.070128s) (1102 heartbeats) +❌ ContinuousMap.IccExtend (0.030286s) (466 heartbeats) +✅ ContinuousMap.coe_IccExtend (0.148204s) (2390 heartbeats) +``` + +Currently supported flags are `--aesop`, `--simp_all` (which runs `intros; simp_all`), +`--rfl` (which runs `intros; rfl`), and `--exact` (which runs `exact?`), +but it is trivial to add more by editing `tactic_benchmark.lean`. + +There is also `scripts/tactic_benchmark.sh`, which will run in parallel over all of Mathlib, +recording results in `out/tactic_benchmark/`. + +After you've run it, `scripts/tactic_benchmark_summary.sh` will report the success rates, +as well as the differential success rates +(e.g. number of goals solved by `simp_all` but not by `aesop`). + +TODO: run on all tactic goals in the library, not just the types of declarations. +Should not be difficult given the existing components. + +### `export_infotree` + +`lake exe export_infotree Mathlib.Logic.Hydra` will recompile the target module, +extract the `InfoTree`s, and then write these out as JSON to stdout. +The JSON contains pretty-printed goals before and after every tactic invocation, +and the pretty-printed syntax of every tactic invocation, and explicitly constructed term. + +There is also `scripts/export_infotree.sh`, which will run in parallel over all of Mathlib, +recording results in `out/export_infotree/`. + +If you need to use this tool, +consider modifying one of the other tools to give you directly what you want! + +## See also + +### LeanDojo + +[LeanDojo] provides similar tools. + +I like that the tools provided here are standalone tools provided separately from any model or benchmark. +They are "pure lean" (plus a little bash scripting), and may be useful models for anyone interested +in metaprogramming tools for examining compiled Lean code. + +### Releases + +You can find a downloadable copy of the output of all these tools under `Releases`. +Please ping me if you'd like these to be updated. (We could run a CI job.) + +### Derivatives + +If you use these tools or the downloadable releases to prepare other publicly available datasets +(e.g. train/test splits) or models, please reference this repository to help others find it. diff --git a/TrainingData.lean b/TrainingData.lean new file mode 100644 index 0000000..b2bc35b --- /dev/null +++ b/TrainingData.lean @@ -0,0 +1,5 @@ +import TrainingData.Frontend +import TrainingData.InfoTree.Basic +import TrainingData.InfoTree.ToJson +import TrainingData.InfoTree.TacticInvocation.Basic +import TrainingData.Mathlib \ No newline at end of file diff --git a/TrainingData/Frontend.lean b/TrainingData/Frontend.lean new file mode 100644 index 0000000..b97a871 --- /dev/null +++ b/TrainingData/Frontend.lean @@ -0,0 +1,238 @@ +/- +Copyright (c) 2023 Scott Morrison. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Authors: Scott Morrison +-/ +import Lean.Elab.Frontend +import Std.Util.TermUnsafe +import Std.Data.MLList.Basic + +/-! +# Compiling Lean sources to obtain `Environment`, `Message`s and `InfoTree`s. + +The main entry point is + +``` +def processInput (input : String) (env? : Option Environment := none) + (opts : Options := {}) (fileName : Option String := none) (info : Bool := true) : + IO (Environment × List Message × List InfoTree) := + ... +``` + +which attempts to compile Lean source code, returning an `Environment`, +along with any generated `Message`s and `InfoTree`s. + +The optional argument `env?` allows specifying an existing `Environment`, for partial compilation. +If this is non-empty, then the source code may not contain any `import` statements. + +You may suppress the generation of `InfoTree`s using `info := false`. + +For finer-grained control of compilation, we define a `CompilationStep` structure +which contains information about the results of each command. + +You can use `processInput'` to obtain a monadic lazy list of `CompilationStep`s. + +The functions `compileModule : Name → IO (List CompilationStep)` and +`moduleInfoTrees : Name → IO (List InfoTree)` are useful for compiling single modules from source. +-/ + +set_option autoImplicit true + +open Lean Elab Frontend + +namespace Lean.PersistentArray + +/-- +Drop the first `n` elements of a `PersistentArray`, returning the results as a `List`. +-/ +-- We can't remove the `[Inhabited α]` hypotheses here until +-- `PersistentArray`'s `GetElem` instance also does. +def drop [Inhabited α] (t : PersistentArray α) (n : Nat) : List α := + List.range (t.size - n) |>.map fun i => t.get! (n + i) + +end Lean.PersistentArray + +namespace MLList + +/-- Run a lazy list in a `ReaderT` monad on some fixed state. -/ +partial def runReaderT [Monad m] (L : MLList (ReaderT.{u, u} ρ m) α) (r : ρ) : MLList m α := + squash fun _ => + return match ← (uncons L).run r with + | none => nil + | some (a, L') => cons a (L'.runReaderT r) + +/-- Run a lazy list in a `StateRefT'` monad on some initial state. -/ +partial def runStateRefT [Monad m] [MonadLiftT (ST ω) m] (L : MLList (StateRefT' ω σ m) α) (s : σ) : + MLList m α := + squash fun _ => + return match ← (uncons L).run s with + | (none, _) => nil + | (some (a, L'), s') => cons a (L'.runStateRefT s') + +end MLList + +namespace Lean.Elab.IO + +/-- +Results from processing a command. + +Contains the `Environment` before and after, +the `src : Substring` and `stx : Syntax` of the command, +and any `Message`s and `InfoTree`s produced while processing. +-/ +structure CompilationStep where + src : Substring + stx : Syntax + before : Environment + after : Environment + msgs : List Message + trees : List InfoTree + +namespace CompilationStep + +/-- +Process one command, returning a `CompilationStep` and +`done : Bool`, indicating whether this was the last command. +-/ +def one : FrontendM (CompilationStep × Bool) := do + let s := (← get).commandState + let beforePos := (← get).cmdPos + let before := s.env + let done ← processCommand + let stx := (← get).commands.back + let src := ⟨(← read).inputCtx.input, beforePos, (← get).cmdPos⟩ -- FIXME this is incorrect + let s' := (← get).commandState + let after := s'.env + let msgs := s'.messages.msgs.drop s.messages.msgs.size + let trees := s'.infoState.trees.drop s.infoState.trees.size + return ({ src, stx, before, after, msgs, trees }, done) + +/-- Process all commands in the input. -/ +partial def all : FrontendM (List CompilationStep) := do + let (cmd, done) ← CompilationStep.one + if done then + return [cmd] + else + return cmd :: (← all) + +/-- Return all new `ConstantInfo`s added during the processed command. -/ +def diff (cmd : CompilationStep) : List ConstantInfo := + cmd.after.constants.map₂.toList.filterMap + fun (c, i) => if cmd.before.constants.map₂.contains c then none else some i + +end CompilationStep + +/-- +Returns a monadic lazy list of `CompilationStep`s. +This needs to be provided with initial state, see `compilationSteps`. +-/ +partial def compilationSteps_aux : MLList FrontendM CompilationStep := + .squash fun _ => aux +where + /-- Implementation of `compilationSteps_aux`. -/ + aux := do + let (cmd, done) ← CompilationStep.one + if done then + return .ofList [cmd] + else + return .cons cmd (← aux) + +/-- Return the the `CompilationStep`s, as a monadic lazy list in `IO`. -/ +def compilationSteps (inputCtx : Parser.InputContext) (parserState : Parser.ModuleParserState) + (commandState : Command.State) : MLList IO CompilationStep := + compilationSteps_aux.runReaderT { inputCtx } + |>.runStateRefT { commandState, parserState, cmdPos := parserState.pos } + +/-- +Process some text input, with or without an existing environment. +If there is no existing environment, we parse the input for headers (e.g. import statements), +and create a new environment. +Otherwise, we add to the existing environment. +Returns a list containing data about each processed command. + +Be aware that Lean does not support compiling multiple files in the same sessions. +Often it works, but if the compiled files do anything complicated with initializers then +nothing is gauranteed. +-/ +def processInput' (input : String) (env? : Option Environment := none) + (opts : Options := {}) (fileName : Option String := none) (info : Bool := true) : + MLList IO CompilationStep := unsafe do + let fileName := fileName.getD "" + let inputCtx := Parser.mkInputContext input fileName + let (parserState, commandState) ← match env? with + | none => do + enableInitializersExecution + let (header, parserState, messages) ← Parser.parseHeader inputCtx + let (env, messages) ← processHeader header opts messages inputCtx + pure (parserState, (Command.mkState env messages opts)) + | some env => do + pure ({ : Parser.ModuleParserState }, Command.mkState env {} opts) + compilationSteps inputCtx parserState { commandState with infoState.enabled := info } + +/-- +Process some text input, with or without an existing environment. +If there is no existing environment, we parse the input for headers (e.g. import statements), +and create a new environment. +Otherwise, we add to the existing environment. +Returns the resulting environment, along with a list of messages and info trees. +-/ +def processInput (input : String) (env? : Option Environment := none) + (opts : Options := {}) (fileName : Option String := none) (info : Bool := true) : + IO (Environment × List Message × List InfoTree) := do + let steps ← processInput' input env? opts fileName info |>.force + match steps.getLast? with + | none => throw <| IO.userError "No commands found in input." + | some { after, .. } => + return (after, steps.bind CompilationStep.msgs, steps.bind CompilationStep.trees) + +open System + +-- TODO allow finding Lean 4 sources from the toolchain. +def findLean (mod : Name) : IO FilePath := do + return FilePath.mk ((← findOLean mod).toString.replace "build/lib/" "") |>.withExtension "lean" + +/-- Implementation of `moduleSource`, which is the cached version of this function. -/ +def moduleSource' (mod : Name) : IO String := do + IO.FS.readFile (← findLean mod) + +initialize sourceCache : IO.Ref <| HashMap Name String ← + IO.mkRef .empty + +/-- Read the source code of the named module. The results are cached. -/ +def moduleSource (mod : Name) : IO String := do + let m ← sourceCache.get + match m.find? mod with + | some r => return r + | none => do + let v ← moduleSource' mod + sourceCache.set (m.insert mod v) + return v + +/-- Implementation of `compileModule`, which is the cached version of this function. -/ +def compileModule' (mod : Name) : MLList IO CompilationStep := do + Lean.Elab.IO.processInput' (← moduleSource mod) none {} (← findLean mod).toString + +initialize compilationCache : IO.Ref <| HashMap Name (List CompilationStep) ← + IO.mkRef .empty + +/-- +Compile the source file for the named module, returning the +resulting environment, any generated messages, and all info trees. + +The results are cached, although be aware that compiling multiple files in the same session +is unsupported, and may lead to exciting results: +you should check all compiled files for error messages if attempting this. +-/ +def compileModule (mod : Name) : IO (List CompilationStep) := do + let m ← compilationCache.get + match m.find? mod with + | some r => return r + | none => do + let v ← compileModule' mod |>.force + compilationCache.set (m.insert mod v) + return v + +/-- Compile the source file for the named module, returning all info trees. -/ +def moduleInfoTrees (mod : Name) : IO (List InfoTree) := do + let steps ← compileModule mod + return steps.bind (fun c => c.trees) diff --git a/TrainingData/InfoTree/Basic.lean b/TrainingData/InfoTree/Basic.lean new file mode 100644 index 0000000..e107880 --- /dev/null +++ b/TrainingData/InfoTree/Basic.lean @@ -0,0 +1,169 @@ +import Lean + +open Lean Elab + +namespace Lean.FileMap + +/-- Extract the range of a `Syntax` expressed as lines and columns. -/ +-- Extracted from the private declaration `Lean.Elab.formatStxRange`, +-- in `Lean.Elab.InfoTree.Main`. +def stxRange (fileMap : FileMap) (stx : Syntax) : Position × Position := + let pos := stx.getPos?.getD 0 + let endPos := stx.getTailPos?.getD pos + (fileMap.toPosition pos, fileMap.toPosition endPos) + +end Lean.FileMap + +namespace Lean.Elab.Info + +/-- The type of a `Lean.Elab.Info`, as a string. -/ +def kind : Info → String + | .ofTacticInfo _ => "TacticInfo" + | .ofTermInfo _ => "TermInfo" + | .ofCommandInfo _ => "CommmandInfo" + | .ofMacroExpansionInfo _ => "MacroExpansionInfo" + | .ofOptionInfo _ => "OptionInfo" + | .ofFieldInfo _ => "FieldInfo" + | .ofCompletionInfo _ => "CompletionInfo" + | .ofUserWidgetInfo _ => "UserWidgetInfo" + | .ofCustomInfo _ => "CustomInfo" + | .ofFVarAliasInfo _ => "FVarAliasInfo" + | .ofFieldRedeclInfo _ => "FieldRedeclInfo" + +/-- The `Syntax` for a `Lean.Elab.Info`, if there is one. -/ +def stx? : Info → Option Syntax + | .ofTacticInfo info => info.stx + | .ofTermInfo info => info.stx + | .ofCommandInfo info => info.stx + | .ofMacroExpansionInfo info => info.stx + | .ofOptionInfo info => info.stx + | .ofFieldInfo info => info.stx + | .ofCompletionInfo info => info.stx + | .ofUserWidgetInfo info => info.stx + | .ofCustomInfo info => info.stx + | .ofFVarAliasInfo _ => none + | .ofFieldRedeclInfo info => info.stx + +/-- Is the `Syntax` for this `Lean.Elab.Info` original, or synthetic? -/ +def isOriginal (i : Info) : Bool := + match i.stx? with + | none => true -- Somewhat unclear what to do with `FVarAliasInfo`, so be conservative. + | some stx => match stx.getHeadInfo with + | .original .. => true + | _ => false + +end Lean.Elab.Info + +namespace Lean.Elab.ContextInfo + +/-- Pretty print an expression in the given `ContextInfo` with the given `LocalContext`. -/ +def ppExpr (ctx : ContextInfo) (lctx : LocalContext) (e : Expr) : IO Format := + ctx.runMetaM lctx (do Meta.ppExpr (← instantiateMVars e)) + +end Lean.Elab.ContextInfo + +namespace Lean.Elab.TacticInfo + +/-- Find the name for the outermost `Syntax` in this `TacticInfo`. -/ +def name? (t : TacticInfo) : Option Name := + match t.stx with + | Syntax.node _ n _ => some n + | _ => none + +/-- Decide whether a tactic is "substantive", +or is merely a tactic combinator (e.g. `by`, `;`, multiline tactics, parenthesized tactics). -/ +def isSubstantive (t : TacticInfo) : Bool := + match t.name? with + | none => false + | some `null => false + | some ``cdot => false + | some ``cdotTk => false + | some ``Lean.Parser.Term.byTactic => false + | some ``Lean.Parser.Tactic.tacticSeq => false + | some ``Lean.Parser.Tactic.tacticSeq1Indented => false + | some ``Lean.Parser.Tactic.«tactic_<;>_» => false + | some ``Lean.Parser.Tactic.paren => false + | _ => true + +end Lean.Elab.TacticInfo + +namespace Lean.Elab.InfoTree + +/-- +Keep `.node` nodes and `.hole` nodes satisfying predicates. + +Returns a `List InfoTree`, although in most situations this will be a singleton. +-/ +partial def filter (p : Info → Bool) (m : MVarId → Bool := fun _ => false) : + InfoTree → List InfoTree + | .context ctx tree => tree.filter p m |>.map (.context ctx) + | .node info children => + if p info then + [.node info (children.toList.map (filter p m)).join.toPArray'] + else + (children.toList.map (filter p m)).join + | .hole mvar => if m mvar then [.hole mvar] else [] + +/-- Discard all nodes besides `.context` nodes and `TacticInfo` nodes. -/ +partial def retainTacticInfo (tree : InfoTree) : List InfoTree := + tree.filter fun | .ofTacticInfo _ => true | _ => false + +/-- Retain only nodes with "original" syntax. -/ +partial def retainOriginal (tree : InfoTree) : List InfoTree := + tree.filter Info.isOriginal + +/-- Discard all TacticInfo nodes that are tactic combinators or structuring tactics. -/ +-- There is considerable grey area here: what to do with `classical`? +partial def retainSubstantive (tree : InfoTree) : List InfoTree := + tree.filter fun | .ofTacticInfo i => i.isSubstantive | _ => true + +/-- Discard any enclosing `InfoTree.context` layers. -/ +def consumeContext : InfoTree → InfoTree + | .context _ t => t.consumeContext + | t => t + +/-- Is this `InfoTree` a `TermInfo` for some `Expr`? -/ +def ofExpr? (i : InfoTree) : Option Expr := match i with + | .node (.ofTermInfo i) _ => some i.expr + | _ => none + +/-- Is this `InfoTree` a `TermInfo` for some `Name`? -/ +def ofName? (i : InfoTree) : Option Name := i.ofExpr?.bind Expr.constName? + +/-- Check if the `InfoTree` is the top level `InfoTree` for a declaration, +if so, return it along with the declaration name. -/ +def elabDecl? (t : InfoTree) : Option Name := + match t.consumeContext with + | .node (.ofCommandInfo i) c => + if i.elaborator == `Lean.Elab.Command.elabDeclaration + then + -- this is hacky: we are relying on the ordering of the child nodes. + c.toList.foldr (fun cc acc => match (cc.consumeContext.ofName?, acc) with + | (_, some r) => some r + | (some n, none) => some n + | (none, none) => none ) + none + else + none + | _ => none + +/-- Analogue of `Lean.Elab.InfoTree.findInfo?`, but that returns a list of all results. -/ +partial def findAllInfo (t : InfoTree) (ctx : Option ContextInfo) (p : Info → Bool) : + List (Info × Option ContextInfo × PersistentArray InfoTree) := + match t with + | context ctx t => t.findAllInfo ctx p + | node i ts => + (if p i then [(i, ctx, ts)] else []) ++ ts.toList.bind (fun t => t.findAllInfo ctx p) + | _ => [] + +/-- Return all `TacticInfo` nodes in an `InfoTree` corresponding to tactics, +each equipped with its relevant `ContextInfo`, and any children info trees. -/ +def findTacticNodes (t : InfoTree) : List (TacticInfo × ContextInfo × PersistentArray InfoTree) := + let infos := t.findAllInfo none fun i => match i with + | .ofTacticInfo _ => true + | _ => false + infos.filterMap fun p => match p with + | (.ofTacticInfo i, some ctx, children) => (i, ctx, children) + | _ => none + +end Lean.Elab.InfoTree diff --git a/TrainingData/InfoTree/TacticInvocation/Basic.lean b/TrainingData/InfoTree/TacticInvocation/Basic.lean new file mode 100644 index 0000000..0874700 --- /dev/null +++ b/TrainingData/InfoTree/TacticInvocation/Basic.lean @@ -0,0 +1,74 @@ +import TrainingData.InfoTree.Basic + +set_option autoImplicit true + +open Lean Elab + +/-- +A helper structure containing a `TacticInfo` and `ContextInfo`, +along with children `InfoTree`s. + +It is convenient to bundle these together because +many functions rely on both the `TacticInfo` and the `ContextInfo`. +-/ +structure Lean.Elab.TacticInvocation where + info : TacticInfo + ctx : ContextInfo + children : PersistentArray InfoTree + +namespace Lean.Elab.TacticInvocation + +/-- Return the range of the tactic, as a pair of file positions. -/ +def range (t : TacticInvocation) : Position × Position := t.ctx.fileMap.stxRange t.info.stx + +/-- Pretty print a tactic. -/ +def pp (t : TacticInvocation) : IO Format := + t.ctx.runMetaM {} try + Lean.PrettyPrinter.ppTactic ⟨t.info.stx⟩ + catch _ => + pure "" + +open Meta + +/-- Run a tactic on the goals stored in a `TacticInvocation`. -/ +def runMetaMGoalsBefore (t : TacticInvocation) (x : List MVarId → MetaM α) : IO α := do + t.ctx.runMetaM {} <| Meta.withMCtx t.info.mctxBefore <| x t.info.goalsBefore + +/-- Run a tactic on the after goals stored in a `TacticInvocation`. -/ +def runMetaMGoalsAfter (t : TacticInvocation) (x : List MVarId → MetaM α) : IO α := do + t.ctx.runMetaM {} <| Meta.withMCtx t.info.mctxAfter <| x t.info.goalsAfter + +/-- Run a tactic on the main goal stored in a `TacticInvocation`. -/ +def runMetaM (t : TacticInvocation) (x : MVarId → MetaM α) : IO α := do + match t.info.goalsBefore.head? with + | none => throw <| IO.userError s!"No goals at {← t.pp}" + | some g => t.runMetaMGoalsBefore fun _ => do g.withContext <| x g + +def mainGoal (t : TacticInvocation) : IO Expr := + t.runMetaM (fun g => do instantiateMVars (← g.getType)) + +def formatMainGoal (t : TacticInvocation) : IO Format := + t.runMetaM (fun g => do ppExpr (← instantiateMVars (← g.getType))) + +def goalState (t : TacticInvocation) : IO (List Format) := do + t.runMetaMGoalsBefore (fun gs => gs.mapM fun g => do Meta.ppGoal g) + +def goalStateAfter (t : TacticInvocation) : IO (List Format) := do + t.runMetaMGoalsAfter (fun gs => gs.mapM fun g => do Meta.ppGoal g) + +def ppExpr (t : TacticInvocation) (e : Expr) : IO Format := + t.runMetaM (fun _ => do Meta.ppExpr (← instantiateMVars e)) + +end Lean.Elab.TacticInvocation + +namespace Lean.Elab.InfoTree + +/-- +Finds all tactic invocations in an `InfoTree`, +ignoring structuring tactics (e.g. `by`, `;`, multiline tactics, parenthesized tactics). +-/ +def tactics (t : InfoTree) : List TacticInvocation := + t.findTacticNodes.map (fun ⟨i, ctx, children⟩ => ⟨i, ctx, children⟩) + |>.filter fun i => i.info.isSubstantive + +end Lean.Elab.InfoTree diff --git a/TrainingData/InfoTree/ToJson.lean b/TrainingData/InfoTree/ToJson.lean new file mode 100644 index 0000000..65ab8cb --- /dev/null +++ b/TrainingData/InfoTree/ToJson.lean @@ -0,0 +1,114 @@ +import TrainingData.InfoTree.TacticInvocation.Basic + +/-! +# Exporting an `InfoTree` as Json + +-/ + +namespace Lean.Elab + +structure InfoTreeNode (α : Type) where + kind : String + node : Option α + children : List Json +deriving ToJson + +deriving instance ToJson for Lean.Position + +structure Syntax.Range where + synthetic : Bool + start : Lean.Position + finish : Lean.Position +deriving ToJson + +structure Syntax.Json where + pp : Option String + -- raw : String + range : Range +deriving ToJson + +def _root_.Lean.Syntax.toRange (stx : Syntax) (ctx : ContextInfo) : Syntax.Range := + let pos := stx.getPos?.getD 0 + let endPos := stx.getTailPos?.getD pos + { start := ctx.fileMap.toPosition pos + finish := ctx.fileMap.toPosition endPos + synthetic := match stx.getHeadInfo with + | .original .. => false + | _ => true } + +def _root_.Lean.Syntax.toJson (stx : Syntax) (ctx : ContextInfo) (lctx : LocalContext) : IO Syntax.Json := do + return { + pp := match (← ctx.ppSyntax lctx stx).pretty with + | "failed to pretty print term (use 'set_option pp.rawOnError true' for raw representation)" => none + | pp => some pp + -- raw := toString stx + range := stx.toRange ctx } + +structure TacticInfo.Json where + name : Option Name + stx : Syntax.Json + goalsBefore : List String + goalsAfter : List String +deriving ToJson + +-- Note: this is not responsible for converting the children to Json. +def TacticInfo.toJson (i : TacticInfo) (ctx : ContextInfo) : IO TacticInfo.Json := do + let I : TacticInvocation := ⟨i, ctx, .empty⟩ + return { + name := i.name? + stx := + { pp := Format.pretty (← I.pp), + -- raw := toString i.info.stx, + range := i.stx.toRange ctx }, + goalsBefore := (← I.goalState).map Format.pretty, + goalsAfter := (← I.goalStateAfter).map Format.pretty } + +structure CommandInfo.Json where + elaborator : Option Name + stx : Syntax.Json +deriving ToJson + +def CommandInfo.toJson (info : CommandInfo) (ctx : ContextInfo) : IO CommandInfo.Json := do + return { + elaborator := match info.elaborator with | .anonymous => none | n => some n, + stx := ← info.stx.toJson ctx {} } + +structure TermInfo.Json where + elaborator : Option Name + stx : Syntax.Json + expectedType? : Option String + expr : String + isBinder : Bool +deriving ToJson + +def TermInfo.toJson (info : TermInfo) (ctx : ContextInfo) : IO TermInfo.Json := do + return { + elaborator := match info.elaborator with | .anonymous => none | n => some n, + stx := ← info.stx.toJson ctx info.lctx, + expectedType? := ← info.expectedType?.mapM fun ty => do + pure (← ctx.ppExpr info.lctx ty).pretty + expr := (← ctx.ppExpr info.lctx info.expr).pretty + isBinder := info.isBinder } + +structure InfoTree.HoleJson where + goalState : String +deriving ToJson + +partial def InfoTree.toJson (t : InfoTree) (ctx? : Option ContextInfo) : IO Json := do + match t with + | .context i t => t.toJson i + | .node info children => + if let some ctx := ctx? then + let node : Option Json ← match info with + | .ofTermInfo info => some <$> (do pure <| Lean.toJson (← info.toJson ctx)) + | .ofCommandInfo info => some <$> (do pure <| Lean.toJson (← info.toJson ctx)) + | .ofTacticInfo info => some <$> (do pure <| Lean.toJson (← info.toJson ctx)) + | _ => pure none + return Lean.toJson (InfoTreeNode.mk info.kind node (← children.toList.mapM fun t' => t'.toJson ctx)) + else throw <| IO.userError "No `ContextInfo` available." + | .hole mvarId => + if let some ctx := ctx? then + return Lean.toJson (InfoTree.HoleJson.mk (← ctx.runMetaM {} (do Meta.ppGoal mvarId)).pretty) + else throw <| IO.userError "No `ContextInfo` available." + +end Lean.Elab diff --git a/TrainingData/Mathlib.lean b/TrainingData/Mathlib.lean new file mode 100644 index 0000000..befa7a0 --- /dev/null +++ b/TrainingData/Mathlib.lean @@ -0,0 +1 @@ +import Mathlib \ No newline at end of file diff --git a/lake-manifest.json b/lake-manifest.json new file mode 100644 index 0000000..f95ec2a --- /dev/null +++ b/lake-manifest.json @@ -0,0 +1,51 @@ +{"version": 5, + "packagesDir": "lake-packages", + "packages": + [{"git": + {"url": "https://github.com/EdAyers/ProofWidgets4", + "subDir?": null, + "rev": "a0c2cd0ac3245a0dade4f925bcfa97e06dd84229", + "opts": {}, + "name": "proofwidgets", + "inputRev?": "v0.0.13", + "inherited": true}}, + {"git": + {"url": "https://github.com/mhuisi/lean4-cli.git", + "subDir?": null, + "rev": "5a858c32963b6b19be0d477a30a1f4b6c120be7e", + "opts": {}, + "name": "Cli", + "inputRev?": "nightly", + "inherited": true}}, + {"git": + {"url": "https://github.com/leanprover-community/mathlib4.git", + "subDir?": null, + "rev": "c779bd594f3dbfeb8d26e4ca968eb7b3831f369e", + "opts": {}, + "name": "mathlib", + "inputRev?": "withImportModules", + "inherited": false}}, + {"git": + {"url": "https://github.com/gebner/quote4", + "subDir?": null, + "rev": "81cc13c524a68d0072561dbac276cd61b65872a6", + "opts": {}, + "name": "Qq", + "inputRev?": "master", + "inherited": true}}, + {"git": + {"url": "https://github.com/JLimperg/aesop", + "subDir?": null, + "rev": "086c98bb129ca856381d4414dc0afd6e3e4ae2ef", + "opts": {}, + "name": "aesop", + "inputRev?": "master", + "inherited": true}}, + {"git": + {"url": "https://github.com/leanprover/std4", + "subDir?": null, + "rev": "61a6507eff4dd71c01e0c7e194bc569d4719a6d3", + "opts": {}, + "name": "std", + "inputRev?": "main", + "inherited": true}}]} diff --git a/lakefile.lean b/lakefile.lean new file mode 100644 index 0000000..a2d39de --- /dev/null +++ b/lakefile.lean @@ -0,0 +1,30 @@ +import Lake +open Lake DSL + +package «lean-training-data» { + -- add any package configuration options here +} + +require mathlib from git + "https://github.com/leanprover-community/mathlib4.git" @ "withImportModules" + +@[default_target] +lean_lib TrainingData where + +lean_exe tactic_benchmark where + root := `scripts.tactic_benchmark + supportInterpreter := true + +lean_exe export_infotree where + root := `scripts.export_infotree + supportInterpreter := true + +lean_exe training_data where + root := `scripts.training_data + supportInterpreter := true + +lean_exe premises where + root := `scripts.premises + +lean_exe declaration_types where + root := `scripts.declaration_types diff --git a/lean-toolchain b/lean-toolchain new file mode 100644 index 0000000..1b34910 --- /dev/null +++ b/lean-toolchain @@ -0,0 +1 @@ +leanprover/lean4:nightly-2023-08-19 diff --git a/make.sh b/make.sh new file mode 100644 index 0000000..fd8a446 --- /dev/null +++ b/make.sh @@ -0,0 +1,40 @@ +# This script prepares the following files: +# * YYYY-MM-DD-declaration_types.gz +# * YYYY-MM-DD-premises.gz +# * YYYY-MM-DD-export_infotree_full.tgz +# * YYYY-MM-DD-export_infotree_min.tgz +# * YYYY-MM-DD-training_data.tgz +# * YYYY-MM-DD-proof_step.tgz + +DATE=`date "+%Y-%m-%d"` + +lake exe declaration_types > out/$DATE-declaration_types +gzip out/$DATE-declaration_types + +lake exe premises > out/$DATE-premises +gzip out/$DATE-premises + +rm -rf out/export_infotree +./scripts/export_infotree.sh +cd out +tar cvzf $DATE-export_infotree_full.tgz export_infotree +cd .. +rm -rf out/export_infotree +./scripts/export_infotree.sh --tactics --original --substantive +cd out +tar cvzf $DATE-export_infotree_min.tgz export_infotree +cd .. + +rm -rf out/training_data +./scripts/training_data.sh +cd out +tar cvzf $DATE-training_data.tgz training_data +cd .. +rm -rf out/training_data +./scripts/training_data.sh --proofstep +cd out +tar cvzf $DATE-proof_step.tgz training_data +cd .. + +rm -rf out/tactic_benchmark +./scripts/tactic_benchmark.sh --aesop diff --git a/scripts/declaration_types.lean b/scripts/declaration_types.lean new file mode 100644 index 0000000..11eecae --- /dev/null +++ b/scripts/declaration_types.lean @@ -0,0 +1,27 @@ +import Mathlib.Lean.CoreM + +open Lean Meta + +def Lean.ConstantInfo.kind : ConstantInfo → String + | .axiomInfo _ => "axiom" + | .defnInfo _ => "def" + | .thmInfo _ => "theorem" + | .opaqueInfo _ => "opaque" + | .quotInfo _ => "quot" -- Not sure what this is! + | .inductInfo _ => "inductive" + | .ctorInfo _ => "constructor" + | .recInfo _ => "recursor" + +def main (args : List String) : IO UInt32 := do + let modules := match args with + | [] => [`Mathlib] + | args => args.map fun s => s.toName + searchPathRef.set compileTimeSearchPath% + CoreM.withImportModules modules do + for (n, c) in (← getEnv).constants.map₁ do + if ! (← n.isBlackListed) then + IO.println "---" + IO.println c.kind + IO.println n + IO.println (← MetaM.run' do ppExpr c.type) + return 0 diff --git a/scripts/export_infotree.lean b/scripts/export_infotree.lean new file mode 100644 index 0000000..6f57271 --- /dev/null +++ b/scripts/export_infotree.lean @@ -0,0 +1,47 @@ +import TrainingData.Frontend +import TrainingData.InfoTree.ToJson +import Mathlib.Tactic.ToExpr -- Upstreamed to std4 as https://github.com/leanprover/std4/pull/221 +import Mathlib.Util.Cli -- Upstreamed to lean4-cli as https://github.com/mhuisi/lean4-cli/pull/15 +import Mathlib.Lean.CoreM + +open Lean Elab IO Meta +open Cli System + +structure InfoTreeExport where + name : Name + trees : List Json +deriving ToJson + +def exportInfoTree (args : Cli.Parsed) : IO UInt32 := do + searchPathRef.set compileTimeSearchPath% + let target := args.positionalArg! "module" |>.as! Name + let mut trees ← moduleInfoTrees target + if args.hasFlag "tactics" then + trees := (trees.map InfoTree.retainTacticInfo).join + if args.hasFlag "original" then + trees := (trees.map InfoTree.retainOriginal).join + if args.hasFlag "substantive" then + trees := (trees.map InfoTree.retainSubstantive).join + let json ← trees.mapM fun t => t.toJson none + IO.println <| toJson <| InfoTreeExport.mk target json + return 0 + +/-- Setting up command line options and help text for `lake exe export_infotree`. -/ +def export_infotree : Cmd := `[Cli| + export_infotree VIA exportInfoTree; ["0.0.1"] + "Export the InfoTrees for a file as JSON." + + FLAGS: + "tactics"; "Only export TacticInfo nodes." + "original"; "Skip nodes with synthetic syntax." + "substantive"; "Skip tactic combinators." + + ARGS: + module : Name; "Lean module to compile and export InfoTrees." +] + +/-- `lake exe export_infotree` -/ +def main (args : List String) : IO UInt32 := + export_infotree.validate args + +-- See `./scripts/export_infotree.sh` to run this against all of Mathlib. diff --git a/scripts/export_infotree.sh b/scripts/export_infotree.sh new file mode 100755 index 0000000..6c09b63 --- /dev/null +++ b/scripts/export_infotree.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Run either as `scripts/export_infotree.sh` to run on all of Mathlib, +# or `scripts/export_infotree.sh Mathlib.Logic.Hydra` to run on just one file. +# Results will go in `build/export_infotree/Mathlib.Logic.Hydra.json`. + +# To pass the flags `--tactics --original --substantive` you have to modify the script below. +# TODO: proper command line arguments. + + +FLAGS=() +ARGS=() + +for arg in "$@"; do + if [[ $arg == --* ]]; then + FLAGS+=("$arg") + else + ARGS+=("$arg") + fi +done + +if [ ${#ARGS[@]} -eq 0 ]; then + rm -f ./build/lake.lock + lake build export_infotree + parallel -j32 ./scripts/export_infotree.sh ${FLAGS[@]} -- `cat lake-packages/mathlib/Mathlib.lean | sed -e 's/import //'` +else + DIR=out/export_infotree + mkdir -p $DIR + mod=${ARGS[0]} + if [ ! -f $DIR/$mod.json ]; then + echo $mod + if [ ! -f build/bin/export_infotree ]; then + lake build export_infotree + fi + build/bin/export_infotree ${FLAGS[@]} $mod > $DIR/$mod._json && mv $DIR/$mod._json $DIR/$mod.json + # build/bin/export_infotree $mod > $DIR/$mod._json && mv $DIR/$mod._json $DIR/$mod.json + fi +fi diff --git a/scripts/premises.lean b/scripts/premises.lean new file mode 100644 index 0000000..b1f0ae1 --- /dev/null +++ b/scripts/premises.lean @@ -0,0 +1,130 @@ +import Mathlib.Lean.CoreM +import Mathlib.Control.Basic + +/-! +Generate declaration dependencies up to a target file (defaulting to all of Mathlib). + +* Declarations are separated by `---`. +* In each block the first declaration is the theorem or definition we are analyzing, +* Subsequent indented declarations are those used in its proof or definition. +* Declarations prefixed with a `* ` appear in explicit arguments. + (This approximates "is visible in the pretty printed form".) +* Declarations prefixed with a `s ` are used by the simplifier. + +-/ + +open Lean Meta + +def isAuxLemma : Name → Bool +| .num (.str _ "_auxLemma") _ => true +| _ => false + +partial def Lean.ConstantInfo.getUsedConstants' (c : ConstantInfo) + (constantsMap : HashMap Name ConstantInfo) + (unfolding : Name → Bool := isAuxLemma) : NameSet × NameSet := Id.run do + let mut direct : NameSet := ∅ + let mut unfolded : NameSet := ∅ + for n in c.getUsedConstants do + if unfolding n then + if let some c := constantsMap.find? n then + unfolded := unfolded ++ c.getUsedConstants + else + direct := direct.insert n + return (direct, unfolded) + +/-- +Traverse all constants from imported files, +collecting the constants which are used in either the type or value of the theorem or definition. + +A predicate `unfolding` picks out a class of constants which should not be returned, +but instead replaced with the set of constants appearing in their type or value. +The typical use case is to unfold `_auxLemma`s generated dynamically by the simplifier. +-/ +def allUsedConstants (unfolding : Name → Bool := isAuxLemma) : + CoreM (NameMap (NameSet × NameSet)) := do + let constantsMap := (← getEnv).constants.map₁ + let mut result : NameMap (NameSet × NameSet) := ∅ + for (n, c) in constantsMap do + -- We omit all internally generated auxiliary statements. + if ! (← n.isBlackListed) then + result := result.insert n (c.getUsedConstants' constantsMap unfolding) + return result + +open Lean in +elab "#printNum " n:ident i:num : command => do + let name := Name.num n.getId i.getNat + let some decl := (← getEnv).find? name | throwError "no such declaration {name}" + logInfo m!"{name} : {decl.type} :=\n{decl.value?.getD (.bvar 0)}" + +/-- +Traverse an expression, collecting `Name`s, +but do not descend into implicit arguments. +-/ +partial def Lean.Expr.explicitConstants : Expr → MetaM NameSet +| .app f x => do + -- We wrap with `try?` here because on tiny fraction of declarations in Mathlib, + -- e.g. `Computation.exists_of_mem_parallel`, this fails with an error like + -- `function expected ?m.88 ?m.93`. + match (← try? (inferType f)) with + | some (.forallE _ _ _ .default) => return (← f.explicitConstants) ++ (← x.explicitConstants) + | _ => f.explicitConstants +| .lam _ t b _ => do b.instantiate1 (← mkFreshExprMVar t) |>.explicitConstants +| .forallE _ t b _ => do b.instantiate1 (← mkFreshExprMVar t) |>.explicitConstants +| .letE n t v b _ => return (← v.explicitConstants) + ++ (← withLetDecl n t v fun fvar => (b.instantiate1 fvar).explicitConstants) +| .const n _ => return NameSet.empty.insert n +| .mdata _ e => e.explicitConstants +| _ => return NameSet.empty + +/-- +Collect all the constants used in the values of theorem or definition, +ignoring implicit arguments of functions. +-/ +def Lean.ConstantInfo.explicitConstants (c : ConstantInfo) : MetaM NameSet := do + match c.value? with + | some e => e.explicitConstants + | none => return ∅ + +/-- +Traverse all constants from imported files, +collecting the constants which are used in the value of the theorem or definition, +ignoring implicit arguments of functions. +-/ +def allExplicitConstants : MetaM (NameMap NameSet) := do + let r := (← getEnv).constants.map₁ + let mut result : NameMap NameSet := ∅ + for (n, c) in r do + -- We omit all internally generated auxiliary statements. + if ! (← n.isBlackListed) then + result := result.insert n (← c.explicitConstants) + return result + +def main (args : List String) : IO UInt32 := do + let options := Options.empty.insert `maxHeartbeats (0 : Nat) + let modules := match args with + | [] => [`Mathlib] + | args => args.map fun s => s.toName + searchPathRef.set compileTimeSearchPath% + CoreM.withImportModules modules (options := options) do + let allConstants ← allUsedConstants + let explicitConstants ← MetaM.run' allExplicitConstants + for (n, (d, u)) in allConstants do + match n.components with + -- Note we keep `Std` as it has many lemmas about numbers and data structures. + | "Lean" :: _ + | "Qq" :: _ + | "Cli" :: _ + | "Aesop" :: _ => continue + | components => if components.contains "Tactic" then continue + let explicit := explicitConstants.find? n |>.getD ∅ + IO.println "---" + IO.println n + for m in d do + if explicit.contains m then + IO.println s!"* {m}" + else + IO.println s!" {m}" + for m in u do + if ! d.contains m then + IO.println s!"s {m}" + return 0 diff --git a/scripts/tactic_benchmark.lean b/scripts/tactic_benchmark.lean new file mode 100644 index 0000000..808662f --- /dev/null +++ b/scripts/tactic_benchmark.lean @@ -0,0 +1,150 @@ +import TrainingData.Frontend +import Mathlib.Control.Basic +import Mathlib.Lean.Expr.Basic +import Mathlib.Tactic.Common +import Mathlib.Tactic.ToExpr +import Mathlib.Util.Cli +import Aesop +import Lean.Util.Trace + +open Lean Core Elab IO Meta Term Tactic -- All the monads! + + +/-- Count the number of heartbeats used during a monadic function. -/ +-- Upstreamed in https://github.com/leanprover/std4/pull/212 +def Lean.withHeartbeats [Monad m] [MonadLiftT BaseIO m] (x : m α) : m (α × Nat) := do + let start ← IO.getNumHeartbeats + let r ← x + let finish ← IO.getNumHeartbeats + return (r, (finish - start) / 1000) + +set_option autoImplicit true + +/-- +Compile the designated module, and run a monadic function with each new `ConstantInfo`, +with the `Environment` as it was *before* the command which created that declaration. + +(Internal declarations according to `Name.isBlackListed` are skipped.) +-/ +def runAtDecls (mod : Name) (tac : ConstantInfo → MetaM (Option α)) : + MLList IO (ConstantInfo × α) := do + let fileName := (← findLean mod).toString + let steps := compileModule' mod + let targets := steps.bind fun c => (MLList.ofList c.diff).map fun i => (c, i) + + targets.filterMapM fun (cmd, ci) => do + for m in cmd.msgs do IO.eprintln (bombEmoji ++ (← m.data.toString)) + unless cmd.msgs.isEmpty do + throw <| IO.userError s!"Unexpected messages in: {mod} during elaboration of {cmd.stx}" + + let options := ({} : KVMap).insert `maxHeartbeats (.ofNat 20000) + let ctx := { fileName, options, fileMap := default } + let state := { env := cmd.before } + -- From `IO` to `CoreM`: + Prod.fst <$> (CoreM.toIO · ctx state) do + if ← ci.name.isBlackListed then + pure none + else + -- From `CoreM` to `MetaM`: + MetaM.run' (ctx := {}) (s := {}) do + match ← tac ci with + | some r => pure (ci, r) + | none => pure none + +inductive ResultType +| failure +| subgoals +| notDefEq +| success +deriving Repr, BEq + +instance : ToString ResultType where + toString := fun + | .failure => "failure" + | .subgoals => "subgoals" + | .notDefEq => "notDefEq" + | .success => "success" + +structure Result where + type : ResultType + seconds : Float + heartbeats : Nat + +/-- +Compile the designated module, select declarations satisfying a predicate, +and run a tactic on the type of each declaration. +-/ +def runTacticAtDecls (mod : Name) (decls : ConstantInfo → CoreM Bool) (tac : TacticM Unit) : + MLList IO (ConstantInfo × Result) := do + runAtDecls mod fun ci => do + if ! (← decls ci) then return none + let g ← mkFreshExprMVar ci.type + -- From `MetaM` to `TermElabM` + let ((gs, seconds), heartbeats) ← withHeartbeats <| withSeconds <| + try? <| TermElabM.run' do + -- From `TermElabM` to `TacticM`! + Tactic.run g.mvarId! tac + let type : ResultType ← match gs with + | none => pure .failure + | some (_ :: _) => pure .subgoals + | some [] => + match ci.value? with + | none => pure .success + | some v => + if ← isProp ci.type then + pure .success + else + match ← try? (isDefEq g v) with + | none + -- In this case we should perhaps return an "uncertain" value. + -- The problem is that `v` may contain constants generated by the simplifier + -- during elaboration of the original proof, + -- and which aren't in the current environment, so we can't really compare `g` and `v` + | some false => pure .notDefEq + | some true => pure .success + return some ⟨type, seconds, heartbeats⟩ + + +def useAesop : TacticM Unit := do evalTactic (← `(tactic| aesop)) +def useExact? : TacticM Unit := do evalTactic (← `(tactic| exact?)) +def useRfl : TacticM Unit := do evalTactic (← `(tactic| intros; rfl)) +def useSimpAll : TacticM Unit := do evalTactic (← `(tactic| intros; simp_all)) + +open Cli System + +def tacticBenchmarkMain (args : Cli.Parsed) : IO UInt32 := do + let module := args.positionalArg! "module" |>.as! Name + searchPathRef.set compileTimeSearchPath% + let tac ← + if args.hasFlag "aesop" then pure useAesop else + if args.hasFlag "exact" then pure useExact? else + if args.hasFlag "rfl" then pure useRfl else + if args.hasFlag "simp_all" then pure useSimpAll else + throw <| IO.userError "Specify a tactic, e.g. `--aesop`" + let result := runTacticAtDecls module (fun _ => pure true) tac + IO.println s!"{module}" + for (ci, ⟨type, seconds, heartbeats⟩) in result do + IO.println <| (if type == .success then checkEmoji else crossEmoji) ++ " " ++ ci.name.toString ++ + s!" ({seconds}s) ({heartbeats} heartbeats)" + return 0 + +/-- Setting up command line options and help text for `lake exe tactic_benchmark`. -/ +def tactic_benchmark : Cmd := `[Cli| + tactic_benchmark VIA tacticBenchmarkMain; ["0.0.1"] + "Run a customisable tactic at all declarations in a file." + + FLAGS: + "aesop"; "Use `aesop`." + "exact"; "Use `exact?`." + "rfl"; "Use `intros; rfl`." + "simp_all"; "Use `intros; simp_all`." + + ARGS: + module : Name; "Lean module to compile and export InfoTrees." +] + +/-- `lake exe tactic_benchmark` -/ +def main (args : List String) : IO UInt32 := + tactic_benchmark.validate args + +-- See `scripts/tactic_benchmark.sh` for a script to run this on all of Mathlib. diff --git a/scripts/tactic_benchmark.sh b/scripts/tactic_benchmark.sh new file mode 100755 index 0000000..321738f --- /dev/null +++ b/scripts/tactic_benchmark.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Run either as `scripts/tactic_benchmark.sh --aesop` to run aesop on all of Mathlib, +# or `scripts/tactic_benchmark.sh --simp_all Mathlib.Logic.Hydra` to run on just one file. +# Results will go in `build/tactic_benchmark/simp_all/Mathlib.Logic.Hydra.bench`. + + +if [ "$#" -eq 1 ]; then + rm -f ./build/lake.lock + lake build tactic_benchmark + parallel -j32 ./scripts/tactic_benchmark.sh $1 -- `cat lake-packages/mathlib/Mathlib.lean | sed -e 's/import //'` +else + DIR=out/tactic_benchmark/${1#--} + mkdir -p $DIR + mod=$2 + if [ ! -f $DIR/$mod.bench ]; then + echo $mod + if [ ! -f build/bin/tactic_benchmark ]; then + lake build tactic_benchmark + fi + timeout 5m build/bin/tactic_benchmark $1 $mod > $DIR/$mod._bench && mv $DIR/$mod._bench $DIR/$mod.bench + fi +fi diff --git a/scripts/tactic_benchmark_summary.sh b/scripts/tactic_benchmark_summary.sh new file mode 100755 index 0000000..7547096 --- /dev/null +++ b/scripts/tactic_benchmark_summary.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +## Writes a human readable summary of the tactics tested so far using `tactic_benchmark`. + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +mkdir -p $SCRIPT_DIR/../out/tactic_benchmark +cd $SCRIPT_DIR/../out/tactic_benchmark + +tactics=`ls -d */ | cut -f1 -d'/'` + +for tac in $tactics; do + mkdir -p $tac + cd $tac + grep ✅ * | awk '{print $1 " " $2}' | sort > ../$tac.out + grep ❌ * | awk '{print $1 " " $2}' | sort > ../$tac.failure + grep -h "^\(✅\|❌\)" * | awk '{gsub(/[()s]/,""); s+=$3;}END{printf "%.0f\n",s}' > ../$tac.time + cd .. +done + +for tac in $tactics; do + echo "Solved by $tac: " `cat $tac.out | wc -l` / `cat $tac.failure | wc -l` " in " `cat $tac.time`"s" + for other in $tactics; do + if [ "$tac" != "$other" ]; then + echo " but not $other: " `comm -23 $tac.out $other.out | wc -l` + fi + done +done diff --git a/scripts/training_data.lean b/scripts/training_data.lean new file mode 100644 index 0000000..4063d5e --- /dev/null +++ b/scripts/training_data.lean @@ -0,0 +1,115 @@ +import TrainingData.Frontend +import TrainingData.InfoTree.ToJson +import TrainingData.InfoTree.TacticInvocation.Basic +import Mathlib.Data.String.Defs +import Mathlib.Util.Cli +import Mathlib.Lean.CoreM +import Cli + +open Lean Elab IO Meta +open Cli System + +namespace Lean.Elab.TacticInvocation + +def verboseTrainingData (module : Name) (i : TacticInvocation) : IO String := do + let mut result := "===\n" + result := result ++ s!"{module}\n---\n" + let sourceUpToTactic := Substring.mk (← moduleSource module) 0 (i.info.stx.getPos?.getD 0) + let chunks := sourceUpToTactic.splitOn "\n\n" + let declUpToTactic := chunks.getLast!.toString + let offset := chunks.dropLast.foldl (init := 0) (fun t c => t + (c.toString.count '\n') + 2) + result := result ++ s!"{offset}\n---\n{declUpToTactic}\n---\n" + result := result ++ (Format.joinSep (← i.goalState) "\n").pretty ++ "\n---\n" + let ⟨⟨startLine, startCol⟩, ⟨endLine, endCol⟩⟩ := i.range + result := result ++ s!"{startLine}:{startCol}-{endLine}:{endCol}\n---\n" + result := result ++ (← i.pp).pretty ++ "\n---\n" + result := result ++ (Format.joinSep (← i.goalStateAfter) "\n").pretty ++ "\n---\n" + return result + +def proofStepData (i : TacticInvocation) : IO String := do + let mut result := "[GOAL]\n" ++ (Format.joinSep (← i.goalState) "\n").pretty ++ "\n[PROOFSTEP]\n" ++ (← i.pp).pretty + + return result + +end Lean.Elab.TacticInvocation + +def trainingData (args : Cli.Parsed) : IO UInt32 := do + searchPathRef.set compileTimeSearchPath% + let module := args.positionalArg! "module" |>.as! Name + let mut trees ← moduleInfoTrees module + trees := trees.bind InfoTree.retainTacticInfo + trees := trees.bind InfoTree.retainOriginal + trees := trees.bind InfoTree.retainSubstantive + for t in trees do + for t in t.tactics do + if args.hasFlag "proofstep" then + IO.println (← t.proofStepData) + else + IO.println (← t.verboseTrainingData module) + return 0 + +/-- Setting up command line options and help text for `lake exe training_data`. -/ +def training_data : Cmd := `[Cli| + training_data VIA trainingData; ["0.0.1"] +"Export training data consisting of goal states and tactic invocations from the given file. + +The output consists of blocks of the form: +``` +=== +Mathlib.Logic.Hydra +--- +61 +--- +theorem cutExpand_le_invImage_lex [DecidableEq α] [IsIrrefl α r] : + CutExpand r ≤ InvImage (Finsupp.Lex (rᶜ ⊓ (· ≠ ·)) (· < ·)) toFinsupp := by + +--- +α : Type u_1 +r : α → α → Prop +inst✝¹ : DecidableEq α +inst✝ : IsIrrefl α r +⊢ CutExpand r ≤ InvImage (Finsupp.Lex (rᶜ ⊓ fun x x_1 => x ≠ x_1) fun x x_1 => x < x_1) ↑toFinsupp +--- +64:2-64:27 +--- +rintro s t ⟨u, a, hr, he⟩ +--- +case intro.intro.intro +α : Type u_1 +r : α → α → Prop +inst✝¹ : DecidableEq α +inst✝ : IsIrrefl α r +s t u : Multiset α +a : α +hr : ∀ (a' : α), a' ∈ u → r a' a +he : s + {a} = t + u +⊢ InvImage (Finsupp.Lex (rᶜ ⊓ fun x x_1 => x ≠ x_1) fun x x_1 => x < x_1) (↑toFinsupp) s t +--- +``` +Here: +* `Mathlib.Logic.Hydra` is the name of the module where this goal occurs. +* `61` is the number of lines before the declaration (i.e. the `theorem` statement is on line `62`.) +* `theorem ...` is the *partial* declaration, including a fragment of the tactic proof. +* Next is the goal state at that point. + (Typically just one goal, as we don't report the goal states before structuring commands.) + Note that there is no guarantee that truncating the file at this point will actually cause Lean + to display this goal: the presence of earlier structuring commands could result in Lean showing + an error message instead. +* `64:2-64:27` is the range of positions occupied by the tactic invocation in the original file. + This allows us to look up this goal in a live Lean session, so we can try out alternative tactics. +* `rintro s t ⟨u, a, hr, he⟩` is the tactic used in the library. +* After that is the goal state after running the tactic. + (Often multiple goals.)" + + FLAGS: + "proofstep"; "Use the proofstep format: [GOAL]tactic-state[PROOFSTEP]next-tactic" + + ARGS: + module : Name; "Lean module to compile and export training data." +] + +/-- `lake exe training_data` -/ +def main (args : List String) : IO UInt32 := + training_data.validate args + +-- See `scripts/training_data.sh` for a script to run this on all of Mathlib. diff --git a/scripts/training_data.sh b/scripts/training_data.sh new file mode 100755 index 0000000..a7f25e4 --- /dev/null +++ b/scripts/training_data.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +## Generates pretty-printed descriptions of every goal/tactic pair. + +# See the help text in `lake exe training_data` for a description of the output format. + +# Run either as `scripts/training_data.sh` to run on all of Mathlib (several hours), +# or `scripts/training_data.sh Mathlib.Logic.Hydra` to run on just one file. +# Results will go in `build/training_data/Mathlib.Logic.Hydra.train`. + +FLAGS=() +ARGS=() + +for arg in "$@"; do + if [[ $arg == --* ]]; then + FLAGS+=("$arg") + else + ARGS+=("$arg") + fi +done + +if [ ${#ARGS[@]} -eq 0 ]; then + rm -f ./build/lake.lock + lake build training_data + parallel -j32 ./scripts/training_data.sh ${FLAGS[@]} -- `cat lake-packages/mathlib/Mathlib.lean | sed -e 's/import //'` +else + DIR=out/training_data + mkdir -p $DIR + mod=${ARGS[0]} + if [ ! -f $DIR/$mod.train ]; then + echo $mod + if [ ! -f build/bin/training_data ]; then + lake build training_data + fi + build/bin/training_data ${FLAGS[@]} $mod > $DIR/$mod._train && mv $DIR/$mod._train $DIR/$mod.train + fi +fi