self-spec.ebnf

/**
 * A grammar is a list of one or more rules.
 */
grammar ::= rule+ ;

/**
 * A rule can be either a grammar or a tokenizer rule. If a rule is defined
 * more than once, each definition will just add to the alternatives.
 */
rule

  /**
   * A tokenizer rule, of the form `symbol :== pattern ;`. The tokenizer (if
   * any) is expected to tokenize the input as follows:
   *
   *  - try matching the _ token rule and all named token rules that are
   *    *directly* referred to from grammar production rules on the remainder
   *    of the input;
   *  - emit the token corresponding to the longest match to the parser, or
   *    drop it if said longest match is the _ token;
   *     - if there is a tie in match lengths, emit the token that was defined
   *       earlier in the grammar (the first definition is what counts, if
   *       there are multiple);
   *  - if no token matched, emit an unrecognized character error and (maybe)
   *    try to recover.
   *
   * Parsers that do not have a tokenizer (i.e. their tokens are simply the
   * characters on the input) can treat these as normal rules.
   *
   * Tokenizer rules may not directly or indirectly refer to themselves, i.e.
   * they may not be recursive. This allows them to be mapped onto regular
   * expressions.
   *
   * Tokenizer rules that are not directly used within grammar rules are not
   * treated as actual tokens; rather, they are treated as reusable bits of a
   * regular expression. Borrowing ANTLR terminology, they are called
   * fragments (but unlike in ANTLR, they are detected automatically). This is
   * just a readability thing; it makes no difference for the matched language.
   */
  ::= doc? symbol ":==" alternation ";" -> token-rule

  /**
   * A whitespace rule, of the form `_ :== pattern ;`. Works like a normal
   * tokenizer rule, but the token is not emitted to the parser. Parsers that
   * do not have a tokenizer (i.e. their tokens are simply the characters on
   * the input) must implicitly match _* at the start of the grammar and after
   * every symbol in grammar production rules (i.e. `::=` rules).
   */
    | doc? "_" ":==" alternation ";" -> whitespace-rule

  /**
   * A grammar production rule, for the form `symbol ::= pattern ;`. The syntax
   * here is a bit convoluted because the toplevel alternations can be given
   * docstrings and can be named, but the syntax is otherwise the same as for
   * token rules.
   *
   * Whenever literal patterns are used in a grammar rule, they are implicitly
   * converted to a token rule. Be mindful of this when writing the grammar:
   * they can cause tokenizer conflicts just like any other token!
   */
    | doc? symbol first-alter subseq-alter* ";" -> grammar-rule
    ;

/**
 * All symbol names are written in kebab-case. GrammarSpec will convert the
 * names to the appropriate case conventions for the target language.
 */
symbol :== [a-z] [a-z0-9#-]* ;

/**
 * `a | b | ...` preferentially matches `a`, matches `b` if `a` does not
 * match, and so on.
 */
alternation ::= concatenation ("|" concatenation)* ;

/**
 * The alternatives at the root of a grammar production rule can be annotated
 * with a variant name (if not specified it is auto-generated when needed) at
 * the end and can be given a docstring before the ::= or | that starts the
 * variant. They are functionally indistinguishable from a normal, nested
 * alternation, though the parse tree may be generated in a different way.
 */
first-alter ::= doc? "::=" concatenation alter-name? ;
subseq-alter ::= doc? "|" concatenation alter-name? ;

/**
 * A toplevel alternative can be named by appending `-> name`. The names do not
 * have semantical meaning (they cannot be referred to within the grammar),
 * but are used to attach names to variants in the generated ASTs.
 */
alter-name ::= "->" symbol ;

/**
 * `a b ...` matches `a` followed by `b` and so on.
 */
concatenation ::= repetition+ ;

/**
 * The usual ?*+ characters can be used to specify repetition.
 */
repetition

  /**
   * `a?` greedily matches `a` zero times or once.
   */
  ::= singular "?" -> maybe

  /**
   * `a*` greedily matches `a` zero or more times.
   */
    | singular "*" -> any

  /**
   * `a+` greedily matches `a` one or more times.
   */
    | singular "+" -> many

  /**
   * `a` matches `a` exactly once. Note that this one must come last in the
   * list of alternatives, because otherwise the other alternatives would
   * never match (because alternatives pick the *first* match, not the longest
   * one).
   */
    | singular -> one
    ;

/**
 * A single literal pattern or referenced rule to match, or a parenthesized
 * expression.
 */
singular
  ::= "(" alternation ")" -> nested
    | symbol
    | string-literal
    | character-set
    | character-literal
    ;

/**
 * `'...'` and `"..."` match `...` literally. Empty string literals are not
 * supported; instead use `?`.
 */
string-literal
  :== "'" ( [^'##] | ## [^] )+ "'"
    | '"' ( [^"##] | ## [^] )+ '"'
    ;

/**
 * `[...]` matches a single character within the specified set. `a-z` may be
 * used to select a whole range of code points, and a `^` at the front of the
 * `...` inverts the set.
 */
character-set :== "[" "^"? ( character-set-char ( "-" character-set-char )? )* "]" ;

/**
 * Within a character set, the characters `^-]#` have special meaning. Prefix
 * them with a `#` to match them literally.
 */
character-set-char :== [^#^#-#]##] | escape-sequence ;

/**
 * Hash escape sequences for characters can also be matched outside the context
 * of a string or character range.
 */
character-literal :== escape-sequence ;

/**
 * A hash escape sequence for a special character:
 *
 *  - `#t`: short for `#x09` (tab).
 *  - `#n`: short for `#x0A` (LF/newline).
 *  - `#r`: short for `#x0D` (CR/carriage return).
 *  - `##`: matches `#` literally.
 *  - `#'`: matches `'` literally (for use within `'`-delimited strings).
 *  - `#"`: matches `"` literally (for use within `"`-delimited strings).
 *  - `#-`: matches `-` literally (for use within character sets).
 *  - `#^`: matches `^` literally (for use within character sets).
 *  - `#]`: matches `]` literally (for use within character sets).
 *  - `#xHH`: matches the given 8-bit code point, expressed with two hex
 *    digits.
 *  - `#uHHHH`: matches the given 16-bit code point, expressed with four hex
 *    digits.
 *  - `#UHHHHHHHH`: matches the given 32-bit code point, expressed with eight
 *    hex digits.
 */
escape-sequence :== ## (
  [^xuU] |
  "x" hex hex |
  "u" hex hex hex hex |
  "U" hex hex hex hex hex hex hex hex
) ;

/**
 * A hex digit.
 */
hex :== [0-9a-fA-F] ;

/**
 * /** comments are treated as docstrings. They may only appear immediately in
 * front of a rule or toplevel alternative in a grammar production rule.
 */
doc :== "/**" comment-body "*/" ;

/**
 * Normal /* comments can appear anywhere.
 */
_ :== "/*" ( [^*] comment-body )? "*/" ;

/**
 * The body of a comment. Some magic is needed here to ensure that the body
 * doesn't match the comment end sequence anywhere in it; we can't just match
 * any character because the start of the first comment in the file would
 * greedily match right up to the end of the last comment in the file.
 */
comment-body :== ( [^*] | "*"+ [^*/] )* "*"* ;

/**
 * Whitespace can appear anywhere.
 */
_ :== [ #t#n#r]+ ;