-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathself-spec.ebnf
220 lines (195 loc) · 7.19 KB
/
self-spec.ebnf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/**
* A grammar is a list of one or more rules.
*/
grammar ::= rule+ ;
/**
* A rule can be either a grammar or a tokenizer rule. If a rule is defined
* more than once, each definition will just add to the alternatives.
*/
rule
/**
* A tokenizer rule, of the form `symbol :== pattern ;`. The tokenizer (if
* any) is expected to tokenize the input as follows:
*
* - try matching the _ token rule and all named token rules that are
* *directly* referred to from grammar production rules on the remainder
* of the input;
* - emit the token corresponding to the longest match to the parser, or
* drop it if said longest match is the _ token;
* - if there is a tie in match lengths, emit the token that was defined
* earlier in the grammar (the first definition is what counts, if
* there are multiple);
* - if no token matched, emit an unrecognized character error and (maybe)
* try to recover.
*
* Parsers that do not have a tokenizer (i.e. their tokens are simply the
* characters on the input) can treat these as normal rules.
*
* Tokenizer rules may not directly or indirectly refer to themselves, i.e.
* they may not be recursive. This allows them to be mapped onto regular
* expressions.
*
* Tokenizer rules that are not directly used within grammar rules are not
* treated as actual tokens; rather, they are treated as reusable bits of a
* regular expression. Borrowing ANTLR terminology, they are called
* fragments (but unlike in ANTLR, they are detected automatically). This is
* just a readability thing; it makes no difference for the matched language.
*/
::= doc? symbol ":==" alternation ";" -> token-rule
/**
* A whitespace rule, of the form `_ :== pattern ;`. Works like a normal
* tokenizer rule, but the token is not emitted to the parser. Parsers that
* do not have a tokenizer (i.e. their tokens are simply the characters on
* the input) must implicitly match _* at the start of the grammar and after
* every symbol in grammar production rules (i.e. `::=` rules).
*/
| doc? "_" ":==" alternation ";" -> whitespace-rule
/**
* A grammar production rule, for the form `symbol ::= pattern ;`. The syntax
* here is a bit convoluted because the toplevel alternations can be given
* docstrings and can be named, but the syntax is otherwise the same as for
* token rules.
*
* Whenever literal patterns are used in a grammar rule, they are implicitly
* converted to a token rule. Be mindful of this when writing the grammar:
* they can cause tokenizer conflicts just like any other token!
*/
| doc? symbol first-alter subseq-alter* ";" -> grammar-rule
;
/**
* All symbol names are written in kebab-case. GrammarSpec will convert the
* names to the appropriate case conventions for the target language.
*/
symbol :== [a-z] [a-z0-9#-]* ;
/**
* `a | b | ...` preferentially matches `a`, matches `b` if `a` does not
* match, and so on.
*/
alternation ::= concatenation ("|" concatenation)* ;
/**
* The alternatives at the root of a grammar production rule can be annotated
* with a variant name (if not specified it is auto-generated when needed) at
* the end and can be given a docstring before the ::= or | that starts the
* variant. They are functionally indistinguishable from a normal, nested
* alternation, though the parse tree may be generated in a different way.
*/
first-alter ::= doc? "::=" concatenation alter-name? ;
subseq-alter ::= doc? "|" concatenation alter-name? ;
/**
* A toplevel alternative can be named by appending `-> name`. The names do not
* have semantical meaning (they cannot be referred to within the grammar),
* but are used to attach names to variants in the generated ASTs.
*/
alter-name ::= "->" symbol ;
/**
* `a b ...` matches `a` followed by `b` and so on.
*/
concatenation ::= repetition+ ;
/**
* The usual ?*+ characters can be used to specify repetition.
*/
repetition
/**
* `a?` greedily matches `a` zero times or once.
*/
::= singular "?" -> maybe
/**
* `a*` greedily matches `a` zero or more times.
*/
| singular "*" -> any
/**
* `a+` greedily matches `a` one or more times.
*/
| singular "+" -> many
/**
* `a` matches `a` exactly once. Note that this one must come last in the
* list of alternatives, because otherwise the other alternatives would
* never match (because alternatives pick the *first* match, not the longest
* one).
*/
| singular -> one
;
/**
* A single literal pattern or referenced rule to match, or a parenthesized
* expression.
*/
singular
::= "(" alternation ")" -> nested
| symbol
| string-literal
| character-set
| character-literal
;
/**
* `'...'` and `"..."` match `...` literally. Empty string literals are not
* supported; instead use `?`.
*/
string-literal
:== "'" ( [^'##] | ## [^] )+ "'"
| '"' ( [^"##] | ## [^] )+ '"'
;
/**
* `[...]` matches a single character within the specified set. `a-z` may be
* used to select a whole range of code points, and a `^` at the front of the
* `...` inverts the set.
*/
character-set :== "[" "^"? ( character-set-char ( "-" character-set-char )? )* "]" ;
/**
* Within a character set, the characters `^-]#` have special meaning. Prefix
* them with a `#` to match them literally.
*/
character-set-char :== [^#^#-#]##] | escape-sequence ;
/**
* Hash escape sequences for characters can also be matched outside the context
* of a string or character range.
*/
character-literal :== escape-sequence ;
/**
* A hash escape sequence for a special character:
*
* - `#t`: short for `#x09` (tab).
* - `#n`: short for `#x0A` (LF/newline).
* - `#r`: short for `#x0D` (CR/carriage return).
* - `##`: matches `#` literally.
* - `#'`: matches `'` literally (for use within `'`-delimited strings).
* - `#"`: matches `"` literally (for use within `"`-delimited strings).
* - `#-`: matches `-` literally (for use within character sets).
* - `#^`: matches `^` literally (for use within character sets).
* - `#]`: matches `]` literally (for use within character sets).
* - `#xHH`: matches the given 8-bit code point, expressed with two hex
* digits.
* - `#uHHHH`: matches the given 16-bit code point, expressed with four hex
* digits.
* - `#UHHHHHHHH`: matches the given 32-bit code point, expressed with eight
* hex digits.
*/
escape-sequence :== ## (
[^xuU] |
"x" hex hex |
"u" hex hex hex hex |
"U" hex hex hex hex hex hex hex hex
) ;
/**
* A hex digit.
*/
hex :== [0-9a-fA-F] ;
/**
* /** comments are treated as docstrings. They may only appear immediately in
* front of a rule or toplevel alternative in a grammar production rule.
*/
doc :== "/**" comment-body "*/" ;
/**
* Normal /* comments can appear anywhere.
*/
_ :== "/*" ( [^*] comment-body )? "*/" ;
/**
* The body of a comment. Some magic is needed here to ensure that the body
* doesn't match the comment end sequence anywhere in it; we can't just match
* any character because the start of the first comment in the file would
* greedily match right up to the end of the last comment in the file.
*/
comment-body :== ( [^*] | "*"+ [^*/] )* "*"* ;
/**
* Whitespace can appear anywhere.
*/
_ :== [ #t#n#r]+ ;