diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc index 14b6242..3c78775 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc @@ -58,14 +58,17 @@ alias RscGrammar = Grammar; } @description{ - The conversion consists of two stages: + The conversion consists of three stages: + - preprocessing (function `preprocess`); - analysis (function `analyze`); - transformation (function `transform`). - The aim of the analysis stage is to select those productions of the Rascal - grammar that are "suitable for conversion" to TextMate rules. The aim of the - transformation stage is to subsequently convert those productions and - produce a TextMate grammar. + The aim of the preprocessing stage is to slightly massage the Rascal grammar + to make analysis and transformation easier (e.g., replace singleton ranges + with just the corresponding literal). The aim of the analysis stage is to + select those productions of the Rascal grammar that are "suitable for + conversion" to TextMate rules. The aim of the transformation stage is to + subsequently convert those productions and produce a TextMate grammar. To be able to cleanly separate analysis and transformation, productions selected during the analysis stage are wrapped into *conversion units* that @@ -161,8 +164,6 @@ private RscGrammar replaceLegacySemanticTokenTypes(RscGrammar rsc) Each production in the list (including the synthetic ones) is *suitable for conversion* to a TextMate rule. A production is "suitable for conversion" when it satisfies each of the following conditions: - - it is non-recursive; - - it does not match newlines; - it does not match the empty word; - it has a `@category` tag. @@ -171,36 +172,42 @@ private RscGrammar replaceLegacySemanticTokenTypes(RscGrammar rsc) @description{ The analysis consists of three stages: - 1. selection of user-defined productions; - 2. creation of synthetic delimiters production; - 3. creation of synthetic keywords production. - - In stage 1, a dependency graph among all productions that occur in `rsc` - (specifically: `prod` constructors) is created. This dependency graph is - subsequently pruned to keep only the suitable-for-conversion productions: - - first, productions with a cyclic dependency on themselves are removed; - - next, productions that only involve single-line matching are retained; - - next, productions that only involve non-empty word matching are retained; - - next, productions that have a `@category` tag are retained. + 1. selection of user-defined productions; + 2. creation of synthetic delimiters production; + 3. creation of synthetic keywords production; + 4. wrapping of productions inside conversion units. + + In stage 1, each user-defined production (specifically: `prod` constructor) + that occurs in `rsc` is selected for conversion when it fulfils the + following requirements: + - it has a unique `@category` tag; + - it doesn't match the empty word. In stage 2, the set of all delimiters that occur in `rsc` is created. This set is subsequently reduced by removing: - strict prefixes of delimiters; - - delimiters that enclose user-defined productions; - - delimiters that occur at the beginning of user-defined productions. + - delimiters that also occur as outer delimiters of + suitable-for-conversion productions; + - delimiters that also occur as inner delimiters of + suitable-for-conversion productions. In stage 3, the set of all keywords that occur in `rsc` is created. + + In stage 4, each suitable-for-conversion production is wrapped in a + conversion unit with additional metadata (e.g., the inner/outer delimiters + of the production). The list of conversion units is subsequently reduced + by removing strict prefixes, and sorted. } list[ConversionUnit] analyze(RscGrammar rsc, str name) { str jobLabel = "Analyzing)">"; jobStart(jobLabel, work = 6); - // Analyze productions + // Stage 1: Analyze productions jobStep(jobLabel, "Analyzing productions"); list[Production] prods = [p | /p: prod(_, _, _) <- rsc]; - // Analyze categories + // Stage 1: Analyze categories jobStep(jobLabel, "Analyzing categories"); prods = for (p <- prods) { @@ -221,11 +228,11 @@ list[ConversionUnit] analyze(RscGrammar rsc, str name) { append p; } - // Analyze emptiness + // Stage 1: Analyze emptiness jobStep(jobLabel, "Analyzing emptiness"); prods = [p | p <- prods, !tryParse(rsc, delabel(p.def), "")]; - // Analyze delimiters + // Stage 2: Analyze delimiters jobStep(jobLabel, "Analyzing delimiters"); set[Symbol] delimiters = {s | /Symbol s := rsc, isDelimiter(delabel(s))}; delimiters &= removeStrictPrefixes(delimiters); @@ -233,12 +240,12 @@ list[ConversionUnit] analyze(RscGrammar rsc, str name) { delimiters -= {s | p <- prods, /just(s) := getInnerDelimiterPair(rsc, p, getOnlyFirst = true)}; list[Production] prodsDelimiters = [prod(lex(DELIMITERS_PRODUCTION_NAME), [\alt(delimiters)], {})]; - // Analyze keywords + // Stage 3: Analyze keywords jobStep(jobLabel, "Analyzing keywords"); set[Symbol] keywords = {s | /Symbol s := rsc, isKeyword(delabel(s))}; list[Production] prodsKeywords = [prod(lex(KEYWORDS_PRODUCTION_NAME), [\alt(keywords)], {\tag("category"("keyword.control"))})]; - // Prepare units + // Stage 4: Prepare units jobStep(jobLabel, "Preparing units"); bool isEmptyProd(prod(_, [\alt(alternatives)], _)) = alternatives == {}; @@ -260,8 +267,13 @@ list[ConversionUnit] analyze(RscGrammar rsc, str name) { @description{ The transformation consists of two stages: - 1. creation of TextMate rules; - 2. composition of TextMate rules into a TextMate grammar. + 1. creation of TextMate rules; + 2. composition of TextMate rules into a TextMate grammar. + + Stage 1 is organized as a pipeline that, step-by-step, adds names and rules + to the conversion units. First, it adds unique names. Next, it adds "inner + rules". Last, it adds "outer rules". See module + `lang::textmate::ConversionUnit` for an explanation of inner/outer rules. } TmGrammar transform(list[ConversionUnit] units, str name, NameGeneration nameGeneration = long()) { diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/NameGeneration.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/NameGeneration.rsc index 73c6a85..b3a3c30 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/NameGeneration.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/NameGeneration.rsc @@ -49,9 +49,6 @@ alias NameGenerator = str(Production); NameGenerator newNameGenerator(list[Production] prods, short()) { - // Define auxiliary functions to compute names for symbols - - // Define auxiliary function to count the number of occurrences of a name int count(str name) = (0 | it + 1 | p <- prods, toName(p.def) == name); diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc index 27ccfd9..bf42a36 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc @@ -24,42 +24,45 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } -// # Walkthrough -// -// This module consists of a walkthrough to explain the main ideas behind the -// conversion algorithm, from Rascal grammars to TextMate grammars. -// -// The walkthrough is split into five parts. The initial part explains basic -// conversion. The subsequent four parts present complications and demonstrate -// extensions of the conversion algorithm to address them. -// -// The toy language considered, is a simple data language consisting of: -// - base: maps, numbers, strings; -// - extension 1: layout (including comments); -// - extension 2: regular expressions; -// - extension 3: locations; -// - extension 4: booleans. -// -// Working familiarity with TextMate grammar is assumed. To summarize: -// -// - Syntax: -// - Each TextMate grammar consists of a list of TextMate rules (ordered). -// - Each TextMate rule is either a *match pattern* (consisting of one -// regular expression) or a *begin/end pattern* (consisting of two -// regular expressions and a list of nested TextMate rules). -// -// - Semantics: A tokenization engine reads a document (line by line, top to -// bottom, left to right), while iteratively trying to apply TextMate rules -// by matching text against the regular expressions. -// -// Further reading: -// - https://macromates.com/manual/en/language_grammars -// - https://www.apeth.com/nonblog/stories/textmatebundle.html -// - https://code.visualstudio.com/api/language-extensions/syntax-highlight-guide +@description{ +This module consists of a walkthrough to explain the main ideas behind the +conversion algorithm, from Rascal grammars to TextMate grammars. The walkthrough +is not intended to be comprehensive, meaning that not all corner cases are +discussed and explained. The aim of the walkthrough is to convey just the +general approach. + +The walkthrough consists of the following sections: + - Preface + - Basic conversion: single-line, non-recursive + - Advanced conversion: multi-line, non-recursive + - Advanced conversion: single/multi-line, recursive + - Advanced conversion: context detection + +Throughout, a simple data language is used to demonstrate the main ideas. It is +defined incrementally, by need. + +Working familiarity with TextMate grammar is assumed. To summarize: + + - Syntax: + - Each TextMate grammar consists of a list of TextMate rules (ordered). + - Each TextMate rule is either a *match pattern* (consisting of one + regular expression) or a *begin/end pattern* (consisting of two + regular expressions and a list of nested TextMate rules). + + - Semantics: A TextMate tokenizer reads a document (line by line, top to + bottom, left to right), while iteratively trying to apply TextMate rules by + matching text against the regular expressions. + +Further reading: + - https://macromates.com/manual/en/language_grammars + - https://www.apeth.com/nonblog/stories/textmatebundle.html + - https://code.visualstudio.com/api/language-extensions/syntax-highlight-guide +} module lang::textmate::conversiontests::Walkthrough import Grammar; +import IO; import ParseTree; import util::Maybe; @@ -67,261 +70,409 @@ import lang::textmate::Conversion; import lang::textmate::ConversionConstants; import lang::textmate::ConversionTests; import lang::textmate::ConversionUnit; +import lang::textmate::NameGeneration; +import lang::textmate::Grammar; import lang::rascal::grammar::analyze::Delimiters; -// ## Basics +// ---------- +// ## Preface // -// The base fragment of the grammar looks as follows: - -lexical Alnum = [a-z A-Z 0-9]; -lexical Digit = [0-9]; -lexical Blank = [\ \t]; -lexical Space = [\ \t\n]; +// The following lexicals will be used as terminals (in addition to literals): -start syntax Value - = Map - | Number - | String - | RegExp - | Location - | Boolean - ; +lexical Alnum = [0-9 A-Z a-z] ; +lexical Digit = [0-9] ; +lexical Blank = [\ \t] ; +lexical Space = [\ \t\n] ; +lexical Print = [0-9 A-Z a-z \ \t\n]; -syntax Map = "{" {(Key ":" Value) ","}* "}"; +// ----------------------------------------------- +// ## Basic conversion: single-line, non-recursive +// +// ### User-defined productions +// +// Basically, the conversion algorithm analyzes the Rascal grammar to find each +// user-defined Rascal production that is *suitable for conversion* to a +// TextMate rule. Roughly, a production is said to have that property when: +// - it has a category; +// - it cannot produce the empty word. +// +// For instance: -lexical Key = Alnum+ !>> [a-z A-Z 0-9]; -lexical Number = @category="constant.numeric" Digit+ !>> [0-9]; -lexical String = @category="string.quoted.double" "\"" Alnum* "\""; +lexical Identifier = Alnum+ !>> [0-9 A-Z a-z] ; +lexical Chars = @category="string" Print* ; +lexical Number = @category="constant.numeric" Digit+ !>> [0-9] ; -// Basically, the conversion algorithm converts each Rascal non-terminal that is -// *suitable for conversion* to a TextMate rule. For instance, `Number` is -// converted to the following match pattern (in JSON): +// The Rascal productions of `Identifier` (does not have a category) and `Chars` +// (can produce the empty word) are not suitable-for-conversion. In contrast, +// the Rascal production of `Number` is suitable-for-conversion. The following +// TextMate rule is generated: // // ``` // { -// "match": "([\\u0030-\\u0039]+?(?![\\u0030-\\u0039]))", -// "name": "prod(lex(\"Number\"),[conditional(iter(lex(\"Digit\")),{\\not-follow(\\char-class([range(48,57)]))})],{tag(\"category\"(\"constant.numeric\"))})", -// "captures": { -// "1": { -// "name": "constant.numeric" -// } -// } +// "name": "/inner/single/number", +// "match": "([0-9]+?(?![0-9]))", +// "captures": { "1": { "name": "constant.numeric" } } // } // ``` // -// Note: The regular expression (`match` property) is written in Oniguruma -// format (following the TextMate grammar specification), using code units -// instead of alphanumericals. +// Note: Property `name` could be anything, but to simplify debugging, the +// conversion algorithm uses a description of the Rascal production. // -// Note: The name (`name` property) could be anything, but to simplify -// debugging, the conversion algorithm uses (part of) the internal -// representation of the Rascal non-terminal as the name. +// Note: The `match` property is a regular expression written in *Oniguruma* +// format (as required by the TextMate grammar specification). + +// ### Keywords // -// In general, a Rascal non-terminal is "suitable for conversion" when it -// satisfies each of the following conditions: +// Sometimes, literals that qualify as *keywords* do not have a corresponding +// category. For instance: + +syntax BooleanExpr + = "true" + | "false" + | "if" BooleanExpr "then" BooleanExpr "else" BooleanExpr + | "(" BooleanExpr ")" + ; + +// Despite not having a category, though, these literals should be highlighted. // -// 1. It is non-recursive. (Recursion is prohibitively hard to faithfully -// convert; tail-recursion could be workable, but currently out of scope.) +// Thus, the conversion algorithm: +// - analyzes the Rascal grammar to find each literal that qualifies as a +// keyword (according to function `isKeyword` in module +// `lang::rascal::grammar:analyze::Delimiters`); +// - collects these literals in a synthetic Rascal production of the form +// `lit1 | lit2 | ...` (suitable-for-conversion by construction); +// - converts that production to a TextMate rule. // -// 2. It does not match newlines. (TextMate rules that involve matching against -// newlines are problematic because the tokenization engine operates line by -// line.) +// For instance, the literals that qualify as keywords in the Rascal productions +// of `BooleanExpr` are "true", "false", "if", "then", and "else". The following +// TextMate rule is generated: // -// 3. It does not match the empty word. (TextMate rules that involve matching -// against the empty word are problematic because they match at every -// position.) +// ``` +// { +// "name": "/inner/single/$keywords", +// "match": "((?:\\btrue\\b)|(?:\\bfalse\\b)|(?:\\belse\\b)|(?:\\bthen\\b)|(?:\\bif\\b))", +// "captures": { "1": { "name": "keyword.control" } } +// } +// ``` // -// 4. It has a `@category` tag. +// ### Delimiters // -// For instance, `Number` and `String` are suitable for conversion, but `Value` -// (violation of conditions 1 and 4), `Map` (violation of condition 1), and -// `Key` (violation of condition 4) are not suitable. - +// Sometimes, literals that qualify as *delimiters* might confuse the TextMate +// tokenizer. For instance: +lexical LineComment = @category="comment" "//" (Alnum | Space)* $ ; +lexical Location = "|" Alnum+ "://" Alnum+ "|"; -// ## Extension 1: Conversion of productions instead of non-terminals +// The Rascal production of `Comment` is suitable-for-conversion, while the +// Rascal production of `Location` (does not have a category) is not. However, +// if only the former were to be converted to a TextMate rule, then substring +// "//Desktop" of input string "|home://Desktop|" would be mistakenly tokenized +// as a comment. +// +// Thus, the conversion algorithm: +// - analyzes the Rascal grammar to find each literal that qualifies as a +// delimiter (according to function `isDelimiter` in module +// `lang::rascal::grammar:analyze::Delimiters`); +// - collects these literals in a synthetic Rascal production of the form +// `lit1 | lit2 | ...` (suitable-for-conversion by construction); +// - converts that production to a TextMate rule. +// +// For instance, the literals that qualify as delimiters in the Rascal +// productions of `Comment` and `Location` are "//", "://", and "|". The +// following TextMate rule is generated: +// +// ``` +// { +// "name": "/inner/single/$delimiters", +// "match": "(?:\\/\\/)|(?:\\|)|(?:\\:\\/\\/)", +// "captures": {} +// } +// ``` +// +// Note: The intent of this TextMate rule is *not* to assign a scope. This is +// why the `captures` property is empty. The only purpose of this TextMate rule +// is to force the TextMate tokenizer to consume highlighting-neutral delimiters +// before they are accidentally tokenized and mistakenly highlighted. // -// The first extension (layout) of the grammar looks like this: +// Note: To ensure that each delimiter is matched by at most one TextMate rule, +// each delimiter literal needs to fulfil a number of additional requirements to +// be included in the synthetic Rascal production (e.g., it must not be the +// prefix of any other delimiter literal). -lexical Comment - = @category="comment.line.double-slash" line: "//" (Alnum | Blank)* $ - | @category="comment.block" block: "/*" (Alnum | Space)* "*/" - ; +// ------------------------------------------------- +// ## Advanced conversion: multi-line, non-recursive +// +// ### Approach +// +// To convert user-defined Rascal productions of multi-line strings, more +// advanced machinery than TextMate rules with match patterns of single-line +// strings is needed (i.e., newlines cannot be matched by individual regular +// expressions in a TextMate grammar). That is, TextMate rules with begin/end +// patterns need to be used. The approach is roughly as follows: +// +// - First, the conversion algorithm analyzes the Rascal grammar to find each +// user-defined Rascal production that is suitable-for-conversion. +// +// - Next, the conversion algorithm optimistically converts each of those +// productions -- *including* those of multi-line strings -- to a TextMate +// rule with a match pattern of single-line strings. The rationale is that +// single-line strings are a significant special case of multi-line strings, +// so this TextMate rule can already be quite effective. +// +// - Next, the conversion algorithm checks for each of those productions of +// multi-line strings if it is *delimited*, *semi-delimited*, or +// *non-delimited*: +// +// - If it begins and ends with a delimiter, then it is *delimited*. In +// this case, the production can be converted to a TextMate rule with a +// begin/end pattern in a relatively simple way. +// +// - If it begins with a delimiter, but it does not end with a delimiter, +// then it is *semi-delimited*. In this case, the production can be +// converted to a TextMate rule with a begin/end pattern in a relatively +// complex way. +// +// - If it does not begin with a delimiter, then it is *non-delimited*. In +// this case, the production cannot be converted to a TextMate rule with +// a begin/end pattern. +// +// ### Delimited conversion, when the begin-delimiter is unique +// +// For instance: -layout Layout = (Comment | Space)* !>> "//" !>> [\ \t\n]; +lexical BlockComment = @category="comment" "/*" Print* "*/" ; -// `Comment` is *not* suitable for conversion, as it violates condition 2: the -// corresponding TextMate rule would involve matching against newlines. However, -// the matching against newlines is needed only for production `block`; not for -// production `line`. Thus, conversion at the granularity of Rascal -// non-terminals is actually too coarse. -// -// To solve this issue, the conversion algorithm works at the granularity of -// individual productions (specifically, `prod` constructors). For instance, -// production `line` of `Comment` is individually converted to the following -// match pattern, independently of production `block` (which is ignored): +// The Rascal production of `BlockComment` is suitable-for-conversion, +// multi-line (because `Print` can produce a newline), and delimited (by "/*" +// and "*/"). Moreover, the begin-delimiter is unique: there is no other Rascal +// production in the Rascal grammar that begins with "/*". The following +// TextMate rule is generated: // // ``` // { -// "match": "((?:\\u002F\\u002F)(?:(?:(?:[\\u0009-\\u0009]|[\\u0020-\\u0020])|(?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A]))*?(?:$)))", -// "name": "prod(label(\"line\",lex(\"Comment\")),[lit(\"//\"),conditional(\\iter-star(alt({lex(\"Blank\"),lex(\"Alnum\")})),{\\end-of-line()})],{tag(\"category\"(\"comment.line.double-slash\"))})", -// "captures": { -// "1": { -// "name": "comment.line.double-slash" +// "name": "/inner/multi/blockcomment", +// "begin": "(\\/\\*)", +// "end": "(\\*\\/)", +// "beginCaptures": { "1": { "name": "comment" } }, +// "endCaptures": { "1": { "name": "comment" } }, +// "patterns": [ +// { +// "match": "([\\t-\\n\\x{20}0-9A-Za-z])", +// "captures": { "1": { "name": "comment" } } +// }, +// { +// "match": "([\\x{01}-\\x{10FFFF}])", +// "captures": { "1": { "name": "comment" } } // } -// } +// ] // } // ``` - - - -// ## Extension 2: Delimiter-sensitive conversion // -// The second extension (regular expressions; illustrative fragment) looks as -// follows: - -syntax RegExp = "/" RegExpBody "/"; +// Note: The purpose of the nested match patterns is to force the TextMate +// tokenizer to explicitly consume all input between the begin/end delimiters. +// The first nested match pattern is derived from the Rascal production of +// `Print`. The second nested match pattern is a default fallback. +// +// ### Delimited conversion, when the begin-delimiter is *not* unique +// +// For instance: -lexical RegExpBody - = @category="markup.italic" alnum: Alnum+ !>> [a-z A-Z 0-9] - | RegExpBody "?" - | RegExpBody "+" - | RegExpBody "|" RegExpBody +lexical String + = StringLeftRight // Without interpolation + | StringLeft (Identifier StringMid)* Identifier StringRight // With interpolation ; -// Production `alnum` of `RegExpBody` is suitable for conversion. However, -// except for the `@category` tag, it has exactly the same definition as the -// production of `Key` (above). Thus, if the conversion algorithm were to -// naively convert `alnum` to a TextMate rule, keys in maps would be tokenized -// accidentally as regular expressions (and mistakenly typeset in italics). -// -// To solve this issue, the conversion algorithm first heuristically checks for -// each suitable-for-conversion production if it is *enclosed by delimiters*. If -// so, instead of converting the production to a top-level match pattern, it is -// converted to a top-level begin/end pattern (for the enclosing delimiters) -// with a nested match pattern (for the production itself). As a result, the -// nested match pattern will be used for tokenization only between matches of -// the enclosing delimiters. For instance, production `alnum` is enclosed by an -// opening `/` and a closing `/`, so it is converted to the following top-level -// begin/end pattern with a nested match pattern: +lexical StringLeftRight = @category="string" "\"" Print* "\"" ; +lexical StringLeft = @category="string" "\"" Print* "\<" ; +lexical StringMid = @category="string" "\>" Print* "\<" ; +lexical StringRight = @category="string" "\>" Print* "\"" ; + +// The Rascal production of `StringMid` is suitable-for-conversion, multi-line +// (because `Print` can produce a newline), and delimited (by ">" and "<"). +// However, the begin-delimiter is not unique: there is another Rascal +// production in the Rascal grammar that begins with ">", namely the one of +// `StringRight`. The following *single* TextMate rule is generated that covers +// *both* Rascal productions: // // ``` // { -// "begin": "(?:\\u002F)", -// "end": "(?:\\u002F)", +// "name": "/inner/multi/stringmid,stringright", +// "begin": "(\\>)", +// "end": "((?:\\\")|(?:\\<))", +// "beginCaptures": { "1": { "name": "string" } }, +// "endCaptures": { "1": { "name": "string" } }, // "patterns": [ // { -// "match": "((?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A])+?(?!(?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A])))", -// "name": "prod(label(\"alnum\",lex(\"RegExpBody\")),[conditional(iter(lex(\"Alnum\")),{\\not-follow(\\char-class([range(48,57),range(65,90),range(97,122)]))})],{tag(\"category\"(\"markup.italic\"))})", -// "captures": { -// "1": { -// "name": "markup.italic" -// } -// } +// "match": "([\\t-\\n\\x{20}0-9A-Za-z])", +// "captures": { "1": { "name": "string" } } +// }, +// { +// "match": "([\\x{01}-\\x{10FFFF}])", +// "captures": { "1": { "name": "string" } } // } // ] // } // ``` // -// Note: If N suitable-for-conversion productions are enclosed by the same -// delimiters, then the conversion algorithm converts them into one top-level -// begin/end pattern with N nested match patterns (one for each production). - - - -// ## Extension 3: Delimiter conversion +// Note: Similarly, the Rascal productions of `StringLeftRight` and `StringLeft` +// are suitable-for-conversion, multi-line, and delimited. Moreover, their begin +// delimiter is "\"", while there is no other Rascal production in the Rascal +// grammar that begins with "\"". However, "\"" *does* occur as a non-begin +// delimiter elsewhere in the Rascal grammar: it is the end-delimiter of the +// Rascal production of `StringLeftRight`. Consequently, "\"" does *not* +// unmistakenly indicate the beginning of `StringLeftRight` or `StringLeft`. To +// avoid multi-line tokenization mistakes, the Rascal productions of +// `StringLeftRight` and `StringLeft` are not converted to a TextMate rule. +// +// ### Semi-delimited conversion // -// The third extension (locations; illustrative fragment) looks as follows: +// For instance: + +syntax Tag + = @category="comment" "@" Alnum+ "=" Alnum+ + | @category="comment" "@" Alnum+ "{" Print* "}" + ; -syntax Location = "|" Segment "://" {Segment "/"}+ "|"; -lexical Segment = Alnum+ !>> [a-z A-Z 0-9]; +layout Layout = Space* !>> [\ \t\n]; -// The productions of `Location` and `Segment` are *not* suitable for -// conversion, as they violate condition 4. However, accidentally, the TextMate -// rule for production `line` of `Comment` (above) will actually be applicable -// to suffixes of locations (e.g., it matches `//bar/baz` in `|foo://bar/baz|`). -// Thus, suffixes of locations will mistakenly be highlighted as comments. -// -// To solve this issue, the conversion algorithm creates a synthetic production -// of the form `lit1 | lit2 | ...`, where each `lit` is a literal that occurs -// in the Rascal grammar, and: -// - it does not match `/^\w+$/` (i.e., it is a *delimiter literal*; e.g., -// `(`, `://`, and `,` are delimiter literals); -// - it is not a prefix of any other delimiter literal; -// - it does not occur at the start of a suitable-for-conversion production; -// - it does not enclose a suitable-for-conversion production. -// -// The synthetic production is converted to a TextMate rule (match pattern). The -// previous requirements for each `lit` are intended to ensure that only a -// single TextMate rule is applicable to each delimiter. For instance, the -// synthetic production in the example grammar is converted to the following -// match pattern: +// The Rascal productions of `Tag` are suitable-for-conversion and multi-line +// (because `Layout` can produce newlines). However, only the second production +// is delimited. This requires special care. The following TextMate rule, with +// several nested patterns, is generated: // // ``` // { -// "match": "(?:\\u002C)|(?:\\u002B)|(?:\\u002A\\u002F)|(?:\\u007D)|(?:\\u007C)|(?:\\u003F)|(?:\\u003A\\u002F\\u002F)|(?:\\u002F\\u002A)|(?:\\u007B)", -// "name": "prod(lex(\"delimiters\"),[alt({lit(\",\"),lit(\"+\"),lit(\"*/\"),lit(\"}\"),lit(\"|\"),lit(\"?\"),lit(\"://\"),lit(\"/*\"),lit(\"{\")})],{})" +// "name": "/inner/multi/tag.2,tag.1", +// "begin": "((?:\\@)(?:[\\t-\\n\\x{20}]*?(?![\\t-\\n\\x{20}]))(?:[0-9A-Za-z](?:(?:[\\t-\\n\\x{20}]*?(?![\\t-\\n\\x{20}]))[0-9A-Za-z])*?)(?:[\\t-\\n\\x{20}]*?(?![\\t-\\n\\x{20}])))", +// "end": "(?=.)", +// "beginCaptures": { "1": { "name": "comment" } }, +// "endCaptures": {}, +// "applyEndPatternLast": true, +// "patterns": [ +// { +// "begin": "(\\{)", +// "end": "(\\})", +// "beginCaptures": { "1": { "name": "comment" } }, +// "endCaptures": { "1": { "name": "comment" } }, +// "patterns": [ +// { +// "match": "([\\t-\\n\\x{20}])", +// "captures": { "1": { "name": "comment" } } +// }, +// { +// "match": "([\\x{01}-\\x{10FFFF}])", +// "captures": { "1": { "name": "comment" } } +// } +// ], +// }, +// { +// "match": "(\\=)", +// "captures": { "1": { "name": "comment" } } +// } +// ] // } // ``` // -// Note: The intent of this match pattern is *not* to assign a category. The -// only purpose is to force the tokenization engine to consume -// "highlighting-insignificant" delimiters before they are accidentally -// tokenized and mistakenly highlighted. - +// Note: The begin pattern matches the common *prefix* of the two Rascal +// productions. The two nested patterns correspond to the different *suffixes*. +// ---------------------------------------------------- +// ## Advanced conversion: single/multi-line, recursive +// +// Semi-delimited conversion (explained above) has limited support for +// user-defined Rascal productions that are recursive. Other than that, +// recursion is not yet supported. -// ## Extension 4: Keyword coversion +// ----------------------------------------- +// ## Advanced conversion: context detection // -// The fourth extension (booleans) of the grammar looks as follows: +// Sometimes, highlighting depends on the context in which the tokenization +// input occurs. For instance: -lexical Boolean - = "true" - | "false" +lexical RegExp = "/" RegExpBody "/"; + +lexical RegExpBody + = @category="string" alnum: Alnum+ !>> [0-9 A-Z a-z] + | RegExpBody "?" + | RegExpBody "+" + | RegExpBody "|" RegExpBody ; -// The productions of `Boolean` are *not* suitable for conversion, as they -// violate condition 4. However, by default, literals like these should be -// highlighted as keywords. +// Rascal production `alnum` of `RegExpBody` is suitable-for-conversion. +// However, except for the `@category` tag, it has exactly the same definition +// as the production of `Identifier` (above). If the conversion algorithm were +// to naively convert `alnum` to a TextMate rule, identifiers would be +// mistakenly highlighted as strings. +// +// Thus, the conversion algorithm first heuristically checks for each Rascal +// production that is suitable-for-conversion if it is *enclosed by delimiters*. +// If so, it is converted to an *outer* TextMate rule with a begin/end pattern +// to match the enclosing delimiters (i.e., context detection) and include +// patterns to toggle *inner* TextMate rules. That is, the inner TextMate rules +// are used for tokenization only between matches of the enclosing delimiters +// (i.e., in the right context). // -// To solve this issue, the conversion algorithm creates a synthetic production -// of the form `lit1 | lit2 | ...`, where each `lit` is a literal that occurs -// in the input grammar, and `lit` matches `/^\w+$/` (i.e., it is a *keyword -// literal*; e.g., `true` and `false`). The synthetic production is converted to -// a TextMate rule (match pattern). For instance, the synthetic production in -// the example grammar is converted to the following match pattern: +// For instance, production `alnum` is enclosed by an opening `/` and a closing +// `/`. The following outer TextMate rule is generated: // // ``` // { -// "match": "((?:\\b\\u0074\\u0072\\u0075\\u0065\\b)|(?:\\b\\u0066\\u0061\\u006C\\u0073\\u0065\\b))", -// "name": "prod(lex(\"keywords\"),[alt({lit(\"true\"),lit(\"false\")})],{tag(\"category\"(\"keyword.control\"))})", -// "captures": { -// "1": { -// "name": "keyword.control" -// } -// } +// "name": "/outer//", +// "begin": "(?:\\/)", +// "end": "(?:\\/)", +// "beginCaptures": {}, +// "endCaptures": {}, +// "patterns": [ +// { "include": "#/inner/single/$delimiters" }, +// { "include": "#/inner/single/regexpbody.alnum" }, +// { "include": "#/inner/single/$keywords" } +// ] // } // ``` - - +// +// Note: If N Rascal productions are enclosed by the same delimiters, then the +// conversion algorithm converts them into one outer TextMate rule and N inner +// TextMate rules. // ## Tests // // The following code tests the conversion algorithm on input of the grammar // defined above. -Grammar rsc = preprocess(grammar(#Value)); +start syntax Start + = Identifier + | Chars + | Number + | BooleanExpr + | LineComment + | Location + | BlockComment + | String + | Tag + | RegExp ; + +Grammar rsc = preprocess(grammar(#Start)); list[ConversionUnit] units = [ - unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit(","),lit("+"),lit("}"),lit("|"),lit("?"),lit("{"),lit("://")})],{}), false, false, , ), - unit(rsc, prod(label("line",lex("Comment")),[lit("//"),conditional(\iter-star(alt({lex("Blank"),lex("Alnum")})),{\end-of-line()})],{\tag("category"("comment.line.double-slash"))}), false, false, , ), - unit(rsc, prod(label("block",lex("Comment")),[lit("/*"),\iter-star(alt({lex("Alnum"),lex("Space")})),lit("*/")],{\tag("category"("comment.block"))}), false, true, , ), - unit(rsc, prod(label("alnum",lex("RegExpBody")),[conditional(iter(lex("Alnum")),{\not-follow(\char-class([range(48,57),range(65,90),range(97,122)]))})],{\tag("category"("markup.italic"))}), false, false, , ), - unit(rsc, prod(lex("String"),[lit("\""),\iter-star(lex("Alnum")),lit("\"")],{\tag("category"("string.quoted.double"))}), false, false, , ), + unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit(")"),lit("("),lit("+"),lit("="),lit("|"),lit("?"),lit("{"),lit("://")})],{}), false, false, , ), + unit(rsc, prod(sort("Tag"),[lit("@"),layouts("Layout"),\iter-seps(lex("Alnum"),[layouts("Layout")]),layouts("Layout"),lit("="),layouts("Layout"),\iter-seps(lex("Alnum"),[layouts("Layout")])],{\tag("category"("comment"))}), false, true, , ), + unit(rsc, prod(sort("Tag"),[lit("@"),layouts("Layout"),\iter-seps(lex("Alnum"),[layouts("Layout")]),layouts("Layout"),lit("{"),layouts("Layout"),\iter-star-seps(lex("Print"),[layouts("Layout")]),layouts("Layout"),lit("}")],{\tag("category"("comment"))}), false, true, , ), + unit(rsc, prod(lex("StringMid"),[lit("\>"),\iter-star(lex("Print")),lit("\<")],{\tag("category"("string"))}), false, true, "))>, ")),just(lit("\<"))>), + unit(rsc, prod(lex("StringRight"),[lit("\>"),\iter-star(lex("Print")),lit("\"")],{\tag("category"("string"))}), false, true, , ")),just(lit("\""))>), + unit(rsc, prod(lex("LineComment"),[lit("//"),conditional(\iter-star(alt({lex("Alnum"),lex("Space")})),{\end-of-line()})],{\tag("category"("comment"))}), false, true, , ), + unit(rsc, prod(lex("BlockComment"),[lit("/*"),\iter-star(lex("Print")),lit("*/")],{\tag("category"("comment"))}), false, true, , ), + unit(rsc, prod(label("alnum",lex("RegExpBody")),[conditional(iter(lex("Alnum")),{\not-follow(\char-class([range(48,57),range(65,90),range(97,122)]))})],{\tag("category"("string"))}), false, false, , ), + unit(rsc, prod(lex("StringLeft"),[lit("\""),\iter-star(lex("Print")),lit("\<")],{\tag("category"("string"))}), false, true, "))>, ), + unit(rsc, prod(lex("StringLeftRight"),[lit("\""),\iter-star(lex("Print")),lit("\"")],{\tag("category"("string"))}), false, true, , ), unit(rsc, prod(lex("Number"),[conditional(iter(lex("Digit")),{\not-follow(\char-class([range(48,57)]))})],{\tag("category"("constant.numeric"))}), false, false, , ), - unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("true"),lit("false")})],{\tag("category"("keyword.control"))}), false, false, , ) + unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("true"),lit("false"),lit("else"),lit("then"),lit("if")})],{\tag("category"("keyword.control"))}), false, false, , ) ]; test bool analyzeTest() = doAnalyzeTest(rsc, units, name = "Walkthrough"); -test bool transformTest() = doTransformTest(units, <7, 2, 0>, name = "Walkthrough"); \ No newline at end of file +test bool transformTest() = doTransformTest(units, <12, 6, 0>, name = "Walkthrough"); + +bool convertAndPrint() { + println(toJSON(toTmGrammar(grammar(#Start), "Walkthrough", nameGeneration = short()))); + return true; +}