From 2e6f8b27ad5bed3c689db206669e4e17782ff9c6 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Wed, 25 Sep 2024 14:52:42 +0200 Subject: [PATCH 1/5] Update documentation in module `lang::textmate::Conversion` --- .../main/rascal/lang/textmate/Conversion.rsc | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc index ab36aa2..a219af5 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc @@ -31,14 +31,17 @@ alias RscGrammar = Grammar; } @description{ - The conversion consists of two stages: + The conversion consists of three stages: + - preprocessing (function `preprocess`); - analysis (function `analyze`); - transformation (function `transform`). - The aim of the analysis stage is to select those productions of the Rascal - grammar that are "suitable for conversion" to TextMate rules. The aim of the - transformation stage is to subsequently convert those productions and - produce a TextMate grammar. + The aim of the preprocessing stage is to slightly massage the Rascal grammar + to make analysis and transformation easier (e.g., replace singleton ranges + with just the corresponding literal). The aim of the analysis stage is to + select those productions of the Rascal grammar that are "suitable for + conversion" to TextMate rules. The aim of the transformation stage is to + subsequently convert those productions and produce a TextMate grammar. To be able to cleanly separate analysis and transformation, productions selected during the analysis stage are wrapped into *conversion units* that @@ -72,8 +75,6 @@ RscGrammar preprocess(RscGrammar rsc) { Each production in the list (including the synthetic ones) is *suitable for conversion* to a TextMate rule. A production is "suitable for conversion" when it satisfies each of the following conditions: - - it is non-recursive; - - it does not match newlines; - it does not match the empty word; - it has a `@category` tag. @@ -84,23 +85,31 @@ RscGrammar preprocess(RscGrammar rsc) { The analysis consists of three stages: 1. selection of user-defined productions; 2. creation of synthetic delimiters production; - 3. creation of synthetic keywords production. + 3. creation of synthetic keywords production; + 4. wrapping of productions inside conversion units. In stage 1, a dependency graph among all productions that occur in `rsc` (specifically: `prod` constructors) is created. This dependency graph is subsequently pruned to keep only the suitable-for-conversion productions: - - first, productions with a cyclic dependency on themselves are removed; - - next, productions that only involve single-line matching are retained; - - next, productions that only involve non-empty word matching are retained; + - first, productions that involve non-empty word matching are retained; - next, productions that have a `@category` tag are retained. + The resulting list of productions is split into lists of recursive + productions and non-recursive productions. In stage 2, the set of all delimiters that occur in `rsc` is created. This set is subsequently reduced by removing: - strict prefixes of delimiters; - - delimiters that enclose user-defined productions; - - delimiters that occur at the beginning of user-defined productions. + - delimiters that also occur as outer delimiters of + suitable-for-conversion productions; + - delimiters that also occur as inner delimiters of + suitable-for-conversion productions. In stage 3, the set of all keywords that occur in `rsc` is created. + + In stage 4, each suitable-for-conversion production is wrapped in a + conversion unit with additional metadata (e.g., the inner/outer delimiters + of the production). The list of conversion units is subsequently reduced + by removing strict prefixes, and sorted. } list[ConversionUnit] analyze(RscGrammar rsc) { @@ -154,6 +163,11 @@ list[ConversionUnit] analyze(RscGrammar rsc) { The transformation consists of two stages: 1. creation of TextMate rules; 2. composition of TextMate rules into a TextMate grammar. + + Stage 1 is organizes as a pipeline that, step-by-step, adds names and rules + to the conversion units. First, it adds unique names. Next, it adds "inner + rules". Last, it adds "outer rules". See module + `lang::textmate::ConversionUnit` for an explanation of inner/outer rules. } TmGrammar transform(list[ConversionUnit] units, NameGeneration nameGeneration = long()) { From 3f6cc633eddfaedbe6616bc7d20aeb8f7738bff7 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 15 Nov 2024 12:30:22 +0100 Subject: [PATCH 2/5] Update documentation in `lang::textmate::Conversion` --- .../main/rascal/lang/textmate/Conversion.rsc | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc index 9358254..3c78775 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc @@ -172,18 +172,16 @@ private RscGrammar replaceLegacySemanticTokenTypes(RscGrammar rsc) @description{ The analysis consists of three stages: - 1. selection of user-defined productions; - 2. creation of synthetic delimiters production; - 3. creation of synthetic keywords production; - 4. wrapping of productions inside conversion units. - - In stage 1, a dependency graph among all productions that occur in `rsc` - (specifically: `prod` constructors) is created. This dependency graph is - subsequently pruned to keep only the suitable-for-conversion productions: - - first, productions that involve non-empty word matching are retained; - - next, productions that have a `@category` tag are retained. - The resulting list of productions is split into lists of recursive - productions and non-recursive productions. + 1. selection of user-defined productions; + 2. creation of synthetic delimiters production; + 3. creation of synthetic keywords production; + 4. wrapping of productions inside conversion units. + + In stage 1, each user-defined production (specifically: `prod` constructor) + that occurs in `rsc` is selected for conversion when it fulfils the + following requirements: + - it has a unique `@category` tag; + - it doesn't match the empty word. In stage 2, the set of all delimiters that occur in `rsc` is created. This set is subsequently reduced by removing: @@ -205,11 +203,11 @@ list[ConversionUnit] analyze(RscGrammar rsc, str name) { str jobLabel = "Analyzing)">"; jobStart(jobLabel, work = 6); - // Analyze productions + // Stage 1: Analyze productions jobStep(jobLabel, "Analyzing productions"); list[Production] prods = [p | /p: prod(_, _, _) <- rsc]; - // Analyze categories + // Stage 1: Analyze categories jobStep(jobLabel, "Analyzing categories"); prods = for (p <- prods) { @@ -230,11 +228,11 @@ list[ConversionUnit] analyze(RscGrammar rsc, str name) { append p; } - // Analyze emptiness + // Stage 1: Analyze emptiness jobStep(jobLabel, "Analyzing emptiness"); prods = [p | p <- prods, !tryParse(rsc, delabel(p.def), "")]; - // Analyze delimiters + // Stage 2: Analyze delimiters jobStep(jobLabel, "Analyzing delimiters"); set[Symbol] delimiters = {s | /Symbol s := rsc, isDelimiter(delabel(s))}; delimiters &= removeStrictPrefixes(delimiters); @@ -242,12 +240,12 @@ list[ConversionUnit] analyze(RscGrammar rsc, str name) { delimiters -= {s | p <- prods, /just(s) := getInnerDelimiterPair(rsc, p, getOnlyFirst = true)}; list[Production] prodsDelimiters = [prod(lex(DELIMITERS_PRODUCTION_NAME), [\alt(delimiters)], {})]; - // Analyze keywords + // Stage 3: Analyze keywords jobStep(jobLabel, "Analyzing keywords"); set[Symbol] keywords = {s | /Symbol s := rsc, isKeyword(delabel(s))}; list[Production] prodsKeywords = [prod(lex(KEYWORDS_PRODUCTION_NAME), [\alt(keywords)], {\tag("category"("keyword.control"))})]; - // Prepare units + // Stage 4: Prepare units jobStep(jobLabel, "Preparing units"); bool isEmptyProd(prod(_, [\alt(alternatives)], _)) = alternatives == {}; @@ -269,10 +267,10 @@ list[ConversionUnit] analyze(RscGrammar rsc, str name) { @description{ The transformation consists of two stages: - 1. creation of TextMate rules; - 2. composition of TextMate rules into a TextMate grammar. - - Stage 1 is organizes as a pipeline that, step-by-step, adds names and rules + 1. creation of TextMate rules; + 2. composition of TextMate rules into a TextMate grammar. + + Stage 1 is organized as a pipeline that, step-by-step, adds names and rules to the conversion units. First, it adds unique names. Next, it adds "inner rules". Last, it adds "outer rules". See module `lang::textmate::ConversionUnit` for an explanation of inner/outer rules. From a3cd944b09413a684004648623e9c13dec7e5342 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 15 Nov 2024 12:30:57 +0100 Subject: [PATCH 3/5] Remove leftover comment in `lang::textmate::NameGeneration` --- .../src/main/rascal/lang/textmate/NameGeneration.rsc | 3 --- 1 file changed, 3 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/NameGeneration.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/NameGeneration.rsc index 73c6a85..b3a3c30 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/NameGeneration.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/NameGeneration.rsc @@ -49,9 +49,6 @@ alias NameGenerator = str(Production); NameGenerator newNameGenerator(list[Production] prods, short()) { - // Define auxiliary functions to compute names for symbols - - // Define auxiliary function to count the number of occurrences of a name int count(str name) = (0 | it + 1 | p <- prods, toName(p.def) == name); From 852d0982424c3abcde6085062daa0e7e0e1f8688 Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Fri, 15 Nov 2024 18:03:33 +0100 Subject: [PATCH 4/5] Update walkthrough --- .../textmate/conversiontests/Walkthrough.rsc | 633 ++++++++++++------ 1 file changed, 411 insertions(+), 222 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc index 27ccfd9..52cf7d7 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc @@ -24,42 +24,43 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } -// # Walkthrough -// -// This module consists of a walkthrough to explain the main ideas behind the -// conversion algorithm, from Rascal grammars to TextMate grammars. -// -// The walkthrough is split into five parts. The initial part explains basic -// conversion. The subsequent four parts present complications and demonstrate -// extensions of the conversion algorithm to address them. -// -// The toy language considered, is a simple data language consisting of: -// - base: maps, numbers, strings; -// - extension 1: layout (including comments); -// - extension 2: regular expressions; -// - extension 3: locations; -// - extension 4: booleans. -// -// Working familiarity with TextMate grammar is assumed. To summarize: -// -// - Syntax: -// - Each TextMate grammar consists of a list of TextMate rules (ordered). -// - Each TextMate rule is either a *match pattern* (consisting of one -// regular expression) or a *begin/end pattern* (consisting of two -// regular expressions and a list of nested TextMate rules). -// -// - Semantics: A tokenization engine reads a document (line by line, top to -// bottom, left to right), while iteratively trying to apply TextMate rules -// by matching text against the regular expressions. -// -// Further reading: -// - https://macromates.com/manual/en/language_grammars -// - https://www.apeth.com/nonblog/stories/textmatebundle.html -// - https://code.visualstudio.com/api/language-extensions/syntax-highlight-guide +@description{ +This module consists of a walkthrough to explain the main ideas behind the +conversion algorithm, from Rascal grammars to TextMate grammars. + +The walkthrough is split into five parts. The initial part explains basic +conversion. The subsequent four parts present complications and demonstrate +extensions of the conversion algorithm to address them. + +The toy language considered, is a simple data language consisting of: + - base: maps, numbers, strings; + - extension 1: layout (including comments); + - extension 2: regular expressions; + - extension 3: locations; + - extension 4: booleans. + +Working familiarity with TextMate grammar is assumed. To summarize: + + - Syntax: + - Each TextMate grammar consists of a list of TextMate rules (ordered). + - Each TextMate rule is either a *match pattern* (consisting of one + regular expression) or a *begin/end pattern* (consisting of two + regular expressions and a list of nested TextMate rules). + + - Semantics: A tokenization engine reads a document (line by line, top to + bottom, left to right), while iteratively trying to apply TextMate rules + by matching text against the regular expressions. + +Further reading: + - https://macromates.com/manual/en/language_grammars + - https://www.apeth.com/nonblog/stories/textmatebundle.html + - https://code.visualstudio.com/api/language-extensions/syntax-highlight-guide +} module lang::textmate::conversiontests::Walkthrough import Grammar; +import IO; import ParseTree; import util::Maybe; @@ -67,261 +68,449 @@ import lang::textmate::Conversion; import lang::textmate::ConversionConstants; import lang::textmate::ConversionTests; import lang::textmate::ConversionUnit; +import lang::textmate::NameGeneration; +import lang::textmate::Grammar; import lang::rascal::grammar::analyze::Delimiters; -// ## Basics +// ---------- +// ## Preface // -// The base fragment of the grammar looks as follows: +// The following lexicals will be used as terminals (in addition to literals): -lexical Alnum = [a-z A-Z 0-9]; -lexical Digit = [0-9]; -lexical Blank = [\ \t]; -lexical Space = [\ \t\n]; +lexical Alnum = [0-9 A-Z a-z] ; +lexical Digit = [0-9] ; +lexical Blank = [\ \t] ; +lexical Space = [\ \t\n] ; +lexical Print = [0-9 A-Z a-z \ \t\n]; -start syntax Value - = Map - | Number - | String - | RegExp - | Location - | Boolean - ; - -syntax Map = "{" {(Key ":" Value) ","}* "}"; +// ----------------------------------------------- +// ## Basic conversion: single-line, non-recursive +// +// ### User-defined productions +// +// Basically, the conversion algorithm analyzes the Rascal grammar to find each +// user-defined Rascal production that is *suitable for conversion* to a +// TextMate rule. Roughly, a production is said to have that property when: +// - it has a category; +// - it does not produce the empty word. +// +// For instance: -lexical Key = Alnum+ !>> [a-z A-Z 0-9]; -lexical Number = @category="constant.numeric" Digit+ !>> [0-9]; -lexical String = @category="string.quoted.double" "\"" Alnum* "\""; +lexical Identifier = Alnum+ !>> [0-9 A-Z a-z] ; +lexical Chars = @category="string" Alnum* ; +lexical Number = @category="constant.numeric" Digit+ !>> [0-9] ; -// Basically, the conversion algorithm converts each Rascal non-terminal that is -// *suitable for conversion* to a TextMate rule. For instance, `Number` is -// converted to the following match pattern (in JSON): +// The Rascal productions of `Identifier` (does not have a category) and `Chars` +// (produces the empty word) are not suitable-for-conversion. In contrast, the +// Rascal production of `Number` is suitable-for-conversion. The following +// TextMate rule is generated: // // ``` // { -// "match": "([\\u0030-\\u0039]+?(?![\\u0030-\\u0039]))", -// "name": "prod(lex(\"Number\"),[conditional(iter(lex(\"Digit\")),{\\not-follow(\\char-class([range(48,57)]))})],{tag(\"category\"(\"constant.numeric\"))})", -// "captures": { -// "1": { -// "name": "constant.numeric" -// } -// } +// "name": "/inner/single/number", +// "match": "([0-9]+?(?![0-9]))", +// "captures": { "1": { "name": "constant.numeric" } } // } // ``` // -// Note: The regular expression (`match` property) is written in Oniguruma -// format (following the TextMate grammar specification), using code units -// instead of alphanumericals. -// // Note: The name (`name` property) could be anything, but to simplify -// debugging, the conversion algorithm uses (part of) the internal -// representation of the Rascal non-terminal as the name. +// debugging, the conversion algorithm uses a description of the Rascal +// production. // -// In general, a Rascal non-terminal is "suitable for conversion" when it -// satisfies each of the following conditions: +// Note: The regular expression (`match` property) is written in *Oniguruma* +// format (following the TextMate grammar specification). + +// ### Keywords // -// 1. It is non-recursive. (Recursion is prohibitively hard to faithfully -// convert; tail-recursion could be workable, but currently out of scope.) +// Sometimes, literals that qualify as *keywords* do not have a corresponding +// category and would not be highlighted. For instance: + +syntax BooleanExpr + = "true" + | "false" + | "if" BooleanExpr "then" BooleanExpr "else" BooleanExpr + | "(" BooleanExpr ")" + ; + +// Despite not having a category, though, these literals should be highlighted. // -// 2. It does not match newlines. (TextMate rules that involve matching against -// newlines are problematic because the tokenization engine operates line by -// line.) +// Thus, the conversion algorithm: +// - analyzes the Rascal grammar to find each literal that qualifies as a +// keyword (according to function `isKeyword` in module +// `lang::rascal::grammar:analyze::Delimiters`); +// - collects these literals in a synthetic Rascal production of the form +// `lit1 | lit2 | ...` (suitable-for-conversion by construction); +// - converts that production to a TextMate rule. // -// 3. It does not match the empty word. (TextMate rules that involve matching -// against the empty word are problematic because they match at every -// position.) +// For instance, the literals that qualify as keywords in the Rascal productions +// of `BooleanExpr` are "true", "false", "if", "then", and "else". The following +// TextMate rule is generated: // -// 4. It has a `@category` tag. +// ``` +// { +// "name": "/inner/single/$keywords", +// "match": "((?:\\btrue\\b)|(?:\\bfalse\\b)|(?:\\belse\\b)|(?:\\bthen\\b)|(?:\\bif\\b))", +// "captures": { "1": { "name": "keyword.control" } } +// } +// ``` // -// For instance, `Number` and `String` are suitable for conversion, but `Value` -// (violation of conditions 1 and 4), `Map` (violation of condition 1), and -// `Key` (violation of condition 4) are not suitable. - - - -// ## Extension 1: Conversion of productions instead of non-terminals +// ### Delimiters // -// The first extension (layout) of the grammar looks like this: +// Sometimes, literals that qualify as *delimiters* might confuse the TextMate +// tokenizer. For instance: -lexical Comment - = @category="comment.line.double-slash" line: "//" (Alnum | Blank)* $ - | @category="comment.block" block: "/*" (Alnum | Space)* "*/" - ; - -layout Layout = (Comment | Space)* !>> "//" !>> [\ \t\n]; +lexical LineComment = @category="comment" "//" (Alnum | Space)* $ ; +lexical Location = "|" Alnum+ "://" Alnum+ "|"; -// `Comment` is *not* suitable for conversion, as it violates condition 2: the -// corresponding TextMate rule would involve matching against newlines. However, -// the matching against newlines is needed only for production `block`; not for -// production `line`. Thus, conversion at the granularity of Rascal -// non-terminals is actually too coarse. -// -// To solve this issue, the conversion algorithm works at the granularity of -// individual productions (specifically, `prod` constructors). For instance, -// production `line` of `Comment` is individually converted to the following -// match pattern, independently of production `block` (which is ignored): +// The Rascal production of `Comment` is suitable-for-conversion, while the +// Rascal production of `Location` (does not have a category) is not. If only +// the `Comment` Rascal production were to be converted to a TextMate rule, then +// substring "//Desktop" of input string "|home://Desktop|" would be mistakenly +// tokenized as a comment. +// +// Thus, the conversion algorithm: +// - analyzes the Rascal grammar to find each literal that qualifies as a +// delimiter (according to function `isDelimiter` in module +// `lang::rascal::grammar:analyze::Delimiters`); +// - collects these literals in a synthetic Rascal production of the form +// `lit1 | lit2 | ...` (suitable-for-conversion by construction); +// - converts that production to a TextMate rule. +// +// For instance, the literals that qualify as delimiters in the Rascal +// productions of `Comment` and `Location` are "//", "://", and "|". The +// following TextMate rule is generated: // // ``` // { -// "match": "((?:\\u002F\\u002F)(?:(?:(?:[\\u0009-\\u0009]|[\\u0020-\\u0020])|(?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A]))*?(?:$)))", -// "name": "prod(label(\"line\",lex(\"Comment\")),[lit(\"//\"),conditional(\\iter-star(alt({lex(\"Blank\"),lex(\"Alnum\")})),{\\end-of-line()})],{tag(\"category\"(\"comment.line.double-slash\"))})", -// "captures": { -// "1": { -// "name": "comment.line.double-slash" -// } -// } +// "name": "/inner/single/$delimiters", +// "match": "(?:\\/\\/)|(?:\\|)|(?:\\:\\/\\/)", +// "captures": {} // } // ``` - - - -// ## Extension 2: Delimiter-sensitive conversion // -// The second extension (regular expressions; illustrative fragment) looks as -// follows: +// Note: The intent of this TextMate rule is *not* to assign a scope. This is +// why the `captures` property is empty. The only purpose of this TextMate rule +// is to force the TextMate tokenizer to consume highlighting-neutral delimiters +// before they are accidentally tokenized and mistakenly highlighted. +// +// Note: To ensure that each delimiter is matched by at most one TextMate rule, +// each delimiter literal needs to fulfil a number of additional requirements to +// be included in the synthetic Rascal production (e.g., it must not be the +// prefix of any other delimiter literal). -syntax RegExp = "/" RegExpBody "/"; +// ------------------------------------------------- +// ## Advanced conversion: multi-line, non-recursive +// +// ### Approach +// +// To convert user-defined Rascal productions of strings that potentially span +// multiple lines, more advanced machinery is needed. This is because individual +// TextMate rules with match patterns cannot be used to match strings that span +// multiple lines (i.e., newlines cannot be matched by individual regular +// expressions in a TextMate grammar). Instead, TextMate rule with begin/end +// patterns needs to be used. The approach is roughly as follows: +// +// - First, the conversion algorithm analyzes the Rascal grammar to find each +// user-defined suitable-for-conversion Rascal production. +// +// - Next, the conversion algorithm optimistically converts each production -- +// *including* those of strings that potentially span multiple lines -- to a +// TextMate rule with a match pattern. The rationale is that single-line is +// an important special case of multi-line, so a TextMate rule with a match +// pattern can already be quite effective (even if it does not cover strings +// that span multiple lines). +// +// - Next, the conversion algorithm checks for each production if it is +// *delimited*, *semi-delimited*, or *non-delimited*: +// +// - If it begins and ends with a delimiter, then it is *delimited*. In +// this case, the production can be converted to a TextMate rule with a +// begin/end pattern in a relatively *simple* way. +// +// - If it begins with a delimiter, but it does not end with a delimiter, +// then it is *semi-delimited*. In this case, the production can be +// converted to a TextMate rule with a begin/end pattern in a relatively +// *complex* way. +// +// - If it does not begin with a delimiter, then it is *non-delimited*. In +// this case, the production cannot be converted to a TextMate rule with +// a begin/end pattern. +// +// ### Delimited conversion when the begin-delimiter is unique +// +// For instance: -lexical RegExpBody - = @category="markup.italic" alnum: Alnum+ !>> [a-z A-Z 0-9] - | RegExpBody "?" - | RegExpBody "+" - | RegExpBody "|" RegExpBody - ; +lexical BlockComment = @category="comment" "/*" Print* "*/" ; -// Production `alnum` of `RegExpBody` is suitable for conversion. However, -// except for the `@category` tag, it has exactly the same definition as the -// production of `Key` (above). Thus, if the conversion algorithm were to -// naively convert `alnum` to a TextMate rule, keys in maps would be tokenized -// accidentally as regular expressions (and mistakenly typeset in italics). -// -// To solve this issue, the conversion algorithm first heuristically checks for -// each suitable-for-conversion production if it is *enclosed by delimiters*. If -// so, instead of converting the production to a top-level match pattern, it is -// converted to a top-level begin/end pattern (for the enclosing delimiters) -// with a nested match pattern (for the production itself). As a result, the -// nested match pattern will be used for tokenization only between matches of -// the enclosing delimiters. For instance, production `alnum` is enclosed by an -// opening `/` and a closing `/`, so it is converted to the following top-level -// begin/end pattern with a nested match pattern: +// The Rascal production of `BlockComment` is suitable-for-conversion, +// multi-line (because `Print` can produce a newline), and delimited (by "/*" +// and "*/"). Moreover, the begin-delimiter is unique: there is no other Rascal +// production that begins with "/*". The following TextMate rule is generated: // // ``` // { -// "begin": "(?:\\u002F)", -// "end": "(?:\\u002F)", +// "name": "/inner/multi/blockcomment", +// "begin": "(\\/\\*)", +// "end": "(\\*\\/)", +// "beginCaptures": { "1": { "name": "comment" } }, +// "endCaptures": { "1": { "name": "comment" } }, // "patterns": [ // { -// "match": "((?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A])+?(?!(?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A])))", -// "name": "prod(label(\"alnum\",lex(\"RegExpBody\")),[conditional(iter(lex(\"Alnum\")),{\\not-follow(\\char-class([range(48,57),range(65,90),range(97,122)]))})],{tag(\"category\"(\"markup.italic\"))})", -// "captures": { -// "1": { -// "name": "markup.italic" -// } -// } +// "match": "([\\t-\\n\\x{20}0-9A-Za-z])", +// "captures": { "1": { "name": "comment" } } +// }, +// { +// "match": "([\\x{01}-\\x{10FFFF}])", +// "captures": { "1": { "name": "comment" } } // } // ] // } // ``` // -// Note: If N suitable-for-conversion productions are enclosed by the same -// delimiters, then the conversion algorithm converts them into one top-level -// begin/end pattern with N nested match patterns (one for each production). - - - -// ## Extension 3: Delimiter conversion +// Note: The purpose of the nested match patterns is to force the TextMate +// tokenizer to consume input between the begin/end delimiters. The first nested +// match pattern is derived from the Rascal production of `Print`. The second +// nested match pattern is a default fallback. +// +// ### Delimited conversion when the begin-delimiter is *not* unique // -// The third extension (locations; illustrative fragment) looks as follows: +// For instance: -syntax Location = "|" Segment "://" {Segment "/"}+ "|"; -lexical Segment = Alnum+ !>> [a-z A-Z 0-9]; +lexical String + = StringLeftRight // Without interpolation + | StringLeft (Identifier StringMid)* Identifier StringRight // With interpolation + ; -// The productions of `Location` and `Segment` are *not* suitable for -// conversion, as they violate condition 4. However, accidentally, the TextMate -// rule for production `line` of `Comment` (above) will actually be applicable -// to suffixes of locations (e.g., it matches `//bar/baz` in `|foo://bar/baz|`). -// Thus, suffixes of locations will mistakenly be highlighted as comments. -// -// To solve this issue, the conversion algorithm creates a synthetic production -// of the form `lit1 | lit2 | ...`, where each `lit` is a literal that occurs -// in the Rascal grammar, and: -// - it does not match `/^\w+$/` (i.e., it is a *delimiter literal*; e.g., -// `(`, `://`, and `,` are delimiter literals); -// - it is not a prefix of any other delimiter literal; -// - it does not occur at the start of a suitable-for-conversion production; -// - it does not enclose a suitable-for-conversion production. -// -// The synthetic production is converted to a TextMate rule (match pattern). The -// previous requirements for each `lit` are intended to ensure that only a -// single TextMate rule is applicable to each delimiter. For instance, the -// synthetic production in the example grammar is converted to the following -// match pattern: +lexical StringLeftRight = @category="string" "\"" Print* "\"" ; +lexical StringLeft = @category="string" "\"" Print* "\<" ; +lexical StringMid = @category="string" "\>" Print* "\<" ; +lexical StringRight = @category="string" "\>" Print* "\"" ; + +// The Rascal production of `StringMid` is suitable-for-conversion, multi-line +// (because `Print` can produce a newline), and delimited (by ">" and "<"). +// However, the begin-delimiter is not unique: there is another Rascal +// production that begins with ">", namely the one of `StringRight`. The +// following *single* TextMate rule is generated for *both* Rascal productions: // // ``` // { -// "match": "(?:\\u002C)|(?:\\u002B)|(?:\\u002A\\u002F)|(?:\\u007D)|(?:\\u007C)|(?:\\u003F)|(?:\\u003A\\u002F\\u002F)|(?:\\u002F\\u002A)|(?:\\u007B)", -// "name": "prod(lex(\"delimiters\"),[alt({lit(\",\"),lit(\"+\"),lit(\"*/\"),lit(\"}\"),lit(\"|\"),lit(\"?\"),lit(\"://\"),lit(\"/*\"),lit(\"{\")})],{})" +// "name": "/inner/multi/stringmid,stringright", +// "begin": "(\\>)", +// "end": "((?:\\\")|(?:\\<))", +// "beginCaptures": { "1": { "name": "string" } }, +// "endCaptures": { "1": { "name": "string" } }, +// "patterns": [ +// { +// "match": "([\\t-\\n\\x{20}0-9A-Za-z])", +// "captures": { "1": { "name": "string" } } +// }, +// { +// "match": "([\\x{01}-\\x{10FFFF}])", +// "captures": { "1": { "name": "string" } } +// } +// ] // } // ``` // -// Note: The intent of this match pattern is *not* to assign a category. The -// only purpose is to force the tokenization engine to consume -// "highlighting-insignificant" delimiters before they are accidentally -// tokenized and mistakenly highlighted. - - - -// ## Extension 4: Keyword coversion +// Note: Similarly, the Rascal productions of `StringLeftRight` and `StringLeft` +// are suitable-for-conversion, multi-line, and delimited. Moreover, their begin +// delimiter is "\"", while there is no other Rascal production that begins with +// "\"". However, "\"" *does* occur as a non-begin delimiter elsewhere in the +// Rascal grammar: it is the end delimiter of the Rascal production of +// `StringLeftRight` itself. Consequently, "\"" does *not* unmistakenly indicate +// the beginning of a string. To avoid multi-line tokenization mistakes, the +// Rascal productions of `StringLeftRight` and `StringLeft` are not converted to +// a TextMate rule. // -// The fourth extension (booleans) of the grammar looks as follows: +// ### Semi-delimited conversion +// +// For instance: -lexical Boolean - = "true" - | "false" +syntax Tag + = @category="comment" "@" Alnum+ "=" Alnum+ + | @category="comment" "@" Alnum+ "{" Print* "}" ; -// The productions of `Boolean` are *not* suitable for conversion, as they -// violate condition 4. However, by default, literals like these should be -// highlighted as keywords. -// -// To solve this issue, the conversion algorithm creates a synthetic production -// of the form `lit1 | lit2 | ...`, where each `lit` is a literal that occurs -// in the input grammar, and `lit` matches `/^\w+$/` (i.e., it is a *keyword -// literal*; e.g., `true` and `false`). The synthetic production is converted to -// a TextMate rule (match pattern). For instance, the synthetic production in -// the example grammar is converted to the following match pattern: +layout Layout = Space* !>> [\ \t\n]; + +// The Rascal productions of `Tag` are suitable-for-conversion and multi-line +// (because `Layout` can produce newlines). However, only the second production +// is delimited. This requires special care. The following TextMate rule, with +// several nested patterns, is generated: // // ``` // { -// "match": "((?:\\b\\u0074\\u0072\\u0075\\u0065\\b)|(?:\\b\\u0066\\u0061\\u006C\\u0073\\u0065\\b))", -// "name": "prod(lex(\"keywords\"),[alt({lit(\"true\"),lit(\"false\")})],{tag(\"category\"(\"keyword.control\"))})", -// "captures": { -// "1": { -// "name": "keyword.control" +// "name": "/inner/multi/tag.2,tag.1", +// "begin": "((?:\\@)(?:[\\t-\\n\\x{20}]*?(?![\\t-\\n\\x{20}]))(?:[0-9A-Za-z](?:(?:[\\t-\\n\\x{20}]*?(?![\\t-\\n\\x{20}]))[0-9A-Za-z])*?)(?:[\\t-\\n\\x{20}]*?(?![\\t-\\n\\x{20}])))", +// "end": "(?=.)", +// "beginCaptures": { "1": { "name": "comment" } }, +// "endCaptures": {}, +// "applyEndPatternLast": true, +// "patterns": [ +// { +// "begin": "(\\{)", +// "end": "(\\})", +// "beginCaptures": { "1": { "name": "comment" } }, +// "endCaptures": { "1": { "name": "comment" } }, +// "patterns": [ +// { +// "match": "([\\t-\\n\\x{20}])", +// "captures": { "1": { "name": "comment" } } +// }, +// { +// "match": "([\\x{01}-\\x{10FFFF}])", +// "captures": { "1": { "name": "comment" } } +// } +// ], +// }, +// { +// "match": "(\\=)", +// "captures": { "1": { "name": "comment" } } // } -// } +// ] // } // ``` +// +// Note: The begin pattern matches the common *prefix* of the two Rascal +// productions. The two nested patterns each correspond to the two different +// *suffixes*. +// ---------------------------------------------------- +// ## Advanced conversion: single/multi-line, recursive +// +// Semi-delimited conversion (explained above) has limited support for +// user-defined Rascal productions that are recursive. Other than that, +// recursion is not yet supported. - -// ## Tests +// ----------------------------------------- +// ## Advanced conversion: context detection // -// The following code tests the conversion algorithm on input of the grammar -// defined above. +// TODO + +start syntax Start = Tag ; + +test bool conversion() { + println(toJSON(toTmGrammar(grammar(#Start), "Walkthrough", nameGeneration = short()))); + return true; +} + + + + + + + + + + + + + +// layout Layout = (Comment | Space)* !>> "//" !>> [\ \t\n]; + +// start syntax Value +// = Map +// | Number +// | String +// | RegExp +// | Location +// | Boolean +// ; + +// syntax Map = "{" {(Key ":" Value) ","}* "}"; + +// lexical Key = Alnum+ !>> [a-z A-Z 0-9]; +// lexical Number = @category="constant.numeric" Digit+ !>> [0-9]; +// lexical String = @category="string.quoted.double" "\"" Alnum* "\""; + +// // ## Extension 2: Delimiter-sensitive conversion +// // +// // The second extension (regular expressions; illustrative fragment) looks as +// // follows: + +// syntax RegExp = "/" RegExpBody "/"; + +// lexical RegExpBody +// = @category="markup.italic" alnum: Alnum+ !>> [a-z A-Z 0-9] +// | RegExpBody "?" +// | RegExpBody "+" +// | RegExpBody "|" RegExpBody +// ; + +// // Production `alnum` of `RegExpBody` is suitable for conversion. However, +// // except for the `@category` tag, it has exactly the same definition as the +// // production of `Key` (above). Thus, if the conversion algorithm were to +// // naively convert `alnum` to a TextMate rule, keys in maps would be tokenized +// // accidentally as regular expressions (and mistakenly typeset in italics). +// // +// // To solve this issue, the conversion algorithm first heuristically checks for +// // each suitable-for-conversion production if it is *enclosed by delimiters*. If +// // so, instead of converting the production to a top-level match pattern, it is +// // converted to a top-level begin/end pattern (for the enclosing delimiters) +// // with a nested match pattern (for the production itself). As a result, the +// // nested match pattern will be used for tokenization only between matches of +// // the enclosing delimiters. For instance, production `alnum` is enclosed by an +// // opening `/` and a closing `/`, so it is converted to the following top-level +// // begin/end pattern with a nested match pattern: +// // +// // ``` +// // { +// // "begin": "(?:\\u002F)", +// // "end": "(?:\\u002F)", +// // "patterns": [ +// // { +// // "match": "((?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A])+?(?!(?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A])))", +// // "name": "prod(label(\"alnum\",lex(\"RegExpBody\")),[conditional(iter(lex(\"Alnum\")),{\\not-follow(\\char-class([range(48,57),range(65,90),range(97,122)]))})],{tag(\"category\"(\"markup.italic\"))})", +// // "captures": { +// // "1": { +// // "name": "markup.italic" +// // } +// // } +// // } +// // ] +// // } +// // ``` +// // +// // Note: If N suitable-for-conversion productions are enclosed by the same +// // delimiters, then the conversion algorithm converts them into one top-level +// // begin/end pattern with N nested match patterns (one for each production). + + + + + + + + + + + + + + + + + + + + +// // ## Tests +// // +// // The following code tests the conversion algorithm on input of the grammar +// // defined above. -Grammar rsc = preprocess(grammar(#Value)); +// Grammar rsc = preprocess(grammar(#Value)); -list[ConversionUnit] units = [ - unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit(","),lit("+"),lit("}"),lit("|"),lit("?"),lit("{"),lit("://")})],{}), false, false, , ), - unit(rsc, prod(label("line",lex("Comment")),[lit("//"),conditional(\iter-star(alt({lex("Blank"),lex("Alnum")})),{\end-of-line()})],{\tag("category"("comment.line.double-slash"))}), false, false, , ), - unit(rsc, prod(label("block",lex("Comment")),[lit("/*"),\iter-star(alt({lex("Alnum"),lex("Space")})),lit("*/")],{\tag("category"("comment.block"))}), false, true, , ), - unit(rsc, prod(label("alnum",lex("RegExpBody")),[conditional(iter(lex("Alnum")),{\not-follow(\char-class([range(48,57),range(65,90),range(97,122)]))})],{\tag("category"("markup.italic"))}), false, false, , ), - unit(rsc, prod(lex("String"),[lit("\""),\iter-star(lex("Alnum")),lit("\"")],{\tag("category"("string.quoted.double"))}), false, false, , ), - unit(rsc, prod(lex("Number"),[conditional(iter(lex("Digit")),{\not-follow(\char-class([range(48,57)]))})],{\tag("category"("constant.numeric"))}), false, false, , ), - unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("true"),lit("false")})],{\tag("category"("keyword.control"))}), false, false, , ) -]; +// list[ConversionUnit] units = [ +// unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit(","),lit("+"),lit("}"),lit("|"),lit("?"),lit("{"),lit("://")})],{}), false, false, , ), +// unit(rsc, prod(label("line",lex("Comment")),[lit("//"),conditional(\iter-star(alt({lex("Blank"),lex("Alnum")})),{\end-of-line()})],{\tag("category"("comment.line.double-slash"))}), false, false, , ), +// unit(rsc, prod(label("block",lex("Comment")),[lit("/*"),\iter-star(alt({lex("Alnum"),lex("Space")})),lit("*/")],{\tag("category"("comment.block"))}), false, true, , ), +// unit(rsc, prod(label("alnum",lex("RegExpBody")),[conditional(iter(lex("Alnum")),{\not-follow(\char-class([range(48,57),range(65,90),range(97,122)]))})],{\tag("category"("markup.italic"))}), false, false, , ), +// unit(rsc, prod(lex("String"),[lit("\""),\iter-star(lex("Alnum")),lit("\"")],{\tag("category"("string.quoted.double"))}), false, false, , ), +// unit(rsc, prod(lex("Number"),[conditional(iter(lex("Digit")),{\not-follow(\char-class([range(48,57)]))})],{\tag("category"("constant.numeric"))}), false, false, , ), +// unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("true"),lit("false")})],{\tag("category"("keyword.control"))}), false, false, , ) +// ]; -test bool analyzeTest() = doAnalyzeTest(rsc, units, name = "Walkthrough"); -test bool transformTest() = doTransformTest(units, <7, 2, 0>, name = "Walkthrough"); \ No newline at end of file +// test bool analyzeTest() = doAnalyzeTest(rsc, units, name = "Walkthrough"); +// test bool transformTest() = doTransformTest(units, <7, 2, 0>, name = "Walkthrough"); From 46b16a662149b0dde9f39eac537eac641ff1c12f Mon Sep 17 00:00:00 2001 From: Sung-Shik Jongmans Date: Sun, 17 Nov 2024 13:23:18 +0100 Subject: [PATCH 5/5] Update walkthrough --- .../textmate/conversiontests/Walkthrough.rsc | 330 ++++++++---------- 1 file changed, 146 insertions(+), 184 deletions(-) diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc index 52cf7d7..bf42a36 100644 --- a/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc +++ b/rascal-textmate-core/src/main/rascal/lang/textmate/conversiontests/Walkthrough.rsc @@ -26,18 +26,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. } @description{ This module consists of a walkthrough to explain the main ideas behind the -conversion algorithm, from Rascal grammars to TextMate grammars. +conversion algorithm, from Rascal grammars to TextMate grammars. The walkthrough +is not intended to be comprehensive, meaning that not all corner cases are +discussed and explained. The aim of the walkthrough is to convey just the +general approach. -The walkthrough is split into five parts. The initial part explains basic -conversion. The subsequent four parts present complications and demonstrate -extensions of the conversion algorithm to address them. +The walkthrough consists of the following sections: + - Preface + - Basic conversion: single-line, non-recursive + - Advanced conversion: multi-line, non-recursive + - Advanced conversion: single/multi-line, recursive + - Advanced conversion: context detection -The toy language considered, is a simple data language consisting of: - - base: maps, numbers, strings; - - extension 1: layout (including comments); - - extension 2: regular expressions; - - extension 3: locations; - - extension 4: booleans. +Throughout, a simple data language is used to demonstrate the main ideas. It is +defined incrementally, by need. Working familiarity with TextMate grammar is assumed. To summarize: @@ -47,9 +49,9 @@ Working familiarity with TextMate grammar is assumed. To summarize: regular expression) or a *begin/end pattern* (consisting of two regular expressions and a list of nested TextMate rules). - - Semantics: A tokenization engine reads a document (line by line, top to - bottom, left to right), while iteratively trying to apply TextMate rules - by matching text against the regular expressions. + - Semantics: A TextMate tokenizer reads a document (line by line, top to + bottom, left to right), while iteratively trying to apply TextMate rules by + matching text against the regular expressions. Further reading: - https://macromates.com/manual/en/language_grammars @@ -93,17 +95,17 @@ lexical Print = [0-9 A-Z a-z \ \t\n]; // user-defined Rascal production that is *suitable for conversion* to a // TextMate rule. Roughly, a production is said to have that property when: // - it has a category; -// - it does not produce the empty word. +// - it cannot produce the empty word. // // For instance: lexical Identifier = Alnum+ !>> [0-9 A-Z a-z] ; -lexical Chars = @category="string" Alnum* ; +lexical Chars = @category="string" Print* ; lexical Number = @category="constant.numeric" Digit+ !>> [0-9] ; // The Rascal productions of `Identifier` (does not have a category) and `Chars` -// (produces the empty word) are not suitable-for-conversion. In contrast, the -// Rascal production of `Number` is suitable-for-conversion. The following +// (can produce the empty word) are not suitable-for-conversion. In contrast, +// the Rascal production of `Number` is suitable-for-conversion. The following // TextMate rule is generated: // // ``` @@ -114,17 +116,16 @@ lexical Number = @category="constant.numeric" Digit+ !>> [0-9] ; // } // ``` // -// Note: The name (`name` property) could be anything, but to simplify -// debugging, the conversion algorithm uses a description of the Rascal -// production. +// Note: Property `name` could be anything, but to simplify debugging, the +// conversion algorithm uses a description of the Rascal production. // -// Note: The regular expression (`match` property) is written in *Oniguruma* -// format (following the TextMate grammar specification). +// Note: The `match` property is a regular expression written in *Oniguruma* +// format (as required by the TextMate grammar specification). // ### Keywords // // Sometimes, literals that qualify as *keywords* do not have a corresponding -// category and would not be highlighted. For instance: +// category. For instance: syntax BooleanExpr = "true" @@ -164,10 +165,10 @@ lexical LineComment = @category="comment" "//" (Alnum | Space)* $ ; lexical Location = "|" Alnum+ "://" Alnum+ "|"; // The Rascal production of `Comment` is suitable-for-conversion, while the -// Rascal production of `Location` (does not have a category) is not. If only -// the `Comment` Rascal production were to be converted to a TextMate rule, then -// substring "//Desktop" of input string "|home://Desktop|" would be mistakenly -// tokenized as a comment. +// Rascal production of `Location` (does not have a category) is not. However, +// if only the former were to be converted to a TextMate rule, then substring +// "//Desktop" of input string "|home://Desktop|" would be mistakenly tokenized +// as a comment. // // Thus, the conversion algorithm: // - analyzes the Rascal grammar to find each literal that qualifies as a @@ -204,40 +205,39 @@ lexical Location = "|" Alnum+ "://" Alnum+ "|"; // // ### Approach // -// To convert user-defined Rascal productions of strings that potentially span -// multiple lines, more advanced machinery is needed. This is because individual -// TextMate rules with match patterns cannot be used to match strings that span -// multiple lines (i.e., newlines cannot be matched by individual regular -// expressions in a TextMate grammar). Instead, TextMate rule with begin/end -// patterns needs to be used. The approach is roughly as follows: +// To convert user-defined Rascal productions of multi-line strings, more +// advanced machinery than TextMate rules with match patterns of single-line +// strings is needed (i.e., newlines cannot be matched by individual regular +// expressions in a TextMate grammar). That is, TextMate rules with begin/end +// patterns need to be used. The approach is roughly as follows: // // - First, the conversion algorithm analyzes the Rascal grammar to find each -// user-defined suitable-for-conversion Rascal production. +// user-defined Rascal production that is suitable-for-conversion. // -// - Next, the conversion algorithm optimistically converts each production -- -// *including* those of strings that potentially span multiple lines -- to a -// TextMate rule with a match pattern. The rationale is that single-line is -// an important special case of multi-line, so a TextMate rule with a match -// pattern can already be quite effective (even if it does not cover strings -// that span multiple lines). +// - Next, the conversion algorithm optimistically converts each of those +// productions -- *including* those of multi-line strings -- to a TextMate +// rule with a match pattern of single-line strings. The rationale is that +// single-line strings are a significant special case of multi-line strings, +// so this TextMate rule can already be quite effective. // -// - Next, the conversion algorithm checks for each production if it is -// *delimited*, *semi-delimited*, or *non-delimited*: +// - Next, the conversion algorithm checks for each of those productions of +// multi-line strings if it is *delimited*, *semi-delimited*, or +// *non-delimited*: // // - If it begins and ends with a delimiter, then it is *delimited*. In // this case, the production can be converted to a TextMate rule with a -// begin/end pattern in a relatively *simple* way. +// begin/end pattern in a relatively simple way. // // - If it begins with a delimiter, but it does not end with a delimiter, // then it is *semi-delimited*. In this case, the production can be // converted to a TextMate rule with a begin/end pattern in a relatively -// *complex* way. +// complex way. // // - If it does not begin with a delimiter, then it is *non-delimited*. In // this case, the production cannot be converted to a TextMate rule with // a begin/end pattern. // -// ### Delimited conversion when the begin-delimiter is unique +// ### Delimited conversion, when the begin-delimiter is unique // // For instance: @@ -246,7 +246,8 @@ lexical BlockComment = @category="comment" "/*" Print* "*/" ; // The Rascal production of `BlockComment` is suitable-for-conversion, // multi-line (because `Print` can produce a newline), and delimited (by "/*" // and "*/"). Moreover, the begin-delimiter is unique: there is no other Rascal -// production that begins with "/*". The following TextMate rule is generated: +// production in the Rascal grammar that begins with "/*". The following +// TextMate rule is generated: // // ``` // { @@ -269,11 +270,11 @@ lexical BlockComment = @category="comment" "/*" Print* "*/" ; // ``` // // Note: The purpose of the nested match patterns is to force the TextMate -// tokenizer to consume input between the begin/end delimiters. The first nested -// match pattern is derived from the Rascal production of `Print`. The second -// nested match pattern is a default fallback. +// tokenizer to explicitly consume all input between the begin/end delimiters. +// The first nested match pattern is derived from the Rascal production of +// `Print`. The second nested match pattern is a default fallback. // -// ### Delimited conversion when the begin-delimiter is *not* unique +// ### Delimited conversion, when the begin-delimiter is *not* unique // // For instance: @@ -290,8 +291,9 @@ lexical StringRight = @category="string" "\>" Print* "\"" ; // The Rascal production of `StringMid` is suitable-for-conversion, multi-line // (because `Print` can produce a newline), and delimited (by ">" and "<"). // However, the begin-delimiter is not unique: there is another Rascal -// production that begins with ">", namely the one of `StringRight`. The -// following *single* TextMate rule is generated for *both* Rascal productions: +// production in the Rascal grammar that begins with ">", namely the one of +// `StringRight`. The following *single* TextMate rule is generated that covers +// *both* Rascal productions: // // ``` // { @@ -315,13 +317,13 @@ lexical StringRight = @category="string" "\>" Print* "\"" ; // // Note: Similarly, the Rascal productions of `StringLeftRight` and `StringLeft` // are suitable-for-conversion, multi-line, and delimited. Moreover, their begin -// delimiter is "\"", while there is no other Rascal production that begins with -// "\"". However, "\"" *does* occur as a non-begin delimiter elsewhere in the -// Rascal grammar: it is the end delimiter of the Rascal production of -// `StringLeftRight` itself. Consequently, "\"" does *not* unmistakenly indicate -// the beginning of a string. To avoid multi-line tokenization mistakes, the -// Rascal productions of `StringLeftRight` and `StringLeft` are not converted to -// a TextMate rule. +// delimiter is "\"", while there is no other Rascal production in the Rascal +// grammar that begins with "\"". However, "\"" *does* occur as a non-begin +// delimiter elsewhere in the Rascal grammar: it is the end-delimiter of the +// Rascal production of `StringLeftRight`. Consequently, "\"" does *not* +// unmistakenly indicate the beginning of `StringLeftRight` or `StringLeft`. To +// avoid multi-line tokenization mistakes, the Rascal productions of +// `StringLeftRight` and `StringLeft` are not converted to a TextMate rule. // // ### Semi-delimited conversion // @@ -373,8 +375,7 @@ layout Layout = Space* !>> [\ \t\n]; // ``` // // Note: The begin pattern matches the common *prefix* of the two Rascal -// productions. The two nested patterns each correspond to the two different -// *suffixes*. +// productions. The two nested patterns correspond to the different *suffixes*. // ---------------------------------------------------- // ## Advanced conversion: single/multi-line, recursive @@ -386,131 +387,92 @@ layout Layout = Space* !>> [\ \t\n]; // ----------------------------------------- // ## Advanced conversion: context detection // -// TODO +// Sometimes, highlighting depends on the context in which the tokenization +// input occurs. For instance: + +lexical RegExp = "/" RegExpBody "/"; -start syntax Start = Tag ; +lexical RegExpBody + = @category="string" alnum: Alnum+ !>> [0-9 A-Z a-z] + | RegExpBody "?" + | RegExpBody "+" + | RegExpBody "|" RegExpBody + ; -test bool conversion() { +// Rascal production `alnum` of `RegExpBody` is suitable-for-conversion. +// However, except for the `@category` tag, it has exactly the same definition +// as the production of `Identifier` (above). If the conversion algorithm were +// to naively convert `alnum` to a TextMate rule, identifiers would be +// mistakenly highlighted as strings. +// +// Thus, the conversion algorithm first heuristically checks for each Rascal +// production that is suitable-for-conversion if it is *enclosed by delimiters*. +// If so, it is converted to an *outer* TextMate rule with a begin/end pattern +// to match the enclosing delimiters (i.e., context detection) and include +// patterns to toggle *inner* TextMate rules. That is, the inner TextMate rules +// are used for tokenization only between matches of the enclosing delimiters +// (i.e., in the right context). +// +// For instance, production `alnum` is enclosed by an opening `/` and a closing +// `/`. The following outer TextMate rule is generated: +// +// ``` +// { +// "name": "/outer//", +// "begin": "(?:\\/)", +// "end": "(?:\\/)", +// "beginCaptures": {}, +// "endCaptures": {}, +// "patterns": [ +// { "include": "#/inner/single/$delimiters" }, +// { "include": "#/inner/single/regexpbody.alnum" }, +// { "include": "#/inner/single/$keywords" } +// ] +// } +// ``` +// +// Note: If N Rascal productions are enclosed by the same delimiters, then the +// conversion algorithm converts them into one outer TextMate rule and N inner +// TextMate rules. + +// ## Tests +// +// The following code tests the conversion algorithm on input of the grammar +// defined above. + +start syntax Start + = Identifier + | Chars + | Number + | BooleanExpr + | LineComment + | Location + | BlockComment + | String + | Tag + | RegExp ; + +Grammar rsc = preprocess(grammar(#Start)); + +list[ConversionUnit] units = [ + unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit(")"),lit("("),lit("+"),lit("="),lit("|"),lit("?"),lit("{"),lit("://")})],{}), false, false, , ), + unit(rsc, prod(sort("Tag"),[lit("@"),layouts("Layout"),\iter-seps(lex("Alnum"),[layouts("Layout")]),layouts("Layout"),lit("="),layouts("Layout"),\iter-seps(lex("Alnum"),[layouts("Layout")])],{\tag("category"("comment"))}), false, true, , ), + unit(rsc, prod(sort("Tag"),[lit("@"),layouts("Layout"),\iter-seps(lex("Alnum"),[layouts("Layout")]),layouts("Layout"),lit("{"),layouts("Layout"),\iter-star-seps(lex("Print"),[layouts("Layout")]),layouts("Layout"),lit("}")],{\tag("category"("comment"))}), false, true, , ), + unit(rsc, prod(lex("StringMid"),[lit("\>"),\iter-star(lex("Print")),lit("\<")],{\tag("category"("string"))}), false, true, "))>, ")),just(lit("\<"))>), + unit(rsc, prod(lex("StringRight"),[lit("\>"),\iter-star(lex("Print")),lit("\"")],{\tag("category"("string"))}), false, true, , ")),just(lit("\""))>), + unit(rsc, prod(lex("LineComment"),[lit("//"),conditional(\iter-star(alt({lex("Alnum"),lex("Space")})),{\end-of-line()})],{\tag("category"("comment"))}), false, true, , ), + unit(rsc, prod(lex("BlockComment"),[lit("/*"),\iter-star(lex("Print")),lit("*/")],{\tag("category"("comment"))}), false, true, , ), + unit(rsc, prod(label("alnum",lex("RegExpBody")),[conditional(iter(lex("Alnum")),{\not-follow(\char-class([range(48,57),range(65,90),range(97,122)]))})],{\tag("category"("string"))}), false, false, , ), + unit(rsc, prod(lex("StringLeft"),[lit("\""),\iter-star(lex("Print")),lit("\<")],{\tag("category"("string"))}), false, true, "))>, ), + unit(rsc, prod(lex("StringLeftRight"),[lit("\""),\iter-star(lex("Print")),lit("\"")],{\tag("category"("string"))}), false, true, , ), + unit(rsc, prod(lex("Number"),[conditional(iter(lex("Digit")),{\not-follow(\char-class([range(48,57)]))})],{\tag("category"("constant.numeric"))}), false, false, , ), + unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("true"),lit("false"),lit("else"),lit("then"),lit("if")})],{\tag("category"("keyword.control"))}), false, false, , ) +]; + +test bool analyzeTest() = doAnalyzeTest(rsc, units, name = "Walkthrough"); +test bool transformTest() = doTransformTest(units, <12, 6, 0>, name = "Walkthrough"); + +bool convertAndPrint() { println(toJSON(toTmGrammar(grammar(#Start), "Walkthrough", nameGeneration = short()))); return true; } - - - - - - - - - - - - - -// layout Layout = (Comment | Space)* !>> "//" !>> [\ \t\n]; - -// start syntax Value -// = Map -// | Number -// | String -// | RegExp -// | Location -// | Boolean -// ; - -// syntax Map = "{" {(Key ":" Value) ","}* "}"; - -// lexical Key = Alnum+ !>> [a-z A-Z 0-9]; -// lexical Number = @category="constant.numeric" Digit+ !>> [0-9]; -// lexical String = @category="string.quoted.double" "\"" Alnum* "\""; - -// // ## Extension 2: Delimiter-sensitive conversion -// // -// // The second extension (regular expressions; illustrative fragment) looks as -// // follows: - -// syntax RegExp = "/" RegExpBody "/"; - -// lexical RegExpBody -// = @category="markup.italic" alnum: Alnum+ !>> [a-z A-Z 0-9] -// | RegExpBody "?" -// | RegExpBody "+" -// | RegExpBody "|" RegExpBody -// ; - -// // Production `alnum` of `RegExpBody` is suitable for conversion. However, -// // except for the `@category` tag, it has exactly the same definition as the -// // production of `Key` (above). Thus, if the conversion algorithm were to -// // naively convert `alnum` to a TextMate rule, keys in maps would be tokenized -// // accidentally as regular expressions (and mistakenly typeset in italics). -// // -// // To solve this issue, the conversion algorithm first heuristically checks for -// // each suitable-for-conversion production if it is *enclosed by delimiters*. If -// // so, instead of converting the production to a top-level match pattern, it is -// // converted to a top-level begin/end pattern (for the enclosing delimiters) -// // with a nested match pattern (for the production itself). As a result, the -// // nested match pattern will be used for tokenization only between matches of -// // the enclosing delimiters. For instance, production `alnum` is enclosed by an -// // opening `/` and a closing `/`, so it is converted to the following top-level -// // begin/end pattern with a nested match pattern: -// // -// // ``` -// // { -// // "begin": "(?:\\u002F)", -// // "end": "(?:\\u002F)", -// // "patterns": [ -// // { -// // "match": "((?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A])+?(?!(?:[\\u0030-\\u0039]|[\\u0041-\\u005A]|[\\u0061-\\u007A])))", -// // "name": "prod(label(\"alnum\",lex(\"RegExpBody\")),[conditional(iter(lex(\"Alnum\")),{\\not-follow(\\char-class([range(48,57),range(65,90),range(97,122)]))})],{tag(\"category\"(\"markup.italic\"))})", -// // "captures": { -// // "1": { -// // "name": "markup.italic" -// // } -// // } -// // } -// // ] -// // } -// // ``` -// // -// // Note: If N suitable-for-conversion productions are enclosed by the same -// // delimiters, then the conversion algorithm converts them into one top-level -// // begin/end pattern with N nested match patterns (one for each production). - - - - - - - - - - - - - - - - - - - - -// // ## Tests -// // -// // The following code tests the conversion algorithm on input of the grammar -// // defined above. - -// Grammar rsc = preprocess(grammar(#Value)); - -// list[ConversionUnit] units = [ -// unit(rsc, prod(lex(DELIMITERS_PRODUCTION_NAME),[alt({lit(","),lit("+"),lit("}"),lit("|"),lit("?"),lit("{"),lit("://")})],{}), false, false, , ), -// unit(rsc, prod(label("line",lex("Comment")),[lit("//"),conditional(\iter-star(alt({lex("Blank"),lex("Alnum")})),{\end-of-line()})],{\tag("category"("comment.line.double-slash"))}), false, false, , ), -// unit(rsc, prod(label("block",lex("Comment")),[lit("/*"),\iter-star(alt({lex("Alnum"),lex("Space")})),lit("*/")],{\tag("category"("comment.block"))}), false, true, , ), -// unit(rsc, prod(label("alnum",lex("RegExpBody")),[conditional(iter(lex("Alnum")),{\not-follow(\char-class([range(48,57),range(65,90),range(97,122)]))})],{\tag("category"("markup.italic"))}), false, false, , ), -// unit(rsc, prod(lex("String"),[lit("\""),\iter-star(lex("Alnum")),lit("\"")],{\tag("category"("string.quoted.double"))}), false, false, , ), -// unit(rsc, prod(lex("Number"),[conditional(iter(lex("Digit")),{\not-follow(\char-class([range(48,57)]))})],{\tag("category"("constant.numeric"))}), false, false, , ), -// unit(rsc, prod(lex(KEYWORDS_PRODUCTION_NAME),[alt({lit("true"),lit("false")})],{\tag("category"("keyword.control"))}), false, false, , ) -// ]; - -// test bool analyzeTest() = doAnalyzeTest(rsc, units, name = "Walkthrough"); -// test bool transformTest() = doTransformTest(units, <7, 2, 0>, name = "Walkthrough");