SWAT-engineering · sungshik · Sep 9, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/Util.rsc
@@ -31,6 +31,22 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
     return false;
 }
 
+@synopsis{
+    Checks if symbol `s` is recursive in grammar `g`
+}
+
+bool isRecursive(Grammar g, Symbol s) {
+    set[Symbol] getChildren(Symbol s) 
+        = {s | p <- lookup(g, s), /Symbol s := p.symbols};
+
+    bool check(set[Symbol] checking, Symbol s)
+        = s in checking
+        ? true
+        : any(child <- getChildren(s), check(checking + s, child));
+
+    return check({}, s);
+}
+
 @synopsis{
     Lookups a list of productions for symbol `s` in grammar `g`, replacing
     formal parameters with actual parameters when needed
@@ -96,10 +112,20 @@ Symbol destar(\seq([symbol]))
 Symbol destar(\alt({symbol}))
     = \alt({destar(symbol)});
 
+Symbol destar(\conditional(symbol, conditions))
+    = \conditional(destar(symbol), conditions);
+
 default Symbol destar(Symbol s) = s;
 
 @synopsis{
-    Retain from set `symbols` each symbol that is a strict prefix of any other
+    Removes the conditional from symbol `s`, if any
+}
+
+Symbol decond(\conditional(Symbol s, _)) = decond(s);
+default Symbol decond(Symbol s)          = s;
+
+@synopsis{
+    Retains from set `symbols` each symbol that is a strict prefix of any other
     symbol in `symbols`
 }
 

diff --git a/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Delimiters.rsc b/rascal-textmate-core/src/main/rascal/lang/rascal/grammar/analyze/Delimiters.rsc
@@ -26,6 +26,28 @@ data Direction   // Traverse lists of symbols (in productions)...
 list[&T] reorder(list[&T] l, forward())  = l;
 list[&T] reorder(list[&T] l, backward()) = reverse(l);
 
+@synopsis{
+    Gets the unique leftmost delimiter (`begin`) and the unique rightmost
+    delimiter `end`, if any, that occur **inside** productions of symbol `s`
+    (when `s` is a non-terminal) or `s` itself (when `s` is a delimiter). If
+    `getOnlyFirst` is `true` (default: `false`), then only the first (resp.
+    last) symbol of the productions can be considered as leftmost (resp.
+    rightmost).
+}
+
+DelimiterPair getInnerDelimiterPair(Grammar g, Symbol s, bool getOnlyFirst = false) {
+    s = delabel(s);
+    if (isDelimiter(s)) {
+        return <just(s), just(s)>;
+    } else if (isNonTerminalType(s)) {
+        Maybe[Symbol] begin = getInnerDelimiterBySymbol(g, forward(),  getOnlyFirst = getOnlyFirst)[s];
+        Maybe[Symbol] end   = getInnerDelimiterBySymbol(g, backward(), getOnlyFirst = getOnlyFirst)[s];
+        return <begin, end>;
+    } else {
+        return <nothing(), nothing()>;
+    }
+}
+
 @synopsis{
     Gets the unique leftmost delimiter (`begin`) and the unique rightmost
     delimiter (`end`), if any, that occur **inside** production `p` in grammar
@@ -60,7 +82,7 @@ list[&T] reorder(list[&T] l, backward()) = reverse(l);
 }
 
 DelimiterPair getInnerDelimiterPair(Grammar g, Production p, bool getOnlyFirst = false) {
-    Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward() , getOnlyFirst = getOnlyFirst)[p];
+    Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward(),  getOnlyFirst = getOnlyFirst)[p];
     Maybe[Symbol] end   = getInnerDelimiterByProduction(g, backward(), getOnlyFirst = getOnlyFirst)[p];
     return <begin, end>;
 }
@@ -79,6 +101,7 @@ private map[Production, Maybe[Symbol]] getInnerDelimiterByProduction(Grammar g,
         for (p <- ret, ret[p] == nothing()) {
             for (s <- reorder(p.symbols, direction)) {
                 s = delabel(s);
+                s = decond(s);
                 if (isDelimiter(s)) {
                     ret[p] = just(s);
                     break;

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
@@ -113,12 +113,10 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
 
     // Analyze dependencies among productions
     println("[LOG] Analyzing dependencies among productions");
-    Dependencies dependencies = deps(toGraph(rsc));
-    list[Production] prods = dependencies
-        .removeProds(isCyclic, true) // `true` means "also remove ancestors"
-        .retainProds(isNonEmpty)
-        .retainProds(hasCategory)
-        .getProds();
+    Graph[Production] graph = toGraph(rsc);
+    list[Production] prods             = deps(graph).retainProds(isNonEmpty).retainProds(hasCategory).getProds();
+    list[Production] prodsNonRecursive = prods & deps(graph).removeProds(isCyclic, true).getProds();
+    list[Production] prodsRecursive    = prods - prodsNonRecursive;
 
     // Analyze delimiters
     println("[LOG] Analyzing delimiters");
@@ -135,12 +133,13 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
 
     // Return
     bool isEmptyProd(prod(_, [\alt(alternatives)], _)) = alternatives == {};
-    list[ConversionUnit] units
-        = [unit(rsc, p, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prods]
-        + [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters, !isEmptyProd(p)]
-        + [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsKeywords, !isEmptyProd(p)];
+    set[ConversionUnit] units
+        = {unit(rsc, p, false, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prodsNonRecursive}
+        + {unit(rsc, p, true, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prodsRecursive}
+        + {unit(rsc, p, false, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters, !isEmptyProd(p)}
+        + {unit(rsc, p, false, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsKeywords, !isEmptyProd(p)};
 
-    return sort(units);
+    return sort([*removeStrictPrefixes(units)]);
 }
 
 @synopsis{
@@ -196,7 +195,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
 
         // Convert all units in the group to match patterns (including,
         // optimistically, multi-line units as-if they are single-line)
-        for (u <- group) {
+        for (u <- group, !u.recursive) {
             TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = true))
                        [name = "/inner/single/<u.name>"];
 
@@ -216,32 +215,98 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
             // Simple case: each unit does have an `end` inner delimiter
             if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {
 
-                // Compute a list of segments that need to be consumed between
+                // Compute a set of segments that need to be consumed between
                 // the `begin` delimiter and the `end` delimiters. Each of these
                 // segments will be converted to a match pattern.
                 set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
                 segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};
 
-                list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
-                terminals = [s | s <- terminals, [] != s.symbols];
-                terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
-                terminals = dup(terminals);
-                terminals = sortByMinimumLength(terminals); // Small symbols first
-                terminals = reverse(terminals); // Large symbols first
-                terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
-
                 TmRule r = toTmRule(
                     toRegExp(rsc, [begin], {t}),
                     toRegExp(rsc, [\alt(ends)], {t}),
-                    [toTmRule(toRegExp(rsc, [s], {t})) | s <- terminals])
+                    [toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)])
                     [name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];
 
                 rules = insertIn(rules, (u: r | u <- group));
             }
 
-            // Complex case: some unit doesn't have an `end` inner delimiter
+            // Complex case: some unit doesn't have an `end` inner delimiter.
+            // This requires (substantial) extra care, as there is no obvious
+            // marker to close the begin/end pattern with.
             else {
-                ; // TODO (part of future support for *recursive* multi-line units)
+                Decomposition decomposition = decompose([*group]);
+
+                // TODO: The following condition can be true (even though there
+                // has to be a `begin` delimiter) because `decompose` doesn't
+                // expand non-terminals. Consider if it should, to maybe improve
+                // accuracy.
+                if ([] == decomposition.prefix) {
+                    continue;
+                }
+
+                RegExp reBegin = toRegExp(rsc, decomposition.prefix, {t});
+                RegExp reEnd   = regExp("(?=.)", []);
+
+                patterns = for (suffix <- decomposition.suffixes) {
+                    if (just(Symbol begin) := getInnerDelimiterPair(rsc, suffix[0], getOnlyFirst = true).begin) {
+                        if (just(Symbol end) := getInnerDelimiterPair(rsc, suffix[-1], getOnlyFirst = true).end) {
+                            set[Segment] segs = getSegments(rsc, suffix);
+                            segs = {removeBeginEnd(seg, {begin}, {end}) | seg <- segs};
+
+                            append toTmRule(
+                                toRegExp(rsc, [begin], {t}),
+                                toRegExp(rsc, [end], {t}),
+                                [toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)]);
+                        }
+
+                        else {
+                            append toTmRule(toRegExp(rsc, [begin], {t}));
+                        }
+                    }
+                }
+
+                TmRule r = toTmRule(reBegin, reEnd, patterns);
+                r = r[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];
+                r = r[applyEndPatternLast = true];
+
+                rules = insertIn(rules, (u: r | u <- group));
+
+                // TODO: The current approach produces "partially"
+                // newline-sensitive rules, in the sense that newlines are
+                // accepted between the prefix and the suffixes, but not between
+                // symbols in the prefix. This approach could be improved to
+                // produce "totally" newline-sensitive rules (at the cost of
+                // much more complicated rule generation and generated rules) by
+                // adopting an approach in which the rules for each symbol in
+                // the prefix looks something like the following three:
+                //
+                // ```
+                // "foo": {
+                //   "name": "foo",
+                //   "begin": "(\\@)",
+                //   "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
+                //   "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }],
+                //   "contentName": "comment",
+                //   "beginCaptures": { "1": { "name": "comment" } }
+                // },
+                // "foo.$": {
+                //   "begin": "$",
+                //   "end": "(?<=^.+)|(?:(?!$)(?![a-z]+))",
+                //   "name": "foo.$",
+                //   "patterns": [ { "include": "#foo.^" }]
+                // },
+                // "foo.^": {
+                //   "begin": "^",
+                //   "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
+                //   "name": "foo.^",
+                //   "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }]
+                // }
+                // ```
+                //
+                // Note: This alternative approach would likely render the
+                // present distinction between the "simple case" and the
+                // "complex case" unneeded, so in that sense, rule generation
+                // would actually become simpler.
             }
         }
     }
@@ -302,10 +367,20 @@ private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends
     if (seg.final, _ <- symbols, symbols[-1] in ends) {
         symbols = symbols[..-1];
     }
-
     return seg[symbols = symbols];
 }
 
+private list[Symbol] toTerminals(set[Segment] segs) {
+    list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
+    terminals = [s | s <- terminals, [] != s.symbols];
+    terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
+    terminals = dup(terminals);
+    terminals = sortByMinimumLength(terminals); // Small symbols first
+    terminals = reverse(terminals); // Large symbols first
+    terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
+    return terminals;
+}
+
 // TODO: This function could be moved to a separate, generic module
 private list[&T] dupLast(list[&T] l)
     = reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/ConversionTests.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/ConversionTests.rsc
@@ -38,7 +38,7 @@ bool doAnalyzeTest(RscGrammar rsc, list[ConversionUnit] expect, bool printActual
         println();
         for (i <- [0..size(actual)]) {
             ConversionUnit u = actual[i];
-            println("    unit(rsc, <toStr(u.prod)>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
+            println("    unit(rsc, <toStr(u.prod)>, <u.recursive>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
         }
         println();
     }

diff --git a/rascal-textmate-core/src/main/rascal/lang/textmate/ConversionUnit.rsc b/rascal-textmate-core/src/main/rascal/lang/textmate/ConversionUnit.rsc
@@ -7,8 +7,10 @@ module lang::textmate::ConversionUnit
 
 import Grammar;
 import ParseTree;
+import util::Math;
 import util::Maybe;
 
+import lang::rascal::grammar::Util;
 import lang::rascal::grammar::analyze::Delimiters;
 import lang::textmate::ConversionConstants;
 import lang::textmate::Grammar;
@@ -43,7 +45,7 @@ data ConversionUnit = unit(
     // The following parameters are set when a unit is created during analysis:
     Grammar rsc,
     Production prod,
- /* bool recursive, */ // TODO: Add support for recursive productions
+    bool recursive,
     bool multiLine,
     DelimiterPair outerDelimiters,
     DelimiterPair innerDelimiters,
@@ -137,4 +139,95 @@ private list[tuple[Keygen, Compare]] sorters = [
 
     // Sort by stringified production
     <getStringifiedProduction, bool(str s1, str s2) { return s1 < s2; }>
-];
+];
+
+@synopsis{
+    Retains from set `units` each unit that is a prefix (i.e., the list of
+    symbols of its production) of any other unit in `units`
+}
+
+set[ConversionUnit] retainStrictPrefixes(set[ConversionUnit] units)
+    = {u1 | u1 <- units, any(u2 <- units, u1 != u2, isStrictPrefix(u1, u2))};
+
+@synopsis{
+    Removes from set `units` each units that is a prefix (i.e., the list of
+    symbols of its production) of any other unit in `units`
+}
+
+set[ConversionUnit] removeStrictPrefixes(set[ConversionUnit] units)
+    = units - retainStrictPrefixes(units);
+
+@synopsis{
+    Checks if unit `u1` is a strict prefix of unit `u2`
+}
+
+bool isStrictPrefix(ConversionUnit u1, ConversionUnit u2)
+    = isStrictPrefix(u1.prod.symbols, u2.prod.symbols);
+
+// TODO: This function could be moved to a separate, generic module
+private bool isStrictPrefix([], [])
+    = false;
+private bool isStrictPrefix([], [_, *_])
+    = true;
+private bool isStrictPrefix([_, *_], [])
+    = false;
+private bool isStrictPrefix([head1, *tail1], [head2, *tail2])
+    = head1 == head2 && isStrictPrefix(tail1, tail2);
+
+@synopsis{
+    Representation of a decomposition of a list of units (i.e., the lists of
+    symbols of their productions) into their maximally common prefix
+    (non-recursive) and their minimally disjoint suffixes. See also function
+    `decompose`.
+}
+
+@description{
+    For instance, consider the following lists of symbols:
+      - `[lit("foo"), lit("bar"), lit("baz")]`;
+      - `[lit("foo"), lit("bar"), lit("qux"), lit("quux")]`.
+
+    The maximally common prefix is `[lit("foo"), lit("bar")]`. The minimally
+    disjoint suffixes are `[lit("baz")]` and `[lit("qux"), lit("quux")]]`.
+}
+
+alias Decomposition = tuple[
+    list[Symbol] prefix,
+    list[list[Symbol]] suffixes
+];
+
+@synopsis{
+    Decomposes list `units`. See also type `Decomposition`.
+}
+
+Decomposition decompose(list[ConversionUnit] units) {
+    list[Symbol] prefix = [];
+    list[list[Symbol]] suffixes = [];
+
+    list[Production] prods    = [u.prod | u <- units];
+    set[Grammar]     grammars = {u.rsc  | u <- units};
+
+    if (_ <- prods && {rsc} := grammars) {
+        list[int] sizes = [size(p.symbols) | p <- prods];
+        int n = (sizes[0] | min(it, size) | size <- sizes[1..]);
+
+        // Compute prefix (at most of size `n`)
+        prefix = for (i <- [0..n]) {
+            set[Symbol] iths = {p.symbols[i] | p <- prods};
+            if ({ith} := iths && !isRecursive(rsc, delabel(ith))) {
+                append ith;
+            } else {
+                break;
+            }
+        }
+
+        // Compute suffixes
+        suffixes = for (p <- prods) {
+            list[Symbol] suffix = p.symbols[size(prefix)..];
+            if (_ <- suffix) {
+                append suffix;
+            }
+        }
+    }
+
+    return <prefix, suffixes>;    
+}