Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recursive multiline highlighting #17

Merged
merged 19 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
2238588
Add `applyEndPatternlast` parameter to `beginEnd` constructor
sungshik Aug 23, 2024
3b11353
Add utility function to check if a symbol is recursive
sungshik Aug 23, 2024
f6a2803
Add utility function to remove the conditional from a symbol
sungshik Aug 23, 2024
667ae4b
Add overloaded version of `getInnerDelimiterPair` for symbols
sungshik Aug 23, 2024
22ea045
Add `\conditional` to `destar`
sungshik Aug 23, 2024
217252d
Fix typo
sungshik Sep 2, 2024
4d59dac
Add `recursive` parameter to `ConversionUnit`
sungshik Sep 2, 2024
bd14940
Add function to remove prefix conversion units from a list
sungshik Sep 2, 2024
1dcd56e
Widen applicability of function `getInnerDelimiterPair` for symbols t…
sungshik Sep 3, 2024
044d10c
Add function to decompose lists of units into prefix/suffixes
sungshik Sep 3, 2024
6ba991d
Add support for recursive productions to the converter
sungshik Sep 3, 2024
5e1f917
Update tests to support recursive productions
sungshik Sep 6, 2024
8d7d635
Merge branch 'identify-newline-separated-segments' into recursive-mul…
sungshik Sep 6, 2024
c310743
Add new test module (`RascalTag`) to test support for recursive produ…
sungshik Sep 6, 2024
9070cdd
Merge branch 'main' into recursive-multiline-highlighting2
sungshik Sep 6, 2024
7fa2df6
Add a few more comments
sungshik Sep 9, 2024
df909e5
Add a few more comments
sungshik Sep 9, 2024
546bb13
Update generated TextMate grammar for Rascal/Pico
sungshik Sep 9, 2024
e8a887c
Simplify a few expressions to improve readability
sungshik Sep 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,22 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
return false;
}

@synopsis{
Checks if symbol `s` is recursive in grammar `g`
}

bool isRecursive(Grammar g, Symbol s) {
set[Symbol] getChildren(Symbol s)
= {s | p <- lookup(g, s), /Symbol s := p.symbols};

bool check(set[Symbol] checking, Symbol s)
= s in checking
? true
: any(child <- getChildren(s), check(checking + s, child));

return check({}, s);
}

@synopsis{
Lookups a list of productions for symbol `s` in grammar `g`, replacing
formal parameters with actual parameters when needed
Expand Down Expand Up @@ -96,10 +112,20 @@ Symbol destar(\seq([symbol]))
Symbol destar(\alt({symbol}))
= \alt({destar(symbol)});

Symbol destar(\conditional(symbol, conditions))
= \conditional(destar(symbol), conditions);

default Symbol destar(Symbol s) = s;

@synopsis{
Retain from set `symbols` each symbol that is a strict prefix of any other
Removes the conditional from symbol `s`, if any
}

Symbol decond(\conditional(Symbol s, _)) = decond(s);
default Symbol decond(Symbol s) = s;

@synopsis{
Retains from set `symbols` each symbol that is a strict prefix of any other
symbol in `symbols`
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,28 @@ data Direction // Traverse lists of symbols (in productions)...
list[&T] reorder(list[&T] l, forward()) = l;
list[&T] reorder(list[&T] l, backward()) = reverse(l);

@synopsis{
Gets the unique leftmost delimiter (`begin`) and the unique rightmost
delimiter `end`, if any, that occur **inside** productions of symbol `s`
(when `s` is a non-terminal) or `s` itself (when `s` is a delimiter). If
`getOnlyFirst` is `true` (default: `false`), then only the first (resp.
last) symbol of the productions can be considered as leftmost (resp.
rightmost).
}

DelimiterPair getInnerDelimiterPair(Grammar g, Symbol s, bool getOnlyFirst = false) {
s = delabel(s);
if (isDelimiter(s)) {
return <just(s), just(s)>;
} else if (isNonTerminalType(s)) {
Maybe[Symbol] begin = getInnerDelimiterBySymbol(g, forward(), getOnlyFirst = getOnlyFirst)[s];
Maybe[Symbol] end = getInnerDelimiterBySymbol(g, backward(), getOnlyFirst = getOnlyFirst)[s];
return <begin, end>;
} else {
return <nothing(), nothing()>;
}
}

@synopsis{
Gets the unique leftmost delimiter (`begin`) and the unique rightmost
delimiter (`end`), if any, that occur **inside** production `p` in grammar
Expand Down Expand Up @@ -60,7 +82,7 @@ list[&T] reorder(list[&T] l, backward()) = reverse(l);
}

DelimiterPair getInnerDelimiterPair(Grammar g, Production p, bool getOnlyFirst = false) {
Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward() , getOnlyFirst = getOnlyFirst)[p];
Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward(), getOnlyFirst = getOnlyFirst)[p];
Maybe[Symbol] end = getInnerDelimiterByProduction(g, backward(), getOnlyFirst = getOnlyFirst)[p];
return <begin, end>;
}
Expand All @@ -79,6 +101,7 @@ private map[Production, Maybe[Symbol]] getInnerDelimiterByProduction(Grammar g,
for (p <- ret, ret[p] == nothing()) {
for (s <- reorder(p.symbols, direction)) {
s = delabel(s);
s = decond(s);
if (isDelimiter(s)) {
ret[p] = just(s);
break;
Expand Down
150 changes: 122 additions & 28 deletions rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ RscGrammar preprocess(RscGrammar rsc) {
// Replace occurrences of singleton ranges with just the corresponding
// literal. This makes it easier to identify delimiters.
return visit (rsc) {
case s: \char-class([range(char, char)]) => d
case \char-class([range(char, char)]) => d
when d := \lit("<stringChar(char)>"), isDelimiter(d)
}
}
Expand Down Expand Up @@ -113,12 +113,10 @@ list[ConversionUnit] analyze(RscGrammar rsc) {

// Analyze dependencies among productions
println("[LOG] Analyzing dependencies among productions");
Dependencies dependencies = deps(toGraph(rsc));
list[Production] prods = dependencies
.removeProds(isCyclic, true) // `true` means "also remove ancestors"
.retainProds(isNonEmpty)
.retainProds(hasCategory)
.getProds();
Graph[Production] graph = toGraph(rsc);
list[Production] prods = deps(graph).retainProds(isNonEmpty).retainProds(hasCategory).getProds();
list[Production] prodsNonRecursive = prods & deps(graph).removeProds(isCyclic, true).getProds();
list[Production] prodsRecursive = prods - prodsNonRecursive;

// Analyze delimiters
println("[LOG] Analyzing delimiters");
Expand All @@ -134,13 +132,15 @@ list[ConversionUnit] analyze(RscGrammar rsc) {
list[Production] prodsKeywords = [prod(lex(KEYWORDS_PRODUCTION_NAME), [\alt(keywords)], {\tag("category"("keyword.control"))})];

// Return
bool isEmptyProd(prod(_, [\alt(alternatives)], _)) = alternatives == {};
list[ConversionUnit] units
= [unit(rsc, p, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prods]
+ [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters, !isEmptyProd(p)]
+ [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsKeywords, !isEmptyProd(p)];

return sort(units);
bool isRecursive(Production p)
= p in prodsRecursive;
bool isEmptyProd(prod(_, [\alt(alternatives)], _))
= alternatives == {};

set[ConversionUnit] units = {};
units += {unit(rsc, p, isRecursive(p), hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prods};
units += {unit(rsc, p, false, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters + prodsKeywords, !isEmptyProd(p)};
return sort([*removeStrictPrefixes(units)]);
}

@synopsis{
Expand Down Expand Up @@ -196,7 +196,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {

// Convert all units in the group to match patterns (including,
// optimistically, multi-line units as-if they are single-line)
for (u <- group) {
for (u <- group, !u.recursive) {
TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = true))
[name = "/inner/single/<u.name>"];

Expand All @@ -216,32 +216,116 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
// Simple case: each unit does have an `end` inner delimiter
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {

// Compute a list of segments that need to be consumed between
// Compute a set of segments that need to be consumed between
// the `begin` delimiter and the `end` delimiters. Each of these
// segments will be converted to a match pattern.
set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};

list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
terminals = [s | s <- terminals, [] != s.symbols];
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
terminals = dup(terminals);
terminals = sortByMinimumLength(terminals); // Small symbols first
terminals = reverse(terminals); // Large symbols first
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
sungshik marked this conversation as resolved.
Show resolved Hide resolved

TmRule r = toTmRule(
toRegExp(rsc, [begin], {t}),
toRegExp(rsc, [\alt(ends)], {t}),
[toTmRule(toRegExp(rsc, [s], {t})) | s <- terminals])
[toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)])
[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];

rules = insertIn(rules, (u: r | u <- group));
}

// Complex case: some unit doesn't have an `end` inner delimiter
// Complex case: some unit doesn't have an `end` inner delimiter.
// This requires (substantial) extra care, as there is no obvious
// marker to close the begin/end pattern with.
else {
; // TODO (part of future support for *recursive* multi-line units)
Decomposition decomposition = decompose([*group]);
sungshik marked this conversation as resolved.
Show resolved Hide resolved

// TODO: The following condition can be true (even though there
// has to be a `begin` delimiter) because `decompose` doesn't
// expand non-terminals. Consider if it should, to maybe improve
// accuracy.
if ([] == decomposition.prefix) {
continue;
}

RegExp reBegin = toRegExp(rsc, decomposition.prefix, {t});
RegExp reEnd = regExp("(?=.)", []);

patterns = for (suffix <- decomposition.suffixes) {
if (just(Symbol begin) := getInnerDelimiterPair(rsc, suffix[0], getOnlyFirst = true).begin) {
if (just(Symbol end) := getInnerDelimiterPair(rsc, suffix[-1], getOnlyFirst = true).end) {
// If the suffix has has both a `begin` delimiter
// and an `end` delimiter, then generate a
// begin/end pattern to highlight these delimiters
// and all content in between.

set[Segment] segs = getSegments(rsc, suffix);
segs = {removeBeginEnd(seg, {begin}, {end}) | seg <- segs};

append toTmRule(
toRegExp(rsc, [begin], {t}),
toRegExp(rsc, [end], {t}),
[toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)]);
}

else {
// If the suffix has a `begin` delimiter, but not
// an `end` delimiter, then generate a match pattern
// just to highlight that `begin` delimiter. Ignore
// the remainder of the suffix (because it's
// recursive, so no regular expression can be
// generated for it).
append toTmRule(toRegExp(rsc, [begin], {t}));
}
}

else {
// If the suffix doesn't have a `begin` delimiter, then
// ignore it (because it's recursive, so no regular
// expression can be generated for it).
;
}
}

TmRule r = toTmRule(reBegin, reEnd, patterns);
r = r[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];
r = r[applyEndPatternLast = true];

rules = insertIn(rules, (u: r | u <- group));

// TODO: The current approach produces "partially"
// newline-sensitive rules, in the sense that newlines are
// accepted between the prefix and the suffixes, but not between
// symbols in the prefix. This approach could be improved to
// produce "totally" newline-sensitive rules (at the cost of
// much more complicated rule generation and generated rules) by
// adopting an approach in which the rules for each symbol in
// the prefix looks something like the following three:
//
// ```
// "foo": {
// "name": "foo",
// "begin": "(\\@)",
// "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
// "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }],
// "contentName": "comment",
// "beginCaptures": { "1": { "name": "comment" } }
// },
// "foo.$": {
// "begin": "$",
// "end": "(?<=^.+)|(?:(?!$)(?![a-z]+))",
// "name": "foo.$",
// "patterns": [ { "include": "#foo.^" }]
// },
// "foo.^": {
// "begin": "^",
// "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
// "name": "foo.^",
// "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }]
// }
// ```
//
// Note: This alternative approach would likely render the
// present distinction between the "simple case" and the
// "complex case" unneeded, so in that sense, rule generation
// would actually become simpler.
}
}
}
Expand Down Expand Up @@ -302,10 +386,20 @@ private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends
if (seg.final, _ <- symbols, symbols[-1] in ends) {
symbols = symbols[..-1];
}

return seg[symbols = symbols];
}

private list[Symbol] toTerminals(set[Segment] segs) {
list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
terminals = [s | s <- terminals, [] != s.symbols];
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
terminals = dup(terminals);
terminals = sortByMinimumLength(terminals); // Small symbols first
terminals = reverse(terminals); // Large symbols first
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
return terminals;
}
sungshik marked this conversation as resolved.
Show resolved Hide resolved

// TODO: This function could be moved to a separate, generic module
private list[&T] dupLast(list[&T] l)
= reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ bool doAnalyzeTest(RscGrammar rsc, list[ConversionUnit] expect, bool printActual
println();
for (i <- [0..size(actual)]) {
ConversionUnit u = actual[i];
println(" unit(rsc, <toStr(u.prod)>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
println(" unit(rsc, <toStr(u.prod)>, <u.recursive>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
}
println();
}
Expand Down
Loading