Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recursive multiline highlighting #17

Merged
merged 19 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
2238588
Add `applyEndPatternlast` parameter to `beginEnd` constructor
sungshik Aug 23, 2024
3b11353
Add utility function to check if a symbol is recursive
sungshik Aug 23, 2024
f6a2803
Add utility function to remove the conditional from a symbol
sungshik Aug 23, 2024
667ae4b
Add overloaded version of `getInnerDelimiterPair` for symbols
sungshik Aug 23, 2024
22ea045
Add `\conditional` to `destar`
sungshik Aug 23, 2024
217252d
Fix typo
sungshik Sep 2, 2024
4d59dac
Add `recursive` parameter to `ConversionUnit`
sungshik Sep 2, 2024
bd14940
Add function to remove prefix conversion units from a list
sungshik Sep 2, 2024
1dcd56e
Widen applicability of function `getInnerDelimiterPair` for symbols t…
sungshik Sep 3, 2024
044d10c
Add function to decompose lists of units into prefix/suffixes
sungshik Sep 3, 2024
6ba991d
Add support for recursive productions to the converter
sungshik Sep 3, 2024
5e1f917
Update tests to support recursive productions
sungshik Sep 6, 2024
8d7d635
Merge branch 'identify-newline-separated-segments' into recursive-mul…
sungshik Sep 6, 2024
c310743
Add new test module (`RascalTag`) to test support for recursive produ…
sungshik Sep 6, 2024
9070cdd
Merge branch 'main' into recursive-multiline-highlighting2
sungshik Sep 6, 2024
7fa2df6
Add a few more comments
sungshik Sep 9, 2024
df909e5
Add a few more comments
sungshik Sep 9, 2024
546bb13
Update generated TextMate grammar for Rascal/Pico
sungshik Sep 9, 2024
e8a887c
Simplify a few expressions to improve readability
sungshik Sep 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,22 @@ bool tryParse(Grammar g, Symbol s, str input, bool allowAmbiguity = false) {
return false;
}

@synopsis{
Checks if symbol `s` is recursive in grammar `g`
}

bool isRecursive(Grammar g, Symbol s) {
set[Symbol] getChildren(Symbol s)
= {s | p <- lookup(g, s), /Symbol s := p.symbols};

bool check(set[Symbol] checking, Symbol s)
= s in checking
? true
: any(child <- getChildren(s), check(checking + s, child));

return check({}, s);
}

@synopsis{
Lookups a list of productions for symbol `s` in grammar `g`, replacing
formal parameters with actual parameters when needed
Expand Down Expand Up @@ -96,10 +112,20 @@ Symbol destar(\seq([symbol]))
Symbol destar(\alt({symbol}))
= \alt({destar(symbol)});

Symbol destar(\conditional(symbol, conditions))
= \conditional(destar(symbol), conditions);

default Symbol destar(Symbol s) = s;

@synopsis{
Retain from set `symbols` each symbol that is a strict prefix of any other
Removes the conditional from symbol `s`, if any
}

Symbol decond(\conditional(Symbol s, _)) = decond(s);
default Symbol decond(Symbol s) = s;

@synopsis{
Retains from set `symbols` each symbol that is a strict prefix of any other
symbol in `symbols`
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,28 @@ data Direction // Traverse lists of symbols (in productions)...
list[&T] reorder(list[&T] l, forward()) = l;
list[&T] reorder(list[&T] l, backward()) = reverse(l);

@synopsis{
Gets the unique leftmost delimiter (`begin`) and the unique rightmost
delimiter `end`, if any, that occur **inside** productions of symbol `s`
(when `s` is a non-terminal) or `s` itself (when `s` is a delimiter). If
`getOnlyFirst` is `true` (default: `false`), then only the first (resp.
last) symbol of the productions can be considered as leftmost (resp.
rightmost).
}

DelimiterPair getInnerDelimiterPair(Grammar g, Symbol s, bool getOnlyFirst = false) {
s = delabel(s);
if (isDelimiter(s)) {
return <just(s), just(s)>;
} else if (isNonTerminalType(s)) {
Maybe[Symbol] begin = getInnerDelimiterBySymbol(g, forward(), getOnlyFirst = getOnlyFirst)[s];
Maybe[Symbol] end = getInnerDelimiterBySymbol(g, backward(), getOnlyFirst = getOnlyFirst)[s];
return <begin, end>;
} else {
return <nothing(), nothing()>;
}
}

@synopsis{
Gets the unique leftmost delimiter (`begin`) and the unique rightmost
delimiter (`end`), if any, that occur **inside** production `p` in grammar
Expand Down Expand Up @@ -60,7 +82,7 @@ list[&T] reorder(list[&T] l, backward()) = reverse(l);
}

DelimiterPair getInnerDelimiterPair(Grammar g, Production p, bool getOnlyFirst = false) {
Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward() , getOnlyFirst = getOnlyFirst)[p];
Maybe[Symbol] begin = getInnerDelimiterByProduction(g, forward(), getOnlyFirst = getOnlyFirst)[p];
Maybe[Symbol] end = getInnerDelimiterByProduction(g, backward(), getOnlyFirst = getOnlyFirst)[p];
return <begin, end>;
}
Expand All @@ -79,6 +101,7 @@ private map[Production, Maybe[Symbol]] getInnerDelimiterByProduction(Grammar g,
for (p <- ret, ret[p] == nothing()) {
for (s <- reorder(p.symbols, direction)) {
s = delabel(s);
s = decond(s);
if (isDelimiter(s)) {
ret[p] = just(s);
break;
Expand Down
125 changes: 100 additions & 25 deletions rascal-textmate-core/src/main/rascal/lang/textmate/Conversion.rsc
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,10 @@ list[ConversionUnit] analyze(RscGrammar rsc) {

// Analyze dependencies among productions
println("[LOG] Analyzing dependencies among productions");
Dependencies dependencies = deps(toGraph(rsc));
list[Production] prods = dependencies
.removeProds(isCyclic, true) // `true` means "also remove ancestors"
.retainProds(isNonEmpty)
.retainProds(hasCategory)
.getProds();
Graph[Production] graph = toGraph(rsc);
list[Production] prods = deps(graph).retainProds(isNonEmpty).retainProds(hasCategory).getProds();
list[Production] prodsNonRecursive = prods & deps(graph).removeProds(isCyclic, true).getProds();
list[Production] prodsRecursive = prods - prodsNonRecursive;

// Analyze delimiters
println("[LOG] Analyzing delimiters");
Expand All @@ -135,12 +133,13 @@ list[ConversionUnit] analyze(RscGrammar rsc) {

// Return
bool isEmptyProd(prod(_, [\alt(alternatives)], _)) = alternatives == {};
list[ConversionUnit] units
= [unit(rsc, p, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prods]
+ [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters, !isEmptyProd(p)]
+ [unit(rsc, p, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsKeywords, !isEmptyProd(p)];
set[ConversionUnit] units
= {unit(rsc, p, false, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prodsNonRecursive}
+ {unit(rsc, p, true, hasNewline(rsc, p), getOuterDelimiterPair(rsc, p), getInnerDelimiterPair(rsc, p, getOnlyFirst = true)) | p <- prodsRecursive}
+ {unit(rsc, p, false, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsDelimiters, !isEmptyProd(p)}
+ {unit(rsc, p, false, false, <nothing(), nothing()>, <nothing(), nothing()>) | p <- prodsKeywords, !isEmptyProd(p)};

sungshik marked this conversation as resolved.
Show resolved Hide resolved
return sort(units);
return sort([*removeStrictPrefixes(units)]);
}

@synopsis{
Expand Down Expand Up @@ -196,7 +195,7 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {

// Convert all units in the group to match patterns (including,
// optimistically, multi-line units as-if they are single-line)
for (u <- group) {
for (u <- group, !u.recursive) {
TmRule r = toTmRule(toRegExp(u.rsc, u.prod, guard = true))
[name = "/inner/single/<u.name>"];

Expand All @@ -216,32 +215,98 @@ private list[ConversionUnit] addInnerRules(list[ConversionUnit] units) {
// Simple case: each unit does have an `end` inner delimiter
if (_ <- group && all(u <- group, just(_) := u.innerDelimiters.end)) {

// Compute a list of segments that need to be consumed between
// Compute a set of segments that need to be consumed between
// the `begin` delimiter and the `end` delimiters. Each of these
// segments will be converted to a match pattern.
set[Segment] segs = {*getSegments(rsc, u.prod) | u <- group};
segs = {removeBeginEnd(seg, begins, ends) | seg <- segs};

list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
terminals = [s | s <- terminals, [] != s.symbols];
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
terminals = dup(terminals);
terminals = sortByMinimumLength(terminals); // Small symbols first
terminals = reverse(terminals); // Large symbols first
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
sungshik marked this conversation as resolved.
Show resolved Hide resolved

TmRule r = toTmRule(
toRegExp(rsc, [begin], {t}),
toRegExp(rsc, [\alt(ends)], {t}),
[toTmRule(toRegExp(rsc, [s], {t})) | s <- terminals])
[toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)])
[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];

rules = insertIn(rules, (u: r | u <- group));
}

// Complex case: some unit doesn't have an `end` inner delimiter
// Complex case: some unit doesn't have an `end` inner delimiter.
// This requires (substantial) extra care, as there is no obvious
// marker to close the begin/end pattern with.
else {
; // TODO (part of future support for *recursive* multi-line units)
Decomposition decomposition = decompose([*group]);
sungshik marked this conversation as resolved.
Show resolved Hide resolved

// TODO: The following condition can be true (even though there
// has to be a `begin` delimiter) because `decompose` doesn't
// expand non-terminals. Consider if it should, to maybe improve
// accuracy.
if ([] == decomposition.prefix) {
continue;
}

RegExp reBegin = toRegExp(rsc, decomposition.prefix, {t});
RegExp reEnd = regExp("(?=.)", []);

patterns = for (suffix <- decomposition.suffixes) {
if (just(Symbol begin) := getInnerDelimiterPair(rsc, suffix[0], getOnlyFirst = true).begin) {
if (just(Symbol end) := getInnerDelimiterPair(rsc, suffix[-1], getOnlyFirst = true).end) {
set[Segment] segs = getSegments(rsc, suffix);
segs = {removeBeginEnd(seg, {begin}, {end}) | seg <- segs};

append toTmRule(
toRegExp(rsc, [begin], {t}),
toRegExp(rsc, [end], {t}),
[toTmRule(toRegExp(rsc, [s], {t})) | s <- toTerminals(segs)]);
}

else {
append toTmRule(toRegExp(rsc, [begin], {t}));
}
}
}

TmRule r = toTmRule(reBegin, reEnd, patterns);
r = r[name = "/inner/multi/<intercalate(",", [u.name | u <- group])>"];
r = r[applyEndPatternLast = true];

rules = insertIn(rules, (u: r | u <- group));

// TODO: The current approach produces "partially"
// newline-sensitive rules, in the sense that newlines are
// accepted between the prefix and the suffixes, but not between
// symbols in the prefix. This approach could be improved to
// produce "totally" newline-sensitive rules (at the cost of
// much more complicated rule generation and generated rules) by
// adopting an approach in which the rules for each symbol in
// the prefix looks something like the following three:
//
// ```
// "foo": {
// "name": "foo",
// "begin": "(\\@)",
// "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
// "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }],
// "contentName": "comment",
// "beginCaptures": { "1": { "name": "comment" } }
// },
// "foo.$": {
// "begin": "$",
// "end": "(?<=^.+)|(?:(?!$)(?![a-z]+))",
// "name": "foo.$",
// "patterns": [ { "include": "#foo.^" }]
// },
// "foo.^": {
// "begin": "^",
// "end": "(?!\\G)|(?:(?!$)(?![a-z]+))",
// "name": "foo.^",
// "patterns": [{ "include": "#foo.$" }, { "match": "[a-z]+" }]
// }
// ```
//
// Note: This alternative approach would likely render the
// present distinction between the "simple case" and the
// "complex case" unneeded, so in that sense, rule generation
// would actually become simpler.
}
}
}
Expand Down Expand Up @@ -302,10 +367,20 @@ private Segment removeBeginEnd(Segment seg, set[Symbol] begins, set[Symbol] ends
if (seg.final, _ <- symbols, symbols[-1] in ends) {
symbols = symbols[..-1];
}

return seg[symbols = symbols];
}

private list[Symbol] toTerminals(set[Segment] segs) {
list[Symbol] terminals = [\seq(seg.symbols) | seg <- segs];
terminals = [s | s <- terminals, [] != s.symbols];
terminals = [destar(s) | s <- terminals]; // The tokenization engine always tries to apply rules repeatedly
terminals = dup(terminals);
terminals = sortByMinimumLength(terminals); // Small symbols first
terminals = reverse(terminals); // Large symbols first
terminals = terminals + \char-class([range(1,0x10FFFF)]); // Any char (as a fallback)
return terminals;
}
sungshik marked this conversation as resolved.
Show resolved Hide resolved

// TODO: This function could be moved to a separate, generic module
private list[&T] dupLast(list[&T] l)
= reverse(dup(reverse(l))); // TODO: Optimize/avoid `reverse`-ing?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ bool doAnalyzeTest(RscGrammar rsc, list[ConversionUnit] expect, bool printActual
println();
for (i <- [0..size(actual)]) {
ConversionUnit u = actual[i];
println(" unit(rsc, <toStr(u.prod)>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
println(" unit(rsc, <toStr(u.prod)>, <u.recursive>, <u.multiLine>, <u.outerDelimiters>, <u.innerDelimiters>)<i < size(actual) - 1 ? "," : "">");
}
println();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ module lang::textmate::ConversionUnit

import Grammar;
import ParseTree;
import util::Math;
import util::Maybe;

import lang::rascal::grammar::Util;
import lang::rascal::grammar::analyze::Delimiters;
import lang::textmate::ConversionConstants;
import lang::textmate::Grammar;
Expand Down Expand Up @@ -43,7 +45,7 @@ data ConversionUnit = unit(
// The following parameters are set when a unit is created during analysis:
Grammar rsc,
Production prod,
/* bool recursive, */ // TODO: Add support for recursive productions
bool recursive,
bool multiLine,
DelimiterPair outerDelimiters,
DelimiterPair innerDelimiters,
Expand Down Expand Up @@ -137,4 +139,95 @@ private list[tuple[Keygen, Compare]] sorters = [

// Sort by stringified production
<getStringifiedProduction, bool(str s1, str s2) { return s1 < s2; }>
];
];

@synopsis{
Retains from set `units` each unit that is a prefix (i.e., the list of
symbols of its production) of any other unit in `units`
}

set[ConversionUnit] retainStrictPrefixes(set[ConversionUnit] units)
= {u1 | u1 <- units, any(u2 <- units, u1 != u2, isStrictPrefix(u1, u2))};

@synopsis{
Removes from set `units` each units that is a prefix (i.e., the list of
symbols of its production) of any other unit in `units`
}

set[ConversionUnit] removeStrictPrefixes(set[ConversionUnit] units)
= units - retainStrictPrefixes(units);

@synopsis{
Checks if unit `u1` is a strict prefix of unit `u2`
}

bool isStrictPrefix(ConversionUnit u1, ConversionUnit u2)
= isStrictPrefix(u1.prod.symbols, u2.prod.symbols);

// TODO: This function could be moved to a separate, generic module
private bool isStrictPrefix([], [])
= false;
private bool isStrictPrefix([], [_, *_])
= true;
private bool isStrictPrefix([_, *_], [])
= false;
private bool isStrictPrefix([head1, *tail1], [head2, *tail2])
= head1 == head2 && isStrictPrefix(tail1, tail2);
sungshik marked this conversation as resolved.
Show resolved Hide resolved

@synopsis{
Representation of a decomposition of a list of units (i.e., the lists of
symbols of their productions) into their maximally common prefix
(non-recursive) and their minimally disjoint suffixes. See also function
`decompose`.
}

@description{
For instance, consider the following lists of symbols:
- `[lit("foo"), lit("bar"), lit("baz")]`;
- `[lit("foo"), lit("bar"), lit("qux"), lit("quux")]`.

The maximally common prefix is `[lit("foo"), lit("bar")]`. The minimally
disjoint suffixes are `[lit("baz")]` and `[lit("qux"), lit("quux")]]`.
}

alias Decomposition = tuple[
list[Symbol] prefix,
list[list[Symbol]] suffixes
];

@synopsis{
Decomposes list `units`. See also type `Decomposition`.
}

Decomposition decompose(list[ConversionUnit] units) {
list[Symbol] prefix = [];
list[list[Symbol]] suffixes = [];

list[Production] prods = [u.prod | u <- units];
set[Grammar] grammars = {u.rsc | u <- units};

if (_ <- prods && {rsc} := grammars) {
list[int] sizes = [size(p.symbols) | p <- prods];
int n = (sizes[0] | min(it, size) | size <- sizes[1..]);

// Compute prefix (at most of size `n`)
prefix = for (i <- [0..n]) {
set[Symbol] iths = {p.symbols[i] | p <- prods};
if ({ith} := iths && !isRecursive(rsc, delabel(ith))) {
append ith;
} else {
break;
}
}

// Compute suffixes
suffixes = for (p <- prods) {
list[Symbol] suffix = p.symbols[size(prefix)..];
if (_ <- suffix) {
append suffix;
}
}
}

return <prefix, suffixes>;
}
Loading