Skip to content

Commit

Permalink
Use lexer to help detect indentation of strings
Browse files Browse the repository at this point in the history
  • Loading branch information
JojOatXGME committed Jul 15, 2024
1 parent 7d2f79b commit f47a403
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,15 @@ public class NixSyntaxHighlighter extends SyntaxHighlighterBase {
entry(NixTypes.URI, NixTextAttributes.URI),
// String literals
entry(NixTypes.STR, NixTextAttributes.STRING),
entry(NixTypes.STR_ESCAPE, NixTextAttributes.STRING_ESCAPE),
entry(NixTypes.STRING_CLOSE, NixTextAttributes.STRING),
entry(NixTypes.STRING_OPEN, NixTextAttributes.STRING),
entry(NixTypes.IND_STR, NixTextAttributes.STRING),
entry(NixTypes.IND_STR_LF, NixTextAttributes.STRING),
entry(NixTypes.IND_STR_INDENT, NixTextAttributes.STRING),
entry(NixTypes.IND_STR_ESCAPE, NixTextAttributes.STRING_ESCAPE),
entry(NixTypes.IND_STRING_CLOSE, NixTextAttributes.STRING),
entry(NixTypes.IND_STRING_OPEN, NixTextAttributes.STRING),
entry(NixTypes.STR_ESCAPE, NixTextAttributes.STRING_ESCAPE),
entry(NixTypes.IND_STR_ESCAPE, NixTextAttributes.STRING_ESCAPE),
// Other
entry(NixTypes.SCOMMENT, NixTextAttributes.LINE_COMMENT),
entry(NixTypes.MCOMMENT, NixTextAttributes.BLOCK_COMMENT),
Expand Down
78 changes: 67 additions & 11 deletions src/main/java/org/nixos/idea/util/NixStringUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import com.intellij.psi.tree.IElementType;
import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.NotNull;
import org.nixos.idea.psi.NixAntiquotation;
import org.nixos.idea.psi.NixIndString;
import org.nixos.idea.psi.NixStdString;
import org.nixos.idea.psi.NixString;
import org.nixos.idea.psi.NixStringPart;
import org.nixos.idea.psi.NixStringText;
import org.nixos.idea.psi.NixTypes;

/**
* Utilities for strings in the Nix Expression Language.
* Utilities for encoding and decoding strings in the Nix Expression Language.
*/
public final class NixStringUtil {

Expand Down Expand Up @@ -87,6 +91,43 @@ public static void escape(@NotNull StringBuilder builder, @NotNull CharSequence
}
}

/**
* Detects the maximal amount of characters removed from the start of the lines.
* May return {@link Integer#MAX_VALUE} if the content of the string is blank.
*
* @param string the string from which to get the indentation
* @return the detected indentation, or {@link Integer#MAX_VALUE}
*/
public static int detectMaxIndent(@NotNull NixString string) {
if (string instanceof NixStdString) {
return 0;
} else if (string instanceof NixIndString) {
int result = Integer.MAX_VALUE;
int preliminary = 0;
for (NixStringPart part : string.getStringParts()) {
if (part instanceof NixStringText textNode) {
for (ASTNode token = textNode.getNode().getFirstChildNode(); token != null; token = token.getTreeNext()) {
IElementType type = token.getElementType();
if (type == NixTypes.IND_STR_INDENT) {
preliminary = Math.min(result, token.getTextLength());
} else if (type == NixTypes.IND_STR_LF) {
preliminary = 0;
} else {
assert type == NixTypes.IND_STR || type == NixTypes.IND_STR_ESCAPE : type;
result = preliminary;
}
}
} else {
assert part instanceof NixAntiquotation : part.getClass();
result = preliminary;
}
}
return result;
} else {
throw new IllegalStateException("Unexpected subclass of NixString: " + string.getClass());
}
}

/**
* Returns the content of the given part of a string in the Nix Expression Language.
* All escape sequences are resolved.
Expand All @@ -95,31 +136,46 @@ public static void escape(@NotNull StringBuilder builder, @NotNull CharSequence
* @return The resulting string after resolving all escape sequences.
*/
public static @NotNull String parse(@NotNull NixStringText textNode) {
int maxIndent = detectMaxIndent((NixString) textNode.getParent());
StringBuilder builder = new StringBuilder();
for (ASTNode child = textNode.getNode().getFirstChildNode(); child != null; child = child.getTreeNext()) {
parse(builder, child);
parse(builder, child, maxIndent);
}
return builder.toString();
}

private static void parse(@NotNull StringBuilder builder, @NotNull ASTNode token) {
private static void parse(@NotNull StringBuilder builder, @NotNull ASTNode token, int maxIndent) {
CharSequence text = token.getChars();
IElementType type = token.getElementType();
if (type == NixTypes.STR || type == NixTypes.IND_STR) {
if (type == NixTypes.STR || type == NixTypes.IND_STR || type == NixTypes.IND_STR_LF) {
builder.append(text);
} else if (type == NixTypes.IND_STR_INDENT) {
int end = text.length();
if (end > maxIndent) {
CharSequence remain = text.subSequence(maxIndent, end);
builder.append(remain);
}
} else if (type == NixTypes.STR_ESCAPE) {
assert text.length() == 2 && text.charAt(0) == '\\' : text;
char c = text.charAt(1);
builder.append(unescape(c));
} else if (type == NixTypes.IND_STR_ESCAPE) {
assert text.length() == 3 && ("''$".contentEquals(text) || "'''".contentEquals(text)) ||
text.length() == 4 && "''\\".contentEquals(text.subSequence(0, 3)) : text;
if ("'''".contentEquals(text)){
builder.append("''");
return;
switch (text.charAt(2)) {
case '$' -> {
assert "''$".contentEquals(text) : text;
builder.append("$");
}
case '\'' -> {
assert "'''".contentEquals(text) : text;
builder.append("''");
}
case '\\' -> {
assert text.length() == 4 && "''\\".contentEquals(text.subSequence(0, 3)) : text;
char c = text.charAt(3);
builder.append(unescape(c));
}
default -> throw new IllegalStateException("Unknown escape sequence: " + text);
}
char c = text.charAt(text.length() - 1);
builder.append(unescape(c));
} else {
throw new IllegalStateException("Unexpected token in string: " + token);
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/lang/Nix.bnf
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ string_text ::= string_token+
antiquotation ::= DOLLAR LCURLY expr recover_antiquotation RCURLY { pin=1 }
private recover_antiquotation ::= { recoverWhile=curly_recover }
private string_part_recover ::= !(DOLLAR | STRING_CLOSE | IND_STRING_CLOSE | string_token)
private string_token ::= STR | IND_STR | STR_ESCAPE | IND_STR_ESCAPE
private string_token ::= STR | STR_ESCAPE | IND_STR | IND_STR_INDENT | IND_STR_ESCAPE | IND_STR_LF

;{ extends("bind_attr|bind_inherit")=bind }
bind ::= bind_attr | bind_inherit
Expand Down
46 changes: 30 additions & 16 deletions src/main/lang/Nix.flex
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,27 @@ import static org.nixos.idea.psi.NixTypes.*;
private final AbstractIntList states = new IntArrayList();

private void pushState(int newState) {
if (newState == YYINITIAL){
throw new IllegalStateException("Pusing YYINITIAL is not supported");
}
assert newState != YYINITIAL : "Pusing YYINITIAL is not supported";
// store current state on the stack to allow restoring it in popState(...)
states.push(yystate());
yybegin(newState);
}

private void popState(int expectedState) {
if (states.isEmpty()){
throw new IllegalStateException("Popping an empty stack of states. Expected: " + expectedState);
}
assert !states.isEmpty() : "Popping an empty stack of states. Expected: " + expectedState;
// safe-guard, because we always know which state we're currently in in the rules below
if (yystate() != expectedState) {
throw new IllegalStateException(String.format("Unexpected state. Current: %d, expected: %d", yystate(), expectedState));
}
assert yystate() == expectedState : String.format("Unexpected state. Current: %d, expected: %d", yystate(), expectedState);
// start the lexer with the previous state, which was stored by pushState(...)
yybegin(states.popInt());
}

private void replaceState(int expectedState, int newState) {
assert newState != YYINITIAL : "Pusing YYINITIAL is not supported";
// safe-guard, because we always know which state we're currently in in the rules below
assert yystate() == expectedState : String.format("Unexpected state. Current: %d, expected: %d", yystate(), expectedState);
yybegin(newState);
}

protected void onReset() {
states.clear();
}
Expand All @@ -44,7 +45,9 @@ import static org.nixos.idea.psi.NixTypes.*;
%function advance
%type IElementType
%unicode
%state BLOCK STRING IND_STRING ANTIQUOTATION_START ANTIQUOTATION PATH
%state BLOCK STRING IND_STRING ANTIQUOTATION_START ANTIQUOTATION
%xstate IND_STRING_START IND_STRING_INDENT PATH
%suppress empty-match

ANY=[^]
ID=[a-zA-Z_][a-zA-Z0-9_'-]*
Expand All @@ -71,8 +74,19 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
\" { popState(STRING); return STRING_CLOSE; }
}

<IND_STRING_START> {
// The first line is ignored in case it is empty
[\ ]*\n { replaceState(IND_STRING_START, IND_STRING_INDENT); return com.intellij.psi.TokenType.WHITE_SPACE; }
}

<IND_STRING_START, IND_STRING_INDENT> {
[\ ]+ { replaceState(yystate(), IND_STRING); return IND_STR_INDENT; }
"" { replaceState(yystate(), IND_STRING); }
}

<IND_STRING> {
[^\$\']+ { return IND_STR; }
\n { replaceState(IND_STRING, IND_STRING_INDENT); return IND_STR_LF; }
[^\$\'\n]+ { return IND_STR; }
"$"|"$$"|"'" { return IND_STR; }
"''$"|"'''" { return IND_STR_ESCAPE; }
"''"\\{ANY} { return IND_STR_ESCAPE; }
Expand All @@ -83,7 +97,7 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
<ANTIQUOTATION_START> {
// '$' and '{' must be two separate tokens to make NixBraceMatcher work
// correctly with Grammar-Kit.
"{" { popState(ANTIQUOTATION_START); pushState(ANTIQUOTATION); return LCURLY; }
"{" { replaceState(ANTIQUOTATION_START, ANTIQUOTATION); return LCURLY; }
}

<ANTIQUOTATION> {
Expand All @@ -98,10 +112,10 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
"$"/"{" { pushState(ANTIQUOTATION_START); return DOLLAR; }
{PATH_SEG} { return PATH_SEGMENT; }
{PATH_CHAR}+ { return PATH_SEGMENT; }
// anything else, e.g. whitespace, stops lexing of a PATH
// anything else, e.g. a whitespace, stops lexing of a PATH
// we're delegating back to the parent state
// PATH_END is an empty-length token to signal the end of the path
[^] { popState(PATH); yypushback(yylength()); return PATH_END; }
"" { popState(PATH); return PATH_END; }
}

<YYINITIAL, BLOCK, ANTIQUOTATION> {
Expand Down Expand Up @@ -152,7 +166,7 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
"->" { return IMPL; }

\" { pushState(STRING); return STRING_OPEN; }
\'\' { pushState(IND_STRING); return IND_STRING_OPEN; }
\'\' { pushState(IND_STRING_START); return IND_STRING_OPEN; }

// Note that `true`, `false` and `null` are built-in variables but not
// keywords. Therefore, they are not listed here.
Expand All @@ -171,5 +185,5 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
{WHITE_SPACE} { return com.intellij.psi.TokenType.WHITE_SPACE; }
}

// matched by all %state states
// matched by inclusive states (%state), but not by exclusive states (%xstate)
[^] { return com.intellij.psi.TokenType.BAD_CHARACTER; }
76 changes: 74 additions & 2 deletions src/test/java/org/nixos/idea/util/NixStringUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import static org.junit.jupiter.api.Assertions.assertEquals;

@SuppressWarnings("UnnecessaryStringEscape")
final class NixStringUtilTest {
@ParameterizedTest(name = "[{index}] {0} -> {1}")
@CsvSource(textBlock = """
Expand Down Expand Up @@ -54,6 +55,51 @@ void escape(String unescaped, String expectedResult) {
assertEquals(expectedResult, stringBuilder.toString());
}

@ParameterizedTest(name = "[{index}] {0} -> {1}")
@CsvSource(quoteCharacter = '|', textBlock = """
# Non-indented strings always return the empty string
|""| , 0
|" a"| , 0
|" a\n b"| , 0
# When there are only spaces, we return Integer.MAX_VALUE
|''''| , 2147483647
|'' ''| , 2147483647
|''\n \n ''| , 2147483647
# The smallest indentation counts
|''\n a\n b''| , 1
|''\n a\n b''| , 1
|''\n a\n b''| , 2
|''\n a\n ${b}''| , 1
|''\n a\n ''\\b''| , 1
# First line counts
|''a\n b''| , 0
|''${a}\n b''| , 0
|''''\\a\n b''| , 0
# But only the first token in a line counts
|'' a${b}''| , 2
|'' a''\\b''| , 2
|'' ${a}b''| , 2
|'' ${a}${b}''| , 2
|'' ${a}''\\b''| , 2
|'' ''\\ab''| , 2
|'' ''\\a${b}''| , 2
|'' ''\\a''\\b''| , 2
# Tab and CR are treated as normal characters, not as spaces
# See NixOS/nix#2911 and NixOS/nix#3759
|''\t''| , 0
|''\n \t''| , 2
|''\r\n''| , 0
|''\n \r\n''| , 2
# Indentation within interpolations is ignored
|'' ${\n"a"}''| , 2
|'' ${\n''a''}''| , 2
""")
@WithIdeaPlatform.OnEdt
void detectMaxIndent(String code, int expectedResult, Project project) {
NixString string = NixElementFactory.createString(project, code);
assertEquals(expectedResult, NixStringUtil.detectMaxIndent(string));
}

@ParameterizedTest(name = "[{index}] {0} -> {1}")
@CsvSource(quoteCharacter = '|', textBlock = """
"" , ||
Expand All @@ -62,7 +108,9 @@ void escape(String unescaped, String expectedResult) {
"\\"" , "
"\\\\" , \\
"\\\\x" , \\x
''"'' , "
''\\"'' , \\"
''\\x'' , \\x
''\\\\'' , \\\\
''\\\\x'' , \\\\x
''''\\"'' , "
Expand All @@ -78,12 +126,36 @@ void escape(String unescaped, String expectedResult) {
|"\n"| , |\n|
|"\r"| , |\r|
|"\t"| , |\t|
|"\\n"| , |\n|
|"\\r"| , |\r|
|"\\t"| , |\t|
|''_\n''| , |_\n|
|''\r''| , |\r|
|''\t''| , |\t|
|''''\\n''| , |\n|
|''''\\r''| , |\r|
|''''\\t''| , |\t|
# supplementary character, i.e. character form a supplementary plane,
# which needs a surrogate pair to be represented in UTF-16
"\uD83C\uDF09" , \uD83C\uDF09
''\uD83C\uDF09'', \uD83C\uDF09
# TODO implement indentation (the one below fails)
# '' a '' , |a |
# Remove common indentation in indented strings
|'' a ''| , |a |
|'' a ''| , |a |
|'' a\n b\n''| , |a\nb\n|
|'' a\n b\n''| , |a\n b\n|
# But don't remove indentation when there is one line without it
|'' a\nb\n c''| , | a\nb\n c|
|''a\n b\n c''| , |a\n b\n c|
|'' a\n\tb''|, | a\n\tb|
|''\ta\n b''|, |\ta\n b|
# Even when the line is blank
|'' a\n ''| , |a\n |
# Ignore indentation of empty lines
|'' a\n\n b\n''|, |a\n\nb\n|
# Remove initial line break in indented strings
|''\n a''| , |a|
|'' \n a''| , |a|
""")
@WithIdeaPlatform.OnEdt
void parse(String code, String expectedResult, Project project) {
Expand Down
11 changes: 10 additions & 1 deletion src/test/testData/ParsingTest/StringWithMultipleLines.lexer.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,16 @@ STR ('\n first\n second\n third\n')
STRING_CLOSE ('"')
WHITE_SPACE ('\n')
IND_STRING_OPEN ('''')
IND_STR ('\n first\n second\n third\n')
WHITE_SPACE ('\n')
IND_STR_INDENT (' ')
IND_STR ('first')
IND_STR_LF ('\n')
IND_STR_INDENT (' ')
IND_STR ('second')
IND_STR_LF ('\n')
IND_STR_INDENT (' ')
IND_STR ('third')
IND_STR_LF ('\n')
IND_STRING_CLOSE ('''')
WHITE_SPACE ('\n')
] (']')
13 changes: 11 additions & 2 deletions src/test/testData/ParsingTest/StringWithMultipleLines.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,17 @@ Nix File(0,63)
PsiWhiteSpace('\n')(30,31)
NixIndStringImpl(IND_STRING)(31,61)
PsiElement(IND_STRING_OPEN)('''')(31,33)
NixStringTextImpl(STRING_TEXT)(33,59)
PsiElement(IND_STR)('\n first\n second\n third\n')(33,59)
PsiWhiteSpace('\n')(33,34)
NixStringTextImpl(STRING_TEXT)(34,59)
PsiElement(IND_STR_INDENT)(' ')(34,36)
PsiElement(IND_STR)('first')(36,41)
PsiElement(IND_STR_LF)('\n')(41,42)
PsiElement(IND_STR_INDENT)(' ')(42,44)
PsiElement(IND_STR)('second')(44,50)
PsiElement(IND_STR_LF)('\n')(50,51)
PsiElement(IND_STR_INDENT)(' ')(51,53)
PsiElement(IND_STR)('third')(53,58)
PsiElement(IND_STR_LF)('\n')(58,59)
PsiElement(IND_STRING_CLOSE)('''')(59,61)
PsiWhiteSpace('\n')(61,62)
PsiElement(])(']')(62,63)

0 comments on commit f47a403

Please sign in to comment.