Use lexer to help detect indentation of strings

NixOS · Jul 15, 2024 · f47a403 · f47a403
1 parent 7d2f79b
commit f47a403
Show file tree

Hide file tree

Showing 7 changed files with 197 additions and 35 deletions.
diff --git a/src/main/java/org/nixos/idea/lang/highlighter/NixSyntaxHighlighter.java b/src/main/java/org/nixos/idea/lang/highlighter/NixSyntaxHighlighter.java
@@ -71,13 +71,15 @@ public class NixSyntaxHighlighter extends SyntaxHighlighterBase {
             entry(NixTypes.URI, NixTextAttributes.URI),
             // String literals
             entry(NixTypes.STR, NixTextAttributes.STRING),
+            entry(NixTypes.STR_ESCAPE, NixTextAttributes.STRING_ESCAPE),
             entry(NixTypes.STRING_CLOSE, NixTextAttributes.STRING),
             entry(NixTypes.STRING_OPEN, NixTextAttributes.STRING),
             entry(NixTypes.IND_STR, NixTextAttributes.STRING),
+            entry(NixTypes.IND_STR_LF, NixTextAttributes.STRING),
+            entry(NixTypes.IND_STR_INDENT, NixTextAttributes.STRING),
+            entry(NixTypes.IND_STR_ESCAPE, NixTextAttributes.STRING_ESCAPE),
             entry(NixTypes.IND_STRING_CLOSE, NixTextAttributes.STRING),
             entry(NixTypes.IND_STRING_OPEN, NixTextAttributes.STRING),
-            entry(NixTypes.STR_ESCAPE, NixTextAttributes.STRING_ESCAPE),
-            entry(NixTypes.IND_STR_ESCAPE, NixTextAttributes.STRING_ESCAPE),
             // Other
             entry(NixTypes.SCOMMENT, NixTextAttributes.LINE_COMMENT),
             entry(NixTypes.MCOMMENT, NixTextAttributes.BLOCK_COMMENT),

diff --git a/src/main/java/org/nixos/idea/util/NixStringUtil.java b/src/main/java/org/nixos/idea/util/NixStringUtil.java
@@ -4,12 +4,16 @@
 import com.intellij.psi.tree.IElementType;
 import org.jetbrains.annotations.Contract;
 import org.jetbrains.annotations.NotNull;
+import org.nixos.idea.psi.NixAntiquotation;
+import org.nixos.idea.psi.NixIndString;
+import org.nixos.idea.psi.NixStdString;
+import org.nixos.idea.psi.NixString;
 import org.nixos.idea.psi.NixStringPart;
 import org.nixos.idea.psi.NixStringText;
 import org.nixos.idea.psi.NixTypes;
 
 /**
- * Utilities for strings in the Nix Expression Language.
+ * Utilities for encoding and decoding strings in the Nix Expression Language.
  */
 public final class NixStringUtil {
 
@@ -87,6 +91,43 @@ public static void escape(@NotNull StringBuilder builder, @NotNull CharSequence
         }
     }
 
+    /**
+     * Detects the maximal amount of characters removed from the start of the lines.
+     * May return {@link Integer#MAX_VALUE} if the content of the string is blank.
+     *
+     * @param string the string from which to get the indentation
+     * @return the detected indentation, or {@link Integer#MAX_VALUE}
+     */
+    public static int detectMaxIndent(@NotNull NixString string) {
+        if (string instanceof NixStdString) {
+            return 0;
+        } else if (string instanceof NixIndString) {
+            int result = Integer.MAX_VALUE;
+            int preliminary = 0;
+            for (NixStringPart part : string.getStringParts()) {
+                if (part instanceof NixStringText textNode) {
+                    for (ASTNode token = textNode.getNode().getFirstChildNode(); token != null; token = token.getTreeNext()) {
+                        IElementType type = token.getElementType();
+                        if (type == NixTypes.IND_STR_INDENT) {
+                            preliminary = Math.min(result, token.getTextLength());
+                        } else if (type == NixTypes.IND_STR_LF) {
+                            preliminary = 0;
+                        } else {
+                            assert type == NixTypes.IND_STR || type == NixTypes.IND_STR_ESCAPE : type;
+                            result = preliminary;
+                        }
+                    }
+                } else {
+                    assert part instanceof NixAntiquotation : part.getClass();
+                    result = preliminary;
+                }
+            }
+            return result;
+        } else {
+            throw new IllegalStateException("Unexpected subclass of NixString: " + string.getClass());
+        }
+    }
+
     /**
      * Returns the content of the given part of a string in the Nix Expression Language.
      * All escape sequences are resolved.
@@ -95,31 +136,46 @@ public static void escape(@NotNull StringBuilder builder, @NotNull CharSequence
      * @return The resulting string after resolving all escape sequences.
      */
     public static @NotNull String parse(@NotNull NixStringText textNode) {
+        int maxIndent = detectMaxIndent((NixString) textNode.getParent());
         StringBuilder builder = new StringBuilder();
         for (ASTNode child = textNode.getNode().getFirstChildNode(); child != null; child = child.getTreeNext()) {
-            parse(builder, child);
+            parse(builder, child, maxIndent);
         }
         return builder.toString();
     }
 
-    private static void parse(@NotNull StringBuilder builder, @NotNull ASTNode token) {
+    private static void parse(@NotNull StringBuilder builder, @NotNull ASTNode token, int maxIndent) {
         CharSequence text = token.getChars();
         IElementType type = token.getElementType();
-        if (type == NixTypes.STR || type == NixTypes.IND_STR) {
+        if (type == NixTypes.STR || type == NixTypes.IND_STR || type == NixTypes.IND_STR_LF) {
             builder.append(text);
+        } else if (type == NixTypes.IND_STR_INDENT) {
+            int end = text.length();
+            if (end > maxIndent) {
+                CharSequence remain = text.subSequence(maxIndent, end);
+                builder.append(remain);
+            }
         } else if (type == NixTypes.STR_ESCAPE) {
             assert text.length() == 2 && text.charAt(0) == '\\' : text;
             char c = text.charAt(1);
             builder.append(unescape(c));
         } else if (type == NixTypes.IND_STR_ESCAPE) {
-            assert text.length() == 3 && ("''$".contentEquals(text) || "'''".contentEquals(text)) ||
-                    text.length() == 4 && "''\\".contentEquals(text.subSequence(0, 3)) : text;
-            if ("'''".contentEquals(text)){
-                builder.append("''");
-                return;
+            switch (text.charAt(2)) {
+                case '$' -> {
+                    assert "''$".contentEquals(text) : text;
+                    builder.append("$");
+                }
+                case '\'' -> {
+                    assert "'''".contentEquals(text) : text;
+                    builder.append("''");
+                }
+                case '\\' -> {
+                    assert text.length() == 4 && "''\\".contentEquals(text.subSequence(0, 3)) : text;
+                    char c = text.charAt(3);
+                    builder.append(unescape(c));
+                }
+                default -> throw new IllegalStateException("Unknown escape sequence: " + text);
             }
-            char c = text.charAt(text.length() - 1);
-            builder.append(unescape(c));
         } else {
             throw new IllegalStateException("Unexpected token in string: " + token);
         }

diff --git a/src/main/lang/Nix.bnf b/src/main/lang/Nix.bnf
@@ -211,7 +211,7 @@ string_text ::= string_token+
 antiquotation ::= DOLLAR LCURLY expr recover_antiquotation RCURLY { pin=1 }
 private recover_antiquotation ::= { recoverWhile=curly_recover }
 private string_part_recover ::= !(DOLLAR | STRING_CLOSE | IND_STRING_CLOSE | string_token)
-private string_token ::= STR | IND_STR | STR_ESCAPE | IND_STR_ESCAPE
+private string_token ::= STR | STR_ESCAPE | IND_STR | IND_STR_INDENT | IND_STR_ESCAPE | IND_STR_LF
 
 ;{ extends("bind_attr|bind_inherit")=bind }
 bind ::= bind_attr | bind_inherit

diff --git a/src/main/lang/Nix.flex b/src/main/lang/Nix.flex
@@ -13,26 +13,27 @@ import static org.nixos.idea.psi.NixTypes.*;
   private final AbstractIntList states = new IntArrayList();
 
   private void pushState(int newState) {
-      if (newState == YYINITIAL){
-          throw new IllegalStateException("Pusing YYINITIAL is not supported");
-      }
+      assert newState != YYINITIAL : "Pusing YYINITIAL is not supported";
       // store current state on the stack to allow restoring it in popState(...)
       states.push(yystate());
       yybegin(newState);
   }
 
   private void popState(int expectedState) {
-    if (states.isEmpty()){
-      throw new IllegalStateException("Popping an empty stack of states. Expected: " + expectedState);
-    }
+    assert !states.isEmpty() : "Popping an empty stack of states. Expected: " + expectedState;
     // safe-guard, because we always know which state we're currently in in the rules below
-    if (yystate() != expectedState) {
-        throw new IllegalStateException(String.format("Unexpected state. Current: %d, expected: %d", yystate(), expectedState));
-    }
+    assert yystate() == expectedState : String.format("Unexpected state. Current: %d, expected: %d", yystate(), expectedState);
     // start the lexer with the previous state, which was stored by pushState(...)
     yybegin(states.popInt());
   }
 
+  private void replaceState(int expectedState, int newState) {
+      assert newState != YYINITIAL : "Pusing YYINITIAL is not supported";
+      // safe-guard, because we always know which state we're currently in in the rules below
+      assert yystate() == expectedState : String.format("Unexpected state. Current: %d, expected: %d", yystate(), expectedState);
+      yybegin(newState);
+  }
+
   protected void onReset() {
       states.clear();
   }
@@ -44,7 +45,9 @@ import static org.nixos.idea.psi.NixTypes.*;
 %function advance
 %type IElementType
 %unicode
-%state BLOCK STRING IND_STRING ANTIQUOTATION_START ANTIQUOTATION PATH
+%state BLOCK STRING IND_STRING ANTIQUOTATION_START ANTIQUOTATION
+%xstate IND_STRING_START IND_STRING_INDENT PATH
+%suppress empty-match
 
 ANY=[^]
 ID=[a-zA-Z_][a-zA-Z0-9_'-]*
@@ -71,8 +74,19 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
   \"                    { popState(STRING); return STRING_CLOSE; }
 }
 
+<IND_STRING_START> {
+  // The first line is ignored in case it is empty
+  [\ ]*\n               { replaceState(IND_STRING_START, IND_STRING_INDENT); return com.intellij.psi.TokenType.WHITE_SPACE; }
+}
+
+<IND_STRING_START, IND_STRING_INDENT> {
+  [\ ]+                 { replaceState(yystate(), IND_STRING); return IND_STR_INDENT; }
+  ""                    { replaceState(yystate(), IND_STRING); }
+}
+
 <IND_STRING> {
-  [^\$\']+              { return IND_STR; }
+  \n                    { replaceState(IND_STRING, IND_STRING_INDENT); return IND_STR_LF; }
+  [^\$\'\n]+            { return IND_STR; }
   "$"|"$$"|"'"          { return IND_STR; }
   "''$"|"'''"           { return IND_STR_ESCAPE; }
   "''"\\{ANY}           { return IND_STR_ESCAPE; }
@@ -83,7 +97,7 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
 <ANTIQUOTATION_START> {
   // '$' and '{' must be two separate tokens to make NixBraceMatcher work
   // correctly with Grammar-Kit.
-  "{"                   { popState(ANTIQUOTATION_START); pushState(ANTIQUOTATION); return LCURLY; }
+  "{"                   { replaceState(ANTIQUOTATION_START, ANTIQUOTATION); return LCURLY; }
 }
 
 <ANTIQUOTATION> {
@@ -98,10 +112,10 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
   "$"/"{"               { pushState(ANTIQUOTATION_START); return DOLLAR; }
   {PATH_SEG}            { return PATH_SEGMENT; }
   {PATH_CHAR}+          { return PATH_SEGMENT; }
-  // anything else, e.g. whitespace, stops lexing of a PATH
+  // anything else, e.g. a whitespace, stops lexing of a PATH
   // we're delegating back to the parent state
   // PATH_END is an empty-length token to signal the end of the path
-  [^]                   { popState(PATH); yypushback(yylength()); return PATH_END; }
+  ""                    { popState(PATH); return PATH_END; }
 }
 
 <YYINITIAL, BLOCK, ANTIQUOTATION> {
@@ -152,7 +166,7 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
   "->"                  { return IMPL; }
 
   \"                    { pushState(STRING); return STRING_OPEN; }
-  \'\'                  { pushState(IND_STRING); return IND_STRING_OPEN; }
+  \'\'                  { pushState(IND_STRING_START); return IND_STRING_OPEN; }
 
   // Note that `true`, `false` and `null` are built-in variables but not
   // keywords. Therefore, they are not listed here.
@@ -171,5 +185,5 @@ MCOMMENT=\/\*([^*]|\*[^\/])*\*\/
   {WHITE_SPACE}         { return com.intellij.psi.TokenType.WHITE_SPACE; }
 }
 
-// matched by all %state states
+// matched by inclusive states (%state), but not by exclusive states (%xstate)
 [^]                     { return com.intellij.psi.TokenType.BAD_CHARACTER; }
diff --git a/src/test/java/org/nixos/idea/util/NixStringUtilTest.java b/src/test/java/org/nixos/idea/util/NixStringUtilTest.java
@@ -13,6 +13,7 @@
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
+@SuppressWarnings("UnnecessaryStringEscape")
 final class NixStringUtilTest {
     @ParameterizedTest(name = "[{index}] {0} -> {1}")
     @CsvSource(textBlock = """
@@ -54,6 +55,51 @@ void escape(String unescaped, String expectedResult) {
         assertEquals(expectedResult, stringBuilder.toString());
     }
 
+    @ParameterizedTest(name = "[{index}] {0} -> {1}")
+    @CsvSource(quoteCharacter = '|', textBlock = """
+            # Non-indented strings always return the empty string
+            |""|                , 0
+            |"    a"|           , 0
+            |"  a\n  b"|        , 0
+            # When there are only spaces, we return Integer.MAX_VALUE
+            |''''|              , 2147483647
+            |''    ''|          , 2147483647
+            |''\n  \n  ''|      , 2147483647
+            # The smallest indentation counts
+            |''\n  a\n b''|     , 1
+            |''\n a\n  b''|     , 1
+            |''\n  a\n  b''|    , 2
+            |''\n  a\n ${b}''|  , 1
+            |''\n  a\n ''\\b''| , 1
+            # First line counts
+            |''a\n  b''|        , 0
+            |''${a}\n  b''|     , 0
+            |''''\\a\n  b''|    , 0
+            # But only the first token in a line counts
+            |''  a${b}''|       , 2
+            |''  a''\\b''|      , 2
+            |''  ${a}b''|       , 2
+            |''  ${a}${b}''|    , 2
+            |''  ${a}''\\b''|   , 2
+            |''  ''\\ab''|      , 2
+            |''  ''\\a${b}''|   , 2
+            |''  ''\\a''\\b''|  , 2
+            # Tab and CR are treated as normal characters, not as spaces
+            # See NixOS/nix#2911 and NixOS/nix#3759
+            |''\t''|            , 0
+            |''\n  \t''|        , 2
+            |''\r\n''|          , 0
+            |''\n  \r\n''|      , 2
+            # Indentation within interpolations is ignored
+            |''  ${\n"a"}''|    , 2
+            |''  ${\n''a''}''|  , 2
+            """)
+    @WithIdeaPlatform.OnEdt
+    void detectMaxIndent(String code, int expectedResult, Project project) {
+        NixString string = NixElementFactory.createString(project, code);
+        assertEquals(expectedResult, NixStringUtil.detectMaxIndent(string));
+    }
+
     @ParameterizedTest(name = "[{index}] {0} -> {1}")
     @CsvSource(quoteCharacter = '|', textBlock = """
             ""              , ||
@@ -62,7 +108,9 @@ void escape(String unescaped, String expectedResult) {
             "\\""           , "
             "\\\\"          , \\
             "\\\\x"         , \\x
+            ''"''           , "
             ''\\"''         , \\"
+            ''\\x''         , \\x
             ''\\\\''        , \\\\
             ''\\\\x''       , \\\\x
             ''''\\"''       , "
@@ -78,12 +126,36 @@ void escape(String unescaped, String expectedResult) {
             |"\n"|          , |\n|
             |"\r"|          , |\r|
             |"\t"|          , |\t|
+            |"\\n"|         , |\n|
+            |"\\r"|         , |\r|
+            |"\\t"|         , |\t|
+            |''_\n''|       , |_\n|
+            |''\r''|        , |\r|
+            |''\t''|        , |\t|
+            |''''\\n''|     , |\n|
+            |''''\\r''|     , |\r|
+            |''''\\t''|     , |\t|
             # supplementary character, i.e. character form a supplementary plane,
             # which needs a surrogate pair to be represented in UTF-16
             "\uD83C\uDF09"  , \uD83C\uDF09
             ''\uD83C\uDF09'', \uD83C\uDF09
-            # TODO implement indentation (the one below fails)
-            # '' a ''          , |a |
+            # Remove common indentation in indented strings
+            |'' a ''|       , |a |
+            |''    a    ''| , |a    |
+            |'' a\n b\n''|  , |a\nb\n|
+            |'' a\n  b\n''| , |a\n b\n|
+            # But don't remove indentation when there is one line without it
+            |'' a\nb\n c''| , | a\nb\n c|
+            |''a\n b\n c''| , |a\n b\n c|
+            |''    a\n\tb''|, |    a\n\tb|
+            |''\ta\n    b''|, |\ta\n    b|
+            # Even when the line is blank
+            |'' a\n  ''|    , |a\n |
+            # Ignore indentation of empty lines
+            |'' a\n\n b\n''|, |a\n\nb\n|
+            # Remove initial line break in indented strings
+            |''\n    a''|   , |a|
+            |''  \n  a''|   , |a|
             """)
     @WithIdeaPlatform.OnEdt
     void parse(String code, String expectedResult, Project project) {

diff --git a/src/test/testData/ParsingTest/StringWithMultipleLines.lexer.txt b/src/test/testData/ParsingTest/StringWithMultipleLines.lexer.txt
@@ -5,7 +5,16 @@ STR ('\n  first\n  second\n  third\n')
 STRING_CLOSE ('"')
 WHITE_SPACE ('\n')
 IND_STRING_OPEN ('''')
-IND_STR ('\n  first\n  second\n  third\n')
+WHITE_SPACE ('\n')
+IND_STR_INDENT ('  ')
+IND_STR ('first')
+IND_STR_LF ('\n')
+IND_STR_INDENT ('  ')
+IND_STR ('second')
+IND_STR_LF ('\n')
+IND_STR_INDENT ('  ')
+IND_STR ('third')
+IND_STR_LF ('\n')
 IND_STRING_CLOSE ('''')
 WHITE_SPACE ('\n')
 ] (']')
diff --git a/src/test/testData/ParsingTest/StringWithMultipleLines.txt b/src/test/testData/ParsingTest/StringWithMultipleLines.txt
@@ -10,8 +10,17 @@ Nix File(0,63)
     PsiWhiteSpace('\n')(30,31)
     NixIndStringImpl(IND_STRING)(31,61)
       PsiElement(IND_STRING_OPEN)('''')(31,33)
-      NixStringTextImpl(STRING_TEXT)(33,59)
-        PsiElement(IND_STR)('\n  first\n  second\n  third\n')(33,59)
+      PsiWhiteSpace('\n')(33,34)
+      NixStringTextImpl(STRING_TEXT)(34,59)
+        PsiElement(IND_STR_INDENT)('  ')(34,36)
+        PsiElement(IND_STR)('first')(36,41)
+        PsiElement(IND_STR_LF)('\n')(41,42)
+        PsiElement(IND_STR_INDENT)('  ')(42,44)
+        PsiElement(IND_STR)('second')(44,50)
+        PsiElement(IND_STR_LF)('\n')(50,51)
+        PsiElement(IND_STR_INDENT)('  ')(51,53)
+        PsiElement(IND_STR)('third')(53,58)
+        PsiElement(IND_STR_LF)('\n')(58,59)
       PsiElement(IND_STRING_CLOSE)('''')(59,61)
     PsiWhiteSpace('\n')(61,62)
     PsiElement(])(']')(62,63)