luau-lang · JohnnyMorganz · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/Ast/include/Luau/Lexer.h b/Ast/include/Luau/Lexer.h
@@ -53,6 +53,7 @@ struct Lexeme
 
         Comment,
         BlockComment,
+        Whitespace,
 
         Attribute,
 
@@ -100,7 +101,7 @@ struct Lexeme
 public:
     union
     {
-        const char* data;       // String, Number, Comment
+        const char* data;       // String, Number, Comment, Whitespace
         const char* name;       // Name
         unsigned int codepoint; // BrokenUnicode
     };
@@ -155,7 +156,7 @@ class Lexer
 public:
     Lexer(const char* buffer, std::size_t bufferSize, AstNameTable& names, Position startPosition = {0, 0});
 
-    void setSkipComments(bool skip);
+    void setSkipTrivia(bool skip);
     void setReadNames(bool read);
 
     const Location& previousLocation() const
@@ -164,7 +165,7 @@ class Lexer
     }
 
     const Lexeme& next();
-    const Lexeme& next(bool skipComments, bool updatePrevLocation);
+    const Lexeme& next(bool skipTrivia, bool updatePrevLocation);
     void nextline();
 
     Lexeme lookahead();
@@ -227,7 +228,7 @@ class Lexer
 
     AstNameTable& names;
 
-    bool skipComments;
+    bool skipTrivia;
     bool readNames;
 
     enum class BraceType

diff --git a/Ast/src/Lexer.cpp b/Ast/src/Lexer.cpp
@@ -9,6 +9,7 @@
 #include <limits.h>
 
 LUAU_FASTFLAGVARIABLE(LexerResumesFromPosition2)
+LUAU_FASTFLAGVARIABLE(LuauLexerTokenizesWhitespace)
 namespace Luau
 {
 
@@ -36,7 +37,7 @@ Lexeme::Lexeme(const Location& location, Type type, const char* data, size_t siz
 {
     LUAU_ASSERT(
         type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
-        type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
+        type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace
     );
 }
 
@@ -53,7 +54,7 @@ unsigned int Lexeme::getLength() const
 {
     LUAU_ASSERT(
         type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
-        type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
+        type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace
     );
 
     return length;
@@ -315,14 +316,14 @@ Lexer::Lexer(const char* buffer, size_t bufferSize, AstNameTable& names, Positio
           Lexeme::Eof
       )
     , names(names)
-    , skipComments(false)
+    , skipTrivia(false)
     , readNames(true)
 {
 }
 
-void Lexer::setSkipComments(bool skip)
+void Lexer::setSkipTrivia(bool skip)
 {
-    skipComments = skip;
+    skipTrivia = skip;
 }
 
 void Lexer::setReadNames(bool read)
@@ -332,24 +333,27 @@ void Lexer::setReadNames(bool read)
 
 const Lexeme& Lexer::next()
 {
-    return next(this->skipComments, true);
+    return next(this->skipTrivia, true);
 }
 
-const Lexeme& Lexer::next(bool skipComments, bool updatePrevLocation)
+const Lexeme& Lexer::next(bool skipTrivia, bool updatePrevLocation)
 {
-    // in skipComments mode we reject valid comments
+    // in skipTrivia mode we reject valid comments
     do
     {
-        // consume whitespace before the token
-        while (isSpace(peekch()))
-            consumeAny();
+        if (!FFlag::LuauLexerTokenizesWhitespace)
+        {
+            // consume whitespace before the token
+            while (isSpace(peekch()))
+                consumeAny();
+        }
 
         if (updatePrevLocation)
             prevLocation = lexeme.location;
 
         lexeme = readNext();
         updatePrevLocation = false;
-    } while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment));
+    } while (skipTrivia && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment || lexeme.type == Lexeme::Whitespace));
 
     return lexeme;
 }
@@ -967,6 +971,15 @@ Lexeme Lexer::readNext()
 
             return Lexeme(Location(start, position()), name.second, name.first.value);
         }
+        else if (FFlag::LuauLexerTokenizesWhitespace && isSpace(peekch()))
+        {
+            size_t startOffset = offset;
+
+            while (isSpace(peekch()))
+                consumeAny();
+
+            return Lexeme(Location(start, position()), Lexeme::Whitespace, &buffer[startOffset], offset - startOffset);
+        }
         else if (peekch() & 0x80)
         {
             return readUtf8Error();

diff --git a/Ast/src/Parser.cpp b/Ast/src/Parser.cpp
@@ -208,7 +208,7 @@ Parser::Parser(const char* buffer, size_t bufferSize, AstNameTable& names, Alloc
     matchRecoveryStopOnToken[Lexeme::Type::Eof] = 1;
 
     // required for lookahead() to work across a comment boundary and for nextLexeme() to work when captureComments is false
-    lexer.setSkipComments(true);
+    lexer.setSkipTrivia(true);
 
     // read first lexeme (any hot comments get .header = true)
     LUAU_ASSERT(hotcommentHeader);
@@ -3572,13 +3572,13 @@ AstTypeError* Parser::reportMissingTypeError(const Location& parseErrorLocation,
 
 void Parser::nextLexeme()
 {
-    Lexeme::Type type = lexer.next(/* skipComments= */ false, true).type;
+    Lexeme::Type type = lexer.next(/* skipTrivia= */ false, true).type;
 
-    while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment)
+    while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment || type == Lexeme::Whitespace)
     {
         const Lexeme& lexeme = lexer.current();
 
-        if (options.captureComments)
+        if (options.captureComments && type != Lexeme::Whitespace)
             commentLocations.push_back(Comment{lexeme.type, lexeme.location});
 
         // Subtlety: Broken comments are weird because we record them as comments AND pass them to the parser as a lexeme.
@@ -3598,7 +3598,7 @@ void Parser::nextLexeme()
             hotcomments.push_back({hotcommentHeader, lexeme.location, std::string(text + 1, text + end)});
         }
 
-        type = lexer.next(/* skipComments= */ false, /* updatePrevLocation= */ false).type;
+        type = lexer.next(/* skipTrivia= */ false, /* updatePrevLocation= */ false).type;
     }
 }
 

diff --git a/tests/Lexer.test.cpp b/tests/Lexer.test.cpp
@@ -8,6 +8,8 @@
 
 using namespace Luau;
 
+LUAU_FASTFLAG(LuauLexerTokenizesWhitespace)
+
 TEST_SUITE_BEGIN("LexerTests");
 
 TEST_CASE("broken_string_works")
@@ -38,7 +40,7 @@ TEST_CASE("broken_comment_kept")
     Luau::Allocator alloc;
     AstNameTable table(alloc);
     Lexer lexer(testInput.c_str(), testInput.size(), table);
-    lexer.setSkipComments(true);
+    lexer.setSkipTrivia(true);
     CHECK_EQ(lexer.next().type, Lexeme::Type::BrokenComment);
 }
 
@@ -48,7 +50,7 @@ TEST_CASE("comment_skipped")
     Luau::Allocator alloc;
     AstNameTable table(alloc);
     Lexer lexer(testInput.c_str(), testInput.size(), table);
-    lexer.setSkipComments(true);
+    lexer.setSkipTrivia(true);
     CHECK_EQ(lexer.next().type, Lexeme::Type::Eof);
 }
 
@@ -103,7 +105,7 @@ TEST_CASE("lookahead")
     Luau::Allocator alloc;
     AstNameTable table(alloc);
     Lexer lexer(testInput.c_str(), testInput.size(), table);
-    lexer.setSkipComments(true);
+    lexer.setSkipTrivia(true);
     lexer.next(); // must call next() before reading data from lexer at least once
 
     CHECK_EQ(lexer.current().type, Lexeme::Name);
@@ -242,4 +244,64 @@ TEST_CASE("string_interpolation_with_unicode_escape")
     CHECK_EQ(lexer.next().type, Lexeme::Eof);
 }
 
+TEST_CASE("lexer_tokenizes_whitespace")
+{
+    ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true};
+
+    const std::string testInput = "local x = 1";
+    Luau::Allocator alloc;
+    AstNameTable table(alloc);
+    Lexer lexer(testInput.c_str(), testInput.size(), table);
+
+    CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal);
+    CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
+    CHECK_EQ(lexer.next().type, Lexeme::Name);
+
+    auto space = lexer.next();
+    CHECK_EQ(space.type, Lexeme::Whitespace);
+    CHECK_EQ(std::string(space.data, space.getLength()), std::string(" "));
+
+    CHECK_EQ(lexer.next().type, '=');
+
+    auto space2 = lexer.next();
+    CHECK_EQ(space2.type, Lexeme::Whitespace);
+    CHECK_EQ(std::string(space2.data, space2.getLength()), std::string(" "));
+
+    CHECK_EQ(lexer.next().type, Lexeme::Number);
+    CHECK_EQ(lexer.next().type, Lexeme::Eof);
+}
+
+TEST_CASE("lexer_tokenizes_multiline_whitespace")
+{
+    ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true};
+
+    const std::string testInput = R"(local x
+
+    y = 2
+    )";
+    Luau::Allocator alloc;
+    AstNameTable table(alloc);
+    Lexer lexer(testInput.c_str(), testInput.size(), table);
+
+    CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal);
+    CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
+    CHECK_EQ(lexer.next().type, Lexeme::Name);
+
+    auto multilineSpace = lexer.next();
+    CHECK_EQ(multilineSpace.type, Lexeme::Whitespace);
+    CHECK_EQ(std::string(multilineSpace.data, multilineSpace.getLength()), std::string("\n\n    "));
+
+    CHECK_EQ(lexer.next().type, Lexeme::Name);
+    CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
+    CHECK_EQ(lexer.next().type, '=');
+    CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
+    CHECK_EQ(lexer.next().type, Lexeme::Number);
+
+    auto multilineSpace2 = lexer.next();
+    CHECK_EQ(multilineSpace2.type, Lexeme::Whitespace);
+    CHECK_EQ(std::string(multilineSpace2.data, multilineSpace2.getLength()), std::string("\n    "));
+
+    CHECK_EQ(lexer.next().type, Lexeme::Eof);
+}
+
 TEST_SUITE_END();