Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenize whitespace #1570

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions Ast/include/Luau/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ struct Lexeme

Comment,
BlockComment,
Whitespace,

Attribute,

Expand Down Expand Up @@ -100,7 +101,7 @@ struct Lexeme
public:
union
{
const char* data; // String, Number, Comment
const char* data; // String, Number, Comment, Whitespace
const char* name; // Name
unsigned int codepoint; // BrokenUnicode
};
Expand Down Expand Up @@ -155,7 +156,7 @@ class Lexer
public:
Lexer(const char* buffer, std::size_t bufferSize, AstNameTable& names, Position startPosition = {0, 0});

void setSkipComments(bool skip);
void setSkipTrivia(bool skip);
void setReadNames(bool read);

const Location& previousLocation() const
Expand All @@ -164,7 +165,7 @@ class Lexer
}

const Lexeme& next();
const Lexeme& next(bool skipComments, bool updatePrevLocation);
const Lexeme& next(bool skipTrivia, bool updatePrevLocation);
void nextline();

Lexeme lookahead();
Expand Down Expand Up @@ -227,7 +228,7 @@ class Lexer

AstNameTable& names;

bool skipComments;
bool skipTrivia;
bool readNames;

enum class BraceType
Expand Down
37 changes: 25 additions & 12 deletions Ast/src/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <limits.h>

LUAU_FASTFLAGVARIABLE(LexerResumesFromPosition2)
LUAU_FASTFLAGVARIABLE(LuauLexerTokenizesWhitespace)
namespace Luau
{

Expand Down Expand Up @@ -36,7 +37,7 @@ Lexeme::Lexeme(const Location& location, Type type, const char* data, size_t siz
{
LUAU_ASSERT(
type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace
);
}

Expand All @@ -53,7 +54,7 @@ unsigned int Lexeme::getLength() const
{
LUAU_ASSERT(
type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace
);

return length;
Expand Down Expand Up @@ -315,14 +316,14 @@ Lexer::Lexer(const char* buffer, size_t bufferSize, AstNameTable& names, Positio
Lexeme::Eof
)
, names(names)
, skipComments(false)
, skipTrivia(false)
, readNames(true)
{
}

void Lexer::setSkipComments(bool skip)
void Lexer::setSkipTrivia(bool skip)
{
skipComments = skip;
skipTrivia = skip;
}

void Lexer::setReadNames(bool read)
Expand All @@ -332,24 +333,27 @@ void Lexer::setReadNames(bool read)

const Lexeme& Lexer::next()
{
return next(this->skipComments, true);
return next(this->skipTrivia, true);
}

const Lexeme& Lexer::next(bool skipComments, bool updatePrevLocation)
const Lexeme& Lexer::next(bool skipTrivia, bool updatePrevLocation)
{
// in skipComments mode we reject valid comments
// in skipTrivia mode we reject valid comments
do
{
// consume whitespace before the token
while (isSpace(peekch()))
consumeAny();
if (!FFlag::LuauLexerTokenizesWhitespace)
{
// consume whitespace before the token
while (isSpace(peekch()))
consumeAny();
}

if (updatePrevLocation)
prevLocation = lexeme.location;

lexeme = readNext();
updatePrevLocation = false;
} while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment));
} while (skipTrivia && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment || lexeme.type == Lexeme::Whitespace));
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Decided to merge skipComments together into a skipTrivia toggle

However, also open to splitting them into separate toggles if it makes more sense, e.g. skipWhitespace

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, given the test failures and the invasiveness of the change, decided to end up splitting into skipWhitespace separately.

But, if the inverse actually makes sense, we can change it :)


return lexeme;
}
Expand Down Expand Up @@ -967,6 +971,15 @@ Lexeme Lexer::readNext()

return Lexeme(Location(start, position()), name.second, name.first.value);
}
else if (FFlag::LuauLexerTokenizesWhitespace && isSpace(peekch()))
{
size_t startOffset = offset;

while (isSpace(peekch()))
consumeAny();

return Lexeme(Location(start, position()), Lexeme::Whitespace, &buffer[startOffset], offset - startOffset);
}
else if (peekch() & 0x80)
{
return readUtf8Error();
Expand Down
10 changes: 5 additions & 5 deletions Ast/src/Parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ Parser::Parser(const char* buffer, size_t bufferSize, AstNameTable& names, Alloc
matchRecoveryStopOnToken[Lexeme::Type::Eof] = 1;

// required for lookahead() to work across a comment boundary and for nextLexeme() to work when captureComments is false
lexer.setSkipComments(true);
lexer.setSkipTrivia(true);

// read first lexeme (any hot comments get .header = true)
LUAU_ASSERT(hotcommentHeader);
Expand Down Expand Up @@ -3572,13 +3572,13 @@ AstTypeError* Parser::reportMissingTypeError(const Location& parseErrorLocation,

void Parser::nextLexeme()
{
Lexeme::Type type = lexer.next(/* skipComments= */ false, true).type;
Lexeme::Type type = lexer.next(/* skipTrivia= */ false, true).type;

while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment)
while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment || type == Lexeme::Whitespace)
{
const Lexeme& lexeme = lexer.current();

if (options.captureComments)
if (options.captureComments && type != Lexeme::Whitespace)
commentLocations.push_back(Comment{lexeme.type, lexeme.location});

// Subtlety: Broken comments are weird because we record them as comments AND pass them to the parser as a lexeme.
Expand All @@ -3598,7 +3598,7 @@ void Parser::nextLexeme()
hotcomments.push_back({hotcommentHeader, lexeme.location, std::string(text + 1, text + end)});
}

type = lexer.next(/* skipComments= */ false, /* updatePrevLocation= */ false).type;
type = lexer.next(/* skipTrivia= */ false, /* updatePrevLocation= */ false).type;
}
}

Expand Down
68 changes: 65 additions & 3 deletions tests/Lexer.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

using namespace Luau;

LUAU_FASTFLAG(LuauLexerTokenizesWhitespace)

TEST_SUITE_BEGIN("LexerTests");

TEST_CASE("broken_string_works")
Expand Down Expand Up @@ -38,7 +40,7 @@ TEST_CASE("broken_comment_kept")
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);
lexer.setSkipComments(true);
lexer.setSkipTrivia(true);
CHECK_EQ(lexer.next().type, Lexeme::Type::BrokenComment);
}

Expand All @@ -48,7 +50,7 @@ TEST_CASE("comment_skipped")
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);
lexer.setSkipComments(true);
lexer.setSkipTrivia(true);
CHECK_EQ(lexer.next().type, Lexeme::Type::Eof);
}

Expand Down Expand Up @@ -103,7 +105,7 @@ TEST_CASE("lookahead")
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);
lexer.setSkipComments(true);
lexer.setSkipTrivia(true);
lexer.next(); // must call next() before reading data from lexer at least once

CHECK_EQ(lexer.current().type, Lexeme::Name);
Expand Down Expand Up @@ -242,4 +244,64 @@ TEST_CASE("string_interpolation_with_unicode_escape")
CHECK_EQ(lexer.next().type, Lexeme::Eof);
}

TEST_CASE("lexer_tokenizes_whitespace")
{
ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true};

const std::string testInput = "local x = 1";
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);

CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Name);

auto space = lexer.next();
CHECK_EQ(space.type, Lexeme::Whitespace);
CHECK_EQ(std::string(space.data, space.getLength()), std::string(" "));

CHECK_EQ(lexer.next().type, '=');

auto space2 = lexer.next();
CHECK_EQ(space2.type, Lexeme::Whitespace);
CHECK_EQ(std::string(space2.data, space2.getLength()), std::string(" "));

CHECK_EQ(lexer.next().type, Lexeme::Number);
CHECK_EQ(lexer.next().type, Lexeme::Eof);
}

TEST_CASE("lexer_tokenizes_multiline_whitespace")
{
ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true};

const std::string testInput = R"(local x

y = 2
)";
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);

CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Name);

auto multilineSpace = lexer.next();
CHECK_EQ(multilineSpace.type, Lexeme::Whitespace);
CHECK_EQ(std::string(multilineSpace.data, multilineSpace.getLength()), std::string("\n\n "));

CHECK_EQ(lexer.next().type, Lexeme::Name);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, '=');
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Number);

auto multilineSpace2 = lexer.next();
CHECK_EQ(multilineSpace2.type, Lexeme::Whitespace);
CHECK_EQ(std::string(multilineSpace2.data, multilineSpace2.getLength()), std::string("\n "));

CHECK_EQ(lexer.next().type, Lexeme::Eof);
}

TEST_SUITE_END();
Loading