From a10ea21cc2c744b4e533c75df0d606831341bc91 Mon Sep 17 00:00:00 2001 From: howsohazard <143410553+howsohazard@users.noreply.github.com> Date: Mon, 3 Jun 2024 11:04:23 -0400 Subject: [PATCH] 20457: Fixes bug where utf-8 spaces and certain language characters could cause labels not be stored and loaded properly (#145) --- src/Amalgam/Parser.cpp | 22 +++++++++++++++------- src/Amalgam/Parser.h | 22 +++++++++++++++------- src/Amalgam/string/StringManipulation.h | 2 +- 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/Amalgam/Parser.cpp b/src/Amalgam/Parser.cpp index a90cb0d4..83340d5b 100644 --- a/src/Amalgam/Parser.cpp +++ b/src/Amalgam/Parser.cpp @@ -388,10 +388,19 @@ void Parser::SkipToEndOfIdentifier(bool allow_leading_label_marks) //eat all characters until one that indicates end of identifier while(pos < code->size()) { + if(StringManipulation::IsUtf8Whitespace(*code, pos)) + break; + auto cur_char = (*code)[pos]; - if(cur_char == '\t' || cur_char == '\n' || cur_char == '\v' || cur_char == '\f' - || cur_char == '\r' || cur_char == ' ' - || cur_char == '#' + + if(cur_char == '\\' && pos + 1 < code->size()) + { + pos += 2; + continue; + } + + //check language characters + if(cur_char == '#' || cur_char == '(' || cur_char == ')' || cur_char == ';') break; @@ -653,11 +662,10 @@ void Parser::AppendComments(EvaluableNode *n, size_t indentation_depth, bool pre //if the string contains a character that needs to be escaped for labels, then will convert std::string ConvertLabelToQuotedStringIfNecessary(const std::string &s) { - bool needs_escape = false; + if(s.empty()) + return s; - //check for any characters that need to be escaped - if(s.find_first_of(" \t\"\n\r") != std::string::npos) - needs_escape = true; + bool needs_escape = Parser::HasCharactersBeyondIdentifier(s, true); if(!needs_escape) { diff --git a/src/Amalgam/Parser.h b/src/Amalgam/Parser.h index 893178ab..7d08d8b7 100644 --- a/src/Amalgam/Parser.h +++ b/src/Amalgam/Parser.h @@ -38,19 +38,27 @@ class Parser } //returns true if the string needs to be backslashified, has spaces, or has special characters - inline static bool HasCharactersBeyondIdentifier(const std::string &s) + inline static bool HasCharactersBeyondIdentifier(const std::string &s, bool label = false) { - for(auto c : s) + bool in_label_initial_hashes = label; + for(size_t i = 0; i < s.size(); i++) { - switch(c) + //can ignore any #'s up front + if(in_label_initial_hashes) + { + if(s[i] == '#') + continue; + in_label_initial_hashes = false; + } + + if(StringManipulation::IsUtf8Whitespace(s, i)) + return true; + + switch(s[i]) { case '\0': case '\\': case '"': - case '\t': - case '\n': - case '\r': - case ' ': case '(': case ')': case '.': diff --git a/src/Amalgam/string/StringManipulation.h b/src/Amalgam/string/StringManipulation.h index 0332cd8e..6ccfbd46 100644 --- a/src/Amalgam/string/StringManipulation.h +++ b/src/Amalgam/string/StringManipulation.h @@ -26,7 +26,7 @@ namespace StringManipulation //returns the number of bytes wide the character in position of string s is if it is whitespace, // 0 if it is not a newline - inline size_t IsUtf8Whitespace(std::string &s, size_t position) + inline size_t IsUtf8Whitespace(const std::string &s, size_t position) { auto cur_char = s[position]; if(cur_char == '\t' || cur_char == '\n' || cur_char == '\v' || cur_char == '\f'