From d08ad1dad5dc3ec4f96c4e94cbd284b749d54a17 Mon Sep 17 00:00:00 2001 From: Florian Schanda Date: Mon, 23 Oct 2023 10:28:33 +0200 Subject: [PATCH] #43 Improve performance Replace the char classification functions with more efficient, but equivalent, implementations. This reduces token() runtime from 18.2s to 15.1 which is a 17% improvement. --- CHANGELOG.md | 2 ++ tests-unit/test_lexer_base.py | 62 +++++++++++++++++++++++++++++++++++ trlc/lexer.py | 12 ++----- 3 files changed, 67 insertions(+), 9 deletions(-) create mode 100644 tests-unit/test_lexer_base.py diff --git a/CHANGELOG.md b/CHANGELOG.md index aef2ff9d..37dad867 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ generated in the following situations: ### 1.2.3-dev +* [TRLC] Various performance improvements when parsing large files. + * [TRLC] Add `--version` flag that can be used to figure out the installed TRLC version. diff --git a/tests-unit/test_lexer_base.py b/tests-unit/test_lexer_base.py new file mode 100644 index 00000000..0b1cc94c --- /dev/null +++ b/tests-unit/test_lexer_base.py @@ -0,0 +1,62 @@ +import unittest +import re + +from trlc.errors import Message_Handler +from trlc.lexer import Lexer_Base + + +class Potato(Lexer_Base): + def file_location(self): + pass + + def token(self): + pass + + +class Test_Lexer_Base(unittest.TestCase): + def setUp(self): + self.lexer = Potato(mh = Message_Handler(), + content = "") + self.test_range = 0xffff + + def tearDown(self): + pass + + @staticmethod + def reference_is_alpha(char): + assert isinstance(char, str) and len(char) == 1 + return ord('a') <= ord(char) <= ord('z') or \ + ord('A') <= ord(char) <= ord('Z') + + @staticmethod + def reference_is_numeric(char): + assert isinstance(char, str) and len(char) == 1 + return ord('0') <= ord(char) <= ord('9') + + @staticmethod + def reference_is_alnum(char): + assert isinstance(char, str) and len(char) == 1 + return ord('a') <= ord(char) <= ord('z') or \ + ord('A') <= ord(char) <= ord('Z') or \ + ord('0') <= ord(char) <= ord('9') + + def testIsAlpha(self): + for i in range(self.test_range): + c = chr(i) + self.assertEqual(self.reference_is_alpha(c), + self.lexer.is_alpha(c), + "mismatch for codepoint %u (%s)" % (i, repr(c))) + + def testIsDigit(self): + for i in range(self.test_range): + c = chr(i) + self.assertEqual(self.reference_is_numeric(c), + self.lexer.is_numeric(c), + "mismatch for codepoint %u (%s)" % (i, repr(c))) + + def testIsAlnum(self): + for i in range(self.test_range): + c = chr(i) + self.assertEqual(self.reference_is_alnum(c), + self.lexer.is_alnum(c), + "mismatch for codepoint %u (%s)" % (i, repr(c))) diff --git a/trlc/lexer.py b/trlc/lexer.py index 23f2e495..55efda21 100644 --- a/trlc/lexer.py +++ b/trlc/lexer.py @@ -201,25 +201,19 @@ def __init__(self, mh, content): def is_alpha(char): # lobster-trace: LRM.Identifier # lobster-trace: LRM.Builtin_Identifier - assert isinstance(char, str) and len(char) == 1 - return ord('a') <= ord(char) <= ord('z') or \ - ord('A') <= ord(char) <= ord('Z') + return char.isascii() and char.isalpha() @staticmethod def is_numeric(char): # lobster-trace: LRM.Integers # lobster-trace: LRM.Decimals - assert isinstance(char, str) and len(char) == 1 - return ord('0') <= ord(char) <= ord('9') + return char.isascii() and char.isdigit() @staticmethod def is_alnum(char): # lobster-trace: LRM.Identifier # lobster-trace: LRM.Builtin_Identifier - assert isinstance(char, str) and len(char) == 1 - return ord('a') <= ord(char) <= ord('z') or \ - ord('A') <= ord(char) <= ord('Z') or \ - ord('0') <= ord(char) <= ord('9') + return char.isascii() and char.isalnum() @abstractmethod def file_location(self):