From d08ad1dad5dc3ec4f96c4e94cbd284b749d54a17 Mon Sep 17 00:00:00 2001
From: Florian Schanda <florian.schanda@bmw.de>
Date: Mon, 23 Oct 2023 10:28:33 +0200
Subject: [PATCH] #43 Improve performance

Replace the char classification functions with more efficient, but
equivalent, implementations.

This reduces token() runtime from 18.2s to 15.1 which is a 17%
improvement.
---
 CHANGELOG.md                  |  2 ++
 tests-unit/test_lexer_base.py | 62 +++++++++++++++++++++++++++++++++++
 trlc/lexer.py                 | 12 ++-----
 3 files changed, 67 insertions(+), 9 deletions(-)
 create mode 100644 tests-unit/test_lexer_base.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aef2ff9d..37dad867 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,8 @@ generated in the following situations:
 
 ### 1.2.3-dev
 
+* [TRLC] Various performance improvements when parsing large files.
+
 * [TRLC] Add `--version` flag that can be used to figure out the
   installed TRLC version.
 
diff --git a/tests-unit/test_lexer_base.py b/tests-unit/test_lexer_base.py
new file mode 100644
index 00000000..0b1cc94c
--- /dev/null
+++ b/tests-unit/test_lexer_base.py
@@ -0,0 +1,62 @@
+import unittest
+import re
+
+from trlc.errors import Message_Handler
+from trlc.lexer import Lexer_Base
+
+
+class Potato(Lexer_Base):
+    def file_location(self):
+        pass
+
+    def token(self):
+        pass
+
+
+class Test_Lexer_Base(unittest.TestCase):
+    def setUp(self):
+        self.lexer = Potato(mh      = Message_Handler(),
+                            content = "")
+        self.test_range = 0xffff
+
+    def tearDown(self):
+        pass
+
+    @staticmethod
+    def reference_is_alpha(char):
+        assert isinstance(char, str) and len(char) == 1
+        return ord('a') <= ord(char) <= ord('z') or \
+            ord('A') <= ord(char) <= ord('Z')
+
+    @staticmethod
+    def reference_is_numeric(char):
+        assert isinstance(char, str) and len(char) == 1
+        return ord('0') <= ord(char) <= ord('9')
+
+    @staticmethod
+    def reference_is_alnum(char):
+        assert isinstance(char, str) and len(char) == 1
+        return ord('a') <= ord(char) <= ord('z') or \
+            ord('A') <= ord(char) <= ord('Z') or \
+            ord('0') <= ord(char) <= ord('9')
+
+    def testIsAlpha(self):
+        for i in range(self.test_range):
+            c = chr(i)
+            self.assertEqual(self.reference_is_alpha(c),
+                             self.lexer.is_alpha(c),
+                             "mismatch for codepoint %u (%s)" % (i, repr(c)))
+
+    def testIsDigit(self):
+        for i in range(self.test_range):
+            c = chr(i)
+            self.assertEqual(self.reference_is_numeric(c),
+                             self.lexer.is_numeric(c),
+                             "mismatch for codepoint %u (%s)" % (i, repr(c)))
+
+    def testIsAlnum(self):
+        for i in range(self.test_range):
+            c = chr(i)
+            self.assertEqual(self.reference_is_alnum(c),
+                             self.lexer.is_alnum(c),
+                             "mismatch for codepoint %u (%s)" % (i, repr(c)))
diff --git a/trlc/lexer.py b/trlc/lexer.py
index 23f2e495..55efda21 100644
--- a/trlc/lexer.py
+++ b/trlc/lexer.py
@@ -201,25 +201,19 @@ def __init__(self, mh, content):
     def is_alpha(char):
         # lobster-trace: LRM.Identifier
         # lobster-trace: LRM.Builtin_Identifier
-        assert isinstance(char, str) and len(char) == 1
-        return ord('a') <= ord(char) <= ord('z') or \
-            ord('A') <= ord(char) <= ord('Z')
+        return char.isascii() and char.isalpha()
 
     @staticmethod
     def is_numeric(char):
         # lobster-trace: LRM.Integers
         # lobster-trace: LRM.Decimals
-        assert isinstance(char, str) and len(char) == 1
-        return ord('0') <= ord(char) <= ord('9')
+        return char.isascii() and char.isdigit()
 
     @staticmethod
     def is_alnum(char):
         # lobster-trace: LRM.Identifier
         # lobster-trace: LRM.Builtin_Identifier
-        assert isinstance(char, str) and len(char) == 1
-        return ord('a') <= ord(char) <= ord('z') or \
-            ord('A') <= ord(char) <= ord('Z') or \
-            ord('0') <= ord(char) <= ord('9')
+        return char.isascii() and char.isalnum()
 
     @abstractmethod
     def file_location(self):