Skip to content

Commit

Permalink
Merge pull request #506 from kamil-tekiela/UtfString
Browse files Browse the repository at this point in the history
Optimize offsetGet
  • Loading branch information
MauricioFauth authored Sep 16, 2023
2 parents 6fd2c59 + be2ca97 commit d57481d
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 228 deletions.
1 change: 0 additions & 1 deletion src/Tools/CustomJsonSerializer.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class CustomJsonSerializer extends JsonSerializer
'viewOptions',
'eventOptions',
'userOptions',
'asciiMap',
];

/**
Expand Down
186 changes: 15 additions & 171 deletions src/UtfString.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@

use function mb_check_encoding;
use function mb_strlen;
use function mb_substr;
use function ord;
use function strlen;
use function substr;

/**
* Implementation for UTF-8 strings.
Expand Down Expand Up @@ -68,119 +71,6 @@ class UtfString implements ArrayAccess, Stringable
*/
public $charLen = 0;

/**
* A map of ASCII binary values to their ASCII code
* This is to improve performance and avoid calling ord($byte)
*
* Source: https://www.freecodecamp.org/news/ascii-table-hex-to-ascii-value-character-code-chart-2/
*
* @var array<int|string,int>
*/
protected static $asciiMap = [
"\0" => 0, // (00000000) NUL Null
"\t" => 9, // (00001001) HT Horizontal Tab
"\n" => 10, // (00001010) LF Newline / Line Feed
"\v" => 11, // (00001011) VT Vertical Tab
"\f" => 12, // (00001100) FF Form Feed
"\r" => 13, // (00001101) CR Carriage Return
' ' => 32, // (00100000) SP Space
'!' => 33, // (00100001) ! Exclamation mark
'"' => 34, // (00100010) " Double quote
'#' => 35, // (00100011) # Number
'$' => 36, // (00100100) $ Dollar
'%' => 37, // (00100101) % Percent
'&' => 38, // (00100110) & Ampersand
'\'' => 39, // (00100111) ' Single quote
'(' => 40, // (00101000) ( Left parenthesis
')' => 41, // (00101001) ) Right parenthesis
'*' => 42, // (00101010) * Asterisk
'+' => 43, // (00101011) + Plus
',' => 44, // (00101100) , Comma
'-' => 45, // (00101101) - Minus
'.' => 46, // (00101110) . Period
'/' => 47, // (00101111) / Slash
'0' => 48, // (00110000) 0 Zero
'1' => 49, // (00110001) 1 One
'2' => 50, // (00110010) 2 Two
'3' => 51, // (00110011) 3 Three
'4' => 52, // (00110100) 4 Four
'5' => 53, // (00110101) 5 Five
'6' => 54, // (00110110) 6 Six
'7' => 55, // (00110111) 7 Seven
'8' => 56, // (00111000) 8 Eight
'9' => 57, // (00111001) 9 Nine
':' => 58, // (00111010) : Colon
';' => 59, // (00111011) ; Semicolon
'<' => 60, // (00111100) < Less than
'=' => 61, // (00111101) = Equal sign
'>' => 62, // (00111110) > Greater than
'?' => 63, // (00111111) ? Question mark
'@' => 64, // (01000000) @ At sign
'A' => 65, // (01000001) A Uppercase A
'B' => 66, // (01000010) B Uppercase B
'C' => 67, // (01000011) C Uppercase C
'D' => 68, // (01000100) D Uppercase D
'E' => 69, // (01000101) E Uppercase E
'F' => 70, // (01000110) F Uppercase F
'G' => 71, // (01000111) G Uppercase G
'H' => 72, // (01001000) H Uppercase H
'I' => 73, // (01001001) I Uppercase I
'J' => 74, // (01001010) J Uppercase J
'K' => 75, // (01001011) K Uppercase K
'L' => 76, // (01001100) L Uppercase L
'M' => 77, // (01001101) M Uppercase M
'N' => 78, // (01001110) N Uppercase N
'O' => 79, // (01001111) O Uppercase O
'P' => 80, // (01010000) P Uppercase P
'Q' => 81, // (01010001) Q Uppercase Q
'R' => 82, // (01010010) R Uppercase R
'S' => 83, // (01010011) S Uppercase S
'T' => 84, // (01010100) T Uppercase T
'U' => 85, // (01010101) U Uppercase U
'V' => 86, // (01010110) V Uppercase V
'W' => 87, // (01010111) W Uppercase W
'X' => 88, // (01011000) X Uppercase X
'Y' => 89, // (01011001) Y Uppercase Y
'Z' => 90, // (01011010) Z Uppercase Z
'[' => 91, // (01011011) [ Left square bracket
'\\' => 92, // (01011100) \ backslash
']' => 93, // (01011101) ] Right square bracket
'^' => 94, // (01011110) ^ Caret / circumflex
'_' => 95, // (01011111) _ Underscore
'`' => 96, // (01100000) ` Grave / accent
'a' => 97, // (01100001) a Lowercase a
'b' => 98, // (01100010) b Lowercase b
'c' => 99, // (01100011) c Lowercase c
'd' => 100, // (01100100) d Lowercase d
'e' => 101, // (01100101) e Lowercase e
'f' => 102, // (01100110) f Lowercase
'g' => 103, // (01100111) g Lowercase g
'h' => 104, // (01101000) h Lowercase h
'i' => 105, // (01101001) i Lowercase i
'j' => 106, // (01101010) j Lowercase j
'k' => 107, // (01101011) k Lowercase k
'l' => 108, // (01101100) l Lowercase l
'm' => 109, // (01101101) m Lowercase m
'n' => 110, // (01101110) n Lowercase n
'o' => 111, // (01101111) o Lowercase o
'p' => 112, // (01110000) p Lowercase p
'q' => 113, // (01110001) q Lowercase q
'r' => 114, // (01110010) r Lowercase r
's' => 115, // (01110011) s Lowercase s
't' => 116, // (01110100) t Lowercase t
'u' => 117, // (01110101) u Lowercase u
'v' => 118, // (01110110) v Lowercase v
'w' => 119, // (01110111) w Lowercase w
'x' => 120, // (01111000) x Lowercase x
'y' => 121, // (01111001) y Lowercase y
'z' => 122, // (01111010) z Lowercase z
'{' => 123, // (01111011) { Left curly bracket
'|' => 124, // (01111100) | Vertical bar
'}' => 125, // (01111101) } Right curly bracket
'~' => 126, // (01111110) ~ Tilde
"\x7f" => 127, // (01111111) DEL Delete
];

/**
* @param string $str the string
*/
Expand Down Expand Up @@ -212,6 +102,12 @@ public function offsetExists($offset): bool
*/
public function offsetGet($offset): string|null
{
// This function moves the internal byte and character pointer to the requested offset.
// This function is part of hot code so the aim is to do the following
// operations as efficiently as possible.
// UTF-8 character encoding is a variable length encoding that encodes Unicode
// characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
// to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
if (($offset < 0) || ($offset >= $this->charLen)) {
return null;
}
Expand All @@ -220,13 +116,13 @@ public function offsetGet($offset): string|null

if ($delta > 0) {
// Fast forwarding.
while ($delta-- > 0) {
$this->byteIdx += static::getCharLength($this->str[$this->byteIdx]);
++$this->charIdx;
}
$this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta));
$this->charIdx += $delta;
} elseif ($delta < 0) {
// Rewinding.
while ($delta++ < 0) {
// We rewind byte by byte and only count characters that are not continuation bytes,
// i.e. ASCII characters and first octets of multibyte characters
do {
$byte = ord($this->str[--$this->byteIdx]);
} while (($byte >= 128) && ($byte < 192));
Expand All @@ -235,14 +131,8 @@ public function offsetGet($offset): string|null
}
}

$bytesCount = static::getCharLength($this->str[$this->byteIdx]);

$ret = '';
for ($i = 0; $bytesCount-- > 0; ++$i) {
$ret .= $this->str[$this->byteIdx + $i];
}

return $ret;
// Fetch the first Unicode character within the next 4 bytes in the string.
return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);
}

/**
Expand Down Expand Up @@ -270,52 +160,6 @@ public function offsetUnset($offset): void
throw new Exception('Not implemented.');
}

/**
* Gets the length of an UTF-8 character.
*
* According to RFC 3629, a UTF-8 character can have at most 4 bytes.
* However, this implementation supports UTF-8 characters containing up to 6
* bytes.
*
* @see https://tools.ietf.org/html/rfc3629
*
* @param string $byte the byte to be analyzed
*/
public static function getCharLength($byte): int
{
// Use the default ASCII map as queries are mostly ASCII chars
// ord($byte) has a performance cost

if (! isset(static::$asciiMap[$byte])) {
// Complete the cache with missing items
static::$asciiMap[$byte] = ord($byte);
}

$byte = static::$asciiMap[$byte];

if ($byte < 128) {
return 1;
}

if ($byte < 224) {
return 2;
}

if ($byte < 240) {
return 3;
}

if ($byte < 248) {
return 4;
}

if ($byte < 252) {
return 5; // unofficial
}

return 6; // unofficial
}

/**
* Returns the length in characters of the string.
*/
Expand Down
30 changes: 6 additions & 24 deletions tests/Misc/UtfStringTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
use PHPUnit\Framework\Attributes\DataProvider;
use Throwable;

use function chr;

class UtfStringTest extends TestCase
{
/**
Expand Down Expand Up @@ -55,27 +53,6 @@ public function testUnset(): void
unset($str[0]);
}

public function testGetCharLength(): void
{
$this->assertEquals(1, UtfString::getCharLength(chr(0x00))); // 00000000
$this->assertEquals(1, UtfString::getCharLength(chr(0x7F))); // 01111111

$this->assertEquals(2, UtfString::getCharLength(chr(0xC0))); // 11000000
$this->assertEquals(2, UtfString::getCharLength(chr(0xDF))); // 11011111

$this->assertEquals(3, UtfString::getCharLength(chr(0xE0))); // 11100000
$this->assertEquals(3, UtfString::getCharLength(chr(0xEF))); // 11101111

$this->assertEquals(4, UtfString::getCharLength(chr(0xF0))); // 11110000
$this->assertEquals(4, UtfString::getCharLength(chr(0xF7))); // 11110111

$this->assertEquals(5, UtfString::getCharLength(chr(0xF8))); // 11111000
$this->assertEquals(5, UtfString::getCharLength(chr(0xFB))); // 11111011

$this->assertEquals(6, UtfString::getCharLength(chr(0xFC))); // 11111100
$this->assertEquals(6, UtfString::getCharLength(chr(0xFD))); // 11111101
}

public function testToString(): void
{
$str = new UtfString(self::TEST_PHRASE);
Expand Down Expand Up @@ -112,7 +89,7 @@ public static function utf8StringsProvider(): array
'č',
],
'emoji' => [
'😂😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯',
'🦋😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯',
'😂',
'😋',
],
Expand All @@ -121,6 +98,11 @@ public static function utf8StringsProvider(): array
null,
null,
],
'random' => [
'xℤⅿↈⅬ⅀ↆℜℝ⅗ℾ℧ⅰℓⅯⅵⅣ⅒21⅞',
'',
'',
],
];
}
}
54 changes: 22 additions & 32 deletions tests/benchmarks/UtfStringBench.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

use PhpMyAdmin\SqlParser\UtfString;

use function chr;
use function file_get_contents;

class UtfStringBench
Expand All @@ -19,8 +18,7 @@ class UtfStringBench
* @Iterations(20)
* @Revs(4)
* @OutputTimeUnit("milliseconds")
* @Assert("mode(variant.time.avg) < 100 milliseconds +/- 10%")
* @Assert("mode(variant.time.avg) > 30 milliseconds +/- 10%")
* @Assert("mode(variant.time.avg) < 40 milliseconds +/- 10%")
*/
public function benchBuildUtfString(): void
{
Expand All @@ -30,38 +28,30 @@ public function benchBuildUtfString(): void
}
}

/**
* @BeforeMethods("setUp")
* @Iterations(2)
* @Revs(2)
* @OutputTimeUnit("microseconds")
* @Assert("mode(variant.time.avg) < 800 microseconds +/- 20%")
* @Assert("mode(variant.time.avg) > 100 microseconds +/- 10%")
*/
public function benchGetCharLength(): void
{
UtfString::getCharLength(chr(0x00)); // 00000000
UtfString::getCharLength(chr(0x7F)); // 01111111

UtfString::getCharLength(chr(0xC0)); // 11000000
UtfString::getCharLength(chr(0xDF)); // 11011111

UtfString::getCharLength(chr(0xE0)); // 11100000
UtfString::getCharLength(chr(0xEF)); // 11101111

UtfString::getCharLength(chr(0xF0)); // 11110000
UtfString::getCharLength(chr(0xF7)); // 11110111

UtfString::getCharLength(chr(0xF8)); // 11111000
UtfString::getCharLength(chr(0xFB)); // 11111011

UtfString::getCharLength(chr(0xFC)); // 11111100
UtfString::getCharLength(chr(0xFD)); // 11111101
}

public function setUp(): void
{
$contentsPath = __DIR__ . '/../../LICENSE.txt';
$this->testContents = (string) file_get_contents($contentsPath);
}

/**
* @Iterations(20)
* @Revs(4)
* @OutputTimeUnit("microseconds")
* @Assert("mode(variant.time.avg) < 120 microseconds +/- 10%")
*/
public function benchUtfStringRandomAccessWithUnicode(): void
{
$text = 'abcdefghijklmnopqrstuvwxyz
áéíóúýěřťǔǐǒǎšďȟǰǩľžčǚň
🦋😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯
P\xf8\xed\xb9ern\xec \xbelu\xbbou\xe8k\xfd k\xf3d \xfap\xecl \xef\xe1belsk\xe9 k\xf3dy
xℤⅿↈⅬ⅀ↆℜℝ⅗ℾ℧ⅰℓⅯⅵⅣ⅒21⅞';

$str1 = new UtfString($text);
$str1->offsetGet(10);
$str1->offsetGet(100);
$str1->offsetGet(20);
$str1->offsetGet(0);
}
}

0 comments on commit d57481d

Please sign in to comment.