diff --git a/java/com/google/re2j/Inst.java b/java/com/google/re2j/Inst.java index 8a76b349..11b77362 100644 --- a/java/com/google/re2j/Inst.java +++ b/java/com/google/re2j/Inst.java @@ -57,25 +57,38 @@ boolean matchRune(int r) { // Special case: single-rune slice is from literal string, not char // class. if (runes.length == 1) { - int r0 = runes[0]; - if (r == r0) { - return true; - } - if ((arg & RE2.FOLD_CASE) != 0) { - for (int r1 = Unicode.simpleFold(r0); - r1 != r0; - r1 = Unicode.simpleFold(r1)) { - if (r == r1) { - return true; - } + return singleMatchRune(r); + } + + return multiMatchRune(r); + } + + private boolean singleMatchRune(int r) { + int r0 = runes[0]; + if (r == r0) { + return true; + } + + if ((arg & RE2.FOLD_CASE) != 0) { + int[] folds = Unicode.optimizedFoldOrbit(r0); + + if (folds == null) { + return Unicode.areEqualsCaseInsensitive(r, r0); + } else { + for(int i = 0; i < folds.length; i++) { + if (folds[i] == r) return true; } } - return false; } + return false; + } + - // Peek at the first few pairs. + private boolean multiMatchRune(int r) { + // Peek at the first 5 pairs. // Should handle ASCII well. - for (int j = 0; j < runes.length && j <= 8; j += 2) { + int length = Math.min(runes.length, 10); + for (int j = 0; j < length; j += 2) { if (r < runes[j]) { return false; } @@ -84,7 +97,8 @@ boolean matchRune(int r) { } } - // Otherwise binary search. + // Otherwise binary search + // Invariant: lo, hi, m are even. for (int lo = 0, hi = runes.length / 2; lo < hi; ) { int m = lo + (hi - lo) / 2; int c = runes[2 * m]; diff --git a/java/com/google/re2j/Unicode.java b/java/com/google/re2j/Unicode.java index 2f12a8ee..d114a278 100644 --- a/java/com/google/re2j/Unicode.java +++ b/java/com/google/re2j/Unicode.java @@ -7,6 +7,8 @@ package com.google.re2j; +import java.util.Arrays; + /** * Utilities for dealing with Unicode better than Java does. * @@ -227,6 +229,101 @@ static int simpleFold(int r) { return toUpper(r); } + static int[] optimizedFoldOrbit(int r) { + if (r >= ORBIT_MIN_VALUE) { + return FOLD_MAP.get(r); + } + return null; + } + + private static final FoldMap FOLD_MAP = new FoldMap(); + private static final int ORBIT_MIN_VALUE = UnicodeTables.CASE_ORBIT[0][0]; + + static { + for(int[] orbit : UnicodeTables.CASE_ORBIT) { + int r0 = orbit[0]; + int[] folds = new int[3]; + int foldsSize = 0; + int r1 = r0; + while((r1 = simpleFold(r1)) != r0) { + if (foldsSize >= folds.length) { + Arrays.copyOf(folds, folds.length * 2); + } + folds[foldsSize] = r1; + foldsSize++; + } + FOLD_MAP.put(r0, Arrays.copyOf(folds, foldsSize)); + } + } + + public static boolean areEqualsCaseInsensitive(int r0, int r1) { + return r0 == toUpper(r1) || r0 == toLower(r1); + } + + /* + A FoldMap maps a rune to an array of runes that are equivalent to it in a case-insensitive pattern. + */ + private static class FoldMap { + private static final int MAX_LINEAR_PROBING = 4; + private final int[] keys; + private final int[][] values; + private final int mask; + + FoldMap() { + int s = findNextPositivePowerOfTwo(UnicodeTables.CASE_ORBIT.length * 4); // load of factor of 0.25 to have a max linear probing of MAX_LINEAR_PROBING + keys = new int[s]; + Arrays.fill(keys,-1); + values = new int[s][]; + mask = s - 1; + } + + void put(int k, int[] fold) { + if (k == -1) throw new IllegalArgumentException("-1 is the empty marker"); + + int index = hashCode(k); + int maxIndex = index + MAX_LINEAR_PROBING; + + do { + int slot = index & mask; + if (keys[slot] == -1) { // empty slot + values[slot] = fold; + keys[slot] = k; + return; + } + index++; + } while (index < maxIndex); + + throw new IllegalStateException("Map is full"); + } + + int[] get(int k) { + int index = hashCode(k); + int maxIndex = index + MAX_LINEAR_PROBING; + do { + int slot = index & mask; + int key = keys[slot]; + if (key == -1) { + return null; // empty slot + } + if (key == k) { + return values[slot]; + } + index++; + } while (index < maxIndex); + + return null; + } + + private int hashCode(int k) { + final int h = k * 0x9E3779B9; + return h ^ (h >> 16); + } + + private int findNextPositivePowerOfTwo(final int value) { + return 1 << (32 - Integer.numberOfLeadingZeros(value - 1)); + } + } + private Unicode() {} // uninstantiable } diff --git a/javatests/com/google/re2j/RunesTest.java b/javatests/com/google/re2j/RunesTest.java new file mode 100644 index 00000000..2d75eb90 --- /dev/null +++ b/javatests/com/google/re2j/RunesTest.java @@ -0,0 +1,48 @@ +package com.google.re2j; + +import org.junit.Test; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class RunesTest { + + @Test + public void testRunes() { + RE2 compile = RE2.compile("[0-13-46-78-9b-ce-fh-i]"); + assertTrue(compile.match("0")); + assertTrue(compile.match("1")); + assertTrue(compile.match("3")); + assertTrue(compile.match("4")); + assertTrue(compile.match("6")); + assertTrue(compile.match("7")); + assertTrue(compile.match("8")); + assertTrue(compile.match("9")); + assertTrue(compile.match("b")); + assertTrue(compile.match("c")); + assertTrue(compile.match("e")); + assertTrue(compile.match("f")); + assertTrue(compile.match("h")); + assertTrue(compile.match("i")); + + assertFalse(compile.match("2")); + assertFalse(compile.match("5")); + assertFalse(compile.match("a")); + assertFalse(compile.match("d")); + assertFalse(compile.match("g")); + assertFalse(compile.match("j")); + } + @Test + public void testRunesWithFold() { + Pattern pattern = Pattern.compile("ak", Pattern.CASE_INSENSITIVE); + String upperCase = "AK"; + String withKelvin = "AK"; + assertFalse(withKelvin.equals(upperCase)); // check we use the kelvin sign + + assertTrue(pattern.matches("ak")); + assertTrue(pattern.matches(upperCase)); + assertTrue(pattern.matches("aK")); + assertTrue(pattern.matches("Ak")); + assertTrue(pattern.matches(withKelvin)); + } +} diff --git a/javatests/com/google/re2j/UnicodeTest.java b/javatests/com/google/re2j/UnicodeTest.java index 0ae11b29..e043b1bd 100644 --- a/javatests/com/google/re2j/UnicodeTest.java +++ b/javatests/com/google/re2j/UnicodeTest.java @@ -4,6 +4,8 @@ package com.google.re2j; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import org.junit.Test; @@ -38,4 +40,35 @@ public void testFoldConstants() { // int toLower(int r); // int simpleFold(int r); + @Test + public void testOptimizedFold() { + // Check that the new optimized fold algorithm gives the same result as using simple fold. + for (int i = 0; i <= Unicode.MAX_RUNE; i++) { + int[] orbit = Unicode.optimizedFoldOrbit(i); + if (orbit != null) { + testOptimizedFoldOrbitCase(i, orbit); + } else { + testOptimizedFoldNonOrbitCase(i); + } + } + } + + private void testOptimizedFoldNonOrbitCase(int i) { + int r = Unicode.simpleFold(i); + assertEquals(i, Unicode.simpleFold(r)); // second fold always go back to first + assertTrue(Unicode.areEqualsCaseInsensitive(i, i)); + assertTrue(Unicode.areEqualsCaseInsensitive(i, r)); + assertTrue(Unicode.areEqualsCaseInsensitive(r, i)); + assertTrue(Unicode.areEqualsCaseInsensitive(r, r)); + } + + private void testOptimizedFoldOrbitCase(int i, int[] orbit) { + int r = Unicode.simpleFold(i); + int j = 0; + while (r != i) { + assertEquals(r, orbit[j]); + j++; + r = Unicode.simpleFold(r); + } + } }