Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use a open addressing map to store folding paths #36

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 29 additions & 15 deletions java/com/google/re2j/Inst.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,25 +57,38 @@ boolean matchRune(int r) {
// Special case: single-rune slice is from literal string, not char
// class.
if (runes.length == 1) {
int r0 = runes[0];
if (r == r0) {
return true;
}
if ((arg & RE2.FOLD_CASE) != 0) {
for (int r1 = Unicode.simpleFold(r0);
r1 != r0;
r1 = Unicode.simpleFold(r1)) {
if (r == r1) {
return true;
}
return singleMatchRune(r);
}

return multiMatchRune(r);
}

private boolean singleMatchRune(int r) {
int r0 = runes[0];
if (r == r0) {
return true;
}

if ((arg & RE2.FOLD_CASE) != 0) {
int[] folds = Unicode.optimizedFoldOrbit(r0);

if (folds == null) {
return Unicode.areEqualsCaseInsensitive(r, r0);
} else {
for(int i = 0; i < folds.length; i++) {
if (folds[i] == r) return true;
}
}
return false;
}
return false;
}


// Peek at the first few pairs.
private boolean multiMatchRune(int r) {
// Peek at the first 5 pairs.
// Should handle ASCII well.
for (int j = 0; j < runes.length && j <= 8; j += 2) {
int length = Math.min(runes.length, 10);
for (int j = 0; j < length; j += 2) {
if (r < runes[j]) {
return false;
}
Expand All @@ -84,7 +97,8 @@ boolean matchRune(int r) {
}
}

// Otherwise binary search.
// Otherwise binary search
// Invariant: lo, hi, m are even.
for (int lo = 0, hi = runes.length / 2; lo < hi; ) {
int m = lo + (hi - lo) / 2;
int c = runes[2 * m];
Expand Down
97 changes: 97 additions & 0 deletions java/com/google/re2j/Unicode.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

package com.google.re2j;

import java.util.Arrays;

/**
* Utilities for dealing with Unicode better than Java does.
*
Expand Down Expand Up @@ -227,6 +229,101 @@ static int simpleFold(int r) {
return toUpper(r);
}

static int[] optimizedFoldOrbit(int r) {
if (r >= ORBIT_MIN_VALUE) {
return FOLD_MAP.get(r);
}
return null;
}

private static final FoldMap FOLD_MAP = new FoldMap();
private static final int ORBIT_MIN_VALUE = UnicodeTables.CASE_ORBIT[0][0];

static {
for(int[] orbit : UnicodeTables.CASE_ORBIT) {
int r0 = orbit[0];
int[] folds = new int[3];
int foldsSize = 0;
int r1 = r0;
while((r1 = simpleFold(r1)) != r0) {
if (foldsSize >= folds.length) {
Arrays.copyOf(folds, folds.length * 2);
}
folds[foldsSize] = r1;
foldsSize++;
}
FOLD_MAP.put(r0, Arrays.copyOf(folds, foldsSize));
}
}

public static boolean areEqualsCaseInsensitive(int r0, int r1) {
return r0 == toUpper(r1) || r0 == toLower(r1);
}

/*
A FoldMap maps a rune to an array of runes that are equivalent to it in a case-insensitive pattern.
*/
private static class FoldMap {
private static final int MAX_LINEAR_PROBING = 4;
private final int[] keys;
private final int[][] values;
private final int mask;

FoldMap() {
int s = findNextPositivePowerOfTwo(UnicodeTables.CASE_ORBIT.length * 4); // load of factor of 0.25 to have a max linear probing of MAX_LINEAR_PROBING
keys = new int[s];
Arrays.fill(keys,-1);
values = new int[s][];
mask = s - 1;
}

void put(int k, int[] fold) {
if (k == -1) throw new IllegalArgumentException("-1 is the empty marker");

int index = hashCode(k);
int maxIndex = index + MAX_LINEAR_PROBING;

do {
int slot = index & mask;
if (keys[slot] == -1) { // empty slot
values[slot] = fold;
keys[slot] = k;
return;
}
index++;
} while (index < maxIndex);

throw new IllegalStateException("Map is full");
}

int[] get(int k) {
int index = hashCode(k);
int maxIndex = index + MAX_LINEAR_PROBING;
do {
int slot = index & mask;
int key = keys[slot];
if (key == -1) {
return null; // empty slot
}
if (key == k) {
return values[slot];
}
index++;
} while (index < maxIndex);

return null;
}

private int hashCode(int k) {
final int h = k * 0x9E3779B9;
return h ^ (h >> 16);
}

private int findNextPositivePowerOfTwo(final int value) {
return 1 << (32 - Integer.numberOfLeadingZeros(value - 1));
}
}

private Unicode() {} // uninstantiable

}
48 changes: 48 additions & 0 deletions javatests/com/google/re2j/RunesTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package com.google.re2j;

import org.junit.Test;

import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

public class RunesTest {

@Test
public void testRunes() {
RE2 compile = RE2.compile("[0-13-46-78-9b-ce-fh-i]");
assertTrue(compile.match("0"));
assertTrue(compile.match("1"));
assertTrue(compile.match("3"));
assertTrue(compile.match("4"));
assertTrue(compile.match("6"));
assertTrue(compile.match("7"));
assertTrue(compile.match("8"));
assertTrue(compile.match("9"));
assertTrue(compile.match("b"));
assertTrue(compile.match("c"));
assertTrue(compile.match("e"));
assertTrue(compile.match("f"));
assertTrue(compile.match("h"));
assertTrue(compile.match("i"));

assertFalse(compile.match("2"));
assertFalse(compile.match("5"));
assertFalse(compile.match("a"));
assertFalse(compile.match("d"));
assertFalse(compile.match("g"));
assertFalse(compile.match("j"));
}
@Test
public void testRunesWithFold() {
Pattern pattern = Pattern.compile("ak", Pattern.CASE_INSENSITIVE);
String upperCase = "AK";
String withKelvin = "AK";
assertFalse(withKelvin.equals(upperCase)); // check we use the kelvin sign

assertTrue(pattern.matches("ak"));
assertTrue(pattern.matches(upperCase));
assertTrue(pattern.matches("aK"));
assertTrue(pattern.matches("Ak"));
assertTrue(pattern.matches(withKelvin));
}
}
33 changes: 33 additions & 0 deletions javatests/com/google/re2j/UnicodeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

package com.google.re2j;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import org.junit.Test;
Expand Down Expand Up @@ -38,4 +40,35 @@ public void testFoldConstants() {
// int toLower(int r);
// int simpleFold(int r);

@Test
public void testOptimizedFold() {
// Check that the new optimized fold algorithm gives the same result as using simple fold.
for (int i = 0; i <= Unicode.MAX_RUNE; i++) {
int[] orbit = Unicode.optimizedFoldOrbit(i);
if (orbit != null) {
testOptimizedFoldOrbitCase(i, orbit);
} else {
testOptimizedFoldNonOrbitCase(i);
}
}
}

private void testOptimizedFoldNonOrbitCase(int i) {
int r = Unicode.simpleFold(i);
assertEquals(i, Unicode.simpleFold(r)); // second fold always go back to first
assertTrue(Unicode.areEqualsCaseInsensitive(i, i));
assertTrue(Unicode.areEqualsCaseInsensitive(i, r));
assertTrue(Unicode.areEqualsCaseInsensitive(r, i));
assertTrue(Unicode.areEqualsCaseInsensitive(r, r));
}

private void testOptimizedFoldOrbitCase(int i, int[] orbit) {
int r = Unicode.simpleFold(i);
int j = 0;
while (r != i) {
assertEquals(r, orbit[j]);
j++;
r = Unicode.simpleFold(r);
}
}
}