Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nondeterministic char_substitutes #4

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 25 additions & 20 deletions dawg_python/dawgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ def _similar_keys(self, current_prefix, key, index, replace_chars):
b_step = key[word_pos].encode('utf8')

if b_step in replace_chars:
next_index = index
b_replace_char, u_replace_char = replace_chars[b_step]
for (b_replace_char, u_replace_char) in replace_chars[b_step]:
next_index = index

next_index = self.dct.follow_bytes(b_replace_char, next_index)
next_index = self.dct.follow_bytes(b_replace_char, next_index)

if next_index is not None:
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
res += extra_keys
if next_index:
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
extra_keys = self._similar_keys(prefix, key, next_index, replace_chars)
res += extra_keys

index = self.dct.follow_bytes(b_step, index)
if index is None:
Expand All @@ -69,7 +69,7 @@ def similar_keys(self, key, replaces):

``replaces`` is an object obtained from
``DAWG.compile_replaces(mapping)`` where mapping is a dict
that maps single-char unicode sitrings to another single-char
that maps single-char unicode strings to (one or more) single-char
unicode strings.

This may be useful e.g. for handling single-character umlauts.
Expand All @@ -80,13 +80,17 @@ def similar_keys(self, key, replaces):
def compile_replaces(cls, replaces):

for k,v in replaces.items():
if len(k) != 1 or len(v) != 1:
raise ValueError("Keys and values must be single-char unicode strings.")
if len(k) != 1:
raise ValueError("Keys must be single-char unicode strings.")
if (isinstance(v, str) and len(v) != 1):
raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")
if isinstance(v, list) and (any(len(v_entry) != 1 for v_entry in v) or len(v) < 1):
raise ValueError("Values must be single-char unicode strings or non-empty lists of such.")

return dict(
(
k.encode('utf8'),
(v.encode('utf8'), v)
[(v_entry.encode('utf8'), v_entry) for v_entry in v]
)
for k, v in replaces.items()
)
Expand Down Expand Up @@ -333,14 +337,15 @@ def _similar_items(self, current_prefix, key, index, replace_chars):
b_step = key[word_pos].encode('utf8')

if b_step in replace_chars:
next_index = index
b_replace_char, u_replace_char = replace_chars[b_step]
for (b_replace_char, u_replace_char) in replace_chars[b_step]:
next_index = index

next_index = self.dct.follow_bytes(b_replace_char, next_index)
if next_index:
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
extra_items = self._similar_items(prefix, key, next_index, replace_chars)
res += extra_items
next_index = self.dct.follow_bytes(b_replace_char, next_index)

if next_index:
prefix = current_prefix + key[start_pos:word_pos] + u_replace_char
extra_items = self._similar_items(prefix, key, next_index, replace_chars)
res += extra_items

index = self.dct.follow_bytes(b_step, index)
if not index:
Expand All @@ -363,7 +368,7 @@ def similar_items(self, key, replaces):

``replaces`` is an object obtained from
``DAWG.compile_replaces(mapping)`` where mapping is a dict
that maps single-char unicode sitrings to another single-char
that maps single-char unicode strings to (one or more) single-char
unicode strings.
"""
return self._similar_items("", key, self.dct.ROOT, replaces)
Expand Down Expand Up @@ -406,7 +411,7 @@ def similar_item_values(self, key, replaces):

``replaces`` is an object obtained from
``DAWG.compile_replaces(mapping)`` where mapping is a dict
that maps single-char unicode sitrings to another single-char
that maps single-char unicode strings to (one or more) single-char
unicode strings.
"""
return self._similar_item_values(0, key, self.dct.ROOT, replaces)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name="DAWG-Python",
version="0.7.2",
version="0.7.3",
description="Pure-python reader for DAWGs (DAFSAs) created by dawgdic C++ library or DAWG Python extension.",
long_description = open('README.rst').read() + "\n\n"+ open('CHANGES.rst').read(),
author='Mikhail Korobov',
Expand Down