Skip to content

Commit

Permalink
version v0.3
Browse files Browse the repository at this point in the history
  • Loading branch information
DevRoss committed Nov 17, 2021
1 parent 84a467c commit a4449e5
Show file tree
Hide file tree
Showing 6 changed files with 321 additions and 55 deletions.
5 changes: 5 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ python:
- "2.7"
- "3.5"
- "3.6"
- "3.7"
- "3.8"
- "3.9"
- "3.10"

install:
- pip install six
- pip install .
Expand Down
32 changes: 24 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# bert_slot_tokenizer

Version 0.2
Version 0.3

![Travis (.org)](https://img.shields.io/travis/DevRoss/bert-slot-tokenizer) ![GitHub](https://img.shields.io/github/license/devross/bert-slot-tokenizer)

Expand All @@ -19,22 +19,38 @@ pip install bert-slot-tokenizer

## 支持的格式:

- [IOB格式](https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging))
- IOB格式
- IOBS格式
- BMES格式
- SPAN格式

## 使用方法:

```python
from bert_slot_tokenizer import SlotConverter
vacab_path = 'tests/test_data/example_vocab.txt'
vocab_path = 'tests/test_data/example_vocab.txt'
# you can find a example here --> https://github.com/DevRoss/bert-slot-tokenizer/blob/master/tests/test_data/example_vocab.txt
sc = SlotConverter(vocab_path, do_lower_case=True)
text = 'Too YOUNG, too simple, sometimes naive! 蛤蛤+1s'
slot = {'name': '蛤蛤', 'time': '+1s'}
output_text, iob_slot = sc.convert2iob(text, slot)
text = 'Too YOUNG, too simple, sometimes naive! 蛤蛤+1s蛤蛤蛤嗝'
slot = {'蛤蛤': 'name', '+1s': 'time', '': '语气'}
output_text, iob_slot = sc.convert(text, slot, fmt='IOB')
output_text, iobs_slot = sc.convert(text, slot, fmt='IOBS')
output_text, bmes_slot = sc.convert(text, slot, fmt='BMES')
output_text, span_slot = sc.convert(text, slot, fmt='SPAN')
print(output_text)
# ['too', 'young', ',', 'too', 'simple', ',', 'some', '##times', 'na', '##ive', '!', '蛤', '蛤', '+', '1', '##s']
# ['too', 'young', ',', 'too', 'simple', ',', 'some', '##times', 'na', '##ive', '!', '蛤', '蛤', '+', '1', '##s', '蛤', '蛤', '蛤', '嗝']

print(iob_slot)
# ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'B-time', 'I-time', 'I-time']
# ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'B-time', 'I-time', 'I-time', 'B-name', 'I-name', 'O', 'B-语气']

print(iobs_slot)
# ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'B-time', 'I-time', 'I-time', 'B-name', 'I-name', 'O', 'S-语气']

print(bmes_slot)
# ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'E-name', 'B-time', 'M-time', 'E-time', 'B-name', 'E-name', 'O', 'S-语气']

print(span_slot)
# [[11, 12, 'name'], [13, 15, 'time'], [16, 17, 'name'], [19, 19, '语气']]
```

## 写在最后:
Expand Down
136 changes: 112 additions & 24 deletions bert_slot_tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import io
from bert_slot_tokenizer.bert_tokenizer import tokenization
import logging


class SlotConverter:
Expand Down Expand Up @@ -45,22 +46,27 @@ def parse_json(file):
def kmp(main_str, pattern):
"""
Kmp algorithm to get the begin index of slot in text if matching
return: List[int] a list of begin index
"""
nex = SlotConverter.get_next(pattern)
i = 0 # the pointer of main_str
j = 0 # the pointer of pattern
while i < len(main_str) and j < len(pattern):
if j == -1 or main_str[i] == pattern[j]:
results = []
while i < len(main_str):
while i < len(main_str) and j < len(pattern):
if j == -1 or main_str[i] == pattern[j]:
i += 1
j += 1
else:
j = nex[j]

if j == len(pattern): # matched
results.append(i - j)
i += 1
j += 1
j = 0
else:
j = nex[j]

if j == len(pattern): # matched
return i - j
else:
return -1
break
return results

@staticmethod
def get_next(pattern):
Expand All @@ -81,13 +87,13 @@ def get_next(pattern):
return nex

@classmethod
def tokenize(cls, begin_index, end_index, slot_key, ret_slot):
def fill_iob(cls, begin_index, end_index, slot_type, ret_slot):
"""
Convert to IOB format slot when given slot‘s begin/end slot index.
We perform in place, which means re_slot will change after calling this function.
:param begin_index: slot begin index, where the slot begins.
:param end_index: slot begin index, where the slot ends.
:param slot_key: the label of slot, such as 'AppName'
:param end_index: slot begin index, where the slot ends. (not include)
:param slot_type: the label of slot, such as 'AppName'
:param ret_slot:
:return:
"""
Expand All @@ -97,26 +103,108 @@ def tokenize(cls, begin_index, end_index, slot_key, ret_slot):
if ret_slot[i] != unicode_O:
break
if i == begin_index:
ret_slot[i] = tokenization.convert_to_unicode('B-' + slot_key)
ret_slot[i] = tokenization.convert_to_unicode('B-' + slot_type)
else:
ret_slot[i] = tokenization.convert_to_unicode('I-' + slot_key)
ret_slot[i] = tokenization.convert_to_unicode('I-' + slot_type)
@classmethod
def fill_iobs(cls, begin_index, end_index, slot_type, ret_slot):
"""
Convert to IOB format slot when given slot‘s begin/end slot index.
We perform in place, which means re_slot will change after calling this function.
:param begin_index: slot begin index, where the slot begins.
:param end_index: slot begin index, where the slot ends. (not include)
:param slot_type: the label of slot, such as 'AppName'
:param ret_slot:
:return:
"""
# usage: [begin_index, end_index]
unicode_O = tokenization.convert_to_unicode('O')
for i in range(begin_index, end_index):
if ret_slot[i] != unicode_O:
break
if end_index - begin_index == 1:
ret_slot[i] = tokenization.convert_to_unicode('S-' + slot_type)
continue
if i == begin_index:
ret_slot[i] = tokenization.convert_to_unicode('B-' + slot_type)
else:
ret_slot[i] = tokenization.convert_to_unicode('I-' + slot_type)

def convert2iob(self, text, slot):
@classmethod
def fill_bmes(cls, begin_index, end_index, slot_type, ret_slot):
"""
Convert to IOB format slot when given slot‘s begin/end slot index.
We perform in place, which means re_slot will change after calling this function.
:param begin_index: slot begin index, where the slot begins.
:param end_index: slot begin index, where the slot ends. (not include)
:param slot_type: the label of slot, such as 'AppName'
:param ret_slot:
:return:
"""
# usage: [begin_index, end_index]
unicode_O = tokenization.convert_to_unicode('O')
for i in range(begin_index, end_index):
if ret_slot[i] != unicode_O:
break
if end_index - begin_index == 1:
ret_slot[i] = tokenization.convert_to_unicode('S-' + slot_type)
continue
if i == begin_index:
ret_slot[i] = tokenization.convert_to_unicode('B-' + slot_type)
elif i == end_index - 1:
ret_slot[i] = tokenization.convert_to_unicode('E-' + slot_type)
else:
ret_slot[i] = tokenization.convert_to_unicode('M-' + slot_type)

@classmethod
def tag_span(cls, begin_index, end_index, slot_type, ret_slot):
"""
Convert to IOB format slot when given slot‘s begin/end slot index.
We perform in place, which means re_slot will change after calling this function.
:param begin_index: slot begin index, where the slot begins.
:param end_index: slot begin index, where the slot ends. (not include)
:param slot_type: the label of slot, such as 'AppName'
:param ret_slot: [[begin_index, end_index - 1, slot_type]]
:return:
"""
# usage: [begin_index, end_index]
ret_slot.append([begin_index, end_index - 1, slot_type])

def convert(self, text, slot, fmt='IOB'):
"""
convert dict slot to IOB format slot
:param text: text with type str
:param slot: slot with type dict
:return: a tuple with (output_slot, output_iob_slot)
"""
if fmt == 'IOB':
convert_func = SlotConverter.fill_iob
elif fmt == 'IOBS':
convert_func = SlotConverter.fill_iobs
elif fmt == 'BMES':
convert_func = SlotConverter.fill_bmes
elif fmt == 'SPAN':
convert_func = SlotConverter.tag_span
else:
raise NotImplementedError('Not support format {fmt}, please try [IOB, IOBS, BMES, SPAN]')

text_tokens = self.bert_tokenizer.tokenize(text)
iob_slot = list(tokenization.convert_to_unicode('O') * len(text_tokens))

for k, v in slot.items():
slot_tokens = self.bert_tokenizer.tokenize(v)
begin_index = SlotConverter.kmp(text_tokens, slot_tokens)
SlotConverter.tokenize(begin_index, begin_index + len(slot_tokens), k, iob_slot)
assert len(iob_slot) == len(text_tokens)
return text_tokens, iob_slot
if fmt in ['IOB', 'IOBS', 'BMES']:
tag_result = list(tokenization.convert_to_unicode('O') * len(text_tokens))
else:
tag_result = [] # span

for entity_name, entity_type in slot.items():
slot_tokens = self.bert_tokenizer.tokenize(entity_name)
begin_index_list = SlotConverter.kmp(text_tokens, slot_tokens)
for begin_index in begin_index_list:
convert_func(begin_index, begin_index + len(slot_tokens), entity_type, tag_result)
# check the length for IOB, IOBS, BMES format
if fmt in ['IOB', 'IOBS', 'BMES']:
assert len(tag_result) == len(text_tokens)
elif fmt == 'SPAN':
tag_result.sort()
return text_tokens, tag_result


if __name__ == '__main__':
Expand Down
122 changes: 122 additions & 0 deletions bert_slot_tokenizer/bert_tokenizer/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,3 +409,125 @@ def _is_punctuation(char):
if cat.startswith("P"):
return True
return False


class SeqWordpieceTokenizer(WordpieceTokenizer):

def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
super().__init__(vocab, unk_token, max_input_chars_per_word)

def tokenize(self, orig_text, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""

text = convert_to_unicode(text)
orig_text = convert_to_unicode(orig_text)

output_tokens = []
orig_output_tokens = []
for orig_token, token in zip(whitespace_tokenize(orig_text), whitespace_tokenize(text)):
chars = list(token)
orig_chars = list(orig_token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
orig_output_tokens.append(orig_token)
continue

is_bad = False
start = 0
sub_tokens = []
orig_sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
orig_cur_substr = None
while start < end:
substr = "".join(chars[start:end])
orig_substr = "".join(orig_chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
orig_cur_substr = orig_substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
orig_sub_tokens.append(orig_cur_substr)
start = end

if is_bad:
output_tokens.append(self.unk_token)
orig_output_tokens.append(orig_token)
else:
output_tokens.extend(sub_tokens)
orig_output_tokens.extend(orig_sub_tokens)
return orig_output_tokens, output_tokens


class SeqBasicTokenizer(BasicTokenizer):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self, do_lower_case=True):
super().__init__(do_lower_case)

def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
orig_split_tokens = []
for token in orig_tokens:
orig_token = token
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
orig_token = self._run_strip_accents(orig_token)
split_tokens.extend(self._run_split_on_punc(token))
orig_split_tokens.extend(self._run_split_on_punc(orig_token))

output_tokens = whitespace_tokenize(" ".join(split_tokens))
orig_output_tokens = whitespace_tokenize(" ".join(orig_split_tokens))
return orig_output_tokens, output_tokens


class SeqFullTokenizer(FullTokenizer):

def __init__(self, vocab_file, do_lower_case=True):
super().__init__(vocab_file, do_lower_case)
self.vocab = load_vocab(vocab_file)
self.basic_tokenizer = SeqBasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = SeqWordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
orig_split_tokens = []
for orig_token, token in zip(*self.basic_tokenizer.tokenize(text)):
for orig_sub_token, sub_token in zip(*self.wordpiece_tokenizer.tokenize(orig_token, token)):
split_tokens.append(sub_token)
orig_split_tokens.append(orig_sub_token)
return orig_split_tokens, split_tokens
2 changes: 1 addition & 1 deletion bert_slot_tokenizer/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

__version__ = "0.2.1"
__version__ = "0.3.0"


def main():
Expand Down
Loading

2 comments on commit a4449e5

@eliasdiek
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DevRoss hi ,I'm looking for someone with your stack to work on our platform as a freelancer or full time . some projects start soon and others begin in half a year . are you up ? contact me on telegram @eliasdiek or via email at [email protected]

@eliasdiek
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DevRoss we are also hosting a podcast on how to choose a stack to build your web3 project . would you like to participate ?

Please sign in to comment.