version v0.3

DevRoss · Nov 17, 2021 · a4449e5 · a4449e5 · eliasdiek · Jul 29, 2024
1 parent 84a467c
commit a4449e5
Show file tree

Hide file tree

Showing 6 changed files with 321 additions and 55 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -3,6 +3,11 @@ python:
   - "2.7"
   - "3.5"
   - "3.6"
+  - "3.7"
+  - "3.8"
+  - "3.9"
+  - "3.10"
+
 install:
   - pip install six
   - pip install .

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # bert_slot_tokenizer
 
-Version 0.2
+Version 0.3
 
 ![Travis (.org)](https://img.shields.io/travis/DevRoss/bert-slot-tokenizer) ![GitHub](https://img.shields.io/github/license/devross/bert-slot-tokenizer)
 
@@ -19,22 +19,38 @@ pip install bert-slot-tokenizer
 
 ## 支持的格式：
 
-- [IOB格式](https://en.wikipedia.org/wiki/Inside–outside–beginning_(tagging))
+- IOB格式
+- IOBS格式
+- BMES格式
+- SPAN格式
 
 ## 使用方法：
 
 ```python
 from bert_slot_tokenizer import SlotConverter
-vacab_path = 'tests/test_data/example_vocab.txt' 
+vocab_path = 'tests/test_data/example_vocab.txt' 
 # you can find a example here --> https://github.com/DevRoss/bert-slot-tokenizer/blob/master/tests/test_data/example_vocab.txt
 sc = SlotConverter(vocab_path, do_lower_case=True)
-text = 'Too YOUNG, too simple, sometimes naive! 蛤蛤+1s'
-slot = {'name': '蛤蛤', 'time': '+1s'}
-output_text, iob_slot = sc.convert2iob(text, slot)
+text = 'Too YOUNG, too simple, sometimes naive! 蛤蛤+1s蛤蛤蛤嗝'
+slot = {'蛤蛤': 'name', '+1s': 'time', '嗝': '语气'}
+output_text, iob_slot = sc.convert(text, slot, fmt='IOB')
+output_text, iobs_slot = sc.convert(text, slot, fmt='IOBS')
+output_text, bmes_slot = sc.convert(text, slot, fmt='BMES')
+output_text, span_slot = sc.convert(text, slot, fmt='SPAN')
 print(output_text)
-# ['too', 'young', ',', 'too', 'simple', ',', 'some', '##times', 'na', '##ive', '!', '蛤', '蛤', '+', '1', '##s']
+# ['too', 'young', ',', 'too', 'simple', ',', 'some', '##times', 'na', '##ive', '!', '蛤', '蛤', '+', '1', '##s', '蛤', '蛤', '蛤', '嗝']
+
 print(iob_slot)
-# ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'B-time', 'I-time', 'I-time']
+# ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'B-time', 'I-time', 'I-time', 'B-name', 'I-name', 'O', 'B-语气']
+
+print(iobs_slot)
+# ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'B-time', 'I-time', 'I-time', 'B-name', 'I-name', 'O', 'S-语气']
+
+print(bmes_slot)
+# ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-name', 'E-name', 'B-time', 'M-time', 'E-time', 'B-name', 'E-name', 'O', 'S-语气']
+
+print(span_slot)
+# [[11, 12, 'name'], [13, 15, 'time'], [16, 17, 'name'], [19, 19, '语气']]
 ```
 
 ## 写在最后：

diff --git a/bert_slot_tokenizer/__init__.py b/bert_slot_tokenizer/__init__.py
@@ -4,6 +4,7 @@
 import json
 import io
 from bert_slot_tokenizer.bert_tokenizer import tokenization
+import logging
 
 
 class SlotConverter:
@@ -45,22 +46,27 @@ def parse_json(file):
     def kmp(main_str, pattern):
         """
         Kmp algorithm to get the begin index of slot in text if matching
-
+        return: List[int] a list of begin index
         """
         nex = SlotConverter.get_next(pattern)
         i = 0  # the pointer of main_str
         j = 0  # the pointer of pattern
-        while i < len(main_str) and j < len(pattern):
-            if j == -1 or main_str[i] == pattern[j]:
+        results = []
+        while i < len(main_str):
+            while i < len(main_str) and j < len(pattern):
+                if j == -1 or main_str[i] == pattern[j]:
+                    i += 1
+                    j += 1
+                else:
+                    j = nex[j]
+
+            if j == len(pattern):  # matched
+                results.append(i - j)
                 i += 1
-                j += 1
+                j = 0
             else:
-                j = nex[j]
-
-        if j == len(pattern):  # matched
-            return i - j
-        else:
-            return -1
+                break
+        return results
 
     @staticmethod
     def get_next(pattern):
@@ -81,13 +87,13 @@ def get_next(pattern):
         return nex
 
     @classmethod
-    def tokenize(cls, begin_index, end_index, slot_key, ret_slot):
+    def fill_iob(cls, begin_index, end_index, slot_type, ret_slot):
         """
         Convert to IOB format slot when given slot‘s begin/end slot index.
         We perform in place, which means re_slot will change after calling this function.
         :param begin_index: slot begin index, where the slot begins.
-        :param end_index: slot begin index, where the slot ends.
-        :param slot_key: the label of slot, such as 'AppName'
+        :param end_index: slot begin index, where the slot ends. (not include)
+        :param slot_type: the label of slot, such as 'AppName'
         :param ret_slot:
         :return:
         """
@@ -97,26 +103,108 @@ def tokenize(cls, begin_index, end_index, slot_key, ret_slot):
             if ret_slot[i] != unicode_O:
                 break
             if i == begin_index:
-                ret_slot[i] = tokenization.convert_to_unicode('B-' + slot_key)
+                ret_slot[i] = tokenization.convert_to_unicode('B-' + slot_type)
             else:
-                ret_slot[i] = tokenization.convert_to_unicode('I-' + slot_key)
+                ret_slot[i] = tokenization.convert_to_unicode('I-' + slot_type)
+    @classmethod
+    def fill_iobs(cls, begin_index, end_index, slot_type, ret_slot):
+        """
+        Convert to IOB format slot when given slot‘s begin/end slot index.
+        We perform in place, which means re_slot will change after calling this function.
+        :param begin_index: slot begin index, where the slot begins.
+        :param end_index: slot begin index, where the slot ends. (not include)
+        :param slot_type: the label of slot, such as 'AppName'
+        :param ret_slot:
+        :return:
+        """
+        # usage: [begin_index, end_index]
+        unicode_O = tokenization.convert_to_unicode('O')
+        for i in range(begin_index, end_index):
+            if ret_slot[i] != unicode_O:
+                break
+            if end_index - begin_index == 1:
+                ret_slot[i] = tokenization.convert_to_unicode('S-' + slot_type)
+                continue
+            if i == begin_index:
+                ret_slot[i] = tokenization.convert_to_unicode('B-' + slot_type)
+            else:
+                ret_slot[i] = tokenization.convert_to_unicode('I-' + slot_type)
 
-    def convert2iob(self, text, slot):
+    @classmethod
+    def fill_bmes(cls, begin_index, end_index, slot_type, ret_slot):
+        """
+        Convert to IOB format slot when given slot‘s begin/end slot index.
+        We perform in place, which means re_slot will change after calling this function.
+        :param begin_index: slot begin index, where the slot begins.
+        :param end_index: slot begin index, where the slot ends. (not include)
+        :param slot_type: the label of slot, such as 'AppName'
+        :param ret_slot:
+        :return:
+        """
+        # usage: [begin_index, end_index]
+        unicode_O = tokenization.convert_to_unicode('O')
+        for i in range(begin_index, end_index):
+            if ret_slot[i] != unicode_O:
+                break
+            if end_index - begin_index == 1:
+                ret_slot[i] = tokenization.convert_to_unicode('S-' + slot_type)
+                continue
+            if i == begin_index:
+                ret_slot[i] = tokenization.convert_to_unicode('B-' + slot_type)
+            elif i == end_index - 1:
+                ret_slot[i] = tokenization.convert_to_unicode('E-' + slot_type)
+            else:
+                ret_slot[i] = tokenization.convert_to_unicode('M-' + slot_type)
+
+    @classmethod
+    def tag_span(cls, begin_index, end_index, slot_type, ret_slot):
+        """
+        Convert to IOB format slot when given slot‘s begin/end slot index.
+        We perform in place, which means re_slot will change after calling this function.
+        :param begin_index: slot begin index, where the slot begins.
+        :param end_index: slot begin index, where the slot ends. (not include)
+        :param slot_type: the label of slot, such as 'AppName'
+        :param ret_slot: [[begin_index, end_index - 1, slot_type]]
+        :return:
+        """
+        # usage: [begin_index, end_index]
+        ret_slot.append([begin_index, end_index - 1, slot_type])
+
+    def convert(self, text, slot, fmt='IOB'):
         """
         convert dict slot to IOB format slot
         :param text: text with type str
         :param slot: slot with type dict
         :return: a tuple with (output_slot, output_iob_slot)
         """
+        if fmt == 'IOB':
+            convert_func = SlotConverter.fill_iob
+        elif fmt == 'IOBS':
+            convert_func = SlotConverter.fill_iobs
+        elif fmt == 'BMES':
+            convert_func = SlotConverter.fill_bmes
+        elif fmt == 'SPAN':
+            convert_func = SlotConverter.tag_span
+        else:
+            raise NotImplementedError('Not support format {fmt}, please try [IOB, IOBS, BMES, SPAN]')
+
         text_tokens = self.bert_tokenizer.tokenize(text)
-        iob_slot = list(tokenization.convert_to_unicode('O') * len(text_tokens))
-
-        for k, v in slot.items():
-            slot_tokens = self.bert_tokenizer.tokenize(v)
-            begin_index = SlotConverter.kmp(text_tokens, slot_tokens)
-            SlotConverter.tokenize(begin_index, begin_index + len(slot_tokens), k, iob_slot)
-            assert len(iob_slot) == len(text_tokens)
-        return text_tokens, iob_slot
+        if fmt in ['IOB', 'IOBS', 'BMES']: 
+            tag_result = list(tokenization.convert_to_unicode('O') * len(text_tokens))
+        else:
+            tag_result = [] # span
+
+        for entity_name, entity_type in slot.items():
+            slot_tokens = self.bert_tokenizer.tokenize(entity_name)
+            begin_index_list = SlotConverter.kmp(text_tokens, slot_tokens)
+            for begin_index in begin_index_list:
+                convert_func(begin_index, begin_index + len(slot_tokens), entity_type, tag_result)
+        # check the length for IOB, IOBS, BMES format
+        if fmt in ['IOB', 'IOBS', 'BMES']:
+            assert len(tag_result) == len(text_tokens)
+        elif fmt == 'SPAN':
+            tag_result.sort()
+        return text_tokens, tag_result
 
 
 if __name__ == '__main__':

diff --git a/bert_slot_tokenizer/bert_tokenizer/tokenization.py b/bert_slot_tokenizer/bert_tokenizer/tokenization.py
@@ -409,3 +409,125 @@ def _is_punctuation(char):
     if cat.startswith("P"):
         return True
     return False
+
+
+class SeqWordpieceTokenizer(WordpieceTokenizer):
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        super().__init__(vocab, unk_token, max_input_chars_per_word)
+
+    def tokenize(self, orig_text, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        text = convert_to_unicode(text)
+        orig_text = convert_to_unicode(orig_text)
+
+        output_tokens = []
+        orig_output_tokens = []
+        for orig_token, token in zip(whitespace_tokenize(orig_text), whitespace_tokenize(text)):
+            chars = list(token)
+            orig_chars = list(orig_token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                orig_output_tokens.append(orig_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            orig_sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                orig_cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    orig_substr = "".join(orig_chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        orig_cur_substr = orig_substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                orig_sub_tokens.append(orig_cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+                orig_output_tokens.append(orig_token)
+            else:
+                output_tokens.extend(sub_tokens)
+                orig_output_tokens.extend(orig_sub_tokens)
+        return orig_output_tokens, output_tokens
+
+
+class SeqBasicTokenizer(BasicTokenizer):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True):
+        super().__init__(do_lower_case)
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = convert_to_unicode(text)
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        orig_split_tokens = []
+        for token in orig_tokens:
+            orig_token = token
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+                orig_token = self._run_strip_accents(orig_token)
+            split_tokens.extend(self._run_split_on_punc(token))
+            orig_split_tokens.extend(self._run_split_on_punc(orig_token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        orig_output_tokens = whitespace_tokenize(" ".join(orig_split_tokens))
+        return orig_output_tokens, output_tokens
+
+
+class SeqFullTokenizer(FullTokenizer):
+
+    def __init__(self, vocab_file, do_lower_case=True):
+        super().__init__(vocab_file, do_lower_case)
+        self.vocab = load_vocab(vocab_file)
+        self.basic_tokenizer = SeqBasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = SeqWordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        orig_split_tokens = []
+        for orig_token, token in zip(*self.basic_tokenizer.tokenize(text)):
+            for orig_sub_token, sub_token in zip(*self.wordpiece_tokenizer.tokenize(orig_token, token)):
+                split_tokens.append(sub_token)
+                orig_split_tokens.append(orig_sub_token)
+        return orig_split_tokens, split_tokens
diff --git a/bert_slot_tokenizer/main.py b/bert_slot_tokenizer/main.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-__version__ = "0.2.1"
+__version__ = "0.3.0"
 
 
 def main():
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,11 @@ python: @@
       - "2.7"
       - "3.5"
       - "3.6"
+      - "3.7"
+      - "3.8"
+      - "3.9"
+      - "3.10"
     install:
       - pip install six
       - pip install .
@@ Expand Down @@