update

FDgithub123 · May 13, 2018 · 6c88b6f · 6c88b6f
1 parent 3d6cdc4
commit 6c88b6f
Show file tree

Hide file tree

Showing 32 changed files with 46,370 additions and 0 deletions.
diff --git a/INSTALL.md b/INSTALL.md
@@ -0,0 +1,26 @@
+
+## install dependency
+
+#### python3
+install or update to python 3
+
+#### install chinese version of rasa nlu
+```
+git clone https://github.com/crownpku/Rasa_NLU_Chi.git
+cd rasa_nlu
+pip install -r requirements.txt
+python setup.py install
+```
+
+#### install sklearn and MITIE
+
+```
+pip install -U scikit-learn sklearn-crfsuite
+pip install git+https://github.com/mit-nlp/MITIE.git
+```
+
+#### install rasa_core
+
+```
+pip install rasa_core
+```
diff --git a/__init__.py b/__init__.py
diff --git a/bot.py b/bot.py
@@ -0,0 +1,170 @@
+# -*- coding: UTF-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import logging
+import warnings
+
+from rasa_core.actions import Action
+from rasa_core.agent import Agent
+from rasa_core.channels.console import ConsoleInputChannel
+from rasa_core.events import SlotSet
+from rasa_core.interpreter import RasaNLUInterpreter
+from rasa_core.policies.keras_policy import KerasPolicy
+from rasa_core.policies.memoization import MemoizationPolicy
+
+logger = logging.getLogger(__name__)
+
+support_search = ["消费", "流量"]
+
+
+def extract_item(item):
+    """
+    check if item supported, this func just for lack of train data.
+    :param item: item in track, eg: "流量"、"查流量"
+    :return:
+    """
+    if item is None:
+        return None
+    for name in support_search:
+        if name in item:
+            return name
+    return None
+
+
+class ActionSearchConsume(Action):
+    def name(self):
+        return 'action_search_consume'
+
+    def run(self, dispatcher, tracker, domain):
+        item = tracker.get_slot("item")
+        item = extract_item(item)
+        if item is None:
+            dispatcher.utter_message("您好，我现在只会查话费和流量")
+            dispatcher.utter_message("你可以这样问我：“帮我查话费”")
+            return []
+
+        time = tracker.get_slot("time")
+        if time is None:
+            dispatcher.utter_message("您想查询哪个月的消费？")
+            return []
+        # query database here using item and time as key. but you may normalize time format first.
+        dispatcher.utter_message("好，请稍等")
+        if item == "流量":
+            dispatcher.utter_message("您好，您{}共使用{}二百八十兆，剩余三十兆。".format(time, item))
+        else:
+            dispatcher.utter_message("您好，您{}共消费二十八元。".format(time))
+        return []
+
+
+class MobilePolicy(KerasPolicy):
+    def model_architecture(self, num_features, num_actions, max_history_len):
+        """Build a Keras model and return a compiled model."""
+        from keras.layers import LSTM, Activation, Masking, Dense
+        from keras.models import Sequential
+
+        n_hidden = 32  # size of hidden layer in LSTM
+        # Build Model
+        batch_shape = (None, max_history_len, num_features)
+
+        model = Sequential()
+        model.add(Masking(-1, batch_input_shape=batch_shape))
+        model.add(LSTM(n_hidden, batch_input_shape=batch_shape))
+        model.add(Dense(input_dim=n_hidden, output_dim=num_actions))
+        model.add(Activation("softmax"))
+
+        model.compile(loss="categorical_crossentropy",
+                      optimizer="adam",
+                      metrics=["accuracy"])
+
+        logger.debug(model.summary())
+        return model
+
+
+def train_dialogue(domain_file="mobile_domain.yml",
+                   model_path="models/dialogue",
+                   training_data_file="data/mobile_story.md"):
+    agent = Agent(domain_file,
+                  policies=[MemoizationPolicy(), MobilePolicy()])
+
+    agent.train(
+        training_data_file,
+        max_history=2,
+        epochs=200,
+        batch_size=16,
+        augmentation_factor=50,
+        validation_split=0.2
+    )
+
+    agent.persist(model_path)
+    return agent
+
+
+def train_nlu():
+    from rasa_nlu.converters import load_data
+    from rasa_nlu.config import RasaNLUConfig
+    from rasa_nlu.model import Trainer
+
+    training_data = load_data("data/mobile_nlu_data.json")
+    trainer = Trainer(RasaNLUConfig("mobile_nlu_model_config.json"))
+    trainer.train(training_data)
+    model_directory = trainer.persist("models/", project_name="ivr", fixed_model_name="demo")
+
+    return model_directory
+
+
+def run_ivrbot_online(input_channel=ConsoleInputChannel(),
+                      interpreter=RasaNLUInterpreter("models/ivr/demo"),
+                      domain_file="mobile_domain.yml",
+                      training_data_file="data/mobile_story.md"):
+    agent = Agent(domain_file,
+                  policies=[MemoizationPolicy(), KerasPolicy()],
+                  interpreter=interpreter)
+
+    agent.train_online(training_data_file,
+                       input_channel=input_channel,
+                       max_history=2,
+                       batch_size=50,
+                       epochs=200,
+                       max_training_samples=300)
+
+    return agent
+
+
+def run(serve_forever=True):
+    agent = Agent.load("models/dialogue",
+                       interpreter=RasaNLUInterpreter("models/ivr/demo"))
+
+    if serve_forever:
+        agent.handle_channel(ConsoleInputChannel())
+    return agent
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level="INFO")
+
+    parser = argparse.ArgumentParser(
+        description="starts the bot")
+
+    parser.add_argument(
+        "task",
+        choices=["train-nlu", "train-dialogue", "run", "online_train"],
+        help="what the bot should do - e.g. run or train?")
+    task = parser.parse_args().task
+
+    # decide what to do based on first parameter of the script
+    if task == "train-nlu":
+        train_nlu()
+    elif task == "train-dialogue":
+        train_dialogue()
+    elif task == "run":
+        run()
+    elif task == "online_train":
+        run_ivrbot_online()
+    else:
+        warnings.warn("Need to pass either 'train-nlu', 'train-dialogue' or "
+                      "'run' to use the script.")
+        exit(1)
diff --git a/chat_detection/corpus.py b/chat_detection/corpus.py
@@ -0,0 +1,117 @@
+import os
+
+
+DIALOG_MAXIMUM_CHARACTER_LENGTH = 400
+
+
+class CorpusObject(list):
+    """
+    This is a proxy object that allow additional
+    attributes to be added to the collections of
+    data that get returned by the corpus reader.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Imitate a list by allowing a value to be passed in.
+        """
+        if args:
+            super(CorpusObject, self).__init__(args[0])
+        else:
+            super(CorpusObject, self).__init__()
+
+        self.categories = []
+
+
+class Corpus(object):
+
+    def __init__(self):
+        current_directory = os.path.dirname(os.path.abspath(__file__))
+        self.data_directory = os.path.join(current_directory, 'data')
+
+    def get_file_path(self, dotted_path, extension='json'):
+        """
+        Reads a dotted file path and returns the file path.
+        """
+
+        # If the operating system's file path seperator character is in the string
+        if os.sep in dotted_path or '/' in dotted_path:
+            # Assume the path is a valid file path
+            return dotted_path
+
+        parts = dotted_path.split('.')
+        if parts[0] == 'chatterbot':
+            parts.pop(0)
+            parts[0] = self.data_directory
+
+        corpus_path = os.path.join(*parts)
+
+        if os.path.exists(corpus_path + '.{}'.format(extension)):
+            corpus_path += '.{}'.format(extension)
+
+        return corpus_path
+
+    def read_corpus(self, file_name):
+        """
+        Read and return the data from a corpus json file.
+        """
+        import io
+        import yaml
+
+        with io.open(file_name, encoding='utf-8') as data_file:
+            data = yaml.load(data_file)
+        return data
+
+    def list_corpus_files(self, dotted_path):
+        """
+        Return a list of file paths to each data file in
+        the specified corpus.
+        """
+        CORPUS_EXTENSION = 'yml'
+
+        corpus_path = self.get_file_path(dotted_path, extension=CORPUS_EXTENSION)
+        paths = []
+
+        if os.path.isdir(corpus_path):
+            for dirname, dirnames, filenames in os.walk(corpus_path):
+                for datafile in filenames:
+                    if datafile.endswith(CORPUS_EXTENSION):
+                        paths.append(os.path.join(dirname, datafile))
+        else:
+            paths.append(corpus_path)
+
+        paths.sort()
+        return paths
+
+    def load_corpus(self, dotted_path):
+        """
+        Return the data contained within a specified corpus.
+        """
+        data_file_paths = self.list_corpus_files(dotted_path)
+
+        corpora = []
+
+        for file_path in data_file_paths:
+            corpus = CorpusObject()
+            corpus_data = self.read_corpus(file_path)
+
+            conversations = corpus_data.get('conversations', [])
+            corpus.categories = corpus_data.get('categories', [])
+            corpus.extend(conversations)
+
+            corpora.append(corpus)
+
+        return corpora
+
+c = Corpus()
+res = c.load_corpus("../data/chinese_dialogs")
+texts = []
+with open("../data/chat_detection_non_task.txt", "w") as f:
+    f.write("text,intent\n")
+    for categorys in res: # category
+        for dialogus in categorys:
+            for text in dialogus:
+                f.write(text+"|non_task\n")
+
+
+