From df45607e49fd4fd0fc701f794c3f5edbc819ea08 Mon Sep 17 00:00:00 2001 From: Exajobs <95390472+exajobs@users.noreply.github.com> Date: Thu, 16 Dec 2021 07:16:49 -0500 Subject: [PATCH] Add files via upload --- machine-learning-scripts/LICENSE | 21 ++ machine-learning-scripts/README.md | 3 + machine-learning-scripts/requirements.txt | 5 + machine-learning-scripts/valohai/dvc_urls.txt | 4 + .../valohai/keras-20ng-cnn.py | 228 +++++++++++++ .../valohai/keras-20ng-rnn.py | 221 +++++++++++++ .../valohai/keras-dvc-cnn-predict.py | 81 +++++ .../valohai/keras-dvc-cnn-pretrained.py | 245 ++++++++++++++ .../valohai/keras-dvc-cnn-simple.py | 206 ++++++++++++ .../valohai/keras-sfnet-cnn.py | 299 ++++++++++++++++++ .../valohai/keras-sfnet-lstm.py | 284 +++++++++++++++++ .../valohai/prediction_server.py | 80 +++++ .../valohai/prediction_server_text.py | 111 +++++++ .../valohai/pytorch_dvc_cnn.py | 171 ++++++++++ .../valohai/pytorch_dvc_cnn_simple.py | 99 ++++++ .../valohai/test_prediction_server.py | 38 +++ .../valohai/test_prediction_server_text.py | 26 ++ 17 files changed, 2122 insertions(+) create mode 100644 machine-learning-scripts/LICENSE create mode 100644 machine-learning-scripts/README.md create mode 100644 machine-learning-scripts/requirements.txt create mode 100644 machine-learning-scripts/valohai/dvc_urls.txt create mode 100644 machine-learning-scripts/valohai/keras-20ng-cnn.py create mode 100644 machine-learning-scripts/valohai/keras-20ng-rnn.py create mode 100644 machine-learning-scripts/valohai/keras-dvc-cnn-predict.py create mode 100644 machine-learning-scripts/valohai/keras-dvc-cnn-pretrained.py create mode 100644 machine-learning-scripts/valohai/keras-dvc-cnn-simple.py create mode 100644 machine-learning-scripts/valohai/keras-sfnet-cnn.py create mode 100644 machine-learning-scripts/valohai/keras-sfnet-lstm.py create mode 100644 machine-learning-scripts/valohai/prediction_server.py create mode 100644 machine-learning-scripts/valohai/prediction_server_text.py create mode 100644 machine-learning-scripts/valohai/pytorch_dvc_cnn.py create mode 100644 machine-learning-scripts/valohai/pytorch_dvc_cnn_simple.py create mode 100644 machine-learning-scripts/valohai/test_prediction_server.py create mode 100644 machine-learning-scripts/valohai/test_prediction_server_text.py diff --git a/machine-learning-scripts/LICENSE b/machine-learning-scripts/LICENSE new file mode 100644 index 0000000..d90cc61 --- /dev/null +++ b/machine-learning-scripts/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 CSC - IT Center for Science Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/machine-learning-scripts/README.md b/machine-learning-scripts/README.md new file mode 100644 index 0000000..1a8d888 --- /dev/null +++ b/machine-learning-scripts/README.md @@ -0,0 +1,3 @@ +# machine-learning-scripts + + This repository is a miscellaneous collection of scripts and tools related to machine learning. Currently consists of Jupyter notebooks in the `notebooks` subdirectory, and example SLURM scripts in `slurm`. Materials related to specific courses can be found in the `courses` subdirectory. diff --git a/machine-learning-scripts/requirements.txt b/machine-learning-scripts/requirements.txt new file mode 100644 index 0000000..5c1b004 --- /dev/null +++ b/machine-learning-scripts/requirements.txt @@ -0,0 +1,5 @@ +# dependencies for Valohai inference deployment +pillow>=6.2.0 +requests==2.21.0 +scikit-image==0.14.2 +werkzeug>=0.15.3 diff --git a/machine-learning-scripts/valohai/dvc_urls.txt b/machine-learning-scripts/valohai/dvc_urls.txt new file mode 100644 index 0000000..51aece9 --- /dev/null +++ b/machine-learning-scripts/valohai/dvc_urls.txt @@ -0,0 +1,4 @@ +https://www.catster.com/wp-content/uploads/2017/08/A-fluffy-cat-looking-funny-surprised-or-concerned.jpg +https://i.ytimg.com/vi/lrvqjdMcjjQ/hqdefault.jpg +https://dynaimage.cdn.cnn.com/cnn/w_768,h_1024,c_scale/https%3A%2F%2Fdynaimage.cdn.cnn.com%2Fcnn%2Fx_1229%2Cy_0%2Cw_2712%2Ch_3616%2Cc_crop%2Fhttps%253A%252F%252Fstamp.static.cnn.io%252F5b7ac48b4db3d70020c01c13%252Fshutterstock_1081879181.jpg +https://static-cdn.jtvnw.net/jtv_user_pictures/dogdog-profile_image-5550ade194780dfc-300x300.jpeg diff --git a/machine-learning-scripts/valohai/keras-20ng-cnn.py b/machine-learning-scripts/valohai/keras-20ng-cnn.py new file mode 100644 index 0000000..2a07df7 --- /dev/null +++ b/machine-learning-scripts/valohai/keras-20ng-cnn.py @@ -0,0 +1,228 @@ + +# coding: utf-8 + +# # 20 Newsgroups text classification with pre-trained word embeddings +# +# In this script, we'll use pre-trained [GloVe word embeddings] +# (http://nlp.stanford.edu/projects/glove/) for text classification +# using Keras (version $\ge$ 2 is required). This script is largely +# based on the blog post [Using pre-trained word embeddings in a Keras +# model] +# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) +# by François Chollet. +# +# **Note that using a GPU with this script is highly recommended.** +# +# First, the needed imports. Keras tells us which backend (Theano, +# Tensorflow, CNTK) it will be using. + +from keras.preprocessing import sequence, text +from keras.models import Sequential +from keras.layers import Dense, Dropout +from keras.layers import Embedding +from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D +from keras.layers import LSTM, CuDNNLSTM +from keras.utils import to_categorical +from keras.callbacks import LambdaCallback + +from distutils.version import LooseVersion as LV +from keras import __version__ +from keras import backend as K + +from sklearn.model_selection import train_test_split + +import argparse +import json + +import os +import sys + +import numpy as np + +print('Using Keras version:', __version__, 'backend:', K.backend()) +assert(LV(__version__) >= LV("2.0.0")) + +def main(settings): + + # ## GloVe word embeddings + # + # Let's begin by loading a datafile containing pre-trained word + # embeddings. The datafile contains 100-dimensional embeddings for + # 400,000 English words. + + GLOVE_DIR = "/valohai/inputs/dataset/" + + print('Indexing word vectors.') + + embeddings_index = {} + with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f: + for line in f: + values = line.split() + word = values[0] + coefs = np.asarray(values[1:], dtype='float32') + embeddings_index[word] = coefs + + print('Found %s word vectors.' % len(embeddings_index)) + + # ## 20 Newsgroups data set + # + # Next we'll load the [20 Newsgroups] + # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html) + # data set. + # + # The dataset contains 20000 messages collected from 20 different + # Usenet newsgroups (1000 messages from each group): + # + # alt.atheism | soc.religion.christian | comp.windows.x | sci.crypt + # talk.politics.guns | comp.sys.ibm.pc.hardware | rec.autos | sci.electronics + # talk.politics.mideast | comp.graphics | rec.motorcycles | sci.space + # talk.politics.misc | comp.os.ms-windows.misc | rec.sport.baseball | sci.med + # talk.religion.misc | comp.sys.mac.hardware | rec.sport.hockey | misc.forsale + + TEXT_DATA_DIR = "/valohai/inputs/dataset/20_newsgroup" + + print('Processing text dataset') + + texts = [] # list of text samples + labels_index = {} # dictionary mapping label name to numeric id + labels = [] # list of label ids + for name in sorted(os.listdir(TEXT_DATA_DIR)): + path = os.path.join(TEXT_DATA_DIR, name) + if os.path.isdir(path): + label_id = len(labels_index) + labels_index[name] = label_id + for fname in sorted(os.listdir(path)): + if fname.isdigit(): + fpath = os.path.join(path, fname) + args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'} + with open(fpath, **args) as f: + t = f.read() + i = t.find('\n\n') # skip header + if 0 < i: + t = t[i:] + texts.append(t) + labels.append(label_id) + + print('Found %s texts.' % len(texts)) + + # Vectorize the text samples into a 2D integer tensor. + + MAX_NUM_WORDS = 10000 + MAX_SEQUENCE_LENGTH = 1000 + + tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS) + tokenizer.fit_on_texts(texts) + sequences = tokenizer.texts_to_sequences(texts) + + word_index = tokenizer.word_index + print('Found %s unique tokens.' % len(word_index)) + + data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) + + labels = to_categorical(np.asarray(labels)) + print('Shape of data tensor:', data.shape) + print('Shape of label tensor:', labels.shape) + + # Split the data into a training set and a validation set + + VALIDATION_SET, TEST_SET = 1000, 4000 + + x_train, x_test, y_train, y_test = train_test_split(data, labels, + test_size=TEST_SET, + shuffle=True, + random_state=42) + + x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, + test_size=VALIDATION_SET, + shuffle=False) + + print('Shape of training data tensor:', x_train.shape) + print('Shape of training label tensor:', y_train.shape) + print('Shape of validation data tensor:', x_val.shape) + print('Shape of validation label tensor:', y_val.shape) + print('Shape of test data tensor:', x_test.shape) + print('Shape of test label tensor:', y_test.shape) + + # Prepare the embedding matrix: + + print('Preparing embedding matrix.') + + num_words = min(MAX_NUM_WORDS, len(word_index) + 1) + embedding_dim = 100 + + embedding_matrix = np.zeros((num_words, embedding_dim)) + for word, i in word_index.items(): + if i >= MAX_NUM_WORDS: + continue + embedding_vector = embeddings_index.get(word) + if embedding_vector is not None: + # words not found in embedding index will be all-zeros. + embedding_matrix[i] = embedding_vector + + print('Shape of embedding matrix:', embedding_matrix.shape) + + # ### Initialization + + print('Build model...') + model = Sequential() + + model.add(Embedding(num_words, + embedding_dim, + weights=[embedding_matrix], + input_length=MAX_SEQUENCE_LENGTH, + trainable=False)) + #model.add(Dropout(0.2)) + + model.add(Conv1D(128, 5, activation='relu')) + model.add(MaxPooling1D(5)) + model.add(Conv1D(128, 5, activation='relu')) + model.add(MaxPooling1D(5)) + model.add(Conv1D(128, 5, activation='relu')) + model.add(GlobalMaxPooling1D()) + + model.add(Dense(128, activation='relu')) + model.add(Dense(20, activation='softmax')) + + model.compile(loss='categorical_crossentropy', + optimizer='rmsprop', + metrics=['accuracy']) + + print(model.summary()) + + # ### Learning + + epochs = settings.epochs + batch_size=128 + + json_logging_callback = LambdaCallback( + on_epoch_end=lambda epoch, logs: print(json.dumps({ + "epoch": epoch, + "loss": logs["loss"], + "acc": logs["acc"], + "val_loss": logs["val_loss"], + "val_acc": logs["val_acc"], + })), + ) + + history = model.fit(x_train, y_train, + batch_size=batch_size, + epochs=epochs, + validation_data=(x_val, y_val), + verbose=2, + callbacks=[json_logging_callback]) + + model.save('/valohai/outputs/20ng-cnn.h5') + + # ### Inference + + if settings.inference: + print('Evaluating model...') + scores = model.evaluate(x_test, y_test, verbose=2) + print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--inference', type=int, default=1) + settings = parser.parse_args() + main(settings) diff --git a/machine-learning-scripts/valohai/keras-20ng-rnn.py b/machine-learning-scripts/valohai/keras-20ng-rnn.py new file mode 100644 index 0000000..dbecd29 --- /dev/null +++ b/machine-learning-scripts/valohai/keras-20ng-rnn.py @@ -0,0 +1,221 @@ + +# coding: utf-8 + +# # 20 Newsgroups text classification with pre-trained word embeddings +# +# In this script, we'll use pre-trained [GloVe word embeddings] +# (http://nlp.stanford.edu/projects/glove/) for text classification +# using Keras (version $\ge$ 2 is required). This script is largely +# based on the blog post [Using pre-trained word embeddings in a Keras +# model] +# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) +# by François Chollet. +# +# **Note that using a GPU with this script is highly recommended.** +# +# First, the needed imports. Keras tells us which backend (Theano, +# Tensorflow, CNTK) it will be using. + +from keras.preprocessing import sequence, text +from keras.models import Sequential +from keras.layers import Dense, Dropout +from keras.layers import Embedding +from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D +from keras.layers import LSTM, CuDNNLSTM +from keras.utils import to_categorical +from keras.callbacks import LambdaCallback + +from distutils.version import LooseVersion as LV +from keras import __version__ +from keras import backend as K + +from sklearn.model_selection import train_test_split + +import argparse +import json + +import os +import sys + +import numpy as np + +print('Using Keras version:', __version__, 'backend:', K.backend()) +assert(LV(__version__) >= LV("2.0.0")) + +def main(settings): + + # ## GloVe word embeddings + # + # Let's begin by loading a datafile containing pre-trained word + # embeddings. The datafile contains 100-dimensional embeddings for + # 400,000 English words. + + GLOVE_DIR = "/valohai/inputs/dataset/" + + print('Indexing word vectors.') + + embeddings_index = {} + with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f: + for line in f: + values = line.split() + word = values[0] + coefs = np.asarray(values[1:], dtype='float32') + embeddings_index[word] = coefs + + print('Found %s word vectors.' % len(embeddings_index)) + + # ## 20 Newsgroups data set + # + # Next we'll load the [20 Newsgroups] + # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html) + # data set. + # + # The dataset contains 20000 messages collected from 20 different + # Usenet newsgroups (1000 messages from each group): + # + # alt.atheism | soc.religion.christian | comp.windows.x | sci.crypt + # talk.politics.guns | comp.sys.ibm.pc.hardware | rec.autos | sci.electronics + # talk.politics.mideast | comp.graphics | rec.motorcycles | sci.space + # talk.politics.misc | comp.os.ms-windows.misc | rec.sport.baseball | sci.med + # talk.religion.misc | comp.sys.mac.hardware | rec.sport.hockey | misc.forsale + + TEXT_DATA_DIR = "/valohai/inputs/dataset/20_newsgroup" + + print('Processing text dataset') + + texts = [] # list of text samples + labels_index = {} # dictionary mapping label name to numeric id + labels = [] # list of label ids + for name in sorted(os.listdir(TEXT_DATA_DIR)): + path = os.path.join(TEXT_DATA_DIR, name) + if os.path.isdir(path): + label_id = len(labels_index) + labels_index[name] = label_id + for fname in sorted(os.listdir(path)): + if fname.isdigit(): + fpath = os.path.join(path, fname) + args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'} + with open(fpath, **args) as f: + t = f.read() + i = t.find('\n\n') # skip header + if 0 < i: + t = t[i:] + texts.append(t) + labels.append(label_id) + + print('Found %s texts.' % len(texts)) + + # Vectorize the text samples into a 2D integer tensor. + + MAX_NUM_WORDS = 10000 + MAX_SEQUENCE_LENGTH = 1000 + + tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS) + tokenizer.fit_on_texts(texts) + sequences = tokenizer.texts_to_sequences(texts) + + word_index = tokenizer.word_index + print('Found %s unique tokens.' % len(word_index)) + + data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) + + labels = to_categorical(np.asarray(labels)) + print('Shape of data tensor:', data.shape) + print('Shape of label tensor:', labels.shape) + + # Split the data into a training set and a validation set + + VALIDATION_SET, TEST_SET = 1000, 4000 + + x_train, x_test, y_train, y_test = train_test_split(data, labels, + test_size=TEST_SET, + shuffle=True, random_state=42) + + x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, + test_size=VALIDATION_SET, + shuffle=False) + + print('Shape of training data tensor:', x_train.shape) + print('Shape of training label tensor:', y_train.shape) + print('Shape of validation data tensor:', x_val.shape) + print('Shape of validation label tensor:', y_val.shape) + print('Shape of test data tensor:', x_test.shape) + print('Shape of test label tensor:', y_test.shape) + + # Prepare the embedding matrix: + + print('Preparing embedding matrix.') + + num_words = min(MAX_NUM_WORDS, len(word_index) + 1) + embedding_dim = 100 + + embedding_matrix = np.zeros((num_words, embedding_dim)) + for word, i in word_index.items(): + if i >= MAX_NUM_WORDS: + continue + embedding_vector = embeddings_index.get(word) + if embedding_vector is not None: + # words not found in embedding index will be all-zeros. + embedding_matrix[i] = embedding_vector + + print('Shape of embedding matrix:', embedding_matrix.shape) + + # ### Initialization + + print('Build model...') + model = Sequential() + + model.add(Embedding(num_words, + embedding_dim, + weights=[embedding_matrix], + input_length=MAX_SEQUENCE_LENGTH, + trainable=False)) + #model.add(Dropout(0.2)) + + model.add(CuDNNLSTM(128)) + + model.add(Dense(128, activation='relu')) + model.add(Dense(20, activation='softmax')) + + model.compile(loss='categorical_crossentropy', + optimizer='rmsprop', + metrics=['accuracy']) + + print(model.summary()) + + # ### Learning + + epochs = settings.epochs + batch_size=128 + + json_logging_callback = LambdaCallback( + on_epoch_end=lambda epoch, logs: print(json.dumps({ + "epoch": epoch, + "loss": logs["loss"], + "acc": logs["acc"], + "val_loss": logs["val_loss"], + "val_acc": logs["val_acc"], + })), + ) + + history = model.fit(x_train, y_train, + batch_size=batch_size, + epochs=epochs, + validation_data=(x_val, y_val), + verbose=2, callbacks=[json_logging_callback]) + + model.save('/valohai/outputs/20ng-rnn.h5') + + # ### Inference + + if settings.inference: + print('Evaluating model...') + scores = model.evaluate(x_test, y_test, verbose=2) + print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--inference', type=int, default=1) + settings = parser.parse_args() + main(settings) diff --git a/machine-learning-scripts/valohai/keras-dvc-cnn-predict.py b/machine-learning-scripts/valohai/keras-dvc-cnn-predict.py new file mode 100644 index 0000000..2c4fd2a --- /dev/null +++ b/machine-learning-scripts/valohai/keras-dvc-cnn-predict.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import requests +import tempfile + +import keras +from keras.models import load_model +from keras.preprocessing.image import ImageDataGenerator + +tmp_dir = '/tmp' + + +def download_url(url, target_dir): + suffix = os.path.splitext(url)[1] + fp = tempfile.NamedTemporaryFile(dir=target_dir, suffix=suffix, delete=False) + r = requests.get(url, allow_redirects=True) + fp.write(r.content) + fp.close() + return fp.name + + +def main(args): + model = load_model(args.model) + # print(model.summary()) + print('Loaded model [{}] with {} layers.\n'.format(args.model, len(model.layers))) + + td = tempfile.TemporaryDirectory(dir=tmp_dir) + image_dir = td.name + target_dir = os.path.join(image_dir, 'a') + os.makedirs(target_dir) + + print('Downloading images from [{}] to [{}].'.format(args.urls_file, target_dir)) + file_to_url = {} + with open(args.urls_file, 'r') as fp: + for url in fp: + url = url.rstrip() + fn = os.path.basename(download_url(url, target_dir)) + file_to_url[fn] = url + # print('*', url) + + print() + input_image_size = (150, 150) + noopgen = ImageDataGenerator(rescale=1./255) + batch_size = 25 + + test_generator = noopgen.flow_from_directory( + image_dir, + target_size=input_image_size, + batch_size=batch_size, + class_mode=None, + shuffle=False) + + preds = model.predict_generator(test_generator, + steps=len(file_to_url) // batch_size + 1, + use_multiprocessing=False, + workers=4, + verbose=1) + + print() + filenames = test_generator.filenames + for i, p in enumerate(preds): + pn = p[0] + url = file_to_url[os.path.basename(filenames[i])] + cls = 'cat' if pn < 0.5 else 'dog' + print(json.dumps({'url': url, 'value': float(pn), 'class': cls})) + + td.cleanup() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('urls_file', type=str) + parser.add_argument('--model', type=str, default='dvc-vgg16-finetune.h5') + args = parser.parse_args() + + print('Using Keras version:', keras.__version__) + print() + main(args) diff --git a/machine-learning-scripts/valohai/keras-dvc-cnn-pretrained.py b/machine-learning-scripts/valohai/keras-dvc-cnn-pretrained.py new file mode 100644 index 0000000..b577334 --- /dev/null +++ b/machine-learning-scripts/valohai/keras-dvc-cnn-pretrained.py @@ -0,0 +1,245 @@ + +# coding: utf-8 + +# # Dogs-vs-cats classification with CNNs +# +# In this script, we'll train a convolutional neural network (CNN, +# ConvNet) to classify images of dogs from images of cats using Keras +# (version $\ge$ 2 is required). This script is largely based on the +# blog post [Building powerful image classification models using very +# little data] +# (https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html) +# by François Chollet. +# +# **Note that using a GPU with this script is highly recommended.** +# +# First, the needed imports. Keras tells us which backend (Theano, +# Tensorflow, CNTK) it will be using. + +from keras.models import Sequential +from keras.layers import Dense, Activation, Dropout, Flatten, MaxPooling2D +from keras.layers import InputLayer +# from keras.layers.convolutional import Conv2D +# from keras.preprocessing.image import (ImageDataGenerator, array_to_img, +# img_to_array, load_img) +from keras.preprocessing.image import ImageDataGenerator +from keras import applications, optimizers + +# from keras.utils import np_utils +from keras import backend as K + +from distutils.version import LooseVersion as LV +from keras import __version__ + +import argparse + + +def main(settings): + print('Using Keras version:', __version__, 'backend:', K.backend()) + assert(LV(__version__) >= LV("2.0.0")) + + # If we are using TensorFlow as the backend, we can use TensorBoard to + # visualize our progress during training. + + # if K.backend() == "tensorflow": + # import tensorflow as tf + # from keras.callbacks import TensorBoard + # import os, datetime + # logdir = os.path.join(os.getcwd(), "logs", + # "dvc-pretrained-"+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + # print('TensorBoard log directory:', logdir) + # os.makedirs(logdir) + # callbacks = [TensorBoard(log_dir=logdir)] + # else: + callbacks = None + + # ## Data + # + # The training dataset consists of 2000 images of dogs and cats, split + # in half. In addition, the validation set consists of 1000 images, + # and the test set of 22000 images. + + datapath = '/valohai/inputs/dataset/dogs-vs-cats/train-2000' + #datapath = "/wrk/makoskel/dogs-vs-cats/train-2000" + (nimages_train, nimages_validation, nimages_test) = (2000, 1000, 22000) + + # ### Data augmentation + # + # First, we'll resize all training and validation images to a fized size. + # + # Then, to make the most of our limited number of training examples, + # we'll apply random transformations to them each time we are looping + # over them. This way, we "augment" our training dataset to contain + # more data. There are various transformations readily available in + # Keras, see [ImageDataGenerator] + # (https://keras.io/preprocessing/image/) for more information. + + input_image_size = (150, 150) + + datagen = ImageDataGenerator( + rescale=1./255, + shear_range=0.2, + zoom_range=0.2, + #rotation_range=40, + #width_shift_range=0.2, + #height_shift_range=0.2, + horizontal_flip=True) + + noopgen = ImageDataGenerator(rescale=1./255) + + # Let's put a couple of training images with the augmentation to a + # TensorBoard event file. + + # augm_generator = datagen.flow_from_directory( + # datapath+'/train', + # target_size=input_image_size, + # batch_size=10) + + # for batch, _ in augm_generator: + # break + + # if K.backend() == "tensorflow": + # imgs = tf.convert_to_tensor(batch) + # summary_op = tf.summary.image("augmented", imgs, max_outputs=10) + # with tf.Session() as sess: + # summary = sess.run(summary_op) + # writer = tf.summary.FileWriter(logdir) + # writer.add_summary(summary) + # writer.close() + + # ### Data loaders + # + # Let's now define our real data loaders for training and validation data. + + batch_size = 25 + + print('Train: ', end="") + train_generator = datagen.flow_from_directory( + datapath+'/train', + target_size=input_image_size, + batch_size=batch_size, + class_mode='binary') + + print('Validation: ', end="") + validation_generator = noopgen.flow_from_directory( + datapath+'/validation', + target_size=input_image_size, + batch_size=batch_size, + class_mode='binary') + + print('Test: ', end="") + test_generator = noopgen.flow_from_directory( + datapath+'/test', + target_size=input_image_size, + batch_size=batch_size, + class_mode='binary') + + # We now reuse a pretrained network. Here we'll use the + # [VGG16](https://keras.io/applications/#vgg16) network architecture + # with weights learned using Imagenet. We remove the top layers and + # freeze the pre-trained weights. + # + # ### Initialization + + model = Sequential() + model.add(InputLayer(input_shape=input_image_size+(3,))) # possibly needed due to a bug in Keras + + vgg_model = applications.VGG16(weights='imagenet', + include_top=False, + input_shape=input_image_size+(3,)) + for layer in vgg_model.layers: + model.add(layer) + + for layer in model.layers: + layer.trainable = False + + print(model.summary()) + + # We then stack our own, randomly initialized layers on top of the + # VGG16 network. + + model.add(Flatten()) + model.add(Dense(64, activation='relu')) + #model.add(Dropout(0.5)) + model.add(Dense(1, activation='sigmoid')) + + model.compile(loss='binary_crossentropy', + optimizer='rmsprop', + metrics=['accuracy']) + + print(model.summary()) + + # ### Learning 1: New layers + + epochs = settings.epochs + workers = 4 + use_multiprocessing = False + + print('Training for', epochs, 'epochs with', workers, + 'workers, use_multiprocessing is', use_multiprocessing) + + history = model.fit_generator(train_generator, + steps_per_epoch=nimages_train // batch_size, + epochs=epochs, + validation_data=validation_generator, + validation_steps=nimages_validation // batch_size, + verbose=2, callbacks=callbacks, + use_multiprocessing=use_multiprocessing, + workers=workers) + + fname = "/valohai/outputs/dvc-vgg16-reuse.h5" + print('Saving model to', fname) + model.save(fname) + + # ### Learning 2: Fine-tuning + # + # Once the top layers have learned some reasonable weights, we can + # continue training by unfreezing the last convolution block of VGG16 + # (`block5`) so that it may adapt to our data. The learning rate + # should be smaller than usual. + + for layer in model.layers[15:]: + layer.trainable = True + print(layer.name, "now trainable") + + model.compile(loss='binary_crossentropy', + optimizer=optimizers.RMSprop(lr=1e-5), + metrics=['accuracy']) + + print(model.summary()) + + # Note that before continuing the training, we create a separate + # TensorBoard log directory: + + epochs_ft = 10 + + # if K.backend() == "tensorflow": + # logdir_ft = logdir + "-ft" + # os.makedirs(logdir_ft) + # callbacks_ft = [TensorBoard(log_dir=logdir_ft)] + # else: + callbacks_ft = None + + print('Finetuning for', epochs_ft, 'epochs with', workers, + 'workers, use_multiprocessing is', use_multiprocessing) + + history = model.fit_generator(train_generator, + steps_per_epoch=nimages_train // batch_size, + epochs=epochs_ft, + validation_data=validation_generator, + validation_steps=nimages_validation // batch_size, + verbose=2, callbacks=callbacks_ft, + use_multiprocessing=use_multiprocessing, + workers=workers) + + fname_ft = "/valohai/outputs/dvc-vgg16-finetune.h5" + print('Saving finetuned model to', fname_ft) + model.save(fname_ft) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--epochs', type=int, default=10) + parser.add_argument('--inference', type=bool, default=False) + settings = parser.parse_args() + main(settings) diff --git a/machine-learning-scripts/valohai/keras-dvc-cnn-simple.py b/machine-learning-scripts/valohai/keras-dvc-cnn-simple.py new file mode 100644 index 0000000..efec1e8 --- /dev/null +++ b/machine-learning-scripts/valohai/keras-dvc-cnn-simple.py @@ -0,0 +1,206 @@ +# coding: utf-8 + +# # Dogs-vs-cats classification with CNNs +# +# In this script, we'll train a convolutional neural network (CNN, +# ConvNet) to classify images of dogs from images of cats using Keras +# (version $\ge$ 2 is required). This script is largely based on the +# blog post [Building powerful image classification models using very +# little data] +# (https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html) +# by François Chollet. +# +# **Note that using a GPU with this script is highly recommended.** +# +# First, the needed imports. Keras tells us which backend (Theano, +# Tensorflow, CNTK) it will be using. +import argparse +import json +import os + +from keras.callbacks import LambdaCallback +from keras.models import Sequential +from keras.layers import Dense, Activation, Dropout, Flatten, MaxPooling2D +from keras.layers.convolutional import Conv2D +from keras.preprocessing.image import (ImageDataGenerator, array_to_img, + img_to_array, load_img) +from keras import applications, optimizers + +from keras.utils import np_utils +from keras import backend as K + +from distutils.version import LooseVersion as LV +from keras import __version__ + +import numpy as np + + +def main(settings): + print('Using Keras version:', __version__, 'backend:', K.backend()) + assert (LV(__version__) >= LV("2.0.0")) + + # If we are using TensorFlow as the backend, we can use TensorBoard to + # visualize our progress during training. + + # if K.backend() == "tensorflow": + # import tensorflow as tf + # from keras.callbacks import TensorBoard + # import os, datetime + # logdir = os.path.join(os.getcwd(), "logs", + # "dvc-simple-"+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + # print('TensorBoard log directory:', logdir) + # os.makedirs(logdir) + # callbacks = [TensorBoard(log_dir=logdir)] + # else: + # callbacks = None + + # ## Data + # + # The training dataset consists of 2000 images of dogs and cats, split + # in half. In addition, the validation set consists of 1000 images, + # and the test set of 22000 images. + + datapath = '/valohai/inputs/dataset/dogs-vs-cats/train-2000' + (nimages_train, nimages_validation, nimages_test) = (2000, 1000, 22000) + + # ### Data augmentation + # + # First, we'll resize all training and validation images to a fized size. + # + # Then, to make the most of our limited number of training examples, + # we'll apply random transformations to them each time we are looping + # over them. This way, we "augment" our training dataset to contain + # more data. There are various transformations readily available in + # Keras, see [ImageDataGenerator] + # (https://keras.io/preprocessing/image/) for more information. + + input_image_size = (150, 150) + + datagen = ImageDataGenerator( + rescale=1. / 255, + shear_range=0.2, + zoom_range=0.2, + # rotation_range=40, + # width_shift_range=0.2, + # height_shift_range=0.2, + horizontal_flip=True) + + noopgen = ImageDataGenerator(rescale=1. / 255) + + # Let's put a couple of training images with the augmentation to a + # TensorBoard event file. + + augm_generator = datagen.flow_from_directory( + datapath + '/train', + target_size=input_image_size, + batch_size=10) + + for batch, _ in augm_generator: + break + + # if K.backend() == "tensorflow": + # imgs = tf.convert_to_tensor(batch) + # summary_op = tf.summary.image("augmented", imgs, max_outputs=10) + # with tf.Session() as sess: + # summary = sess.run(summary_op) + # writer = tf.summary.FileWriter(logdir) + # writer.add_summary(summary) + # writer.close() + + # ### Data loaders + # + # Let's now define our real data loaders for training and validation data. + + batch_size = 25 + + print('Train: ', end="") + train_generator = datagen.flow_from_directory( + datapath + '/train', + target_size=input_image_size, + batch_size=batch_size, + class_mode='binary') + + print('Validation: ', end="") + validation_generator = noopgen.flow_from_directory( + datapath + '/validation', + target_size=input_image_size, + batch_size=batch_size, + class_mode='binary') + + print('Test: ', end="") + test_generator = noopgen.flow_from_directory( + datapath + '/test', + target_size=input_image_size, + batch_size=batch_size, + class_mode='binary') + + # Similarly as with MNIST digits, we can start from scratch and train + # a CNN for the classification task. However, due to the small number + # of training images, a large network will easily overfit, regardless + # of the data augmentation. + # + # ### Initialization + + model = Sequential() + + model.add(Conv2D(32, (3, 3), input_shape=input_image_size + (3,), activation='relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + + model.add(Conv2D(32, (3, 3), activation='relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + + model.add(Conv2D(64, (3, 3), activation='relu')) + model.add(MaxPooling2D(pool_size=(2, 2))) + + model.add(Flatten()) + model.add(Dense(64, activation='relu')) + model.add(Dropout(0.5)) + model.add(Dense(1, activation='sigmoid')) + + model.compile(loss='binary_crossentropy', + optimizer='rmsprop', + metrics=['accuracy']) + + print(model.summary()) + + json_logging_callback = LambdaCallback( + on_epoch_end=lambda epoch, logs: print(json.dumps({ + "epoch": epoch, + "loss": logs["loss"], + "acc": logs["acc"], + "val_loss": logs["val_loss"], + "val_acc": logs["val_acc"], + })), + ) + + # ### Learning + + history = model.fit_generator(train_generator, + steps_per_epoch=nimages_train // batch_size, + epochs=settings.epochs, + validation_data=validation_generator, + validation_steps=nimages_validation // batch_size, + verbose=2, + callbacks=[json_logging_callback], + use_multiprocessing=True, + workers=4) + + model.save('/valohai/outputs/dvc-small-cnn.h5') + + # ### Inference + + if settings.inference: + print('Evaluating model...') + scores = model.evaluate_generator(test_generator, + steps=nimages_test // batch_size, + use_multiprocessing=True, + workers=4) + print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--inference', type=int, default=0) + settings = parser.parse_args() + main(settings) diff --git a/machine-learning-scripts/valohai/keras-sfnet-cnn.py b/machine-learning-scripts/valohai/keras-sfnet-cnn.py new file mode 100644 index 0000000..16ea596 --- /dev/null +++ b/machine-learning-scripts/valohai/keras-sfnet-cnn.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 + +from keras.preprocessing import sequence, text +from keras.models import Sequential +from keras.layers import Dense # Dropout +from keras.layers import Embedding +from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D +# from keras.layers import CuDNNLSTM +from keras.utils import to_categorical + +from distutils.version import LooseVersion as LV +from keras import __version__ +from keras import backend as K + +# from IPython.display import SVG +# from keras.utils.vis_utils import model_to_dot + +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix + +from tqdm import tqdm + +import os +import gzip +import re +import pickle + +import numpy as np +# import matplotlib.pyplot as plt +# import seaborn as sns + +print('Using Keras version:', __version__, 'backend:', K.backend()) +assert(LV(__version__) >= LV("2.0.0")) + +FASTTEXT_FILE = "/valohai/inputs/embedding/cc.fi.300.vec.gz" +TEXT_DATA_DIR = "/valohai/inputs/dataset/sfnet2007-2008/raw_texts/" + + +# if K.backend() == "tensorflow": +# # import tensorflow as tf +# from keras.callbacks import TensorBoard +# import datetime +# logdir = os.path.join(os.getcwd(), "logs", +# "sfnet-"+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) +# print('TensorBoard log directory:', logdir) +# os.makedirs(logdir) +# callbacks = [TensorBoard(log_dir=logdir)] +# else: +callbacks = None + + +# Finnish word embeddings +# +# TODO: try also +# http://bionlp.utu.fi/finnish-internet-parsebank.html ? +# + +pickle_name = 'fasttext.cc.fi.300.pickle' + +if os.path.isfile(pickle_name): + with open(pickle_name, 'rb') as f: + embeddings_index = pickle.load(f) + print('Loaded word vectors from {}.'.format(pickle_name)) +else: + print('Indexing word vectors.') + + embeddings_index = {} + + with gzip.open(FASTTEXT_FILE, 'rt', encoding='utf-8') as f: + num_lines, dim = (int(x) for x in f.readline().rstrip().split()) + print('{} has {} words with {}-dimensional embeddings.'.format( + os.path.basename(FASTTEXT_FILE), num_lines, dim)) + + for line in tqdm(f, total=num_lines): + values = line.split() + word = values[0] + coefs = np.asarray(values[1:], dtype='float32') + assert coefs.shape[0] == dim + embeddings_index[word] = coefs + + assert len(embeddings_index) == num_lines + + # with open(pickle_name, 'wb') as f: + # # Pickle the 'data' dictionary using the highest protocol available. + # pickle.dump(embeddings_index, f, pickle.HIGHEST_PROTOCOL) + +# FASTTEXT_FILE = "/media/data/yle-embeddings/fasttext_fin.csv.gz" + +# with gzip.open(FASTTEXT_FILE, 'rt', encoding='utf-8') as f: +# f.readline() +# # num_lines, dim = (int(x) for x in f.readline().rstrip().split()) +# # print('{} has {} words with {}-dimensional embeddings.'.format( +# # os.path.basename(FASTTEXT_FILE), num_lines, dim)) + +# for line in tqdm(f, total=880327): +# values = line.split(',') +# word = values[0] +# coefs = np.asarray(values[1:], dtype='float32') +# assert coefs.shape[0] == 100 +# embeddings_index[word] = coefs + +# # assert len(embeddings_index) == num_lines + +# print('Loaded {} embeddings'.format(len(embeddings_index))) +# print('Examples of embeddings:') +# for w in ['jotain', 'satunnaisia', 'sanoja']: +# print(w, embeddings_index[w]) + +print('Examples of embeddings:') +for w in ['jotain', 'satunnaisia', 'sanoja']: + print(w, embeddings_index[w]) + +# SFNet data set + +print('Processing text dataset') + +texts = [] # list of text samples +labels_index = {} # dictionary mapping label name to numeric id +labels = [] # list of label ids +for name in sorted(os.listdir(TEXT_DATA_DIR)): + path = os.path.join(TEXT_DATA_DIR, name) + if os.path.isdir(path): + label_id = len(labels_index) + labels_index[name] = label_id + print(name, label_id) + for fname in sorted(os.listdir(path)): + print('*', fname) + if fname.endswith('.gz'): + fpath = os.path.join(path, fname) + with gzip.open(fpath, 'rt', encoding='latin-1') as f: + header = True # keep track if we are in header area, or in message + t = '' # accumulate current message into t + prev_line = None + for line in f: + m = re.match(r'^([a-zA-Z]+): (.*)$', line) + if m and m.group(1) in ['Path', 'Subject', 'From', 'Newsgroups']: + # yes, we are definitely inside a header now... + header = True + if t != '': # if we have accumulated text, we save it + texts.append(t) + labels.append(label_id) + t = '' + continue + # empty line indicates end of headers + if line == '\n' and header: + header = False + continue + + # if not a header, accumulate line to text in t + if not header: + t += line + + prev_line = line + + if t != '': # store also the last message + texts.append(t) + labels.append(label_id) + +print('Found %s texts.' % len(texts)) + +# First message and its label: + +print(texts[0]) +print('label:', labels[0], labels_index) + +# Vectorize the text samples into a 2D integer tensor. + +MAX_NUM_WORDS = 10000 +MAX_SEQUENCE_LENGTH = 1000 + +tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS) +tokenizer.fit_on_texts(texts) +sequences = tokenizer.texts_to_sequences(texts) + +word_index = tokenizer.word_index +print('Found %s unique tokens.' % len(word_index)) + +#with gzip.open('/valohai/outputs/tokenizer_sfnet.json.gz', 'wt', encoding='utf-8') as f: +# f.write(tokenizer.to_json()) +with open('/valohai/outputs/tokenizer_sfnet.pkl', 'wb') as f: + pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL) + + +data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) + +labels = to_categorical(np.asarray(labels)) +print('Shape of data tensor:', data.shape) +print('Shape of label tensor:', labels.shape) + + +# Split the data into a training set and a validation set + +VALIDATION_SET, TEST_SET = 1000, 4000 + +x_train, x_test, y_train, y_test = train_test_split(data, labels, + test_size=TEST_SET, + shuffle=True, random_state=42) + +x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, + test_size=VALIDATION_SET, + shuffle=False) + +print('Shape of training data tensor:', x_train.shape) +print('Shape of training label tensor:', y_train.shape) +print('Shape of validation data tensor:', x_val.shape) +print('Shape of validation label tensor:', y_val.shape) +print('Shape of test data tensor:', x_test.shape) +print('Shape of test label tensor:', y_test.shape) + + +# Prepare the embedding matrix: + +print('Preparing embedding matrix.') + +num_words = min(MAX_NUM_WORDS, len(word_index) + 1) +embedding_dim = 300 +not_found = 0 + +embedding_matrix = np.zeros((num_words, embedding_dim)) +for word, i in word_index.items(): + if i >= MAX_NUM_WORDS: + continue + embedding_vector = embeddings_index.get(word) + if embedding_vector is not None: + # words not found in embedding index will be all-zeros. + embedding_matrix[i] = embedding_vector + else: + not_found += 1 + +print('Shape of embedding matrix:', embedding_matrix.shape) +print('Number of words not found in embedding index:', not_found) + + +# 1-D CNN + +print('Build model...') +model = Sequential() + +model.add(Embedding(num_words, + embedding_dim, + weights=[embedding_matrix], + input_length=MAX_SEQUENCE_LENGTH, + trainable=False)) +# model.add(Dropout(0.2)) + +model.add(Conv1D(128, 5, activation='relu')) +model.add(MaxPooling1D(5)) +model.add(Conv1D(128, 5, activation='relu')) +model.add(MaxPooling1D(5)) +model.add(Conv1D(128, 5, activation='relu')) +model.add(GlobalMaxPooling1D()) + +model.add(Dense(128, activation='relu')) +model.add(Dense(9, activation='softmax')) + +model.compile(loss='categorical_crossentropy', + optimizer='rmsprop', + metrics=['accuracy']) + +print(model.summary()) + +# Learning + +epochs = 15 +batch_size = 128 + +history = model.fit(x_train, y_train, + batch_size=batch_size, + epochs=epochs, + validation_data=(x_val, y_val), + verbose=2, callbacks=callbacks) +model.save('/valohai/outputs/sfnet-cnn-fasttext_cc300-epochs{}.h5'.format(epochs)) + + +# Inference +scores = model.evaluate(x_test, y_test, verbose=2) +print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) + +# We can also look at classification accuracies separately for each +# newsgroup, and compute a confusion matrix to see which newsgroups +# get mixed the most: + +predictions = model.predict(x_test) + +cm = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1), + labels=list(range(9))) + +print('Classification accuracy for each newsgroup:') +print() +labels = [l[0] for l in sorted(labels_index.items(), key=lambda x: x[1])] +for i, j in enumerate(cm.diagonal()/cm.sum(axis=1)): + print("%s: %.4f" % (labels[i].ljust(26), j)) +print() + +print('Confusion matrix (rows: true newsgroup; columns: predicted newsgroup):') +print() +np.set_printoptions(linewidth=9999) +print(cm) +print() diff --git a/machine-learning-scripts/valohai/keras-sfnet-lstm.py b/machine-learning-scripts/valohai/keras-sfnet-lstm.py new file mode 100644 index 0000000..d60be90 --- /dev/null +++ b/machine-learning-scripts/valohai/keras-sfnet-lstm.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 + +from keras.preprocessing import sequence, text +from keras.models import Sequential +from keras.layers import Dense, Dropout +from keras.layers import Embedding +# from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D +from keras.layers import CuDNNLSTM +from keras.utils import to_categorical + +from distutils.version import LooseVersion as LV +from keras import __version__ +from keras import backend as K + +# from IPython.display import SVG +# from keras.utils.vis_utils import model_to_dot + +from sklearn.model_selection import train_test_split +from sklearn.metrics import confusion_matrix + +from tqdm import tqdm + +import os +import gzip +import re +import pickle + +import numpy as np +# import matplotlib.pyplot as plt +# import seaborn as sns + +print('Using Keras version:', __version__, 'backend:', K.backend()) +assert(LV(__version__) >= LV("2.0.0")) + +FASTTEXT_FILE = "/valohai/inputs/embedding/cc.fi.300.vec.gz" +TEXT_DATA_DIR = "/valohai/inputs/dataset/sfnet2007-2008/raw_texts/" + + +# if K.backend() == "tensorflow": +# # import tensorflow as tf +# from keras.callbacks import TensorBoard +# import datetime +# logdir = os.path.join(os.getcwd(), "logs", +# "sfnet-lstm-"+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) +# print('TensorBoard log directory:', logdir) +# os.makedirs(logdir) +# callbacks = [TensorBoard(log_dir=logdir)] +# else: +callbacks = None + + +# Finnish word embeddings +# +# TODO: try also +# http://bionlp.utu.fi/finnish-internet-parsebank.html ? +# + +pickle_name = 'fasttext.cc.fi.300.pickle' + +if os.path.isfile(pickle_name): + with open(pickle_name, 'rb') as f: + embeddings_index = pickle.load(f) + print('Loaded word vectors from {}.'.format(pickle_name)) +else: + print('Indexing word vectors.') + + embeddings_index = {} + + with gzip.open(FASTTEXT_FILE, 'rt', encoding='utf-8') as f: + num_lines, dim = (int(x) for x in f.readline().rstrip().split()) + print('{} has {} words with {}-dimensional embeddings.'.format( + os.path.basename(FASTTEXT_FILE), num_lines, dim)) + + for line in tqdm(f, total=num_lines): + values = line.split() + word = values[0] + coefs = np.asarray(values[1:], dtype='float32') + assert coefs.shape[0] == dim + embeddings_index[word] = coefs + + assert len(embeddings_index) == num_lines + + # with open(pickle_name, 'wb') as f: + # # Pickle the 'data' dictionary using the highest protocol available. + # pickle.dump(embeddings_index, f, pickle.HIGHEST_PROTOCOL) + +# FASTTEXT_FILE = "/media/data/yle-embeddings/fasttext_fin.csv.gz" + +# with gzip.open(FASTTEXT_FILE, 'rt', encoding='utf-8') as f: +# f.readline() +# # num_lines, dim = (int(x) for x in f.readline().rstrip().split()) +# # print('{} has {} words with {}-dimensional embeddings.'.format( +# # os.path.basename(FASTTEXT_FILE), num_lines, dim)) + +# for line in tqdm(f, total=880327): +# values = line.split(',') +# word = values[0] +# coefs = np.asarray(values[1:], dtype='float32') +# assert coefs.shape[0] == 100 +# embeddings_index[word] = coefs + +# # assert len(embeddings_index) == num_lines + +# print('Loaded {} embeddings'.format(len(embeddings_index))) +# print('Examples of embeddings:') +# for w in ['jotain', 'satunnaisia', 'sanoja']: +# print(w, embeddings_index[w]) + +print('Examples of embeddings:') +for w in ['jotain', 'satunnaisia', 'sanoja']: + print(w, embeddings_index[w]) + +# SFNet data set + +print('Processing text dataset') + +texts = [] # list of text samples +labels_index = {} # dictionary mapping label name to numeric id +labels = [] # list of label ids +for name in sorted(os.listdir(TEXT_DATA_DIR)): + path = os.path.join(TEXT_DATA_DIR, name) + if os.path.isdir(path): + label_id = len(labels_index) + labels_index[name] = label_id + print(name, label_id) + for fname in sorted(os.listdir(path)): + print('*', fname) + if fname.endswith('.gz'): + fpath = os.path.join(path, fname) + with gzip.open(fpath, 'rt', encoding='latin-1') as f: + header = True # keep track if we are in header area, or in message + t = '' # accumulate current message into t + prev_line = None + for line in f: + m = re.match(r'^([a-zA-Z]+): (.*)$', line) + if m and m.group(1) in ['Path', 'Subject', 'From', 'Newsgroups']: + # yes, we are definitely inside a header now... + header = True + if t != '': # if we have accumulated text, we save it + texts.append(t) + labels.append(label_id) + t = '' + continue + # empty line indicates end of headers + if line == '\n' and header: + header = False + continue + + # if not a header, accumulate line to text in t + if not header: + t += line + + prev_line = line + + if t != '': # store also the last message + texts.append(t) + labels.append(label_id) + +print('Found %s texts.' % len(texts)) + +# First message and its label: + +print(texts[0]) +print('label:', labels[0], labels_index) + +# Vectorize the text samples into a 2D integer tensor. + +MAX_NUM_WORDS = 10000 +MAX_SEQUENCE_LENGTH = 1000 + +tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS) +tokenizer.fit_on_texts(texts) +sequences = tokenizer.texts_to_sequences(texts) + +word_index = tokenizer.word_index +print('Found %s unique tokens.' % len(word_index)) + +data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) + +labels = to_categorical(np.asarray(labels)) +print('Shape of data tensor:', data.shape) +print('Shape of label tensor:', labels.shape) + + +# Split the data into a training set and a validation set + +VALIDATION_SET, TEST_SET = 1000, 4000 + +x_train, x_test, y_train, y_test = train_test_split(data, labels, + test_size=TEST_SET, + shuffle=True, random_state=42) + +x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, + test_size=VALIDATION_SET, + shuffle=False) + +print('Shape of training data tensor:', x_train.shape) +print('Shape of training label tensor:', y_train.shape) +print('Shape of validation data tensor:', x_val.shape) +print('Shape of validation label tensor:', y_val.shape) +print('Shape of test data tensor:', x_test.shape) +print('Shape of test label tensor:', y_test.shape) + + +# Prepare the embedding matrix: + +print('Preparing embedding matrix.') + +num_words = min(MAX_NUM_WORDS, len(word_index) + 1) +embedding_dim = 300 +not_found = 0 + +embedding_matrix = np.zeros((num_words, embedding_dim)) +for word, i in word_index.items(): + if i >= MAX_NUM_WORDS: + continue + embedding_vector = embeddings_index.get(word) + if embedding_vector is not None: + # words not found in embedding index will be all-zeros. + embedding_matrix[i] = embedding_vector + else: + not_found += 1 + +print('Shape of embedding matrix:', embedding_matrix.shape) +print('Number of words not found in embedding index:', not_found) + +print('Build model...') +model = Sequential() + +model.add(Embedding(num_words, + embedding_dim, + weights=[embedding_matrix], + input_length=MAX_SEQUENCE_LENGTH, + trainable=False)) +model.add(Dropout(0.5)) + +model.add(CuDNNLSTM(128, return_sequences=True)) +model.add(CuDNNLSTM(128)) + +model.add(Dense(128, activation='relu')) +model.add(Dense(9, activation='softmax')) + +model.compile(loss='categorical_crossentropy', + optimizer='rmsprop', + metrics=['accuracy']) + +print(model.summary()) + + +# Learning + +epochs = 20 +batch_size = 128 + +history = model.fit(x_train, y_train, + batch_size=batch_size, + epochs=epochs, + validation_data=(x_val, y_val), + verbose=2, callbacks=callbacks) +model.save('/valohai/outputs/sfnet-lstm-fasttext_cc300-epochs{}.h5'.format(epochs)) + +# Inference +scores = model.evaluate(x_test, y_test, verbose=2) +print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) + + +predictions = model.predict(x_test) + +cm = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1), + labels=list(range(9))) +cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] + +print('Classification accuracy for each newsgroup:') +print() +labels = [l[0] for l in sorted(labels_index.items(), key=lambda x: x[1])] +for i, j in enumerate(cm.diagonal()/cm.sum(axis=1)): + print("%s: %.4f" % (labels[i].ljust(26), j)) +print() + +print('Confusion matrix (rows: true newsgroup; columns: predicted newsgroup):') +print() +np.set_printoptions(linewidth=9999) +print(cm) +print() diff --git a/machine-learning-scripts/valohai/prediction_server.py b/machine-learning-scripts/valohai/prediction_server.py new file mode 100644 index 0000000..e767bcc --- /dev/null +++ b/machine-learning-scripts/valohai/prediction_server.py @@ -0,0 +1,80 @@ +import glob +import json +import os + +os.environ['KERAS_BACKEND'] = 'tensorflow' +from keras.engine.saving import load_model +from PIL import Image +from skimage import transform +from werkzeug.debug import DebuggedApplication +from werkzeug.wrappers import Response, Request +import numpy as np + +""" +Development Usage: + $ python valohai/prediction_server.py + - it assumes that you have *.h5 model file in the current working directory + - then you can POST images to the local URL e.g. with test_prediction_server.py +""" + +model = None + + +def read_image_from_request(request): + # Reads the first file in the request and tries to load it as an image. + if not request.files: + return None + file_key = list(request.files.keys())[0] + file = request.files.get(file_key) + img = Image.open(file.stream) + img.load() + return img + + +def create_response(content, status_code): + return Response(json.dumps(content), status_code, mimetype='application/json') + + +def predict_wsgi(environ, start_response): + request = Request(environ) + image = read_image_from_request(request) + if not image: + result = {'error': 'No images in the request, include sample image in the request.'} + response = create_response(result, 400) + return response(environ, start_response) + + # Pre-processing a single image + # TODO: notice that this is not 100% the same preprocessing than in training, will create some skew + image = np.array(image).astype('float32') / 255 + image = transform.resize(image, (150, 150, 3), mode='constant', anti_aliasing=False) + image = np.expand_dims(image, axis=0) + + # Load model as global object so it stays in the memory making responses fast. + global model + if not model: + # Try to find HDF5 files on the current directory to load as the model. + local_hdf5_files = glob.glob('*.h5') + if not local_hdf5_files: + result = {'error': 'Could not find predictive model to load, contact support.'} + response = create_response(result, 400) + return response(environ, start_response) + model_path = os.path.join(os.getcwd(), local_hdf5_files[0]) + model = load_model(model_path) + + # Give prediction on the image. + predictions = model.predict(image) + prediction = predictions[0] + + # Report results. + cls = 'cat' if prediction < 0.5 else 'dog' + result = {'class': cls, 'value': float(prediction)} + response = create_response(result, 200) + return response(environ, start_response) + + +predict_wsgi = DebuggedApplication(predict_wsgi) + +if __name__ == '__main__': + from werkzeug.serving import run_simple + + run_simple('localhost', 8000, predict_wsgi) diff --git a/machine-learning-scripts/valohai/prediction_server_text.py b/machine-learning-scripts/valohai/prediction_server_text.py new file mode 100644 index 0000000..9bdcae5 --- /dev/null +++ b/machine-learning-scripts/valohai/prediction_server_text.py @@ -0,0 +1,111 @@ +import glob +import json +import os + +os.environ['KERAS_BACKEND'] = 'tensorflow' +from keras.preprocessing import sequence, text +from keras.engine.saving import load_model +from werkzeug.debug import DebuggedApplication +from werkzeug.wrappers import Response, Request +import gzip +import pickle + +""" +Development Usage: + $ python valohai/prediction_server_text.py + - it assumes that you have *.h5 model file in the current working directory + - submit texts with GET parameter text to the local URL e.g. with test_prediction_server.py +""" + +MAX_SEQUENCE_LENGTH = 1000 + +model = None +tokenizer = None + +groups = { + 'atk': 0, + 'harrastus': 1, + 'keskustelu': 2, + 'misc': 3, + 'tiede': 4, + 'tietoliikenne': 5, + 'tori': 6, + 'urheilu': 7, + 'viestinta': 8 +} + + +def create_response(content, status_code): + return Response(json.dumps(content), status_code, mimetype='application/json') + + +def predict_wsgi(environ, start_response): + global model, tokenizer + + request = Request(environ) + get_text = request.form.get('text') + if get_text is None: + get_text = request.args.get('text') + if get_text is None: + result = {'error': 'No text given in the request.'} + response = create_response(result, 400) + return response(environ, start_response) + + texts = [get_text] + + # Load model as global object so it stays in the memory making responses fast. + if model is None: + # Try to find HDF5 files on the current directory to load as the model. + local_hdf5_files = glob.glob('*.h5') + if not local_hdf5_files: + result = {'error': 'Could not find predictive model to load, contact support.'} + response = create_response(result, 400) + return response(environ, start_response) + model_path = os.path.join(os.getcwd(), local_hdf5_files[0]) + model = load_model(model_path) + + if tokenizer is None: + # with gzip.open('tokenizer_sfnet.json.gz', 'rt', encoding='utf-8') as f: + # tokenizer = text.text.tokenizer_from_json(f.read()) + local_pkl_files = glob.glob('tokenizer*.pkl') + if not local_pkl_files: + result = {'error': 'Could not find pickled Tokenizer to load, contact support.'} + response = create_response(result, 400) + return response(environ, start_response) + pkl_path = os.path.join(os.getcwd(), local_pkl_files[0]) + with open(pkl_path, 'rb') as f: + tokenizer = pickle.load(f) + + # print(texts) + sequences = tokenizer.texts_to_sequences(texts) + # print(sequences) + data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) + + # Give prediction + predictions = model.predict(data) + prediction = predictions[0] + + # Report results. + result = {} + result['predict'] = {g: float(prediction[i]) for g, i in groups.items()} + + # Get details about the deployed model if possible. + metadata_path = 'valohai-metadata.json' + if os.path.isfile(metadata_path): + with open(metadata_path) as f: + try: + deployment_metadata = json.load(f) + result['deployment'] = deployment_metadata + except json.JSONDecodeError: + # Could not read the deployment metadata, ignore it + pass + + response = create_response(result, 200) + return response(environ, start_response) + + +predict_wsgi = DebuggedApplication(predict_wsgi) + +if __name__ == '__main__': + from werkzeug.serving import run_simple + run_simple('localhost', 8000, predict_wsgi) diff --git a/machine-learning-scripts/valohai/pytorch_dvc_cnn.py b/machine-learning-scripts/valohai/pytorch_dvc_cnn.py new file mode 100644 index 0000000..ce6e681 --- /dev/null +++ b/machine-learning-scripts/valohai/pytorch_dvc_cnn.py @@ -0,0 +1,171 @@ +# coding: utf-8 + +# Dogs-vs-cats classification with CNNs +# +# In this script, we'll train a convolutional neural network (CNN, +# ConvNet) to classify images of dogs from images of cats using +# PyTorch. This script is largely based on the blog post [Building +# powerful image classification models using very little +# data](https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html) +# by François Chollet. +# +# **Note that using a GPU with this script is highly recommended.** + +import torch +from torch.utils.data import DataLoader +from torchvision import datasets, transforms +from distutils.version import LooseVersion as LV +import os + +torch.manual_seed(42) +if torch.cuda.is_available(): + device = torch.device('cuda') +else: + device = torch.device('cpu') +print('Using PyTorch version:', torch.__version__, ' Device:', device) +assert(LV(torch.__version__) >= LV("1.0.0")) + +datapath = None +subpath = 'dogs-vs-cats/train-2000' + +slurm_job_id = os.environ.get('SLURM_JOB_ID') +if slurm_job_id is not None: + datapath = os.path.join(os.environ.get('TMPDIR'), os.environ.get('SLURM_JOB_ID'), + subpath) +if datapath is None or not os.path.isdir(datapath): + datapath = '/wrk/makoskel/' + subpath +if not os.path.isdir(datapath): + datapath = '/media/data/' + subpath +if not os.path.isdir(datapath): + datapath = '/valohai/inputs/dataset/' + subpath +print('Reading data from path:', datapath) + +(nimages_train, nimages_validation, nimages_test) = (2000, 1000, 22000) + + +def get_tensorboard(log_name): + return None + # try: + # import tensorboardX + # import os + # import datetime + # logdir = os.path.join(os.getcwd(), "logs", + # "dvc-" + log_name + "-" + + # datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) + # print('Logging TensorBoard to:', logdir) + # os.makedirs(logdir) + # return tensorboardX.SummaryWriter(logdir) + # except ImportError: + # return None + + +def train(model, loader, criterion, optimizer, epoch, log=None): + # Set model to training mode + model.train() + epoch_loss = 0. + + # Loop over each batch from the training set + for batch_idx, (data, target) in enumerate(loader): + # Copy data to GPU if needed + data = data.to(device) + target = target.to(device) + + # Zero gradient buffers + optimizer.zero_grad() + + # Pass data through the network + output = model(data) + output = torch.squeeze(output) + + # Calculate loss + loss = criterion(output, target.to(torch.float32)) + epoch_loss += loss.data.item() + + # Backpropagate + loss.backward() + + # Update weights + optimizer.step() + + epoch_loss /= len(loader.dataset) + print('Train Epoch: {}, Loss: {:.4f}'.format(epoch, epoch_loss)) + + if log is not None: + log.add_scalar('loss', epoch_loss, epoch-1) + + +def evaluate(model, loader, criterion=None, epoch=None, log=None): + model.eval() + loss, correct = 0, 0 + for data, target in loader: + data = data.to(device) + target = target.to(device) + + output = torch.squeeze(model(data)) + + if criterion is not None: + loss += criterion(output, target.to(torch.float32)).data.item() + + pred = output > 0.5 + pred = pred.to(torch.int64) + correct += pred.eq(target.data).cpu().sum() + + if criterion is not None: + loss /= len(loader.dataset) + + accuracy = 100. * correct.to(torch.float32) / len(loader.dataset) + + print('Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( + loss, correct, len(loader.dataset), accuracy)) + + if log is not None and epoch is not None: + log.add_scalar('val_loss', loss, epoch-1) + log.add_scalar('val_acc', accuracy, epoch-1) + + +input_image_size = (150, 150) + +data_transform = transforms.Compose([ + transforms.Resize(input_image_size), + transforms.RandomAffine(degrees=0, translate=None, + scale=(0.8, 1.2), shear=0.2), + transforms.RandomHorizontalFlip(), + transforms.ToTensor() + ]) + +noop_transform = transforms.Compose([ + transforms.Resize(input_image_size), + transforms.ToTensor() + ]) + + +def get_train_loader(batch_size=25): + print('Train: ', end="") + train_dataset = datasets.ImageFolder(root=datapath+'/train', + transform=data_transform) + train_loader = DataLoader(train_dataset, batch_size=batch_size, + shuffle=True, num_workers=4) + print('Found', len(train_dataset), 'images belonging to', + len(train_dataset.classes), 'classes') + return train_loader + + +def get_validation_loader(batch_size=25): + print('Validation: ', end="") + validation_dataset = datasets.ImageFolder(root=datapath+'/validation', + transform=noop_transform) + validation_loader = DataLoader(validation_dataset, batch_size=batch_size, + shuffle=False, num_workers=4) + print('Found', len(validation_dataset), 'images belonging to', + len(validation_dataset.classes), 'classes') + return validation_loader + +def get_test_loader(batch_size=25): + print('Test: ', end="") + test_dataset = datasets.ImageFolder(root=datapath+'/test', + transform=noop_transform) + test_loader = DataLoader(test_dataset, batch_size=batch_size, + shuffle=False, num_workers=4) + print('Found', len(test_dataset), 'images belonging to', + len(test_dataset.classes), 'classes') + return test_loader diff --git a/machine-learning-scripts/valohai/pytorch_dvc_cnn_simple.py b/machine-learning-scripts/valohai/pytorch_dvc_cnn_simple.py new file mode 100644 index 0000000..b04fc83 --- /dev/null +++ b/machine-learning-scripts/valohai/pytorch_dvc_cnn_simple.py @@ -0,0 +1,99 @@ +# coding: utf-8 + +# Dogs-vs-cats classification with CNNs + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from datetime import datetime + +from pytorch_dvc_cnn import get_train_loader, get_validation_loader, get_test_loader +from pytorch_dvc_cnn import device, train, evaluate, get_tensorboard + +model_file = '/valohai/outputs/dvc_simple_cnn.pt' + + +# Option 1: Train a small CNN from scratch + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(3, 32, (3, 3)) + self.pool1 = nn.MaxPool2d((2, 2)) + self.conv2 = nn.Conv2d(32, 32, (3, 3)) + self.pool2 = nn.MaxPool2d((2, 2)) + self.conv3 = nn.Conv2d(32, 64, (3, 3)) + self.pool3 = nn.MaxPool2d((2, 2)) + self.fc1 = nn.Linear(17*17*64, 64) + self.fc1_drop = nn.Dropout(0.5) + self.fc2 = nn.Linear(64, 1) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = self.pool1(x) + x = F.relu(self.conv2(x)) + x = self.pool2(x) + x = F.relu(self.conv3(x)) + x = self.pool3(x) + + # "flatten" 2D to 1D + x = x.view(-1, 17*17*64) + x = F.relu(self.fc1(x)) + x = self.fc1_drop(x) + return torch.sigmoid(self.fc2(x)) + + +def train_main(): + model = Net().to(device) + optimizer = optim.SGD(model.parameters(), lr=0.05) + criterion = nn.BCELoss() + + print(model) + + batch_size = 25 + train_loader = get_train_loader(batch_size) + validation_loader = get_validation_loader(batch_size) + + log = get_tensorboard('simple') + epochs = 50 + + start_time = datetime.now() + for epoch in range(1, epochs + 1): + train(model, train_loader, criterion, optimizer, epoch, log) + + with torch.no_grad(): + print('\nValidation:') + evaluate(model, validation_loader, criterion, epoch, log) + + end_time = datetime.now() + print('Total training time: {}.'.format(end_time - start_time)) + + torch.save(model.state_dict(), model_file) + print('Wrote model to', model_file) + + +def test_main(): + model = Net() + model.load_state_dict(torch.load(model_file)) + model.to(device) + + test_loader = get_test_loader(25) + + print('=========') + print('Test set:') + with torch.no_grad(): + evaluate(model, test_loader) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--test', action='store_true') + args = parser.parse_args() + + if args.test: + test_main() + else: + train_main() diff --git a/machine-learning-scripts/valohai/test_prediction_server.py b/machine-learning-scripts/valohai/test_prediction_server.py new file mode 100644 index 0000000..6e1487f --- /dev/null +++ b/machine-learning-scripts/valohai/test_prediction_server.py @@ -0,0 +1,38 @@ +import os +import argparse + +import requests + +""" +Simple test script that sends target file to local prediction server endpoint +and prints the response status code and content. + +If developing the prediction server, remember to start it first + +Usage: + $ python valohai/test_prediction_server.py inputs/cat.jpg,inputs/dog.jpg +""" + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('image_path', type=str) + args = parser.parse_args() + image_path = args.image_path + + # Support for multiple files using , separator + if ',' in image_path: + parts = image_path.split(',') + full_image_paths = [url for url in parts if len(url) > 0] + else: + full_image_paths = [os.path.join(os.getcwd(), args.image_path)] + + for fip in full_image_paths: + if not os.path.isfile(fip): + print(f'Could not find a file to send at {fip}') + exit(1) + + for fip in full_image_paths: + files = {'media': open(fip, 'rb')} + response = requests.post('http://localhost:8000', files=files) + print(f'Target: {fip}') + print(f'Result: {response.status_code} => {response.content}') diff --git a/machine-learning-scripts/valohai/test_prediction_server_text.py b/machine-learning-scripts/valohai/test_prediction_server_text.py new file mode 100644 index 0000000..7181656 --- /dev/null +++ b/machine-learning-scripts/valohai/test_prediction_server_text.py @@ -0,0 +1,26 @@ +import os +import argparse + +import requests + +""" +Simple test script that sends target file to local prediction server endpoint +and prints the response status code and content. + +If developing the prediction server, remember to start it first + +Usage: + $ python valohai/test_prediction_server_text.py "Hei maailma" +""" + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('text', type=str) + args = parser.parse_args() + + target_url = "https://valohai.cloud/msjoberg/pdl-test/sfnet/current/predict-nyyssi" + # target_url = "http://localhost:8000" + + params = {'text': args.text} + response = requests.get(target_url, params=params) + print(f'Result: {response.status_code} => {response.content}')