From df45607e49fd4fd0fc701f794c3f5edbc819ea08 Mon Sep 17 00:00:00 2001
From: Exajobs <95390472+exajobs@users.noreply.github.com>
Date: Thu, 16 Dec 2021 07:16:49 -0500
Subject: [PATCH] Add files via upload

---
 machine-learning-scripts/LICENSE              |  21 ++
 machine-learning-scripts/README.md            |   3 +
 machine-learning-scripts/requirements.txt     |   5 +
 machine-learning-scripts/valohai/dvc_urls.txt |   4 +
 .../valohai/keras-20ng-cnn.py                 | 228 +++++++++++++
 .../valohai/keras-20ng-rnn.py                 | 221 +++++++++++++
 .../valohai/keras-dvc-cnn-predict.py          |  81 +++++
 .../valohai/keras-dvc-cnn-pretrained.py       | 245 ++++++++++++++
 .../valohai/keras-dvc-cnn-simple.py           | 206 ++++++++++++
 .../valohai/keras-sfnet-cnn.py                | 299 ++++++++++++++++++
 .../valohai/keras-sfnet-lstm.py               | 284 +++++++++++++++++
 .../valohai/prediction_server.py              |  80 +++++
 .../valohai/prediction_server_text.py         | 111 +++++++
 .../valohai/pytorch_dvc_cnn.py                | 171 ++++++++++
 .../valohai/pytorch_dvc_cnn_simple.py         |  99 ++++++
 .../valohai/test_prediction_server.py         |  38 +++
 .../valohai/test_prediction_server_text.py    |  26 ++
 17 files changed, 2122 insertions(+)
 create mode 100644 machine-learning-scripts/LICENSE
 create mode 100644 machine-learning-scripts/README.md
 create mode 100644 machine-learning-scripts/requirements.txt
 create mode 100644 machine-learning-scripts/valohai/dvc_urls.txt
 create mode 100644 machine-learning-scripts/valohai/keras-20ng-cnn.py
 create mode 100644 machine-learning-scripts/valohai/keras-20ng-rnn.py
 create mode 100644 machine-learning-scripts/valohai/keras-dvc-cnn-predict.py
 create mode 100644 machine-learning-scripts/valohai/keras-dvc-cnn-pretrained.py
 create mode 100644 machine-learning-scripts/valohai/keras-dvc-cnn-simple.py
 create mode 100644 machine-learning-scripts/valohai/keras-sfnet-cnn.py
 create mode 100644 machine-learning-scripts/valohai/keras-sfnet-lstm.py
 create mode 100644 machine-learning-scripts/valohai/prediction_server.py
 create mode 100644 machine-learning-scripts/valohai/prediction_server_text.py
 create mode 100644 machine-learning-scripts/valohai/pytorch_dvc_cnn.py
 create mode 100644 machine-learning-scripts/valohai/pytorch_dvc_cnn_simple.py
 create mode 100644 machine-learning-scripts/valohai/test_prediction_server.py
 create mode 100644 machine-learning-scripts/valohai/test_prediction_server_text.py

diff --git a/machine-learning-scripts/LICENSE b/machine-learning-scripts/LICENSE
new file mode 100644
index 0000000..d90cc61
--- /dev/null
+++ b/machine-learning-scripts/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 CSC - IT Center for Science Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/machine-learning-scripts/README.md b/machine-learning-scripts/README.md
new file mode 100644
index 0000000..1a8d888
--- /dev/null
+++ b/machine-learning-scripts/README.md
@@ -0,0 +1,3 @@
+# machine-learning-scripts
+
+ This repository is a miscellaneous collection of scripts and tools related to machine learning. Currently consists of Jupyter notebooks in the `notebooks` subdirectory, and example SLURM scripts in `slurm`. Materials related to specific courses can be found in the `courses` subdirectory. 
diff --git a/machine-learning-scripts/requirements.txt b/machine-learning-scripts/requirements.txt
new file mode 100644
index 0000000..5c1b004
--- /dev/null
+++ b/machine-learning-scripts/requirements.txt
@@ -0,0 +1,5 @@
+# dependencies for Valohai inference deployment
+pillow>=6.2.0
+requests==2.21.0
+scikit-image==0.14.2
+werkzeug>=0.15.3
diff --git a/machine-learning-scripts/valohai/dvc_urls.txt b/machine-learning-scripts/valohai/dvc_urls.txt
new file mode 100644
index 0000000..51aece9
--- /dev/null
+++ b/machine-learning-scripts/valohai/dvc_urls.txt
@@ -0,0 +1,4 @@
+https://www.catster.com/wp-content/uploads/2017/08/A-fluffy-cat-looking-funny-surprised-or-concerned.jpg
+https://i.ytimg.com/vi/lrvqjdMcjjQ/hqdefault.jpg
+https://dynaimage.cdn.cnn.com/cnn/w_768,h_1024,c_scale/https%3A%2F%2Fdynaimage.cdn.cnn.com%2Fcnn%2Fx_1229%2Cy_0%2Cw_2712%2Ch_3616%2Cc_crop%2Fhttps%253A%252F%252Fstamp.static.cnn.io%252F5b7ac48b4db3d70020c01c13%252Fshutterstock_1081879181.jpg
+https://static-cdn.jtvnw.net/jtv_user_pictures/dogdog-profile_image-5550ade194780dfc-300x300.jpeg
diff --git a/machine-learning-scripts/valohai/keras-20ng-cnn.py b/machine-learning-scripts/valohai/keras-20ng-cnn.py
new file mode 100644
index 0000000..2a07df7
--- /dev/null
+++ b/machine-learning-scripts/valohai/keras-20ng-cnn.py
@@ -0,0 +1,228 @@
+
+# coding: utf-8
+
+# # 20 Newsgroups text classification with pre-trained word embeddings
+# 
+# In this script, we'll use pre-trained [GloVe word embeddings]
+# (http://nlp.stanford.edu/projects/glove/) for text classification
+# using Keras (version $\ge$ 2 is required). This script is largely
+# based on the blog post [Using pre-trained word embeddings in a Keras
+# model]
+# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)
+# by François Chollet.
+#
+# **Note that using a GPU with this script is highly recommended.**
+#
+# First, the needed imports. Keras tells us which backend (Theano,
+# Tensorflow, CNTK) it will be using.
+
+from keras.preprocessing import sequence, text
+from keras.models import Sequential
+from keras.layers import Dense, Dropout
+from keras.layers import Embedding
+from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
+from keras.layers import LSTM, CuDNNLSTM
+from keras.utils import to_categorical
+from keras.callbacks import LambdaCallback
+
+from distutils.version import LooseVersion as LV
+from keras import __version__
+from keras import backend as K
+
+from sklearn.model_selection import train_test_split
+
+import argparse
+import json
+
+import os
+import sys
+
+import numpy as np
+
+print('Using Keras version:', __version__, 'backend:', K.backend())
+assert(LV(__version__) >= LV("2.0.0"))
+
+def main(settings):
+
+    # ## GloVe word embeddings
+    #
+    # Let's begin by loading a datafile containing pre-trained word
+    # embeddings.  The datafile contains 100-dimensional embeddings for
+    # 400,000 English words.
+
+    GLOVE_DIR = "/valohai/inputs/dataset/"
+
+    print('Indexing word vectors.')
+
+    embeddings_index = {}
+    with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
+        for line in f:
+            values = line.split()
+            word = values[0]
+            coefs = np.asarray(values[1:], dtype='float32')
+            embeddings_index[word] = coefs
+
+    print('Found %s word vectors.' % len(embeddings_index))
+
+    # ## 20 Newsgroups data set
+    #
+    # Next we'll load the [20 Newsgroups]
+    # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html)
+    # data set.
+    #
+    # The dataset contains 20000 messages collected from 20 different
+    # Usenet newsgroups (1000 messages from each group):
+    #
+    # alt.atheism           | soc.religion.christian   | comp.windows.x     | sci.crypt
+    # talk.politics.guns    | comp.sys.ibm.pc.hardware | rec.autos          | sci.electronics
+    # talk.politics.mideast | comp.graphics            | rec.motorcycles    | sci.space
+    # talk.politics.misc    | comp.os.ms-windows.misc  | rec.sport.baseball | sci.med
+    # talk.religion.misc    | comp.sys.mac.hardware    | rec.sport.hockey   | misc.forsale
+
+    TEXT_DATA_DIR = "/valohai/inputs/dataset/20_newsgroup"
+
+    print('Processing text dataset')
+
+    texts = []  # list of text samples
+    labels_index = {}  # dictionary mapping label name to numeric id
+    labels = []  # list of label ids
+    for name in sorted(os.listdir(TEXT_DATA_DIR)):
+        path = os.path.join(TEXT_DATA_DIR, name)
+        if os.path.isdir(path):
+            label_id = len(labels_index)
+            labels_index[name] = label_id
+            for fname in sorted(os.listdir(path)):
+                if fname.isdigit():
+                    fpath = os.path.join(path, fname)
+                    args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
+                    with open(fpath, **args) as f:
+                        t = f.read()
+                        i = t.find('\n\n')  # skip header
+                        if 0 < i:
+                            t = t[i:]
+                        texts.append(t)
+                    labels.append(label_id)
+
+    print('Found %s texts.' % len(texts))
+
+    # Vectorize the text samples into a 2D integer tensor.
+
+    MAX_NUM_WORDS = 10000
+    MAX_SEQUENCE_LENGTH = 1000
+
+    tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
+    tokenizer.fit_on_texts(texts)
+    sequences = tokenizer.texts_to_sequences(texts)
+
+    word_index = tokenizer.word_index
+    print('Found %s unique tokens.' % len(word_index))
+
+    data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
+
+    labels = to_categorical(np.asarray(labels))
+    print('Shape of data tensor:', data.shape)
+    print('Shape of label tensor:', labels.shape)
+
+    # Split the data into a training set and a validation set
+
+    VALIDATION_SET, TEST_SET = 1000, 4000
+
+    x_train, x_test, y_train, y_test = train_test_split(data, labels,
+                                                        test_size=TEST_SET,
+                                                        shuffle=True,
+                                                        random_state=42)
+
+    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
+                                                      test_size=VALIDATION_SET,
+                                                      shuffle=False)
+
+    print('Shape of training data tensor:', x_train.shape)
+    print('Shape of training label tensor:', y_train.shape)
+    print('Shape of validation data tensor:', x_val.shape)
+    print('Shape of validation label tensor:', y_val.shape)
+    print('Shape of test data tensor:', x_test.shape)
+    print('Shape of test label tensor:', y_test.shape)
+
+    # Prepare the embedding matrix:
+
+    print('Preparing embedding matrix.')
+
+    num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
+    embedding_dim = 100
+
+    embedding_matrix = np.zeros((num_words, embedding_dim))
+    for word, i in word_index.items():
+        if i >= MAX_NUM_WORDS:
+            continue
+        embedding_vector = embeddings_index.get(word)
+        if embedding_vector is not None:
+            # words not found in embedding index will be all-zeros.
+            embedding_matrix[i] = embedding_vector
+
+    print('Shape of embedding matrix:', embedding_matrix.shape)
+
+    # ### Initialization
+
+    print('Build model...')
+    model = Sequential()
+
+    model.add(Embedding(num_words,
+                        embedding_dim,
+                        weights=[embedding_matrix],
+                        input_length=MAX_SEQUENCE_LENGTH,
+                        trainable=False))
+    #model.add(Dropout(0.2))
+
+    model.add(Conv1D(128, 5, activation='relu'))
+    model.add(MaxPooling1D(5))
+    model.add(Conv1D(128, 5, activation='relu'))
+    model.add(MaxPooling1D(5))
+    model.add(Conv1D(128, 5, activation='relu'))
+    model.add(GlobalMaxPooling1D())
+
+    model.add(Dense(128, activation='relu'))
+    model.add(Dense(20, activation='softmax'))
+
+    model.compile(loss='categorical_crossentropy',
+                  optimizer='rmsprop',
+                  metrics=['accuracy'])
+
+    print(model.summary())
+
+    # ### Learning
+
+    epochs = settings.epochs
+    batch_size=128
+
+    json_logging_callback = LambdaCallback(
+        on_epoch_end=lambda epoch, logs: print(json.dumps({
+            "epoch": epoch,
+            "loss": logs["loss"],
+            "acc": logs["acc"],
+            "val_loss": logs["val_loss"],
+            "val_acc": logs["val_acc"],
+        })),
+    )
+
+    history = model.fit(x_train, y_train,
+                        batch_size=batch_size,
+                        epochs=epochs,
+                        validation_data=(x_val, y_val),
+                        verbose=2,
+                        callbacks=[json_logging_callback])
+
+    model.save('/valohai/outputs/20ng-cnn.h5')
+
+    # ### Inference
+
+    if settings.inference:
+        print('Evaluating model...')
+        scores = model.evaluate(x_test, y_test, verbose=2)
+        print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=20)
+    parser.add_argument('--inference', type=int, default=1)
+    settings = parser.parse_args()
+    main(settings)
diff --git a/machine-learning-scripts/valohai/keras-20ng-rnn.py b/machine-learning-scripts/valohai/keras-20ng-rnn.py
new file mode 100644
index 0000000..dbecd29
--- /dev/null
+++ b/machine-learning-scripts/valohai/keras-20ng-rnn.py
@@ -0,0 +1,221 @@
+
+# coding: utf-8
+
+# # 20 Newsgroups text classification with pre-trained word embeddings
+# 
+# In this script, we'll use pre-trained [GloVe word embeddings]
+# (http://nlp.stanford.edu/projects/glove/) for text classification
+# using Keras (version $\ge$ 2 is required). This script is largely
+# based on the blog post [Using pre-trained word embeddings in a Keras
+# model]
+# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)
+# by François Chollet.
+# 
+# **Note that using a GPU with this script is highly recommended.**
+# 
+# First, the needed imports. Keras tells us which backend (Theano,
+# Tensorflow, CNTK) it will be using.
+
+from keras.preprocessing import sequence, text
+from keras.models import Sequential
+from keras.layers import Dense, Dropout
+from keras.layers import Embedding
+from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
+from keras.layers import LSTM, CuDNNLSTM
+from keras.utils import to_categorical
+from keras.callbacks import LambdaCallback
+
+from distutils.version import LooseVersion as LV
+from keras import __version__
+from keras import backend as K
+
+from sklearn.model_selection import train_test_split
+
+import argparse
+import json
+
+import os
+import sys
+
+import numpy as np
+
+print('Using Keras version:', __version__, 'backend:', K.backend())
+assert(LV(__version__) >= LV("2.0.0"))
+
+def main(settings):
+    
+    # ## GloVe word embeddings
+    # 
+    # Let's begin by loading a datafile containing pre-trained word
+    # embeddings.  The datafile contains 100-dimensional embeddings for
+    # 400,000 English words.
+
+    GLOVE_DIR = "/valohai/inputs/dataset/"
+
+    print('Indexing word vectors.')
+
+    embeddings_index = {}
+    with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
+        for line in f:
+            values = line.split()
+            word = values[0]
+            coefs = np.asarray(values[1:], dtype='float32')
+            embeddings_index[word] = coefs
+
+    print('Found %s word vectors.' % len(embeddings_index))
+
+    # ## 20 Newsgroups data set
+    # 
+    # Next we'll load the [20 Newsgroups]
+    # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html)
+    # data set.
+    # 
+    # The dataset contains 20000 messages collected from 20 different
+    # Usenet newsgroups (1000 messages from each group):
+    # 
+    # alt.atheism           | soc.religion.christian   | comp.windows.x     | sci.crypt   
+    # talk.politics.guns    | comp.sys.ibm.pc.hardware | rec.autos          | sci.electronics
+    # talk.politics.mideast | comp.graphics            | rec.motorcycles    | sci.space
+    # talk.politics.misc    | comp.os.ms-windows.misc  | rec.sport.baseball | sci.med
+    # talk.religion.misc    | comp.sys.mac.hardware    | rec.sport.hockey   | misc.forsale
+
+    TEXT_DATA_DIR = "/valohai/inputs/dataset/20_newsgroup"
+
+    print('Processing text dataset')
+
+    texts = []  # list of text samples
+    labels_index = {}  # dictionary mapping label name to numeric id
+    labels = []  # list of label ids
+    for name in sorted(os.listdir(TEXT_DATA_DIR)):
+        path = os.path.join(TEXT_DATA_DIR, name)
+        if os.path.isdir(path):
+            label_id = len(labels_index)
+            labels_index[name] = label_id
+            for fname in sorted(os.listdir(path)):
+                if fname.isdigit():
+                    fpath = os.path.join(path, fname)
+                    args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
+                    with open(fpath, **args) as f:
+                        t = f.read()
+                        i = t.find('\n\n')  # skip header
+                        if 0 < i:
+                            t = t[i:]
+                        texts.append(t)
+                    labels.append(label_id)
+
+    print('Found %s texts.' % len(texts))
+
+    # Vectorize the text samples into a 2D integer tensor.
+
+    MAX_NUM_WORDS = 10000
+    MAX_SEQUENCE_LENGTH = 1000 
+
+    tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
+    tokenizer.fit_on_texts(texts)
+    sequences = tokenizer.texts_to_sequences(texts)
+
+    word_index = tokenizer.word_index
+    print('Found %s unique tokens.' % len(word_index))
+
+    data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
+
+    labels = to_categorical(np.asarray(labels))
+    print('Shape of data tensor:', data.shape)
+    print('Shape of label tensor:', labels.shape)
+
+    # Split the data into a training set and a validation set
+
+    VALIDATION_SET, TEST_SET = 1000, 4000
+
+    x_train, x_test, y_train, y_test = train_test_split(data, labels, 
+                                                        test_size=TEST_SET,
+                                                        shuffle=True, random_state=42)
+
+    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, 
+                                                      test_size=VALIDATION_SET,
+                                                      shuffle=False)
+
+    print('Shape of training data tensor:', x_train.shape)
+    print('Shape of training label tensor:', y_train.shape)
+    print('Shape of validation data tensor:', x_val.shape)
+    print('Shape of validation label tensor:', y_val.shape)
+    print('Shape of test data tensor:', x_test.shape)
+    print('Shape of test label tensor:', y_test.shape)
+
+    # Prepare the embedding matrix:
+
+    print('Preparing embedding matrix.')
+
+    num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
+    embedding_dim = 100
+
+    embedding_matrix = np.zeros((num_words, embedding_dim))
+    for word, i in word_index.items():
+        if i >= MAX_NUM_WORDS:
+            continue
+        embedding_vector = embeddings_index.get(word)
+        if embedding_vector is not None:
+            # words not found in embedding index will be all-zeros.
+            embedding_matrix[i] = embedding_vector
+
+    print('Shape of embedding matrix:', embedding_matrix.shape)
+
+    # ### Initialization
+
+    print('Build model...')
+    model = Sequential()
+
+    model.add(Embedding(num_words,
+                        embedding_dim,
+                        weights=[embedding_matrix],
+                        input_length=MAX_SEQUENCE_LENGTH,
+                        trainable=False))
+    #model.add(Dropout(0.2))
+
+    model.add(CuDNNLSTM(128))
+
+    model.add(Dense(128, activation='relu'))
+    model.add(Dense(20, activation='softmax'))
+
+    model.compile(loss='categorical_crossentropy',
+                  optimizer='rmsprop',
+                  metrics=['accuracy'])
+
+    print(model.summary())
+
+    # ### Learning
+
+    epochs = settings.epochs
+    batch_size=128
+
+    json_logging_callback = LambdaCallback(
+        on_epoch_end=lambda epoch, logs: print(json.dumps({
+            "epoch": epoch,
+            "loss": logs["loss"],
+            "acc": logs["acc"],
+            "val_loss": logs["val_loss"],
+            "val_acc": logs["val_acc"],
+        })),
+    )
+
+    history = model.fit(x_train, y_train,
+                        batch_size=batch_size,
+                        epochs=epochs,
+                        validation_data=(x_val, y_val),
+                        verbose=2, callbacks=[json_logging_callback])
+
+    model.save('/valohai/outputs/20ng-rnn.h5')
+
+    # ### Inference
+
+    if settings.inference:
+        print('Evaluating model...')
+        scores = model.evaluate(x_test, y_test, verbose=2)
+        print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=20)
+    parser.add_argument('--inference', type=int, default=1)
+    settings = parser.parse_args()
+    main(settings)
diff --git a/machine-learning-scripts/valohai/keras-dvc-cnn-predict.py b/machine-learning-scripts/valohai/keras-dvc-cnn-predict.py
new file mode 100644
index 0000000..2c4fd2a
--- /dev/null
+++ b/machine-learning-scripts/valohai/keras-dvc-cnn-predict.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import requests
+import tempfile
+
+import keras
+from keras.models import load_model
+from keras.preprocessing.image import ImageDataGenerator
+
+tmp_dir = '/tmp'
+
+
+def download_url(url, target_dir):
+    suffix = os.path.splitext(url)[1]
+    fp = tempfile.NamedTemporaryFile(dir=target_dir, suffix=suffix, delete=False)
+    r = requests.get(url, allow_redirects=True)
+    fp.write(r.content)
+    fp.close()
+    return fp.name
+
+
+def main(args):
+    model = load_model(args.model)
+    # print(model.summary())
+    print('Loaded model [{}] with {} layers.\n'.format(args.model, len(model.layers)))
+
+    td = tempfile.TemporaryDirectory(dir=tmp_dir)
+    image_dir = td.name
+    target_dir = os.path.join(image_dir, 'a')
+    os.makedirs(target_dir)
+
+    print('Downloading images from [{}] to [{}].'.format(args.urls_file, target_dir))
+    file_to_url = {}
+    with open(args.urls_file, 'r') as fp:
+        for url in fp:
+            url = url.rstrip()
+            fn = os.path.basename(download_url(url, target_dir))
+            file_to_url[fn] = url
+            # print('*', url)
+
+    print()
+    input_image_size = (150, 150)
+    noopgen = ImageDataGenerator(rescale=1./255)
+    batch_size = 25
+
+    test_generator = noopgen.flow_from_directory(
+        image_dir,
+        target_size=input_image_size,
+        batch_size=batch_size,
+        class_mode=None,
+        shuffle=False)
+
+    preds = model.predict_generator(test_generator,
+                                    steps=len(file_to_url) // batch_size + 1,
+                                    use_multiprocessing=False,
+                                    workers=4,
+                                    verbose=1)
+
+    print()
+    filenames = test_generator.filenames
+    for i, p in enumerate(preds):
+        pn = p[0]
+        url = file_to_url[os.path.basename(filenames[i])]
+        cls = 'cat' if pn < 0.5 else 'dog'
+        print(json.dumps({'url': url, 'value': float(pn), 'class': cls}))
+
+    td.cleanup()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('urls_file', type=str)
+    parser.add_argument('--model', type=str, default='dvc-vgg16-finetune.h5')
+    args = parser.parse_args()
+
+    print('Using Keras version:', keras.__version__)
+    print()
+    main(args)
diff --git a/machine-learning-scripts/valohai/keras-dvc-cnn-pretrained.py b/machine-learning-scripts/valohai/keras-dvc-cnn-pretrained.py
new file mode 100644
index 0000000..b577334
--- /dev/null
+++ b/machine-learning-scripts/valohai/keras-dvc-cnn-pretrained.py
@@ -0,0 +1,245 @@
+
+# coding: utf-8
+
+# # Dogs-vs-cats classification with CNNs
+# 
+# In this script, we'll train a convolutional neural network (CNN,
+# ConvNet) to classify images of dogs from images of cats using Keras
+# (version $\ge$ 2 is required). This script is largely based on the
+# blog post [Building powerful image classification models using very
+# little data]
+# (https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html)
+# by François Chollet.
+# 
+# **Note that using a GPU with this script is highly recommended.**
+# 
+# First, the needed imports. Keras tells us which backend (Theano,
+# Tensorflow, CNTK) it will be using.
+
+from keras.models import Sequential
+from keras.layers import Dense, Activation, Dropout, Flatten, MaxPooling2D
+from keras.layers import InputLayer
+# from keras.layers.convolutional import Conv2D
+# from keras.preprocessing.image import (ImageDataGenerator, array_to_img,
+#                                       img_to_array, load_img)
+from keras.preprocessing.image import ImageDataGenerator
+from keras import applications, optimizers
+
+# from keras.utils import np_utils
+from keras import backend as K
+
+from distutils.version import LooseVersion as LV
+from keras import __version__
+
+import argparse
+
+
+def main(settings):
+    print('Using Keras version:', __version__, 'backend:', K.backend())
+    assert(LV(__version__) >= LV("2.0.0"))
+
+    # If we are using TensorFlow as the backend, we can use TensorBoard to
+    # visualize our progress during training.
+
+    # if K.backend() == "tensorflow":
+    #     import tensorflow as tf
+    #     from keras.callbacks import TensorBoard
+    #     import os, datetime
+    #     logdir = os.path.join(os.getcwd(), "logs",
+    #                      "dvc-pretrained-"+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+    #     print('TensorBoard log directory:', logdir)
+    #     os.makedirs(logdir)
+    #     callbacks = [TensorBoard(log_dir=logdir)]
+    # else:
+    callbacks = None
+
+    # ## Data
+    #
+    # The training dataset consists of 2000 images of dogs and cats, split
+    # in half.  In addition, the validation set consists of 1000 images,
+    # and the test set of 22000 images.
+
+    datapath = '/valohai/inputs/dataset/dogs-vs-cats/train-2000'
+    #datapath = "/wrk/makoskel/dogs-vs-cats/train-2000"
+    (nimages_train, nimages_validation, nimages_test) = (2000, 1000, 22000)
+
+    # ### Data augmentation
+    #
+    # First, we'll resize all training and validation images to a fized size.
+    #
+    # Then, to make the most of our limited number of training examples,
+    # we'll apply random transformations to them each time we are looping
+    # over them. This way, we "augment" our training dataset to contain
+    # more data. There are various transformations readily available in
+    # Keras, see [ImageDataGenerator]
+    # (https://keras.io/preprocessing/image/) for more information.
+
+    input_image_size = (150, 150)
+
+    datagen = ImageDataGenerator(
+            rescale=1./255,
+            shear_range=0.2,
+            zoom_range=0.2,
+            #rotation_range=40,
+            #width_shift_range=0.2,
+            #height_shift_range=0.2,
+            horizontal_flip=True)
+
+    noopgen = ImageDataGenerator(rescale=1./255)
+
+    # Let's put a couple of training images with the augmentation to a
+    # TensorBoard event file.
+
+    # augm_generator = datagen.flow_from_directory(
+    #         datapath+'/train',
+    #         target_size=input_image_size,
+    #         batch_size=10)
+
+    # for batch, _ in augm_generator:
+    #     break
+
+    # if K.backend() == "tensorflow":
+    #     imgs = tf.convert_to_tensor(batch)
+    #     summary_op = tf.summary.image("augmented", imgs, max_outputs=10)
+    #     with tf.Session() as sess:
+    #         summary = sess.run(summary_op)
+    #         writer = tf.summary.FileWriter(logdir)
+    #         writer.add_summary(summary)
+    #         writer.close()
+
+    # ### Data loaders
+    #
+    # Let's now define our real data loaders for training and validation data.
+
+    batch_size = 25
+
+    print('Train: ', end="")
+    train_generator = datagen.flow_from_directory(
+            datapath+'/train',
+            target_size=input_image_size,
+            batch_size=batch_size,
+            class_mode='binary')
+
+    print('Validation: ', end="")
+    validation_generator = noopgen.flow_from_directory(
+            datapath+'/validation',
+            target_size=input_image_size,
+            batch_size=batch_size,
+            class_mode='binary')
+
+    print('Test: ', end="")
+    test_generator = noopgen.flow_from_directory(
+            datapath+'/test',
+            target_size=input_image_size,
+            batch_size=batch_size,
+            class_mode='binary')
+
+    # We now reuse a pretrained network.  Here we'll use the
+    # [VGG16](https://keras.io/applications/#vgg16) network architecture
+    # with weights learned using Imagenet.  We remove the top layers and
+    # freeze the pre-trained weights.
+    #
+    # ### Initialization
+
+    model = Sequential()
+    model.add(InputLayer(input_shape=input_image_size+(3,)))  # possibly needed due to a bug in Keras
+
+    vgg_model = applications.VGG16(weights='imagenet',
+                                   include_top=False,
+                                   input_shape=input_image_size+(3,))
+    for layer in vgg_model.layers:
+        model.add(layer)
+
+    for layer in model.layers:
+        layer.trainable = False
+
+    print(model.summary())
+
+    # We then stack our own, randomly initialized layers on top of the
+    # VGG16 network.
+
+    model.add(Flatten())
+    model.add(Dense(64, activation='relu'))
+    #model.add(Dropout(0.5))
+    model.add(Dense(1, activation='sigmoid'))
+
+    model.compile(loss='binary_crossentropy',
+                  optimizer='rmsprop',
+                  metrics=['accuracy'])
+
+    print(model.summary())
+
+    # ### Learning 1: New layers
+
+    epochs = settings.epochs
+    workers = 4
+    use_multiprocessing = False
+
+    print('Training for', epochs, 'epochs with', workers,
+          'workers, use_multiprocessing is', use_multiprocessing)
+
+    history = model.fit_generator(train_generator,
+                                  steps_per_epoch=nimages_train // batch_size,
+                                  epochs=epochs,
+                                  validation_data=validation_generator,
+                                  validation_steps=nimages_validation // batch_size,
+                                  verbose=2, callbacks=callbacks,
+                                  use_multiprocessing=use_multiprocessing,
+                                  workers=workers)
+
+    fname = "/valohai/outputs/dvc-vgg16-reuse.h5"
+    print('Saving model to', fname)
+    model.save(fname)
+
+    # ### Learning 2: Fine-tuning
+    #
+    # Once the top layers have learned some reasonable weights, we can
+    # continue training by unfreezing the last convolution block of VGG16
+    # (`block5`) so that it may adapt to our data. The learning rate
+    # should be smaller than usual.
+
+    for layer in model.layers[15:]:
+        layer.trainable = True
+        print(layer.name, "now trainable")
+
+    model.compile(loss='binary_crossentropy',
+                  optimizer=optimizers.RMSprop(lr=1e-5),
+                  metrics=['accuracy'])
+
+    print(model.summary())
+
+    # Note that before continuing the training, we create a separate
+    # TensorBoard log directory:
+
+    epochs_ft = 10
+
+    # if K.backend() == "tensorflow":
+    #     logdir_ft = logdir + "-ft"
+    #     os.makedirs(logdir_ft)
+    #     callbacks_ft = [TensorBoard(log_dir=logdir_ft)]
+    # else:
+    callbacks_ft = None
+
+    print('Finetuning for', epochs_ft, 'epochs with', workers,
+          'workers, use_multiprocessing is', use_multiprocessing)
+
+    history = model.fit_generator(train_generator,
+                                  steps_per_epoch=nimages_train // batch_size,
+                                  epochs=epochs_ft,
+                                  validation_data=validation_generator,
+                                  validation_steps=nimages_validation // batch_size,
+                                  verbose=2, callbacks=callbacks_ft,
+                                  use_multiprocessing=use_multiprocessing,
+                                  workers=workers)
+
+    fname_ft = "/valohai/outputs/dvc-vgg16-finetune.h5"
+    print('Saving finetuned model to', fname_ft)
+    model.save(fname_ft)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=10)
+    parser.add_argument('--inference', type=bool, default=False)
+    settings = parser.parse_args()
+    main(settings)
diff --git a/machine-learning-scripts/valohai/keras-dvc-cnn-simple.py b/machine-learning-scripts/valohai/keras-dvc-cnn-simple.py
new file mode 100644
index 0000000..efec1e8
--- /dev/null
+++ b/machine-learning-scripts/valohai/keras-dvc-cnn-simple.py
@@ -0,0 +1,206 @@
+# coding: utf-8
+
+# # Dogs-vs-cats classification with CNNs
+# 
+# In this script, we'll train a convolutional neural network (CNN,
+# ConvNet) to classify images of dogs from images of cats using Keras
+# (version $\ge$ 2 is required). This script is largely based on the
+# blog post [Building powerful image classification models using very
+# little data]
+# (https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html)
+# by François Chollet.
+# 
+# **Note that using a GPU with this script is highly recommended.**
+# 
+# First, the needed imports. Keras tells us which backend (Theano,
+# Tensorflow, CNTK) it will be using.
+import argparse
+import json
+import os
+
+from keras.callbacks import LambdaCallback
+from keras.models import Sequential
+from keras.layers import Dense, Activation, Dropout, Flatten, MaxPooling2D
+from keras.layers.convolutional import Conv2D
+from keras.preprocessing.image import (ImageDataGenerator, array_to_img,
+                                       img_to_array, load_img)
+from keras import applications, optimizers
+
+from keras.utils import np_utils
+from keras import backend as K
+
+from distutils.version import LooseVersion as LV
+from keras import __version__
+
+import numpy as np
+
+
+def main(settings):
+    print('Using Keras version:', __version__, 'backend:', K.backend())
+    assert (LV(__version__) >= LV("2.0.0"))
+
+    # If we are using TensorFlow as the backend, we can use TensorBoard to
+    # visualize our progress during training.
+
+    # if K.backend() == "tensorflow":
+    #     import tensorflow as tf
+    #     from keras.callbacks import TensorBoard
+    #     import os, datetime
+    #     logdir = os.path.join(os.getcwd(), "logs",
+    #                      "dvc-simple-"+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+    #     print('TensorBoard log directory:', logdir)
+    #     os.makedirs(logdir)
+    #     callbacks = [TensorBoard(log_dir=logdir)]
+    # else:
+    #     callbacks =  None
+
+    # ## Data
+    #
+    # The training dataset consists of 2000 images of dogs and cats, split
+    # in half.  In addition, the validation set consists of 1000 images,
+    # and the test set of 22000 images.
+
+    datapath = '/valohai/inputs/dataset/dogs-vs-cats/train-2000'
+    (nimages_train, nimages_validation, nimages_test) = (2000, 1000, 22000)
+
+    # ### Data augmentation
+    #
+    # First, we'll resize all training and validation images to a fized size.
+    #
+    # Then, to make the most of our limited number of training examples,
+    # we'll apply random transformations to them each time we are looping
+    # over them. This way, we "augment" our training dataset to contain
+    # more data. There are various transformations readily available in
+    # Keras, see [ImageDataGenerator]
+    # (https://keras.io/preprocessing/image/) for more information.
+
+    input_image_size = (150, 150)
+
+    datagen = ImageDataGenerator(
+        rescale=1. / 255,
+        shear_range=0.2,
+        zoom_range=0.2,
+        # rotation_range=40,
+        # width_shift_range=0.2,
+        # height_shift_range=0.2,
+        horizontal_flip=True)
+
+    noopgen = ImageDataGenerator(rescale=1. / 255)
+
+    # Let's put a couple of training images with the augmentation to a
+    # TensorBoard event file.
+
+    augm_generator = datagen.flow_from_directory(
+        datapath + '/train',
+        target_size=input_image_size,
+        batch_size=10)
+
+    for batch, _ in augm_generator:
+        break
+
+    # if K.backend() == "tensorflow":
+    #     imgs = tf.convert_to_tensor(batch)
+    #     summary_op = tf.summary.image("augmented", imgs, max_outputs=10)
+    #     with tf.Session() as sess:
+    #         summary = sess.run(summary_op)
+    #         writer = tf.summary.FileWriter(logdir)
+    #         writer.add_summary(summary)
+    #         writer.close()
+
+    # ### Data loaders
+    #
+    # Let's now define our real data loaders for training and validation data.
+
+    batch_size = 25
+
+    print('Train: ', end="")
+    train_generator = datagen.flow_from_directory(
+        datapath + '/train',
+        target_size=input_image_size,
+        batch_size=batch_size,
+        class_mode='binary')
+
+    print('Validation: ', end="")
+    validation_generator = noopgen.flow_from_directory(
+        datapath + '/validation',
+        target_size=input_image_size,
+        batch_size=batch_size,
+        class_mode='binary')
+
+    print('Test: ', end="")
+    test_generator = noopgen.flow_from_directory(
+        datapath + '/test',
+        target_size=input_image_size,
+        batch_size=batch_size,
+        class_mode='binary')
+
+    # Similarly as with MNIST digits, we can start from scratch and train
+    # a CNN for the classification task. However, due to the small number
+    # of training images, a large network will easily overfit, regardless
+    # of the data augmentation.
+    #
+    # ### Initialization
+
+    model = Sequential()
+
+    model.add(Conv2D(32, (3, 3), input_shape=input_image_size + (3,), activation='relu'))
+    model.add(MaxPooling2D(pool_size=(2, 2)))
+
+    model.add(Conv2D(32, (3, 3), activation='relu'))
+    model.add(MaxPooling2D(pool_size=(2, 2)))
+
+    model.add(Conv2D(64, (3, 3), activation='relu'))
+    model.add(MaxPooling2D(pool_size=(2, 2)))
+
+    model.add(Flatten())
+    model.add(Dense(64, activation='relu'))
+    model.add(Dropout(0.5))
+    model.add(Dense(1, activation='sigmoid'))
+
+    model.compile(loss='binary_crossentropy',
+                  optimizer='rmsprop',
+                  metrics=['accuracy'])
+
+    print(model.summary())
+
+    json_logging_callback = LambdaCallback(
+        on_epoch_end=lambda epoch, logs: print(json.dumps({
+            "epoch": epoch,
+            "loss": logs["loss"],
+            "acc": logs["acc"],
+            "val_loss": logs["val_loss"],
+            "val_acc": logs["val_acc"],
+        })),
+    )
+
+    # ### Learning
+
+    history = model.fit_generator(train_generator,
+                                  steps_per_epoch=nimages_train // batch_size,
+                                  epochs=settings.epochs,
+                                  validation_data=validation_generator,
+                                  validation_steps=nimages_validation // batch_size,
+                                  verbose=2,
+                                  callbacks=[json_logging_callback],
+                                  use_multiprocessing=True,
+                                  workers=4)
+
+    model.save('/valohai/outputs/dvc-small-cnn.h5')
+
+    # ### Inference
+
+    if settings.inference:
+        print('Evaluating model...')
+        scores = model.evaluate_generator(test_generator,
+                                          steps=nimages_test // batch_size,
+                                          use_multiprocessing=True,
+                                          workers=4)
+        print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=20)
+    parser.add_argument('--inference', type=int, default=0)
+    settings = parser.parse_args()
+    main(settings)
diff --git a/machine-learning-scripts/valohai/keras-sfnet-cnn.py b/machine-learning-scripts/valohai/keras-sfnet-cnn.py
new file mode 100644
index 0000000..16ea596
--- /dev/null
+++ b/machine-learning-scripts/valohai/keras-sfnet-cnn.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+
+from keras.preprocessing import sequence, text
+from keras.models import Sequential
+from keras.layers import Dense  # Dropout
+from keras.layers import Embedding
+from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
+# from keras.layers import CuDNNLSTM
+from keras.utils import to_categorical
+
+from distutils.version import LooseVersion as LV
+from keras import __version__
+from keras import backend as K
+
+# from IPython.display import SVG
+# from keras.utils.vis_utils import model_to_dot
+
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix
+
+from tqdm import tqdm
+
+import os
+import gzip
+import re
+import pickle
+
+import numpy as np
+# import matplotlib.pyplot as plt
+# import seaborn as sns
+
+print('Using Keras version:', __version__, 'backend:', K.backend())
+assert(LV(__version__) >= LV("2.0.0"))
+
+FASTTEXT_FILE = "/valohai/inputs/embedding/cc.fi.300.vec.gz"
+TEXT_DATA_DIR = "/valohai/inputs/dataset/sfnet2007-2008/raw_texts/"
+
+
+# if K.backend() == "tensorflow":
+#     # import tensorflow as tf
+#     from keras.callbacks import TensorBoard
+#     import datetime
+#     logdir = os.path.join(os.getcwd(), "logs",
+#                           "sfnet-"+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+#     print('TensorBoard log directory:', logdir)
+#     os.makedirs(logdir)
+#     callbacks = [TensorBoard(log_dir=logdir)]
+# else:
+callbacks = None
+
+
+# Finnish word embeddings
+#
+# TODO: try also
+# http://bionlp.utu.fi/finnish-internet-parsebank.html ?
+#
+
+pickle_name = 'fasttext.cc.fi.300.pickle'
+
+if os.path.isfile(pickle_name):
+    with open(pickle_name, 'rb') as f:
+        embeddings_index = pickle.load(f)
+    print('Loaded word vectors from {}.'.format(pickle_name))
+else:
+    print('Indexing word vectors.')
+
+    embeddings_index = {}
+
+    with gzip.open(FASTTEXT_FILE, 'rt', encoding='utf-8') as f:
+        num_lines, dim = (int(x) for x in f.readline().rstrip().split())
+        print('{} has {} words with {}-dimensional embeddings.'.format(
+            os.path.basename(FASTTEXT_FILE), num_lines, dim))
+
+        for line in tqdm(f, total=num_lines):
+            values = line.split()
+            word = values[0]
+            coefs = np.asarray(values[1:], dtype='float32')
+            assert coefs.shape[0] == dim
+            embeddings_index[word] = coefs
+
+        assert len(embeddings_index) == num_lines
+
+    # with open(pickle_name, 'wb') as f:
+    #     # Pickle the 'data' dictionary using the highest protocol available.
+    #     pickle.dump(embeddings_index, f, pickle.HIGHEST_PROTOCOL)
+
+# FASTTEXT_FILE = "/media/data/yle-embeddings/fasttext_fin.csv.gz"
+
+# with gzip.open(FASTTEXT_FILE, 'rt', encoding='utf-8') as f:
+#     f.readline()
+#     # num_lines, dim = (int(x) for x in f.readline().rstrip().split())
+#     # print('{} has {} words with {}-dimensional embeddings.'.format(
+#     #     os.path.basename(FASTTEXT_FILE), num_lines, dim))
+
+#     for line in tqdm(f, total=880327):
+#         values = line.split(',')
+#         word = values[0]
+#         coefs = np.asarray(values[1:], dtype='float32')
+#         assert coefs.shape[0] == 100
+#         embeddings_index[word] = coefs
+
+#     # assert len(embeddings_index) == num_lines
+
+# print('Loaded {} embeddings'.format(len(embeddings_index)))
+# print('Examples of embeddings:')
+# for w in ['jotain', 'satunnaisia', 'sanoja']:
+#     print(w, embeddings_index[w])
+
+print('Examples of embeddings:')
+for w in ['jotain', 'satunnaisia', 'sanoja']:
+    print(w, embeddings_index[w])
+
+# SFNet data set
+
+print('Processing text dataset')
+
+texts = []  # list of text samples
+labels_index = {}  # dictionary mapping label name to numeric id
+labels = []  # list of label ids
+for name in sorted(os.listdir(TEXT_DATA_DIR)):
+    path = os.path.join(TEXT_DATA_DIR, name)
+    if os.path.isdir(path):
+        label_id = len(labels_index)
+        labels_index[name] = label_id
+        print(name, label_id)
+        for fname in sorted(os.listdir(path)):
+            print('*', fname)
+            if fname.endswith('.gz'):
+                fpath = os.path.join(path, fname)
+                with gzip.open(fpath, 'rt', encoding='latin-1') as f:
+                    header = True  # keep track if we are in header area, or in message
+                    t = ''  # accumulate current message into t
+                    prev_line = None
+                    for line in f:
+                        m = re.match(r'^([a-zA-Z]+): (.*)$', line)
+                        if m and m.group(1) in ['Path', 'Subject', 'From', 'Newsgroups']:
+                            # yes, we are definitely inside a header now...
+                            header = True
+                            if t != '':  # if we have accumulated text, we save it
+                                texts.append(t)
+                                labels.append(label_id)
+                                t = ''
+                                continue
+                        # empty line indicates end of headers
+                        if line == '\n' and header:
+                            header = False
+                            continue
+
+                        # if not a header, accumulate line to text in t
+                        if not header:
+                            t += line
+
+                        prev_line = line
+
+                    if t != '':  # store also the last message
+                        texts.append(t)
+                        labels.append(label_id)
+
+print('Found %s texts.' % len(texts))
+
+# First message and its label:
+
+print(texts[0])
+print('label:', labels[0], labels_index)
+
+# Vectorize the text samples into a 2D integer tensor.
+
+MAX_NUM_WORDS = 10000
+MAX_SEQUENCE_LENGTH = 1000
+
+tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
+tokenizer.fit_on_texts(texts)
+sequences = tokenizer.texts_to_sequences(texts)
+
+word_index = tokenizer.word_index
+print('Found %s unique tokens.' % len(word_index))
+
+#with gzip.open('/valohai/outputs/tokenizer_sfnet.json.gz', 'wt', encoding='utf-8') as f:
+#    f.write(tokenizer.to_json())
+with open('/valohai/outputs/tokenizer_sfnet.pkl', 'wb') as f:
+    pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
+
+
+data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
+
+labels = to_categorical(np.asarray(labels))
+print('Shape of data tensor:', data.shape)
+print('Shape of label tensor:', labels.shape)
+
+
+# Split the data into a training set and a validation set
+
+VALIDATION_SET, TEST_SET = 1000, 4000
+
+x_train, x_test, y_train, y_test = train_test_split(data, labels,
+                                                    test_size=TEST_SET,
+                                                    shuffle=True, random_state=42)
+
+x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
+                                                  test_size=VALIDATION_SET,
+                                                  shuffle=False)
+
+print('Shape of training data tensor:', x_train.shape)
+print('Shape of training label tensor:', y_train.shape)
+print('Shape of validation data tensor:', x_val.shape)
+print('Shape of validation label tensor:', y_val.shape)
+print('Shape of test data tensor:', x_test.shape)
+print('Shape of test label tensor:', y_test.shape)
+
+
+# Prepare the embedding matrix:
+
+print('Preparing embedding matrix.')
+
+num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
+embedding_dim = 300
+not_found = 0
+
+embedding_matrix = np.zeros((num_words, embedding_dim))
+for word, i in word_index.items():
+    if i >= MAX_NUM_WORDS:
+        continue
+    embedding_vector = embeddings_index.get(word)
+    if embedding_vector is not None:
+        # words not found in embedding index will be all-zeros.
+        embedding_matrix[i] = embedding_vector
+    else:
+        not_found += 1
+
+print('Shape of embedding matrix:', embedding_matrix.shape)
+print('Number of words not found in embedding index:', not_found)
+
+
+# 1-D CNN
+
+print('Build model...')
+model = Sequential()
+
+model.add(Embedding(num_words,
+                    embedding_dim,
+                    weights=[embedding_matrix],
+                    input_length=MAX_SEQUENCE_LENGTH,
+                    trainable=False))
+# model.add(Dropout(0.2))
+
+model.add(Conv1D(128, 5, activation='relu'))
+model.add(MaxPooling1D(5))
+model.add(Conv1D(128, 5, activation='relu'))
+model.add(MaxPooling1D(5))
+model.add(Conv1D(128, 5, activation='relu'))
+model.add(GlobalMaxPooling1D())
+
+model.add(Dense(128, activation='relu'))
+model.add(Dense(9, activation='softmax'))
+
+model.compile(loss='categorical_crossentropy',
+              optimizer='rmsprop',
+              metrics=['accuracy'])
+
+print(model.summary())
+
+# Learning
+
+epochs = 15
+batch_size = 128
+
+history = model.fit(x_train, y_train,
+                    batch_size=batch_size,
+                    epochs=epochs,
+                    validation_data=(x_val, y_val),
+                    verbose=2, callbacks=callbacks)
+model.save('/valohai/outputs/sfnet-cnn-fasttext_cc300-epochs{}.h5'.format(epochs))
+
+
+# Inference
+scores = model.evaluate(x_test, y_test, verbose=2)
+print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
+
+# We can also look at classification accuracies separately for each
+# newsgroup, and compute a confusion matrix to see which newsgroups
+# get mixed the most:
+
+predictions = model.predict(x_test)
+
+cm = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1),
+                      labels=list(range(9)))
+
+print('Classification accuracy for each newsgroup:')
+print()
+labels = [l[0] for l in sorted(labels_index.items(), key=lambda x: x[1])]
+for i, j in enumerate(cm.diagonal()/cm.sum(axis=1)):
+    print("%s: %.4f" % (labels[i].ljust(26), j))
+print()
+
+print('Confusion matrix (rows: true newsgroup; columns: predicted newsgroup):')
+print()
+np.set_printoptions(linewidth=9999)
+print(cm)
+print()
diff --git a/machine-learning-scripts/valohai/keras-sfnet-lstm.py b/machine-learning-scripts/valohai/keras-sfnet-lstm.py
new file mode 100644
index 0000000..d60be90
--- /dev/null
+++ b/machine-learning-scripts/valohai/keras-sfnet-lstm.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+
+from keras.preprocessing import sequence, text
+from keras.models import Sequential
+from keras.layers import Dense, Dropout
+from keras.layers import Embedding
+# from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
+from keras.layers import CuDNNLSTM
+from keras.utils import to_categorical
+
+from distutils.version import LooseVersion as LV
+from keras import __version__
+from keras import backend as K
+
+# from IPython.display import SVG
+# from keras.utils.vis_utils import model_to_dot
+
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import confusion_matrix
+
+from tqdm import tqdm
+
+import os
+import gzip
+import re
+import pickle
+
+import numpy as np
+# import matplotlib.pyplot as plt
+# import seaborn as sns
+
+print('Using Keras version:', __version__, 'backend:', K.backend())
+assert(LV(__version__) >= LV("2.0.0"))
+
+FASTTEXT_FILE = "/valohai/inputs/embedding/cc.fi.300.vec.gz"
+TEXT_DATA_DIR = "/valohai/inputs/dataset/sfnet2007-2008/raw_texts/"
+
+
+# if K.backend() == "tensorflow":
+#     # import tensorflow as tf
+#     from keras.callbacks import TensorBoard
+#     import datetime
+#     logdir = os.path.join(os.getcwd(), "logs",
+#                           "sfnet-lstm-"+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+#     print('TensorBoard log directory:', logdir)
+#     os.makedirs(logdir)
+#     callbacks = [TensorBoard(log_dir=logdir)]
+# else:
+callbacks = None
+
+
+# Finnish word embeddings
+#
+# TODO: try also
+# http://bionlp.utu.fi/finnish-internet-parsebank.html ?
+#
+
+pickle_name = 'fasttext.cc.fi.300.pickle'
+
+if os.path.isfile(pickle_name):
+    with open(pickle_name, 'rb') as f:
+        embeddings_index = pickle.load(f)
+    print('Loaded word vectors from {}.'.format(pickle_name))
+else:
+    print('Indexing word vectors.')
+
+    embeddings_index = {}
+
+    with gzip.open(FASTTEXT_FILE, 'rt', encoding='utf-8') as f:
+        num_lines, dim = (int(x) for x in f.readline().rstrip().split())
+        print('{} has {} words with {}-dimensional embeddings.'.format(
+            os.path.basename(FASTTEXT_FILE), num_lines, dim))
+
+        for line in tqdm(f, total=num_lines):
+            values = line.split()
+            word = values[0]
+            coefs = np.asarray(values[1:], dtype='float32')
+            assert coefs.shape[0] == dim
+            embeddings_index[word] = coefs
+
+        assert len(embeddings_index) == num_lines
+
+    # with open(pickle_name, 'wb') as f:
+    #     # Pickle the 'data' dictionary using the highest protocol available.
+    #     pickle.dump(embeddings_index, f, pickle.HIGHEST_PROTOCOL)
+
+# FASTTEXT_FILE = "/media/data/yle-embeddings/fasttext_fin.csv.gz"
+
+# with gzip.open(FASTTEXT_FILE, 'rt', encoding='utf-8') as f:
+#     f.readline()
+#     # num_lines, dim = (int(x) for x in f.readline().rstrip().split())
+#     # print('{} has {} words with {}-dimensional embeddings.'.format(
+#     #     os.path.basename(FASTTEXT_FILE), num_lines, dim))
+
+#     for line in tqdm(f, total=880327):
+#         values = line.split(',')
+#         word = values[0]
+#         coefs = np.asarray(values[1:], dtype='float32')
+#         assert coefs.shape[0] == 100
+#         embeddings_index[word] = coefs
+
+#     # assert len(embeddings_index) == num_lines
+
+# print('Loaded {} embeddings'.format(len(embeddings_index)))
+# print('Examples of embeddings:')
+# for w in ['jotain', 'satunnaisia', 'sanoja']:
+#     print(w, embeddings_index[w])
+
+print('Examples of embeddings:')
+for w in ['jotain', 'satunnaisia', 'sanoja']:
+    print(w, embeddings_index[w])
+
+# SFNet data set
+
+print('Processing text dataset')
+
+texts = []  # list of text samples
+labels_index = {}  # dictionary mapping label name to numeric id
+labels = []  # list of label ids
+for name in sorted(os.listdir(TEXT_DATA_DIR)):
+    path = os.path.join(TEXT_DATA_DIR, name)
+    if os.path.isdir(path):
+        label_id = len(labels_index)
+        labels_index[name] = label_id
+        print(name, label_id)
+        for fname in sorted(os.listdir(path)):
+            print('*', fname)
+            if fname.endswith('.gz'):
+                fpath = os.path.join(path, fname)
+                with gzip.open(fpath, 'rt', encoding='latin-1') as f:
+                    header = True  # keep track if we are in header area, or in message
+                    t = ''  # accumulate current message into t
+                    prev_line = None
+                    for line in f:
+                        m = re.match(r'^([a-zA-Z]+): (.*)$', line)
+                        if m and m.group(1) in ['Path', 'Subject', 'From', 'Newsgroups']:
+                            # yes, we are definitely inside a header now...
+                            header = True
+                            if t != '':  # if we have accumulated text, we save it
+                                texts.append(t)
+                                labels.append(label_id)
+                                t = ''
+                                continue
+                        # empty line indicates end of headers
+                        if line == '\n' and header:
+                            header = False
+                            continue
+
+                        # if not a header, accumulate line to text in t
+                        if not header:
+                            t += line
+
+                        prev_line = line
+
+                    if t != '':  # store also the last message
+                        texts.append(t)
+                        labels.append(label_id)
+
+print('Found %s texts.' % len(texts))
+
+# First message and its label:
+
+print(texts[0])
+print('label:', labels[0], labels_index)
+
+# Vectorize the text samples into a 2D integer tensor.
+
+MAX_NUM_WORDS = 10000
+MAX_SEQUENCE_LENGTH = 1000
+
+tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
+tokenizer.fit_on_texts(texts)
+sequences = tokenizer.texts_to_sequences(texts)
+
+word_index = tokenizer.word_index
+print('Found %s unique tokens.' % len(word_index))
+
+data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
+
+labels = to_categorical(np.asarray(labels))
+print('Shape of data tensor:', data.shape)
+print('Shape of label tensor:', labels.shape)
+
+
+# Split the data into a training set and a validation set
+
+VALIDATION_SET, TEST_SET = 1000, 4000
+
+x_train, x_test, y_train, y_test = train_test_split(data, labels,
+                                                    test_size=TEST_SET,
+                                                    shuffle=True, random_state=42)
+
+x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
+                                                  test_size=VALIDATION_SET,
+                                                  shuffle=False)
+
+print('Shape of training data tensor:', x_train.shape)
+print('Shape of training label tensor:', y_train.shape)
+print('Shape of validation data tensor:', x_val.shape)
+print('Shape of validation label tensor:', y_val.shape)
+print('Shape of test data tensor:', x_test.shape)
+print('Shape of test label tensor:', y_test.shape)
+
+
+# Prepare the embedding matrix:
+
+print('Preparing embedding matrix.')
+
+num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
+embedding_dim = 300
+not_found = 0
+
+embedding_matrix = np.zeros((num_words, embedding_dim))
+for word, i in word_index.items():
+    if i >= MAX_NUM_WORDS:
+        continue
+    embedding_vector = embeddings_index.get(word)
+    if embedding_vector is not None:
+        # words not found in embedding index will be all-zeros.
+        embedding_matrix[i] = embedding_vector
+    else:
+        not_found += 1
+
+print('Shape of embedding matrix:', embedding_matrix.shape)
+print('Number of words not found in embedding index:', not_found)
+
+print('Build model...')
+model = Sequential()
+
+model.add(Embedding(num_words,
+                    embedding_dim,
+                    weights=[embedding_matrix],
+                    input_length=MAX_SEQUENCE_LENGTH,
+                    trainable=False))
+model.add(Dropout(0.5))
+
+model.add(CuDNNLSTM(128, return_sequences=True))
+model.add(CuDNNLSTM(128))
+
+model.add(Dense(128, activation='relu'))
+model.add(Dense(9, activation='softmax'))
+
+model.compile(loss='categorical_crossentropy',
+              optimizer='rmsprop',
+              metrics=['accuracy'])
+
+print(model.summary())
+
+
+# Learning
+
+epochs = 20
+batch_size = 128
+
+history = model.fit(x_train, y_train,
+                    batch_size=batch_size,
+                    epochs=epochs,
+                    validation_data=(x_val, y_val),
+                    verbose=2, callbacks=callbacks)
+model.save('/valohai/outputs/sfnet-lstm-fasttext_cc300-epochs{}.h5'.format(epochs))
+
+# Inference
+scores = model.evaluate(x_test, y_test, verbose=2)
+print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
+
+
+predictions = model.predict(x_test)
+
+cm = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1),
+                      labels=list(range(9)))
+cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+
+print('Classification accuracy for each newsgroup:')
+print()
+labels = [l[0] for l in sorted(labels_index.items(), key=lambda x: x[1])]
+for i, j in enumerate(cm.diagonal()/cm.sum(axis=1)):
+    print("%s: %.4f" % (labels[i].ljust(26), j))
+print()
+
+print('Confusion matrix (rows: true newsgroup; columns: predicted newsgroup):')
+print()
+np.set_printoptions(linewidth=9999)
+print(cm)
+print()
diff --git a/machine-learning-scripts/valohai/prediction_server.py b/machine-learning-scripts/valohai/prediction_server.py
new file mode 100644
index 0000000..e767bcc
--- /dev/null
+++ b/machine-learning-scripts/valohai/prediction_server.py
@@ -0,0 +1,80 @@
+import glob
+import json
+import os
+
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+from keras.engine.saving import load_model
+from PIL import Image
+from skimage import transform
+from werkzeug.debug import DebuggedApplication
+from werkzeug.wrappers import Response, Request
+import numpy as np
+
+"""
+Development Usage:
+    $ python valohai/prediction_server.py
+    - it assumes that you have *.h5 model file in the current working directory
+    - then you can POST images to the local URL e.g. with test_prediction_server.py
+"""
+
+model = None
+
+
+def read_image_from_request(request):
+    # Reads the first file in the request and tries to load it as an image.
+    if not request.files:
+        return None
+    file_key = list(request.files.keys())[0]
+    file = request.files.get(file_key)
+    img = Image.open(file.stream)
+    img.load()
+    return img
+
+
+def create_response(content, status_code):
+    return Response(json.dumps(content), status_code, mimetype='application/json')
+
+
+def predict_wsgi(environ, start_response):
+    request = Request(environ)
+    image = read_image_from_request(request)
+    if not image:
+        result = {'error': 'No images in the request, include sample image in the request.'}
+        response = create_response(result, 400)
+        return response(environ, start_response)
+
+    # Pre-processing a single image
+    # TODO: notice that this is not 100% the same preprocessing than in training, will create some skew
+    image = np.array(image).astype('float32') / 255
+    image = transform.resize(image, (150, 150, 3), mode='constant', anti_aliasing=False)
+    image = np.expand_dims(image, axis=0)
+
+    # Load model as global object so it stays in the memory making responses fast.
+    global model
+    if not model:
+        # Try to find HDF5 files on the current directory to load as the model.
+        local_hdf5_files = glob.glob('*.h5')
+        if not local_hdf5_files:
+            result = {'error': 'Could not find predictive model to load, contact support.'}
+            response = create_response(result, 400)
+            return response(environ, start_response)
+        model_path = os.path.join(os.getcwd(), local_hdf5_files[0])
+        model = load_model(model_path)
+
+    # Give prediction on the image.
+    predictions = model.predict(image)
+    prediction = predictions[0]
+
+    # Report results.
+    cls = 'cat' if prediction < 0.5 else 'dog'
+    result = {'class': cls, 'value': float(prediction)}
+    response = create_response(result, 200)
+    return response(environ, start_response)
+
+
+predict_wsgi = DebuggedApplication(predict_wsgi)
+
+if __name__ == '__main__':
+    from werkzeug.serving import run_simple
+
+    run_simple('localhost', 8000, predict_wsgi)
diff --git a/machine-learning-scripts/valohai/prediction_server_text.py b/machine-learning-scripts/valohai/prediction_server_text.py
new file mode 100644
index 0000000..9bdcae5
--- /dev/null
+++ b/machine-learning-scripts/valohai/prediction_server_text.py
@@ -0,0 +1,111 @@
+import glob
+import json
+import os
+
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+from keras.preprocessing import sequence, text
+from keras.engine.saving import load_model
+from werkzeug.debug import DebuggedApplication
+from werkzeug.wrappers import Response, Request
+import gzip
+import pickle
+
+"""
+Development Usage:
+    $ python valohai/prediction_server_text.py
+    - it assumes that you have *.h5 model file in the current working directory
+    - submit texts with GET parameter text to the local URL e.g. with test_prediction_server.py
+"""
+
+MAX_SEQUENCE_LENGTH = 1000
+
+model = None
+tokenizer = None
+
+groups = {
+    'atk': 0,
+    'harrastus': 1,
+    'keskustelu': 2,
+    'misc': 3,
+    'tiede': 4,
+    'tietoliikenne': 5,
+    'tori': 6,
+    'urheilu': 7,
+    'viestinta': 8
+}
+
+
+def create_response(content, status_code):
+    return Response(json.dumps(content), status_code, mimetype='application/json')
+
+
+def predict_wsgi(environ, start_response):
+    global model, tokenizer
+
+    request = Request(environ)
+    get_text = request.form.get('text')
+    if get_text is None:
+        get_text = request.args.get('text')
+    if get_text is None:
+        result = {'error': 'No text given in the request.'}
+        response = create_response(result, 400)
+        return response(environ, start_response)
+
+    texts = [get_text]
+
+    # Load model as global object so it stays in the memory making responses fast.
+    if model is None:
+        # Try to find HDF5 files on the current directory to load as the model.
+        local_hdf5_files = glob.glob('*.h5')
+        if not local_hdf5_files:
+            result = {'error': 'Could not find predictive model to load, contact support.'}
+            response = create_response(result, 400)
+            return response(environ, start_response)
+        model_path = os.path.join(os.getcwd(), local_hdf5_files[0])
+        model = load_model(model_path)
+
+    if tokenizer is None:
+        # with gzip.open('tokenizer_sfnet.json.gz', 'rt', encoding='utf-8') as f:
+        #     tokenizer = text.text.tokenizer_from_json(f.read())
+        local_pkl_files = glob.glob('tokenizer*.pkl')
+        if not local_pkl_files:
+            result = {'error': 'Could not find pickled Tokenizer to load, contact support.'}
+            response = create_response(result, 400)
+            return response(environ, start_response)
+        pkl_path = os.path.join(os.getcwd(), local_pkl_files[0])
+        with open(pkl_path, 'rb') as f:
+            tokenizer = pickle.load(f)
+
+    # print(texts)
+    sequences = tokenizer.texts_to_sequences(texts)
+    # print(sequences)
+    data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
+
+    # Give prediction
+    predictions = model.predict(data)
+    prediction = predictions[0]
+
+    # Report results.
+    result = {}
+    result['predict'] = {g: float(prediction[i]) for g, i in groups.items()}
+
+    # Get details about the deployed model if possible.
+    metadata_path = 'valohai-metadata.json'
+    if os.path.isfile(metadata_path):
+        with open(metadata_path) as f:
+            try:
+                deployment_metadata = json.load(f)
+                result['deployment'] = deployment_metadata
+            except json.JSONDecodeError:
+                # Could not read the deployment metadata, ignore it
+                pass
+
+    response = create_response(result, 200)
+    return response(environ, start_response)
+
+
+predict_wsgi = DebuggedApplication(predict_wsgi)
+
+if __name__ == '__main__':
+    from werkzeug.serving import run_simple
+    run_simple('localhost', 8000, predict_wsgi)
diff --git a/machine-learning-scripts/valohai/pytorch_dvc_cnn.py b/machine-learning-scripts/valohai/pytorch_dvc_cnn.py
new file mode 100644
index 0000000..ce6e681
--- /dev/null
+++ b/machine-learning-scripts/valohai/pytorch_dvc_cnn.py
@@ -0,0 +1,171 @@
+# coding: utf-8
+
+# Dogs-vs-cats classification with CNNs
+#
+# In this script, we'll train a convolutional neural network (CNN,
+# ConvNet) to classify images of dogs from images of cats using
+# PyTorch. This script is largely based on the blog post [Building
+# powerful image classification models using very little
+# data](https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html)
+# by François Chollet.
+#
+# **Note that using a GPU with this script is highly recommended.**
+
+import torch
+from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
+from distutils.version import LooseVersion as LV
+import os
+
+torch.manual_seed(42)
+if torch.cuda.is_available():
+    device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+print('Using PyTorch version:', torch.__version__, ' Device:', device)
+assert(LV(torch.__version__) >= LV("1.0.0"))
+
+datapath = None
+subpath = 'dogs-vs-cats/train-2000'
+
+slurm_job_id = os.environ.get('SLURM_JOB_ID')
+if slurm_job_id is not None:
+    datapath = os.path.join(os.environ.get('TMPDIR'), os.environ.get('SLURM_JOB_ID'),
+                            subpath)
+if datapath is None or not os.path.isdir(datapath):
+    datapath = '/wrk/makoskel/' + subpath
+if not os.path.isdir(datapath):
+    datapath = '/media/data/' + subpath
+if not os.path.isdir(datapath):
+    datapath = '/valohai/inputs/dataset/' + subpath
+print('Reading data from path:', datapath)
+
+(nimages_train, nimages_validation, nimages_test) = (2000, 1000, 22000)
+
+
+def get_tensorboard(log_name):
+    return None
+    # try:
+    #     import tensorboardX
+    #     import os
+    #     import datetime
+    #     logdir = os.path.join(os.getcwd(), "logs",
+    #                           "dvc-" + log_name + "-" +
+    #                           datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
+    #     print('Logging TensorBoard to:', logdir)
+    #     os.makedirs(logdir)
+    #     return tensorboardX.SummaryWriter(logdir)
+    # except ImportError:
+    #     return None
+
+
+def train(model, loader, criterion, optimizer, epoch, log=None):
+    # Set model to training mode
+    model.train()
+    epoch_loss = 0.
+
+    # Loop over each batch from the training set
+    for batch_idx, (data, target) in enumerate(loader):
+        # Copy data to GPU if needed
+        data = data.to(device)
+        target = target.to(device)
+
+        # Zero gradient buffers
+        optimizer.zero_grad()
+
+        # Pass data through the network
+        output = model(data)
+        output = torch.squeeze(output)
+
+        # Calculate loss
+        loss = criterion(output, target.to(torch.float32))
+        epoch_loss += loss.data.item()
+
+        # Backpropagate
+        loss.backward()
+
+        # Update weights
+        optimizer.step()
+
+    epoch_loss /= len(loader.dataset)
+    print('Train Epoch: {}, Loss: {:.4f}'.format(epoch, epoch_loss))
+
+    if log is not None:
+        log.add_scalar('loss', epoch_loss, epoch-1)
+
+
+def evaluate(model, loader, criterion=None, epoch=None, log=None):
+    model.eval()
+    loss, correct = 0, 0
+    for data, target in loader:
+        data = data.to(device)
+        target = target.to(device)
+
+        output = torch.squeeze(model(data))
+
+        if criterion is not None:
+            loss += criterion(output, target.to(torch.float32)).data.item()
+
+        pred = output > 0.5
+        pred = pred.to(torch.int64)
+        correct += pred.eq(target.data).cpu().sum()
+
+    if criterion is not None:
+        loss /= len(loader.dataset)
+
+    accuracy = 100. * correct.to(torch.float32) / len(loader.dataset)
+
+    print('Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
+        loss, correct, len(loader.dataset), accuracy))
+
+    if log is not None and epoch is not None:
+        log.add_scalar('val_loss', loss, epoch-1)
+        log.add_scalar('val_acc', accuracy, epoch-1)
+
+
+input_image_size = (150, 150)
+
+data_transform = transforms.Compose([
+        transforms.Resize(input_image_size),
+        transforms.RandomAffine(degrees=0, translate=None,
+                                scale=(0.8, 1.2), shear=0.2),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor()
+    ])
+
+noop_transform = transforms.Compose([
+        transforms.Resize(input_image_size),
+        transforms.ToTensor()
+    ])
+
+
+def get_train_loader(batch_size=25):
+    print('Train: ', end="")
+    train_dataset = datasets.ImageFolder(root=datapath+'/train',
+                                         transform=data_transform)
+    train_loader = DataLoader(train_dataset, batch_size=batch_size,
+                              shuffle=True, num_workers=4)
+    print('Found', len(train_dataset), 'images belonging to',
+          len(train_dataset.classes), 'classes')
+    return train_loader
+
+
+def get_validation_loader(batch_size=25):
+    print('Validation: ', end="")
+    validation_dataset = datasets.ImageFolder(root=datapath+'/validation',
+                                              transform=noop_transform)
+    validation_loader = DataLoader(validation_dataset, batch_size=batch_size,
+                                   shuffle=False, num_workers=4)
+    print('Found', len(validation_dataset), 'images belonging to',
+          len(validation_dataset.classes), 'classes')
+    return validation_loader
+
+def get_test_loader(batch_size=25):
+    print('Test: ', end="")
+    test_dataset = datasets.ImageFolder(root=datapath+'/test',
+                                        transform=noop_transform)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size,
+                             shuffle=False, num_workers=4)
+    print('Found', len(test_dataset), 'images belonging to',
+          len(test_dataset.classes), 'classes')
+    return test_loader
diff --git a/machine-learning-scripts/valohai/pytorch_dvc_cnn_simple.py b/machine-learning-scripts/valohai/pytorch_dvc_cnn_simple.py
new file mode 100644
index 0000000..b04fc83
--- /dev/null
+++ b/machine-learning-scripts/valohai/pytorch_dvc_cnn_simple.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+
+# Dogs-vs-cats classification with CNNs
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from datetime import datetime
+
+from pytorch_dvc_cnn import get_train_loader, get_validation_loader, get_test_loader
+from pytorch_dvc_cnn import device, train, evaluate, get_tensorboard
+
+model_file = '/valohai/outputs/dvc_simple_cnn.pt'
+
+
+# Option 1: Train a small CNN from scratch
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(3, 32, (3, 3))
+        self.pool1 = nn.MaxPool2d((2, 2))
+        self.conv2 = nn.Conv2d(32, 32, (3, 3))
+        self.pool2 = nn.MaxPool2d((2, 2))
+        self.conv3 = nn.Conv2d(32, 64, (3, 3))
+        self.pool3 = nn.MaxPool2d((2, 2))
+        self.fc1 = nn.Linear(17*17*64, 64)
+        self.fc1_drop = nn.Dropout(0.5)
+        self.fc2 = nn.Linear(64, 1)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = self.pool1(x)
+        x = F.relu(self.conv2(x))
+        x = self.pool2(x)
+        x = F.relu(self.conv3(x))
+        x = self.pool3(x)
+
+        # "flatten" 2D to 1D
+        x = x.view(-1, 17*17*64)
+        x = F.relu(self.fc1(x))
+        x = self.fc1_drop(x)
+        return torch.sigmoid(self.fc2(x))
+
+
+def train_main():
+    model = Net().to(device)
+    optimizer = optim.SGD(model.parameters(), lr=0.05)
+    criterion = nn.BCELoss()
+
+    print(model)
+
+    batch_size = 25
+    train_loader = get_train_loader(batch_size)
+    validation_loader = get_validation_loader(batch_size)
+
+    log = get_tensorboard('simple')
+    epochs = 50
+
+    start_time = datetime.now()
+    for epoch in range(1, epochs + 1):
+        train(model, train_loader, criterion, optimizer, epoch, log)
+
+        with torch.no_grad():
+            print('\nValidation:')
+            evaluate(model, validation_loader, criterion, epoch, log)
+
+    end_time = datetime.now()
+    print('Total training time: {}.'.format(end_time - start_time))
+
+    torch.save(model.state_dict(), model_file)
+    print('Wrote model to', model_file)
+
+
+def test_main():
+    model = Net()
+    model.load_state_dict(torch.load(model_file))
+    model.to(device)
+
+    test_loader = get_test_loader(25)
+
+    print('=========')
+    print('Test set:')
+    with torch.no_grad():
+        evaluate(model, test_loader)
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--test', action='store_true')
+    args = parser.parse_args()
+
+    if args.test:
+        test_main()
+    else:
+        train_main()
diff --git a/machine-learning-scripts/valohai/test_prediction_server.py b/machine-learning-scripts/valohai/test_prediction_server.py
new file mode 100644
index 0000000..6e1487f
--- /dev/null
+++ b/machine-learning-scripts/valohai/test_prediction_server.py
@@ -0,0 +1,38 @@
+import os
+import argparse
+
+import requests
+
+"""
+Simple test script that sends target file to local prediction server endpoint
+and prints the response status code and content.
+
+If developing the prediction server, remember to start it first
+
+Usage:
+    $ python valohai/test_prediction_server.py inputs/cat.jpg,inputs/dog.jpg
+"""
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('image_path', type=str)
+    args = parser.parse_args()
+    image_path = args.image_path
+
+    # Support for multiple files using , separator
+    if ',' in image_path:
+        parts = image_path.split(',')
+        full_image_paths = [url for url in parts if len(url) > 0]
+    else:
+        full_image_paths = [os.path.join(os.getcwd(), args.image_path)]
+
+    for fip in full_image_paths:
+        if not os.path.isfile(fip):
+            print(f'Could not find a file to send at {fip}')
+            exit(1)
+
+    for fip in full_image_paths:
+        files = {'media': open(fip, 'rb')}
+        response = requests.post('http://localhost:8000', files=files)
+        print(f'Target: {fip}')
+        print(f'Result: {response.status_code} => {response.content}')
diff --git a/machine-learning-scripts/valohai/test_prediction_server_text.py b/machine-learning-scripts/valohai/test_prediction_server_text.py
new file mode 100644
index 0000000..7181656
--- /dev/null
+++ b/machine-learning-scripts/valohai/test_prediction_server_text.py
@@ -0,0 +1,26 @@
+import os
+import argparse
+
+import requests
+
+"""
+Simple test script that sends target file to local prediction server endpoint
+and prints the response status code and content.
+
+If developing the prediction server, remember to start it first
+
+Usage:
+    $ python valohai/test_prediction_server_text.py "Hei maailma"
+"""
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('text', type=str)
+    args = parser.parse_args()
+
+    target_url = "https://valohai.cloud/msjoberg/pdl-test/sfnet/current/predict-nyyssi"
+    # target_url = "http://localhost:8000"
+
+    params = {'text': args.text}
+    response = requests.get(target_url, params=params)
+    print(f'Result: {response.status_code} => {response.content}')