Add files via upload

exajobs · Dec 16, 2021 · df45607 · df45607
1 parent 1522962
commit df45607
Show file tree

Hide file tree

Showing 17 changed files with 2,122 additions and 0 deletions.
diff --git a/machine-learning-scripts/LICENSE b/machine-learning-scripts/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 CSC - IT Center for Science Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/machine-learning-scripts/README.md b/machine-learning-scripts/README.md
@@ -0,0 +1,3 @@
+# machine-learning-scripts
+
+ This repository is a miscellaneous collection of scripts and tools related to machine learning. Currently consists of Jupyter notebooks in the `notebooks` subdirectory, and example SLURM scripts in `slurm`. Materials related to specific courses can be found in the `courses` subdirectory. 
diff --git a/machine-learning-scripts/requirements.txt b/machine-learning-scripts/requirements.txt
@@ -0,0 +1,5 @@
+# dependencies for Valohai inference deployment
+pillow>=6.2.0
+requests==2.21.0
+scikit-image==0.14.2
+werkzeug>=0.15.3
diff --git a/machine-learning-scripts/valohai/dvc_urls.txt b/machine-learning-scripts/valohai/dvc_urls.txt
@@ -0,0 +1,4 @@
+https://www.catster.com/wp-content/uploads/2017/08/A-fluffy-cat-looking-funny-surprised-or-concerned.jpg
+https://i.ytimg.com/vi/lrvqjdMcjjQ/hqdefault.jpg
+https://dynaimage.cdn.cnn.com/cnn/w_768,h_1024,c_scale/https%3A%2F%2Fdynaimage.cdn.cnn.com%2Fcnn%2Fx_1229%2Cy_0%2Cw_2712%2Ch_3616%2Cc_crop%2Fhttps%253A%252F%252Fstamp.static.cnn.io%252F5b7ac48b4db3d70020c01c13%252Fshutterstock_1081879181.jpg
+https://static-cdn.jtvnw.net/jtv_user_pictures/dogdog-profile_image-5550ade194780dfc-300x300.jpeg
diff --git a/machine-learning-scripts/valohai/keras-20ng-cnn.py b/machine-learning-scripts/valohai/keras-20ng-cnn.py
@@ -0,0 +1,228 @@
+
+# coding: utf-8
+
+# # 20 Newsgroups text classification with pre-trained word embeddings
+# 
+# In this script, we'll use pre-trained [GloVe word embeddings]
+# (http://nlp.stanford.edu/projects/glove/) for text classification
+# using Keras (version $\ge$ 2 is required). This script is largely
+# based on the blog post [Using pre-trained word embeddings in a Keras
+# model]
+# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)
+# by François Chollet.
+#
+# **Note that using a GPU with this script is highly recommended.**
+#
+# First, the needed imports. Keras tells us which backend (Theano,
+# Tensorflow, CNTK) it will be using.
+
+from keras.preprocessing import sequence, text
+from keras.models import Sequential
+from keras.layers import Dense, Dropout
+from keras.layers import Embedding
+from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
+from keras.layers import LSTM, CuDNNLSTM
+from keras.utils import to_categorical
+from keras.callbacks import LambdaCallback
+
+from distutils.version import LooseVersion as LV
+from keras import __version__
+from keras import backend as K
+
+from sklearn.model_selection import train_test_split
+
+import argparse
+import json
+
+import os
+import sys
+
+import numpy as np
+
+print('Using Keras version:', __version__, 'backend:', K.backend())
+assert(LV(__version__) >= LV("2.0.0"))
+
+def main(settings):
+
+    # ## GloVe word embeddings
+    #
+    # Let's begin by loading a datafile containing pre-trained word
+    # embeddings.  The datafile contains 100-dimensional embeddings for
+    # 400,000 English words.
+
+    GLOVE_DIR = "/valohai/inputs/dataset/"
+
+    print('Indexing word vectors.')
+
+    embeddings_index = {}
+    with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
+        for line in f:
+            values = line.split()
+            word = values[0]
+            coefs = np.asarray(values[1:], dtype='float32')
+            embeddings_index[word] = coefs
+
+    print('Found %s word vectors.' % len(embeddings_index))
+
+    # ## 20 Newsgroups data set
+    #
+    # Next we'll load the [20 Newsgroups]
+    # (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html)
+    # data set.
+    #
+    # The dataset contains 20000 messages collected from 20 different
+    # Usenet newsgroups (1000 messages from each group):
+    #
+    # alt.atheism           | soc.religion.christian   | comp.windows.x     | sci.crypt
+    # talk.politics.guns    | comp.sys.ibm.pc.hardware | rec.autos          | sci.electronics
+    # talk.politics.mideast | comp.graphics            | rec.motorcycles    | sci.space
+    # talk.politics.misc    | comp.os.ms-windows.misc  | rec.sport.baseball | sci.med
+    # talk.religion.misc    | comp.sys.mac.hardware    | rec.sport.hockey   | misc.forsale
+
+    TEXT_DATA_DIR = "/valohai/inputs/dataset/20_newsgroup"
+
+    print('Processing text dataset')
+
+    texts = []  # list of text samples
+    labels_index = {}  # dictionary mapping label name to numeric id
+    labels = []  # list of label ids
+    for name in sorted(os.listdir(TEXT_DATA_DIR)):
+        path = os.path.join(TEXT_DATA_DIR, name)
+        if os.path.isdir(path):
+            label_id = len(labels_index)
+            labels_index[name] = label_id
+            for fname in sorted(os.listdir(path)):
+                if fname.isdigit():
+                    fpath = os.path.join(path, fname)
+                    args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
+                    with open(fpath, **args) as f:
+                        t = f.read()
+                        i = t.find('\n\n')  # skip header
+                        if 0 < i:
+                            t = t[i:]
+                        texts.append(t)
+                    labels.append(label_id)
+
+    print('Found %s texts.' % len(texts))
+
+    # Vectorize the text samples into a 2D integer tensor.
+
+    MAX_NUM_WORDS = 10000
+    MAX_SEQUENCE_LENGTH = 1000
+
+    tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
+    tokenizer.fit_on_texts(texts)
+    sequences = tokenizer.texts_to_sequences(texts)
+
+    word_index = tokenizer.word_index
+    print('Found %s unique tokens.' % len(word_index))
+
+    data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
+
+    labels = to_categorical(np.asarray(labels))
+    print('Shape of data tensor:', data.shape)
+    print('Shape of label tensor:', labels.shape)
+
+    # Split the data into a training set and a validation set
+
+    VALIDATION_SET, TEST_SET = 1000, 4000
+
+    x_train, x_test, y_train, y_test = train_test_split(data, labels,
+                                                        test_size=TEST_SET,
+                                                        shuffle=True,
+                                                        random_state=42)
+
+    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
+                                                      test_size=VALIDATION_SET,
+                                                      shuffle=False)
+
+    print('Shape of training data tensor:', x_train.shape)
+    print('Shape of training label tensor:', y_train.shape)
+    print('Shape of validation data tensor:', x_val.shape)
+    print('Shape of validation label tensor:', y_val.shape)
+    print('Shape of test data tensor:', x_test.shape)
+    print('Shape of test label tensor:', y_test.shape)
+
+    # Prepare the embedding matrix:
+
+    print('Preparing embedding matrix.')
+
+    num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
+    embedding_dim = 100
+
+    embedding_matrix = np.zeros((num_words, embedding_dim))
+    for word, i in word_index.items():
+        if i >= MAX_NUM_WORDS:
+            continue
+        embedding_vector = embeddings_index.get(word)
+        if embedding_vector is not None:
+            # words not found in embedding index will be all-zeros.
+            embedding_matrix[i] = embedding_vector
+
+    print('Shape of embedding matrix:', embedding_matrix.shape)
+
+    # ### Initialization
+
+    print('Build model...')
+    model = Sequential()
+
+    model.add(Embedding(num_words,
+                        embedding_dim,
+                        weights=[embedding_matrix],
+                        input_length=MAX_SEQUENCE_LENGTH,
+                        trainable=False))
+    #model.add(Dropout(0.2))
+
+    model.add(Conv1D(128, 5, activation='relu'))
+    model.add(MaxPooling1D(5))
+    model.add(Conv1D(128, 5, activation='relu'))
+    model.add(MaxPooling1D(5))
+    model.add(Conv1D(128, 5, activation='relu'))
+    model.add(GlobalMaxPooling1D())
+
+    model.add(Dense(128, activation='relu'))
+    model.add(Dense(20, activation='softmax'))
+
+    model.compile(loss='categorical_crossentropy',
+                  optimizer='rmsprop',
+                  metrics=['accuracy'])
+
+    print(model.summary())
+
+    # ### Learning
+
+    epochs = settings.epochs
+    batch_size=128
+
+    json_logging_callback = LambdaCallback(
+        on_epoch_end=lambda epoch, logs: print(json.dumps({
+            "epoch": epoch,
+            "loss": logs["loss"],
+            "acc": logs["acc"],
+            "val_loss": logs["val_loss"],
+            "val_acc": logs["val_acc"],
+        })),
+    )
+
+    history = model.fit(x_train, y_train,
+                        batch_size=batch_size,
+                        epochs=epochs,
+                        validation_data=(x_val, y_val),
+                        verbose=2,
+                        callbacks=[json_logging_callback])
+
+    model.save('/valohai/outputs/20ng-cnn.h5')
+
+    # ### Inference
+
+    if settings.inference:
+        print('Evaluating model...')
+        scores = model.evaluate(x_test, y_test, verbose=2)
+        print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--epochs', type=int, default=20)
+    parser.add_argument('--inference', type=int, default=1)
+    settings = parser.parse_args()
+    main(settings)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# machine-learning-scripts

		This repository is a miscellaneous collection of scripts and tools related to machine learning. Currently consists of Jupyter notebooks in the `notebooks` subdirectory, and example SLURM scripts in `slurm`. Materials related to specific courses can be found in the `courses` subdirectory.