Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
exajobs authored Dec 16, 2021
1 parent 1522962 commit df45607
Show file tree
Hide file tree
Showing 17 changed files with 2,122 additions and 0 deletions.
21 changes: 21 additions & 0 deletions machine-learning-scripts/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2019 CSC - IT Center for Science Ltd.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
3 changes: 3 additions & 0 deletions machine-learning-scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# machine-learning-scripts

This repository is a miscellaneous collection of scripts and tools related to machine learning. Currently consists of Jupyter notebooks in the `notebooks` subdirectory, and example SLURM scripts in `slurm`. Materials related to specific courses can be found in the `courses` subdirectory.
5 changes: 5 additions & 0 deletions machine-learning-scripts/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# dependencies for Valohai inference deployment
pillow>=6.2.0
requests==2.21.0
scikit-image==0.14.2
werkzeug>=0.15.3
4 changes: 4 additions & 0 deletions machine-learning-scripts/valohai/dvc_urls.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
https://www.catster.com/wp-content/uploads/2017/08/A-fluffy-cat-looking-funny-surprised-or-concerned.jpg
https://i.ytimg.com/vi/lrvqjdMcjjQ/hqdefault.jpg
https://dynaimage.cdn.cnn.com/cnn/w_768,h_1024,c_scale/https%3A%2F%2Fdynaimage.cdn.cnn.com%2Fcnn%2Fx_1229%2Cy_0%2Cw_2712%2Ch_3616%2Cc_crop%2Fhttps%253A%252F%252Fstamp.static.cnn.io%252F5b7ac48b4db3d70020c01c13%252Fshutterstock_1081879181.jpg
https://static-cdn.jtvnw.net/jtv_user_pictures/dogdog-profile_image-5550ade194780dfc-300x300.jpeg
228 changes: 228 additions & 0 deletions machine-learning-scripts/valohai/keras-20ng-cnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@

# coding: utf-8

# # 20 Newsgroups text classification with pre-trained word embeddings
#
# In this script, we'll use pre-trained [GloVe word embeddings]
# (http://nlp.stanford.edu/projects/glove/) for text classification
# using Keras (version $\ge$ 2 is required). This script is largely
# based on the blog post [Using pre-trained word embeddings in a Keras
# model]
# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)
# by François Chollet.
#
# **Note that using a GPU with this script is highly recommended.**
#
# First, the needed imports. Keras tells us which backend (Theano,
# Tensorflow, CNTK) it will be using.

from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import LSTM, CuDNNLSTM
from keras.utils import to_categorical
from keras.callbacks import LambdaCallback

from distutils.version import LooseVersion as LV
from keras import __version__
from keras import backend as K

from sklearn.model_selection import train_test_split

import argparse
import json

import os
import sys

import numpy as np

print('Using Keras version:', __version__, 'backend:', K.backend())
assert(LV(__version__) >= LV("2.0.0"))

def main(settings):

# ## GloVe word embeddings
#
# Let's begin by loading a datafile containing pre-trained word
# embeddings. The datafile contains 100-dimensional embeddings for
# 400,000 English words.

GLOVE_DIR = "/valohai/inputs/dataset/"

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

# ## 20 Newsgroups data set
#
# Next we'll load the [20 Newsgroups]
# (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html)
# data set.
#
# The dataset contains 20000 messages collected from 20 different
# Usenet newsgroups (1000 messages from each group):
#
# alt.atheism | soc.religion.christian | comp.windows.x | sci.crypt
# talk.politics.guns | comp.sys.ibm.pc.hardware | rec.autos | sci.electronics
# talk.politics.mideast | comp.graphics | rec.motorcycles | sci.space
# talk.politics.misc | comp.os.ms-windows.misc | rec.sport.baseball | sci.med
# talk.religion.misc | comp.sys.mac.hardware | rec.sport.hockey | misc.forsale

TEXT_DATA_DIR = "/valohai/inputs/dataset/20_newsgroup"

print('Processing text dataset')

texts = [] # list of text samples
labels_index = {} # dictionary mapping label name to numeric id
labels = [] # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
path = os.path.join(TEXT_DATA_DIR, name)
if os.path.isdir(path):
label_id = len(labels_index)
labels_index[name] = label_id
for fname in sorted(os.listdir(path)):
if fname.isdigit():
fpath = os.path.join(path, fname)
args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
with open(fpath, **args) as f:
t = f.read()
i = t.find('\n\n') # skip header
if 0 < i:
t = t[i:]
texts.append(t)
labels.append(label_id)

print('Found %s texts.' % len(texts))

# Vectorize the text samples into a 2D integer tensor.

MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000

tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set

VALIDATION_SET, TEST_SET = 1000, 4000

x_train, x_test, y_train, y_test = train_test_split(data, labels,
test_size=TEST_SET,
shuffle=True,
random_state=42)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
test_size=VALIDATION_SET,
shuffle=False)

print('Shape of training data tensor:', x_train.shape)
print('Shape of training label tensor:', y_train.shape)
print('Shape of validation data tensor:', x_val.shape)
print('Shape of validation label tensor:', y_val.shape)
print('Shape of test data tensor:', x_test.shape)
print('Shape of test label tensor:', y_test.shape)

# Prepare the embedding matrix:

print('Preparing embedding matrix.')

num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_dim = 100

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
if i >= MAX_NUM_WORDS:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector

print('Shape of embedding matrix:', embedding_matrix.shape)

# ### Initialization

print('Build model...')
model = Sequential()

model.add(Embedding(num_words,
embedding_dim,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False))
#model.add(Dropout(0.2))

model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())

model.add(Dense(128, activation='relu'))
model.add(Dense(20, activation='softmax'))

model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])

print(model.summary())

# ### Learning

epochs = settings.epochs
batch_size=128

json_logging_callback = LambdaCallback(
on_epoch_end=lambda epoch, logs: print(json.dumps({
"epoch": epoch,
"loss": logs["loss"],
"acc": logs["acc"],
"val_loss": logs["val_loss"],
"val_acc": logs["val_acc"],
})),
)

history = model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_val, y_val),
verbose=2,
callbacks=[json_logging_callback])

model.save('/valohai/outputs/20ng-cnn.h5')

# ### Inference

if settings.inference:
print('Evaluating model...')
scores = model.evaluate(x_test, y_test, verbose=2)
print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=20)
parser.add_argument('--inference', type=int, default=1)
settings = parser.parse_args()
main(settings)
Loading

0 comments on commit df45607

Please sign in to comment.