-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
2,122 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2019 CSC - IT Center for Science Ltd. | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# machine-learning-scripts | ||
|
||
This repository is a miscellaneous collection of scripts and tools related to machine learning. Currently consists of Jupyter notebooks in the `notebooks` subdirectory, and example SLURM scripts in `slurm`. Materials related to specific courses can be found in the `courses` subdirectory. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# dependencies for Valohai inference deployment | ||
pillow>=6.2.0 | ||
requests==2.21.0 | ||
scikit-image==0.14.2 | ||
werkzeug>=0.15.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
https://www.catster.com/wp-content/uploads/2017/08/A-fluffy-cat-looking-funny-surprised-or-concerned.jpg | ||
https://i.ytimg.com/vi/lrvqjdMcjjQ/hqdefault.jpg | ||
https://dynaimage.cdn.cnn.com/cnn/w_768,h_1024,c_scale/https%3A%2F%2Fdynaimage.cdn.cnn.com%2Fcnn%2Fx_1229%2Cy_0%2Cw_2712%2Ch_3616%2Cc_crop%2Fhttps%253A%252F%252Fstamp.static.cnn.io%252F5b7ac48b4db3d70020c01c13%252Fshutterstock_1081879181.jpg | ||
https://static-cdn.jtvnw.net/jtv_user_pictures/dogdog-profile_image-5550ade194780dfc-300x300.jpeg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
|
||
# coding: utf-8 | ||
|
||
# # 20 Newsgroups text classification with pre-trained word embeddings | ||
# | ||
# In this script, we'll use pre-trained [GloVe word embeddings] | ||
# (http://nlp.stanford.edu/projects/glove/) for text classification | ||
# using Keras (version $\ge$ 2 is required). This script is largely | ||
# based on the blog post [Using pre-trained word embeddings in a Keras | ||
# model] | ||
# (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) | ||
# by François Chollet. | ||
# | ||
# **Note that using a GPU with this script is highly recommended.** | ||
# | ||
# First, the needed imports. Keras tells us which backend (Theano, | ||
# Tensorflow, CNTK) it will be using. | ||
|
||
from keras.preprocessing import sequence, text | ||
from keras.models import Sequential | ||
from keras.layers import Dense, Dropout | ||
from keras.layers import Embedding | ||
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D | ||
from keras.layers import LSTM, CuDNNLSTM | ||
from keras.utils import to_categorical | ||
from keras.callbacks import LambdaCallback | ||
|
||
from distutils.version import LooseVersion as LV | ||
from keras import __version__ | ||
from keras import backend as K | ||
|
||
from sklearn.model_selection import train_test_split | ||
|
||
import argparse | ||
import json | ||
|
||
import os | ||
import sys | ||
|
||
import numpy as np | ||
|
||
print('Using Keras version:', __version__, 'backend:', K.backend()) | ||
assert(LV(__version__) >= LV("2.0.0")) | ||
|
||
def main(settings): | ||
|
||
# ## GloVe word embeddings | ||
# | ||
# Let's begin by loading a datafile containing pre-trained word | ||
# embeddings. The datafile contains 100-dimensional embeddings for | ||
# 400,000 English words. | ||
|
||
GLOVE_DIR = "/valohai/inputs/dataset/" | ||
|
||
print('Indexing word vectors.') | ||
|
||
embeddings_index = {} | ||
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f: | ||
for line in f: | ||
values = line.split() | ||
word = values[0] | ||
coefs = np.asarray(values[1:], dtype='float32') | ||
embeddings_index[word] = coefs | ||
|
||
print('Found %s word vectors.' % len(embeddings_index)) | ||
|
||
# ## 20 Newsgroups data set | ||
# | ||
# Next we'll load the [20 Newsgroups] | ||
# (http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html) | ||
# data set. | ||
# | ||
# The dataset contains 20000 messages collected from 20 different | ||
# Usenet newsgroups (1000 messages from each group): | ||
# | ||
# alt.atheism | soc.religion.christian | comp.windows.x | sci.crypt | ||
# talk.politics.guns | comp.sys.ibm.pc.hardware | rec.autos | sci.electronics | ||
# talk.politics.mideast | comp.graphics | rec.motorcycles | sci.space | ||
# talk.politics.misc | comp.os.ms-windows.misc | rec.sport.baseball | sci.med | ||
# talk.religion.misc | comp.sys.mac.hardware | rec.sport.hockey | misc.forsale | ||
|
||
TEXT_DATA_DIR = "/valohai/inputs/dataset/20_newsgroup" | ||
|
||
print('Processing text dataset') | ||
|
||
texts = [] # list of text samples | ||
labels_index = {} # dictionary mapping label name to numeric id | ||
labels = [] # list of label ids | ||
for name in sorted(os.listdir(TEXT_DATA_DIR)): | ||
path = os.path.join(TEXT_DATA_DIR, name) | ||
if os.path.isdir(path): | ||
label_id = len(labels_index) | ||
labels_index[name] = label_id | ||
for fname in sorted(os.listdir(path)): | ||
if fname.isdigit(): | ||
fpath = os.path.join(path, fname) | ||
args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'} | ||
with open(fpath, **args) as f: | ||
t = f.read() | ||
i = t.find('\n\n') # skip header | ||
if 0 < i: | ||
t = t[i:] | ||
texts.append(t) | ||
labels.append(label_id) | ||
|
||
print('Found %s texts.' % len(texts)) | ||
|
||
# Vectorize the text samples into a 2D integer tensor. | ||
|
||
MAX_NUM_WORDS = 10000 | ||
MAX_SEQUENCE_LENGTH = 1000 | ||
|
||
tokenizer = text.Tokenizer(num_words=MAX_NUM_WORDS) | ||
tokenizer.fit_on_texts(texts) | ||
sequences = tokenizer.texts_to_sequences(texts) | ||
|
||
word_index = tokenizer.word_index | ||
print('Found %s unique tokens.' % len(word_index)) | ||
|
||
data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) | ||
|
||
labels = to_categorical(np.asarray(labels)) | ||
print('Shape of data tensor:', data.shape) | ||
print('Shape of label tensor:', labels.shape) | ||
|
||
# Split the data into a training set and a validation set | ||
|
||
VALIDATION_SET, TEST_SET = 1000, 4000 | ||
|
||
x_train, x_test, y_train, y_test = train_test_split(data, labels, | ||
test_size=TEST_SET, | ||
shuffle=True, | ||
random_state=42) | ||
|
||
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, | ||
test_size=VALIDATION_SET, | ||
shuffle=False) | ||
|
||
print('Shape of training data tensor:', x_train.shape) | ||
print('Shape of training label tensor:', y_train.shape) | ||
print('Shape of validation data tensor:', x_val.shape) | ||
print('Shape of validation label tensor:', y_val.shape) | ||
print('Shape of test data tensor:', x_test.shape) | ||
print('Shape of test label tensor:', y_test.shape) | ||
|
||
# Prepare the embedding matrix: | ||
|
||
print('Preparing embedding matrix.') | ||
|
||
num_words = min(MAX_NUM_WORDS, len(word_index) + 1) | ||
embedding_dim = 100 | ||
|
||
embedding_matrix = np.zeros((num_words, embedding_dim)) | ||
for word, i in word_index.items(): | ||
if i >= MAX_NUM_WORDS: | ||
continue | ||
embedding_vector = embeddings_index.get(word) | ||
if embedding_vector is not None: | ||
# words not found in embedding index will be all-zeros. | ||
embedding_matrix[i] = embedding_vector | ||
|
||
print('Shape of embedding matrix:', embedding_matrix.shape) | ||
|
||
# ### Initialization | ||
|
||
print('Build model...') | ||
model = Sequential() | ||
|
||
model.add(Embedding(num_words, | ||
embedding_dim, | ||
weights=[embedding_matrix], | ||
input_length=MAX_SEQUENCE_LENGTH, | ||
trainable=False)) | ||
#model.add(Dropout(0.2)) | ||
|
||
model.add(Conv1D(128, 5, activation='relu')) | ||
model.add(MaxPooling1D(5)) | ||
model.add(Conv1D(128, 5, activation='relu')) | ||
model.add(MaxPooling1D(5)) | ||
model.add(Conv1D(128, 5, activation='relu')) | ||
model.add(GlobalMaxPooling1D()) | ||
|
||
model.add(Dense(128, activation='relu')) | ||
model.add(Dense(20, activation='softmax')) | ||
|
||
model.compile(loss='categorical_crossentropy', | ||
optimizer='rmsprop', | ||
metrics=['accuracy']) | ||
|
||
print(model.summary()) | ||
|
||
# ### Learning | ||
|
||
epochs = settings.epochs | ||
batch_size=128 | ||
|
||
json_logging_callback = LambdaCallback( | ||
on_epoch_end=lambda epoch, logs: print(json.dumps({ | ||
"epoch": epoch, | ||
"loss": logs["loss"], | ||
"acc": logs["acc"], | ||
"val_loss": logs["val_loss"], | ||
"val_acc": logs["val_acc"], | ||
})), | ||
) | ||
|
||
history = model.fit(x_train, y_train, | ||
batch_size=batch_size, | ||
epochs=epochs, | ||
validation_data=(x_val, y_val), | ||
verbose=2, | ||
callbacks=[json_logging_callback]) | ||
|
||
model.save('/valohai/outputs/20ng-cnn.h5') | ||
|
||
# ### Inference | ||
|
||
if settings.inference: | ||
print('Evaluating model...') | ||
scores = model.evaluate(x_test, y_test, verbose=2) | ||
print("Test set %s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--epochs', type=int, default=20) | ||
parser.add_argument('--inference', type=int, default=1) | ||
settings = parser.parse_args() | ||
main(settings) |
Oops, something went wrong.