Skip to content

Commit

Permalink
PC-2016.3.2 <[email protected] Merge branch 'master'
Browse files Browse the repository at this point in the history
  • Loading branch information
cahya-wirawan committed Mar 17, 2017
2 parents 103a460 + 8a83ffb commit a43208e
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 39 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ optional arguments:
-h, --help show this help message and exit
--embedding_dim EMBEDDING_DIM
Dimensionality of character embedding (default: 128)
--enable_word_embeddings
Enable/disable the word embeddings (default: True)
--filter_sizes FILTER_SIZES
Comma-separated filter sizes (default: '3,4,5')
--num_filters NUM_FILTERS
Expand Down
24 changes: 22 additions & 2 deletions config.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
word_embeddings:
# Two types of word embedding algorithm (word2vec and glove) are supported.
# Just set the default to empty string to disable the word embeddings
default: word2vec
word2vec:
path: ../../data/GoogleNews-vectors-negative300.bin
Expand All @@ -10,6 +12,7 @@ word_embeddings:
length: 400000

datasets:
# Support currently 3 datasets: mrpolarity, 20newsgroup and localdata
default: 20newsgroup
mrpolarity:
positive_data_file:
Expand All @@ -19,7 +22,7 @@ datasets:
path: "data/rt-polaritydata/rt-polarity.neg"
info: "Data source for the negative data"
20newsgroup:
# The dataset includes following newsgroup:
# The dataset includes following 20 newsgroups:
# alt.atheism, comp.windows.x, rec.sport.hockey, soc.religion.christian
# comp.graphics, misc.forsale, sci.crypt, talk.politics.guns
# comp.os.ms-windows.misc, rec.autos, sci.electronics, talk.politics.mideast
Expand All @@ -32,5 +35,22 @@ datasets:
- soc.religion.christian
shuffle: True
random_state: 42

localdata:
# Load text files with categories as subfolder names.
# Individual samples are assumed to be files stored
# a two levels folder structure such as the following:
# container_folder/
# category_1_folder/
# file_1.txt file_2.txt ... file_42.txt
# category_2_folder/
# file_43.txt file_44.txt ...
#
# As an example, a SentenceCorpus dataset from
# https://archive.ics.uci.edu/ml/datasets/Sentence+Classification
# has been used. The dataset includes following 3 domains:
# arxiv, jdm and plos
container_path: ../../data/SentenceCorpus
categories:
shuffle: True
random_state: 42

27 changes: 22 additions & 5 deletions data_helpers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import numpy as np
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files

subset="train"
categories=['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
shuffle=True
random_state=42

def clean_str(string):
"""
Expand Down Expand Up @@ -48,10 +45,13 @@ def batch_iter(data, batch_size, num_epochs, shuffle=True):
yield shuffled_data[start_index:end_index]


def get_datasets_20newsgroup(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state):
def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
"""
Retrieve data from 20 newsgroups
:param subset: train, test or all
:param categories: List of newsgroup name
:param shuffle: shuffle the list or not
:param random_state: seed integer to shuffle the dataset
:return: data and labels of the newsgroup
"""
datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
Expand All @@ -77,6 +77,23 @@ def get_datasets_mrpolarity(positive_data_file, negative_data_file):
return datasets


def get_datasets_localdata(container_path=None, categories=None, load_content=True,
encoding='utf-8', shuffle=True, random_state=42):
"""
Load text files with categories as subfolder names.
Individual samples are assumed to be files stored a two levels folder structure.
:param container_path: The path of the container
:param categories: List of classes to choose, all classes are chosen by default (if empty or omitted)
:param shuffle: shuffle the list or not
:param random_state: seed integer to shuffle the dataset
:return: data and labels of the dataset
"""
datasets = load_files(container_path=container_path, categories=categories,
load_content=load_content, shuffle=shuffle, encoding=encoding,
random_state=random_state)
return datasets


def load_data_labels(datasets):
"""
Load data and labels
Expand Down
14 changes: 8 additions & 6 deletions eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,13 @@
y_test = np.argmax(y_test, axis=1)
print("Total number of test examples: {}".format(len(y_test)))
else:
x_raw = ["The number of reported cases of gonorrhea in Colorado increased",
"I am in the market for a 24-bit graphics card for a PC"]
y_test = [2, 1]
# x_raw = ["a masterpiece four years in the making", "everything is off."]
# y_test = [1, 0]
if dataset_name == "mrpolarity":
x_raw = ["a masterpiece four years in the making", "everything is off."]
y_test = [1, 0]
else:
x_raw = ["The number of reported cases of gonorrhea in Colorado increased",
"I am in the market for a 24-bit graphics card for a PC"]
y_test = [2, 1]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
Expand Down Expand Up @@ -104,7 +106,7 @@
correct_predictions = float(sum(all_predictions == y_test))
print("Total number of test examples: {}".format(len(y_test)))
print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
print(metrics.classification_report(y_test, all_predictions, target_names=datasets.target_names))
print(metrics.classification_report(y_test, all_predictions, target_names=datasets['target_names']))
print(metrics.confusion_matrix(y_test, all_predictions))

# Save the evaluation to a csv
Expand Down
4 changes: 2 additions & 2 deletions text_cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(

# Combine all the pooled features
num_filters_total = num_filters * len(filter_sizes)
self.h_pool = tf.concat(3, pooled_outputs)
self.h_pool = tf.concat(pooled_outputs, 3)
self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

# Add dropout
Expand All @@ -75,7 +75,7 @@ def __init__(

# CalculateMean cross-entropy loss
with tf.name_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

# Accuracy
Expand Down
58 changes: 34 additions & 24 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,15 @@
from tensorflow.contrib import learn
import yaml

with open("config.yml", 'r') as ymlfile:
cfg = yaml.load(ymlfile)

embedding_name = cfg['word_embeddings']['default']
embedding_dimension = cfg['word_embeddings'][embedding_name]['dimension']
dataset_name = cfg["datasets"]["default"]

# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")

# Model Hyperparameters
tf.flags.DEFINE_boolean("enable_word_embeddings", True, "Enable/disable the word embedding (default: True)")
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
Expand All @@ -46,6 +41,15 @@
print("{}={}".format(attr.upper(), value))
print("")

with open("config.yml", 'r') as ymlfile:
cfg = yaml.load(ymlfile)

dataset_name = cfg["datasets"]["default"]
if FLAGS.enable_word_embeddings and cfg['word_embeddings']['default'] is not None:
embedding_name = cfg['word_embeddings']['default']
embedding_dimension = cfg['word_embeddings'][embedding_name]['dimension']
else:
embedding_dimension = FLAGS.embedding_dim

# Data Preparation
# ==================================================
Expand All @@ -61,6 +65,11 @@
categories=cfg["datasets"][dataset_name]["categories"],
shuffle=cfg["datasets"][dataset_name]["shuffle"],
random_state=cfg["datasets"][dataset_name]["random_state"])
elif dataset_name == "localdata":
datasets = data_helpers.get_datasets_localdata(container_path=cfg["datasets"][dataset_name]["container_path"],
categories=cfg["datasets"][dataset_name]["categories"],
shuffle=cfg["datasets"][dataset_name]["shuffle"],
random_state=cfg["datasets"][dataset_name]["random_state"])
x_text, y = data_helpers.load_data_labels(datasets)

# Build vocabulary
Expand Down Expand Up @@ -148,23 +157,24 @@

# Initialize all variables
sess.run(tf.global_variables_initializer())
vocabulary = vocab_processor.vocabulary_
initW = None
if embedding_name == 'word2vec':
# load embedding vectors from the word2vec
print("Load word2vec file {}".format(cfg['word_embeddings']['word2vec']['path']))
initW = data_helpers.load_embedding_vectors_word2vec(vocabulary,
cfg['word_embeddings']['word2vec']['path'],
cfg['word_embeddings']['word2vec']['binary'])
print("word2vec file has been loaded")
elif embedding_name == 'glove':
# load embedding vectors from the glove
print("Load glove file {}".format(cfg['word_embeddings']['glove']['path']))
initW = data_helpers.load_embedding_vectors_glove(vocabulary,
cfg['word_embeddings']['glove']['path'],
embedding_dimension)
print("glove file has been loaded\n")
sess.run(cnn.W.assign(initW))
if FLAGS.enable_word_embeddings and cfg['word_embeddings']['default'] is not None:
vocabulary = vocab_processor.vocabulary_
initW = None
if embedding_name == 'word2vec':
# load embedding vectors from the word2vec
print("Load word2vec file {}".format(cfg['word_embeddings']['word2vec']['path']))
initW = data_helpers.load_embedding_vectors_word2vec(vocabulary,
cfg['word_embeddings']['word2vec']['path'],
cfg['word_embeddings']['word2vec']['binary'])
print("word2vec file has been loaded")
elif embedding_name == 'glove':
# load embedding vectors from the glove
print("Load glove file {}".format(cfg['word_embeddings']['glove']['path']))
initW = data_helpers.load_embedding_vectors_glove(vocabulary,
cfg['word_embeddings']['glove']['path'],
embedding_dimension)
print("glove file has been loaded\n")
sess.run(cnn.W.assign(initW))

def train_step(x_batch, y_batch):
"""
Expand Down

0 comments on commit a43208e

Please sign in to comment.