Skip to content

Latest commit

 

History

History
171 lines (126 loc) · 5.31 KB

6.基于深度学习的文本分类2-word2vec.md

File metadata and controls

171 lines (126 loc) · 5.31 KB

使用gensim训练word2vec -- copy

本DEMO只使用部分数据,使用全部数据预训练的词向量地址:

链接: https://pan.baidu.com/s/1ewlck3zwXVQuAzraZ26Euw 提取码: qbpr

word2vec的相关介绍

word2vec最早是由斯坦福的mikolov提出来的,具体请参考他的两篇论文

  • Distributed Representations of Sentences and Documents
  • 贡献:在前人基础上提出更精简的语言模型(language model)框架并用于生成词向量,这个框架就是 Word2vec
  • Efficient estimation of word representations in vector space
  • 贡献:专门讲训练 Word2vec 中的两个trick:hierarchical softmax 和 negative sampling

优点:Word2vec 开山之作,两篇论文均值得一读 缺点:只见树木,不见森林和树叶,读完不得要义。 这里『森林』指 word2vec 模型的理论基础——即 以神经网络形式表示的语言模型 树叶』指具体的神经网络形式、理论推导、hierarchical softmax 的实现细节等等

具体可以参考https://zhuanlan.zhihu.com/p/26306795 这篇文章

随机数生成器读取和torch设置

import logging
import random

import numpy as np
import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed
seed = 666
random.seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

对数据进行读取

# split data to 10 fold
import pandas as pd
data_file = 'datalab/73157/train_set.csv'

将数据划分为10折,其实sklearn中含有数据划分的函数KFold(K折交叉验证)和StratifiedKFold(分层K折交叉验证)。这里下面要实现的,是将所有不同标签类型分别划分到10折中,并且对10折中的标签类型排序进行打散。

#划分为10折
fold_num = 10

原来的题主代码中有个隐含的错误,我在下面已经将他修改了,你能找出来么?

# num表示划分的数据总数
def all_data2fold(fold_num, num=10000):
	fold_data = []
	f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
	texts = f['text'].tolist()[:num]
	labels = f['label'].tolist()[:num]

	total = len(labels)

	index = list(range(total))
	np.random.shuffle(index)

	all_texts = []
	all_labels = []
	for i in index:
		all_texts.append(texts[i])
		all_labels.append(labels[i])

	# 请注意label2id得用法,将label对应的index列表关联起来
	# 最后用这个index去同步找出text和label的统一数据划分
	label2id = {}
	for i in range(total):
		label = str(all_labels[i])
		if label not in label2id:
			label2id[label] = [i]
		else:
			label2id[label].append(i)

	all_index = [[] for _ in range(fold_num)]
	for label, data in label2id.items():
		# print(label, len(data))
		batch_size = int(len(data) / fold_num)
		other = len(data) - batch_size * fold_num
		for i in range(fold_num):
			cur_batch_size = batch_size + 1 if i < other else batch_size
			# print(cur_batch_size)
			batch_data = [data[i * cur_batch_size + b] for b in range(cur_batch_size)]
			all_index[i].extend(batch_data)

	batch_size = int(total / fold_num)
	other_texts = []
	other_labels = []
	other_num = 0
	start = 0
	for fold in range(fold_num):
		num = len(all_index[fold])
		texts = [all_texts[i] for i in all_index[fold]]
		labels = [all_labels[i] for i in all_index[fold]]

		if num > batch_size:
			fold_texts = texts[:batch_size]
			other_texts.extend(texts[batch_size:])
			fold_labels = labels[:batch_size]
			other_labels.extend(labels[batch_size:])
			other_num += num - batch_size
		elif num < batch_size:
			end = start + batch_size - num
			fold_texts = texts + other_texts[start: end]
			fold_labels = labels + other_labels[start: end]
			start = end
		else:
			fold_texts = texts
			fold_labels = labels

		assert batch_size == len(fold_labels)

		# shuffle
		# 随机化text和labels,但是保持数据一致
		index = list(range(batch_size))
		np.random.shuffle(index)

		shuffle_fold_texts = []
		shuffle_fold_labels = []
		for i in index:
			shuffle_fold_texts.append(fold_texts[i])
			shuffle_fold_labels.append(fold_labels[i])

		data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
		fold_data.append(data)

	logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))

	return fold_data


fold_data = all_data2fold(10)

build train data for word2vec

fold_id = 9

train_texts = []
for i in range(0, fold_id):
	data = fold_data[i]
	train_texts.extend(data['text'])
	
logging.info('Total %d docs.' % len(train_texts))
logging.info('Start training...')



from gensim.models.word2vec import Word2Vec

num_features = 100     # Word vector dimensionality
num_workers = 8       # Number of threads to run in parallel

train_texts = list(map(lambda x: list(x.split()), train_texts))
model = Word2Vec(train_texts, workers=num_workers, size=num_features)
model.init_sims(replace=True)

# save model
model.save("/home/tianchi/myspace/nlptask/word2vec.bin")

# load model
model = Word2Vec.load("/home/tianchi/myspace/nlptask/word2vec.bin")

# convert format
model.wv.save_word2vec_format('/home/tianchi/myspace/nlptask/word2vec.txt', binary=False)