-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_cloud
57 lines (48 loc) · 1.76 KB
/
word_cloud
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import os
import pandas as pd
from pandas import DataFrame
import re
import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# 过滤字符串只保留中文
def translate(str):
line = str.strip()
p2 = re.compile('[^\u4e00-\u9fa5]') # 中文的编码范围是:\u4e00到\u9fa5
zh = " ".join(p2.split(line)).strip()
zh = ",".join(zh.split())
str = re.sub("[A-Za-z0-9!!,%\[\],。]", "", zh)
return str
def word_cloud(csv_file, stopwords_path, pic_path):
pic_name = csv_file+"_词云图.png"
path = os.path.abspath(os.curdir)
csv_file = path+ "/" + csv_file + ".csv"
d = pd.read_csv(csv_file, engine='python', encoding='utf-8')
content = []
for i in d['content']:
try:
i = translate(i)
except AttributeError as e:
continue
else:
content.append(i)
comment_after_split = jieba.cut(str(content), cut_all=False)
wl_space_split = " ".join(comment_after_split)
backgroud_Image = plt.imread(pic_path)
stopwords = STOPWORDS.copy()
with open(stopwords_path, 'r', encoding='utf-8') as f:
for i in f.readlines():
stopwords.add(i.strip('\n'))
f.close()
wc = WordCloud(width=1024, height=768, background_color='white',
mask=backgroud_Image, font_path="/Library/Fonts/悟空大字库.ttf",
stopwords=stopwords, max_font_size=300,
random_state=50)
wc.generate_from_text(wl_space_split)
img_colors = ImageColorGenerator(backgroud_Image)
wc.recolor(color_func=img_colors)
plt.imshow(wc)
plt.axis('off')
plt.show()
wc.to_file(pic_name)
word_cloud("复仇者联盟", "stopwords.txt", "复仇者联盟.jpg" )