-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_visualization.py
155 lines (116 loc) · 5.39 KB
/
data_visualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import nltk
import pandas as pd
import webbrowser
import unicodedata
import scattertext as st
import spacy
import re, io
from collections import Counter
import matplotlib.pyplot as plt
from IPython.display import IFrame
from IPython.core.display import display, HTML
from pandas._libs.tslibs.timestamps import Timestamp
from wordcloud import WordCloud
import os, pkgutil, json, urllib
from urllib.request import urlopen
from pprint import pprint
from scattertext import CorpusFromPandas, produce_scattertext_explorer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.metrics import homogeneity_score
from sklearn.metrics import silhouette_score
import xlrd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
display(HTML("<style>.container { width:98% !important; }</style>"))
class Data_visualization:
def __init__(self,file_name):
self.str=""
self.word_array=[]
self.file_name=file_name
def wordcloud(self):
stop_words = nltk.corpus.stopwords.words("english")
Word__tokenize = nltk.word_tokenize(self.str)
# filter word in stop word
Word__tokenize_filter = [w for w in Word__tokenize if not w in stop_words]
width = 12
height = 12
plt.figure(figsize=(width, height))
# text = 'all your base are belong to us all of your base base base'
wordcloud = WordCloud(width=1800, height=1400).generate(str(Word__tokenize_filter))
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig("WORD_CLOUD_pos_After_Classification_NY_6.png")
plt.close()
def word_count(self):
stop_words = nltk.corpus.stopwords.words("english")
Word__tokenize = nltk.word_tokenize(self.str)
# filter word in stop word
Word__tokenize_filter = [w for w in Word__tokenize if not w in stop_words]
# count the word
Word__tokenize_counter = Counter(Word__tokenize_filter)
word_dict = dict(Word__tokenize_counter)
data = pd.DataFrame(list(word_dict.items()), columns=['word', 'count'])
data.sort_values("count", axis=0, ascending=False, inplace=True, na_position='last')
# data.to_csv("word_count"+self.file_name)
fig, ax = plt.subplots(figsize=(30, 30))
# Plot horizontal bar graph
data[:50].plot.barh(x='word', y='count', ax=ax, color="purple")
ax.set_title("Common Words Found in Tweets (Including All Words)")
plt.savefig("WORD_COUNT_pos_After_Classification_NY_6.png")
plt.close()
def scattertext_function(self):
## START
nlp = spacy.load('en_core_web_sm')
convention_df = pd.read_csv("After_Classification/After_Classification_NY_6.csv")
convention_df['parsed'] = convention_df.tweet.apply(nlp)
##Index(['Unnamed: 0', 'Date', 'name', 'tweet', 'death', 'Classification'], dtype='object')
# print("Document Count")
# print(convention_df.groupby('Classification')['tweet'].count())
# print("Word Count")
# print(convention_df.groupby('Classification').apply(lambda x: x.tweet.apply(lambda x: len(x.split())).sum()))
# print(type(convention_df))
##Convert Dataframe into Scattertext Corpus
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='Classification', parsed_col='parsed').build()
print(type(st.Scalers.log_scale_standardize))
list(corpus.get_scaled_f_scores_vs_background().index[:10])
html = st.produce_scattertext_explorer(corpus,category='pos', category_name='POS', not_category_name='NEG',minimum_term_frequency=5, width_in_pixels=1000,transform=st.Scalers.log_scale_standardize)
file_name_1 = 'After_Classification_NY_6.html'
open(file_name_1, 'wb').write(html.encode('utf-8'))
print(IFrame(src=file_name_1, width=1200, height=700))
#display(IFrame(html))
def open_html(self):
new = 2
url = "Scattertext_HTML/After_Classification_NY_1.html"
webbrowser.open(url, new=new)
def test2(self):
df=pd.read_csv(self.file_name)
pos_df=df.loc[df["Classification"] == "pos"]
neg_df=df.size-pos_df.size
Tasks = [pos_df.size, neg_df]
my_labels = 'POS', 'NEG'
plt.pie(Tasks, labels=my_labels, autopct='%1.1f%%')
plt.title('Trump')
plt.axis('equal')
plt.savefig("PI_Trump_D12.png")
#plt.show()
def start(self):
self.test2()
exit()
#self.scattertext_function()
df = pd.read_csv(self.file_name)
neg_trump_D3 = df.loc[df["Classification"] == "pos"]
text = neg_trump_D3['tweet'].tolist()
for words in text:
self.word_array.append(str(words))
self.str = self.str + words
self.word_array = list(set(self.word_array))
self.wordcloud()
self.word_count()
name="After_Classification/After_Classification_Trump_D12.csv"
data=Data_visualization(name)
data.start()
##corpus = st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build()
# df=pd.read_csv("After_Classification/After_Classification_Trump_D1.csv")
# print(df["Date"].head())