-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathReddit_LDA_Analysis.py
170 lines (133 loc) · 7.27 KB
/
Reddit_LDA_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
### import the required modules and functions
from math import ceil
import numpy as np
from config import *
from ModelEstimation import LDAModel
from reddit_parser import Parser
from Utils import *
import ast
### Create directory for storing the output if it does not already exist
if not os.path.exists(output_path):
print("Creating directory to store the output")
os.makedirs(output_path)
### Write hyperparameters and performance to file
Write_Performance()
### call the parsing function
# NOTE: If NN = False, will pre-process data for LDA.
# NOTE: If write_original = True, the original text of a relevant comment -
# without preprocessing - will be saved to a separate file
# NOTE: If clean_raw = True, the compressed data files will be removed from disk
# after processing
# NOTE: Relevance filters can be changed from defaults.py
# NOTE: If there is partial record on file, e.g. including some months in the
# desired range, but not all, delete the aggregated text files resulting from
# previous parsing manually before running the function again.These files can be
# identified by not having a certain month in their filenames and depending on
# parameter settings can include: author, sentiments, lda_prep, nn_prep,
# original_comm, original_indices, Random_Count_Dict, Random_Count_List,
# random_indices, RC_Count_Dict, RC_Count_List, total_count and votes
theparser = Parser()
# create the relevant subfolders for storing dataset attributes
theparser.safe_dir_create()
theparser.Parse_Rel_RC_Comments()
if Neural_Relevance_Filtering:
# Use a transformer-based neural network trained on human ratings to prune the
# dataset from irrelevant posts. Path will default to the Human_Ratings folder
theparser.Neural_Relevance_Screen()
# Needs results from Neural_Relevance_Screen
theparser.Neural_Relevance_Clean()
# Filter the dataset based on whether posts are in English (uses Google's
# language detection)
# NOTE: Requires original text of comments to be available on disk
# NOTE: Should be separately run for LDA and NN, as their preprocessed comments
# are stored in separate files
# NOTE: Performance is significantly worse for shorter posts. By default,
# the filtering is only performed on posts that contain at least 20 words
theparser.lang_filtering()
# TODO: Run the function for alternative sentiment estimates after this
## TextBlob sentiment analysis is integrated into parsing. If requested and not
# available, the following function retrieves alternative sentiment measures
# (from NLTK's Vader and CoreNLP)
# NOTE: Make sure that Stanford CoreNLP's Python package is unzipped to the
# same directory as this file and CoreNLP_server.py is also available before
# running this function.
# if add_sentiment:
# theparser.add_sentiment()
## call the function for calculating the percentage of relevant comments
if calculate_perc_rel:
theparser.Perc_Rel_RC_Comment()
### create training and evaluation sets
if not ENTIRE_CORPUS:
theparser.select_random_comments()
## Determine the comments that will comprise each set
# NOTE: If NN = False, will create sets for LDA.
ldam = LDAModel()
ldam.Define_Sets()
## read the data and create the vocabulary and the term-document matrix
# NOTE: Needs loaded sets. Use Define_Sets() before running this function even
# if prepared sets exist on file
ldam.LDA_Corpus_Processing()
### Train and Test the LDA Model ###
ldam.get_model()
### calculate a lower bound on per-word perplexity for training and evaluation sets
# NOTE: This function writes the estimates after calculation to the file "perf"
# NOTE: This is a slow, serial function with no method for looking for previous
# estimates. Check the file named Performance in the output folder manually
# and comment out if estimates already exist
if calculate_perplexity:
train_per_word_perplex, eval_per_word_perplex = ldam.Get_Perplexity()
### calculate umass coherence to allow for interpretability comparison between
# models with different [num_topics]
if calculate_coherence:
ldam.Get_Coherence()
### Determine Top Topics Based on Contribution to the Model ###
# NOTE: There is a strict dependency hierarchy between the functions that come
# in this section and the next. They should be run in the order presented
### go through the corpus and calculate the contribution of each topic to comment content in each year
## Technical comments
# NOTE: The contribution is calculated over the entire dataset, not just the training set, but will ignore words not in the dictionary
# NOTE: Percentage of contributions is relative to the parts of corpus for which there WAS a reasonable prediction based on the model
# NOTE: For the LDA to give reasonable output, the number of topics given to this function should not be changed from what it was during model training
# NOTE: Serial, threaded and multicore (default) versions of this function are available (See Utils.py)
# NOTE: Even with multiprocessing, this function can be slow proportional to the number of top topics, as well as the size of the dataset
## Load or calculate topic distributions and create an enhanced version of the entire dataset
ldam.Get_Topic_Contribution()
ldam.get_top_topics()
## Plot the temporal trends in the top topics and save it to the output path
ldam.Plotter("{}/Temporal_Trend-{}-{}-{}".format(ldam.output_path,
"1hot" if one_hot_topic_contributions else "MLE", str(num_topics),
"idf" if topic_idf else "f"))
## Find the top words associated with top topics and write them to file
with open("{}/top_words-{}".format(
output_path, "idf" if topic_idf else "f"), 'w') as f: # create a file for storing
# the high-probability words
for topic in ldam.top_topics:
print(topic, file=f)
output = ldam.ldamodel.show_topic(topic, topn=topn)
print(output, file=f)
### Find the most Representative Comments for the Top Topics ###
### Retrieve the probability assigned to top topics for comments in the dataset
# NOTE: This function only outputs the probabilities for comments of length at
# least [min_comm_length] with non-zero probability assigned to at least one
# top topic
ldam.Get_Top_Topic_Theta()
## call the function for sampling the most impactful comments
if num_pop != None:
ldam.sample_pop()
### for the top topics, choose the [sample_comments] comments that reflect the
# greatest contribution of those topics and write them to file
# NOTE: If write_original was set to False during the initial parsing, this
# function will require all the original compressed data files (and will be much
# slower). If not in the same directory as this file, change the "path" argument
# NOTE: This function ignores comments in the incels subreddit, as no vote info
# for this banned subreddit exists in the source dataset
ldam.Get_Top_Comments()
## find top words associated with EVERY topic and write them to file
top_words_all = {key: [] for key in range(num_topics)}
with open(output_path + '/top_words_all_' + str(num_topics), 'a+') as f:
# create a file for storing the high-probability words
for topic in top_words_all.keys():
print(topic, file=f)
output = ldam.ldamodel.show_topic(topic, topn=topn)
print(output, file=f)
top_words_all[topic] = ldam.ldamodel.show_topic(topic, topn=topn)