-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_Processing.R
79 lines (61 loc) · 1.97 KB
/
03_Processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#-------------Packages-------------
library(abjutils)
library(tidytext)
library(tidyverse)
library(magrittr)
library(stm)
library(tm)
library(ggridges)
library(formattable)
#------------- General Data frame -------------
channel1 <- rep("channel1", 140)
df.channel1 <-cbind(df.channel1,channel1)
colnames(df.channel1)[colnames(df.channel1)=="channel1"] <- "Channel"
df.final.all <-rbind(df.channel1,df.channel2,df.channel3,df.channel4)
#------------- General wordlist -------------
df.final.all %>%
unnest_tokens(word, caption) %>%
# filtering stopwords
filter(!word %in% sw_pt_tm)%>%
# Counting
count(word) %>%
arrange(desc(n)) %>%
formattable()
#------------- stopwords list -------------
sw_pt_tm <- tm::stopwords("pt")
sw_pt_tm <- rm_accent(sw_pt_tm)
#------------- Processing the topics -------------
#
proc <- stm::textProcessor(df.final.all$caption, metadata = df.final.all,
language = "portuguese",
customstopwords = sw_pt_tm)
out <- stm::prepDocuments(proc$documents, proc$vocab, proc$meta,
lower.thresh = 10)
storage <- stm::searchK(out$documents, out$vocab, K = c(3:15),
data = out$meta)
fit <- stm::stm(
documents = out$documents, vocab = out$vocab, data = out$meta, K = 4,
max.em.its = 75, init.type = "Spectral", verbose = FALSE
)
#------------- Common words -------------
#
## More common words in each topic
stm::labelTopics(fit)
#plot(fit, "summary")
plot(fit$theta, type = "p", col="blue")
head(fit$theta)
view(fit$theta)
#------------- Topic names -------------
#
Topic.Names <- c("Relationships",
"Gender",
"Beauty",
"Transition")
#------------- Video possibility -------------
prob <- apply(fit$theta, 1, max)
#Possibility for each topic
Videos.Topic <- Topic.Names[apply(fit$theta, 1, which.max)]
#Join
df_topics <- df.final.all %>%
mutate(best_prob = prob,
topic = Videos.Topic)