forked from michelucci/Sandbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstudent_tm.R
124 lines (84 loc) · 3.65 KB
/
student_tm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
fileName <- '//cifsp02/UD101P01/Home/HKCPU/DATA/My Documents/Data Science/Projects/Student Text Mining/student1_txt.txt'
txt <- readChar(fileName, file.info(fileName)$size)
# number of characters
nchar(txt)
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en"), "mug", "coffee"))
return(corpus)
}
# Cleaning functions
UmlautCleaner <- function(text_vector){
text_vector <- gsub("[ä]", "ae", text_vector)
text_vector <- gsub("[ö]", "oe", text_vector)
text_vector <- gsub("[ü]", "ue", text_vector)
text_vector <- gsub("[è]", "e", text_vector)
text_vector <- gsub("[é]", "e", text_vector)
text_vector <- gsub("[à]", "a", text_vector)
return(text_vector)
}
TextPreprocessing <- function(text_vector){
# Remove Punctuation and Remove Numbers
text_vector <- gsub("[^A-Za-z]", " ", text_vector)
text_vector <- gsub(" *\\b[[:alpha:]]{1,3}\\b *", " ", text_vector)
return(text_vector)
}
CorpusCleaner <- function(corpus){
corpus_tmp <- tm_map(corpus, content_transformer(tolower))
corpus_tmp <- tm_map(corpus_tmp, removeWords, stopwords(kind = "de"))
corpus_tmp <- tm_map(corpus_tmp, removeWords, c(stopwords("de"), "https", "intranet", "sitepages", "aspx", "portal","fuer", "dass", "wurde", "dachbegruenung", "abbildung", "abbildungsverzeichnis"))
corpus_tmp <- tm_map(corpus_tmp, removeWords, c("schritte", "erwartetes", "ergebnis", "tatsaechliches"))
corpus_tmp <- tm_map(corpus_tmp, stripWhitespace)
return(corpus_tmp)
}
URLCleaner <- function(URL){
URL <- gsub("https://intranet.hel.kko.ch/", "", URL)
URL <- gsub("\\=.*", "", URL, perl = TRUE)
URL <- gsub("\\#.*", "", URL, perl = TRUE)
URL <- gsub(" ", "", URL)
return(URL)
}
# Cleaning the text
txt_clean <- UmlautCleaner(txt)
txt_clean <- TextPreprocessing(txt_clean)
############################
# Let's start with Text Mining
library("tm")
library("ggplot2")
txt_source <- VectorSource(txt_clean)
txt_corpus <- VCorpus(txt_source)
txt_corpus_clean <- CorpusCleaner(txt_corpus)
# Let's start with a tdm
txt_tdm <- TermDocumentMatrix(txt_corpus_clean)
txt_m <- as.matrix(txt_tdm)
term_frequency <- rowSums(txt_m)
term_frequency <- sort(term_frequency, decreasing = TRUE)
term_frequency[1:10]
#### Some plots
library(dplyr)
tf <- as.data.frame(term_frequency)
tf$words <- row.names(tf)
tf10 <- as.data.frame(tf[1:10,])
tf10 <- mutate(tf10, words = factor(words, words))
ggplot(tf10, aes(x = tf10$words , y = tf10$term_frequency )) + geom_bar( stat = "identity", fill = "tan", col = "black")+ theme_grey()+theme(text = element_text(size=16), axis.title.x=element_blank(),axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+ylab("Words Frequency")
## Wordclouds
library(wordcloud)
term_frequency[1:10]
word_freqs <- data.frame(term = names(term_frequency), num = term_frequency)
wordcloud(word_freqs$term, word_freqs$num, max.words = 100, colors = c("grey80", "darkgoldenrod1", "tomato"))
library(qdap)
word_associate(txt_clean, match.string = c("niederschlag"),
stopwords = c(stopwords("de"), "https", "intranet", "sitepages", "aspx", "portal","fuer", "dass", "wurde", "dachbegruenung", "abbildung", "abbildungsverzeichnis"),
network.plot = TRUE)
# Word associations
txt_tdm1 <- removeSparseTerms(txt_tdm, sparse = 0.9999)
txt_tdm1 <- txt_tdm[1:20,]
dim(txt_tdm1)
txt_tdm1_m <- as.matrix(txt_tdm1)
txt_tdm1_df <- as.data.frame(txt_tdm1_m)
txt_dist <- dist(txt_tdm1_df)
tst <- dist(term_frequency[1:20])
txt_hc <- hclust(tst)
plot(txt_hc)