-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0610_911.R
69 lines (40 loc) · 1.36 KB
/
0610_911.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
url <- 'https://archive.nytimes.com/www.nytimes.com/learning/general/onthisday/big/0911.html'
library(httr)
html <- GET(url)
library(XML)
html.parsed <- htmlParse(html)
text <- xpathSApply(html.parsed, "//p", xmlValue)
str(text)
text <- text[text != '']
text
# making the vector all in one sentence
text <- paste(text, collapse=' ')
text
# preprocessing
text <- gsub('\\s{2,} | \n', '', text)
text
library(tm)
doc <- VCorpus(VectorSource(text))
doc
inspect(doc)
# text preprocessing , lowercase
doc <- tm_map(doc, content_transformer(tolower()))
inspect(doc)
mystopwords <- c(stopwords('english'), c('also', 'among', 'but', 'even', 'four', 'get', 'one', 'said', 'the', 'there', 'two', 'three'))
doc <- tm_map(doc, removeWords, mystopwords)
doc <- tm_map(doc, removePunctuation)
doc <- tm_map(doc, removeNumbers)
doc <- tm_map(doc, stripWhitespace)
doc <- tm_map(doc, stemDocument)
dtm <- DocumentTermMatrix(doc)
dtm
inspect(dtm[,1:10])
term.freq <- colSums(as.matrix(dtm))
head(term.freq)
term.freq[order(term.freq, decreasing = T)][1:10]
library(wordcloud)
library(RColorBrewer)
set.seed(123)
windows(width=6.5, height=6.5)
wordcloud(words=names(term.freq), freq=term.freq, scale=c(4, 0.4), min.freq=3, rot.per=0, random.order=F , random.color=F, colors=brewer.pal(5, 'Set1'))
?RColorBrewer