-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0523_dtm.R
93 lines (62 loc) · 2.47 KB
/
0523_dtm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# corpus ver.
text <- c('Crash dieting is not the best way to lose weight. http://bbc.in/1G0J4Agg',
'A vegetarian diet excludes all animal flesh (meat, poultry, seafood).',
'Economists surveyed by Refinitiv expect the economy added 160,000 jobs.')
# preprocessing workflow
library(tm)
corpus.docs <- VCorpus(VectorSource(text))
corpus.docs <- tm_map(corpus.docs, content_transformer(tolower))
corpus.docs <- tm_map(corpus.docs, removeWords, stopwords('english'))
myRemove <- content_transformer(function(x, pattern)
{return(gsub(pattern, '', x))})
corpus.docs <- tm_map(corpus.docs, myRemove, '(f|ht)tp\\S+\\s*')
corpus.docs <- tm_map(corpus.docs, removePunctuation)
corpus.docs <- tm_map(corpus.docs, removeNumbers)
corpus.docs <- tm_map(corpus.docs, stripWhitespace)
corpus.docs <- tm_map(corpus.docs, content_transformer(trimws))
corpus.docs <- tm_map(corpus.docs, stemDocument)
corpus.docs <- tm_map(corpus.docs, content_transformer(gsub),
pattern='economist', replacement='economi')
corpus.docs
?DocumentTermMatrix
corpus.dtm <- DocumentTermMatrix(corpus.docs, control=list(wordLengths=c(2, Inf)))
corpus.dtm
nTerms(corpus.dtm)
Terms(corpus.dtm)
nDocs(corpus.dtm)
Docs(corpus.dtm)
# modify document title
rownames(corpus.dtm)
rownames(corpus.dtm) <- c('BBC', 'CNN', 'FOX')
Docs(corpus.dtm)
inspect(corpus.dtm)
inspect(corpus.dtm[1:2, 10:15])
library(tidytext)
tidy(corpus.dtm)
text <- c('Crash dieting is not the best way to lose weight. http://bbc.in/1G0J4Agg',
'A vegetarian diet excludes all animal flesh (meat, poultry, seafood).',
'Economists surveyed by Refinitiv expect the economy added 160,000 jobs.')
source <- c('BBC', 'CNN', 'FOX')
# tidy ver.
library(dplyr)
library(SnowballC)
library(tidytext)
text.df <- tibble(source=source, text=text)
text.df$text <- gsub('(f|ht)tp\\S+\\s*', '', text.df$text)
text.df$text <- gsub('\\d+', '', text.df$text)
tidy.docs <- text.df %>%
unnest_tokens(output=word, input=text) %>%
anti_join(stop_words, by='word') %>%
mutate(word=wordStem(word))
tidy.docs$word <- gsub('\\s+', '', tidy.docs$word)
tidy.docs$word <- gsub('economist', 'economi', tidy.docs$word)
tidy.docs %>% print(n=Inf)
tidy.docs %>%
count(source, word)
tidy.dtm <- tidy.docs %>%
count(source, word) %>%
cast_dtm(document=source, term=word, value=n)
tidy.dtm
Terms(tidy.dtm)
Docs(tidy.dtm)
inspect(tidy.dtm)