-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processing.R
82 lines (64 loc) · 3.25 KB
/
data_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# create corpora using tm package
library(tm)
#corp <- VCorpus(DirSource("./sample/en_US/",encoding = "UTF-8"))
#corp <- VCorpus(DirSource("./sample2/en_US/",encoding = "UTF-8"))
corp <- VCorpus(DirSource("./sample3/en_US/",encoding = "UTF-8"))
#corp <- VCorpus(DirSource("./test_set/",encoding = "UTF-8"))
profanities<-read.table("profanities.txt",stringsAsFactors = F,strip.white = T,sep = "\n")[,1] #source: "https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/en"
skipWords <- function(x) removeWords(x,profanities)
remPunct <- function(x) removePunctuation(x,ucp=TRUE)# (when ucp == False, ” are not removed)
removeNonWesternCharacters <- content_transformer(function(x) {
id <- !is.na(iconv(x, from = "UTF-8", to = "WINDOWS-1252"))
print(paste("removed",as.character(length(x)-sum(id)),"lines"))
x[id]
}) # removal of lines with non-Western characters
funs <- list(skipWords,
stripWhitespace,
remPunct,
removeNumbers,
content_transformer(tolower),
removeNonWesternCharacters)
corp <- tm_map(corp,FUN=tm_reduce,tmFuns=funs)
flatcorp <- c(corp[[1]][[1]],corp[[2]][[1]],corp[[3]][[1]])#make just a char vector by appending document contents
library(quanteda)
library(RWeka)
library(stringr)
library(dplyr)
#Clustering, tokenisation, handling of unknown words
#---------------------------------------------------
single_words <- textstat_frequency(dfm(tokens(flatcorp,ngrams=1))) %>% filter(frequency>1)
unks <- (single_words %>% filter(frequency<3,frequency>1) %>% select(feature))$feature
#unique words do not need to be checked because only tokens with frequency >1 have been filtered
#Handling unknown words
#Replace all words that occur as unigram rarely by <unk>
replace_unknowns <- function(features){
split <- str_split(features,pattern = "_",simplify=TRUE)
indexes <- split %in% unks
split[indexes] <- "<unk>"
str_trim(apply(split, 1, paste,collapse=" "))
}
freqs <- textstat_frequency(dfm(tokens(flatcorp,ngrams=2:5))) %>% filter(frequency>1)
ngramdb <- tibble(feature=freqs$feature,frequency=as.integer(freqs$frequency))
ngramdb <- ngramdb %>% transmute(
grams=as.integer(1+str_count(feature,pattern="_")),
prediction=str_extract(feature,pattern="[^_]+$"),
predictor=sub(feature,pattern = "_[^_]+$",replacement =""),
predictor=replace_unknowns(predictor),
frequency)
#calculate conditional probabilities of predictor given the prediction
totals <- ngramdb %>% group_by(predictor) %>% summarise(tot=sum(frequency))
ngramdb <- ngramdb %>% left_join(totals) %>% mutate(condprob=frequency/tot) %>% select(-tot)
ngramdb <- filter(ngramdb,!(grams==2 & frequency<5),!(grams==3 & frequency<3))
#for performance reasons
knowns <- unique(as.character(str_split(ngramdb$predictor," ",simplify = T)))
#saveRDS(ngramdb,"./training_set/ngramdb_big.rds")
#saveRDS(ngramdb,"./training_set/knowns_big.rds")
saveRDS(ngramdb,"./training_set/ngramdb_medium.rds")
saveRDS(knowns,"./training_set/knowns_medium.rds")
#saveRDS(ngramdb,"./training_set/ngramdb.rds")
#saveRDS(knowns,"./training_set/knowns.rds")
#saveRDS(ngramdb,"./test_set/ngramdb.rds")
#x <- table(flatten(gr3)) #Distrubution of occuration counts
#y <- as.numeric(names(x))*x #Weighting
#y_dist <- ecdf(y)
#plot(y_dist)