-
Notifications
You must be signed in to change notification settings - Fork 0
/
02_solve_for_all_function.R
152 lines (122 loc) · 4.64 KB
/
02_solve_for_all_function.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# source of pdf files:
# https://intelligence.house.gov/social-media-content/social-media-advertisements.htm
library(pdftools)
library(tidyverse)
library(janitor)
library(lubridate)
# function to handle processing of extracting from the pdf file
extractmydata <- function(myfile) {
text <- pdf_text(myfile)[1]
df <- tibble(
document_name = myfile,
# ad_id = str_trim(gsub(".*Ad ID\\s*|Ad Text.*", "", text)),
ad_id = str_trim(str_remove(str_extract(text, "Ad ID.*"), "Ad ID")),
ad_text = if_else(
str_detect(text, "Ad Text"),
str_trim(gsub(".*Ad Text\\s*|Ad Landing.*", "", text)),
""),
ad_landing_page = str_trim(gsub(".*Ad Landing Page\\s*|Ad Targeting.*", "", text)),
ad_impressions = if_else(
str_detect(text, "Ad Impressions"),
str_trim(gsub(".*Ad Impressions\\s*|Ad Clicks.*", "", text)),
""),
ad_clicks = if_else(
str_detect(text, "Ad Clicks"),
str_trim(gsub(".*Ad Clicks\\s*|Ad Spend.*", "", text)),
""),
ad_spend = if_else(
str_detect(text, "Ad Spend"),
str_trim(gsub(".*Ad Spend\\s*|Ad Creation.*", "", text)),
""),
ad_creation_date = str_squish(gsub(".*Ad Creation Date\\s*|P.*", "", text)), #stop at the P in PST/PDT
ad_end_date = if_else(
str_detect(text, "Ad End Date"),
str_squish(gsub(".*Ad End Date\\s*|P.*", "", text)), #stop at the P in PST/PDT
""),
target_age = str_trim(str_remove(str_extract(text, "Age.*"), "Age:")),
target_location = str_trim(str_remove(str_extract(text, "Location.*"), "Location -")),
target_language = str_trim(str_remove(str_extract(text, "Language.*"), "Language:")),
target_pplwhomatch = if_else(
str_detect(text, "People Who Match"),
str_trim(gsub(".*People Who Match\\s*|Ad Impressions.*", "", text)),
""),
ad_targeting_fulltext = str_squish(gsub(".*Ad Targeting\\s*|Ad Impressions.*", "", text))
)
print(myfile)
return(df)
}
# run the function on a single pdf
extractmydata("pdfs/P(1)0000001.pdf")
# success!
#### Now let's do this for ALL the files ####
# get a list of all the files in the pdfs directory
allfiles <- list.files("./pdfs", full.names = TRUE)
allfiles
# run a purrr map function (ie looping) to apply our processing to every file
# map_df(allfiles, extractmydata)
# save results into new dataframe
myresults <- map_df(allfiles, extractmydata)
#write to file
write_csv(myresults, "myresults.csv")
### now we'll clean up some formatting of the results data ####
glimpse(myresults)
#convert impressions and clicks to numberic, dates to date format
#also pull out y, m, d from dates and trim document_name to remove path itself
myresults_formatted <- myresults %>%
mutate(
ad_impressions = parse_number(ad_impressions), #use readr's parse_number to handle commas in text
ad_clicks = parse_number(ad_clicks),
ad_creation_date = mdy_hms(ad_creation_date),
ad_creation_year = year(ad_creation_date),
ad_creation_month = month(ad_creation_date),
ad_creation_day = day(ad_creation_date),
ad_end_date = mdy_hms(ad_end_date),
document_name = str_sub(document_name, 8, 50),
target_pplwhomatch = str_remove(target_pplwhomatch, ": ")
)
#handle ocassional "None" in ad spend instead of a zero/blank
myresults_formatted <- myresults_formatted %>%
mutate(
ad_spend = str_replace(ad_spend, "None", "")
)
# pull currency and ad spend work
currency_vector <- str_extract(myresults_formatted$ad_spend, "RUB")
myresults_formatted <- myresults_formatted %>%
mutate(
currency = currency_vector,
ad_spend = str_replace(ad_spend, "RUB", ""),
ad_spend = parse_number(ad_spend)
)
# isolate STATES and CITIES in location ####
# by joining with a hand-rolled lookup table
location_lookup <- read_csv("location_lookup.csv",
col_types = cols(rank = col_skip()))
#join
joined <- left_join(myresults_formatted, location_lookup)
# move currency column next to ad spend, year month and day next to date
myresults_formatted <- joined %>%
select(1:7,
currency,
ad_creation_date,
ad_creation_year,
ad_creation_month,
ad_creation_day,
ad_end_date,
target_age,
target_location,
location_country,
location_state1,
location_state2,
location_state3,
location_state4,
location_state5,
location_city1,
location_city2,
location_city3,
location_city4,
location_city5,
everything())
#write to file
write_csv(myresults_formatted, "myresults_formatted.csv")
#write to RDS file
saveRDS(myresults_formatted, "myresults_formatted.rds")