-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_data.py
219 lines (181 loc) · 10.6 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# Prepare wide and long forms of data for video and language analysis
# author = Sabyasachee
import re
import collections
import pandas as pd
import numpy as np
def full_form_language(lang):
return dict(bn = "BENGALI", hi = "HINDI", kn = "KANNADA", ta = "TAMIL", te = "TELUGU")[lang]
def parse_duration(x):
h, m, s = x.split(":")
h, m, s = int(h), int(m), int(s)
duration = 3600*h + 60*m + s
return duration
def drop_duplicates(df, key):
"drop all rows from df if they share same value on key column, but retain rows with NaN on key column"
notna_df = df[df[key].notna()].drop_duplicates(key, keep=False)
na_df = df[df[key].isna()]
return pd.concat((na_df, notna_df))
def prepare_data():
# set filenames
video_analysis_file = "data/[MUSE India] [RP Outputs] - Muse_India_Study_yt_local.csv.csv"
language_analysis_files = [
"data/[MUSE India] [Final] Language Analysis Results - bn.csv",
"data/[MUSE India] [Final] Language Analysis Results - hi.csv",
"data/[MUSE India] [Final] Language Analysis Results - kn.csv",
"data/[MUSE India] [Final] Language Analysis Results - ta.csv",
"data/[MUSE India] [Final] Language Analysis Results - te.csv"
]
# read the video analysis and language analysis dataframes
video_analysis_df = pd.read_csv(video_analysis_file, index_col=None)
language_analysis_dfs = [pd.read_csv(language_analysis_file, index_col=None)
for language_analysis_file in language_analysis_files]
language_analysis_df = pd.concat(language_analysis_dfs)
# lower case program name
video_analysis_df["Program name"] = video_analysis_df["Program name"].str.lower()
# change language abbreviations to full form
language_analysis_df["Language"] = language_analysis_df["Language"].apply(full_form_language)
# create key columns
language_analysis_df["Video ID"] = language_analysis_df["Video ID"].str.replace(r"\.m((p4)|(ov))$", "", regex=True)
video_analysis_df.rename(columns={"video_key": "youtube_key", "Cat No.": "file_key"}, inplace=True)
language_analysis_df.rename(columns={"Video ID": "key"}, inplace=True)
# remove samples with same keys
video_analysis_df = drop_duplicates(video_analysis_df, "file_key")
video_analysis_df = drop_duplicates(video_analysis_df, "youtube_key")
language_analysis_df.drop_duplicates("key", keep=False, inplace=True)
# parse the duration columns
video_analysis_df["Ats(viewer)(sec)"] = video_analysis_df["Ats(viewer)"].apply(lambda x: parse_duration(x))
video_analysis_df["Program duration(sec)"] = video_analysis_df["Program duration"].apply(
lambda x: parse_duration(x))
video_analysis_df["Ats(viewer)%"] = (
100 * video_analysis_df["Ats(viewer)(sec)"]/video_analysis_df["Program duration(sec)"])
# create metadata df
metadata_df = video_analysis_df[["file_key", "youtube_key", "Year", "Program name", "Programme Language",
"Program Theme", "Channel", "Ats(viewer)%", "rat%/AP", "Daily Avg Rch%"]].copy()
metadata_df.columns = ["file_key", "youtube_key", "year", "program", "lang", "genre", "channel", "ats",
"rating", "reach"]
# create faces data
# n_faces_arr[i, j, k, l] = number of faces of gender j and age k and skintone l in video i
gender_cats = ["male", "female"]
age_cats = ["[0, 18)", "[18, 33)", "[33, 60)", "[60, inf)"]
skintone_cats = ["[-inf, 1.1)", "[1.1, 2.1)", "[2.1, 3.1)", "[3.1, 4.1)", "[4.1, 5.1)", "[5.1, 6.1)", "[6.1, 7.1)",
"[7.1, 8.1)", "[8.1, 9.1)", "[9.1, 10.1)"]
n_faces_arr = np.zeros((len(video_analysis_df), len(gender_cats), len(age_cats), len(skintone_cats)), dtype=int)
for index, (_, row) in enumerate(video_analysis_df.iterrows()):
for i, gender_cat in enumerate(gender_cats):
for j, age_cat in enumerate(age_cats):
for k, skintone_cat in enumerate(skintone_cats):
cat = f"({gender_cat}, {age_cat}, {skintone_cat})"
if cat in row and pd.notna(row[cat]):
n_faces_arr[index, i, j, k] = row[cat]
# create video analysis dataframe in wide and long forms
# each key will have 2 x 4 x 3 rows = 24 rows in the long form
long_video_rows = []
wide_video_rows = []
named_age_cats = ["young", "adult", "middle_aged", "old"]
named_skintone_cats = ["light", "medium", "dark"]
for index, (_, row) in enumerate(metadata_df.iterrows()):
total_faces = n_faces_arr[index].sum()
for i, gender_cat in enumerate(gender_cats):
for j, age_cat in enumerate(named_age_cats):
for skintone_cat in named_skintone_cats:
if skintone_cat == "light":
k1, k2 = 0, 3
elif skintone_cat == "medium":
k1, k2 = 3, 6
else:
k1, k2 = 6, 10
faces = n_faces_arr[index, i, j, k1 : k2].sum()
long_video_rows.append(row.tolist() + [gender_cat, age_cat, skintone_cat, faces])
gender_faces = n_faces_arr[index].sum(axis=(1, 2)).tolist()
age_faces = n_faces_arr[index].sum(axis=(0, 2)).tolist()
skintone_faces = n_faces_arr[index].sum(axis=(0, 1))
skintone_faces = [skintone_faces[:3].sum(), skintone_faces[3:6].sum(), skintone_faces[6:].sum()]
wide_video_rows.append(row.tolist() + gender_faces + age_faces + skintone_faces + [total_faces])
# save the long form video to file
long_video_df = pd.DataFrame(long_video_rows, columns=metadata_df.columns.tolist()
+ ["gender", "age", "skintone", "faces"])
long_video_df.to_csv("data/long_video.csv", index=False)
# create the wide form video
wide_video_df = pd.DataFrame(wide_video_rows, columns=metadata_df.columns.tolist()
+ ["male_faces", "female_faces", "young_faces", "adult_faces", "middle_aged_faces",
"old_faces", "light_faces", "medium_faces", "dark_faces", "faces"])
print(f"{len(wide_video_df)} video samples")
# create language analysis dataframe in wide form
rows = []
header = ["key", "lang", "derogatory", "controversial",
"male_name", "female_name", "unisex_name",
"hindu_name", "muslim_name", "christian_name",
"transcript", "non_stopword_transcript"]
for _, row in language_analysis_df.iterrows():
key = row["key"]
lang = row["Language"]
# derogatory words
derogatory_count_str = row["Derogatory words (dictionary) word count"]
if derogatory_count_str != "None":
derogatory_tuples_str = re.findall(r"\([^\)]+\)", derogatory_count_str[1:-1])
derogatory_count = sum([int(tup[1:-1].split(", ")[1]) for tup in derogatory_tuples_str])
else:
derogatory_count = 0
# controversial count
controversial_count_str = row["Controversial topics (LLM) word count"]
if controversial_count_str != "None":
controversial_tuples_str = re.findall(r"\([^\)]+\)", controversial_count_str[1:-1])
controversial_count = sum([int(tup[1:-1].split(", ")[1]) for tup in controversial_tuples_str])
else:
controversial_count = 0
# person name count
religions = []
genders = []
person_name_str = row["Person names word count"]
person_names_list = re.findall(r"\([^\)]+\)", person_name_str)
for person_name_tuple_str in person_names_list:
person_name_tuple = person_name_tuple_str[1:-1].split(", ")
religions.append(person_name_tuple[1].strip("'"))
genders.append(person_name_tuple[2].strip("'"))
religion_dict = collections.Counter(religions)
gender_dict = collections.Counter(genders)
hindu_count = religion_dict.get("hindu", 0)
muslim_count = religion_dict.get("muslim", 0)
christian_count = religion_dict.get("christian", 0)
male_count = gender_dict.get("male", 0)
female_count = gender_dict.get("female", 0)
person_name_count = len(person_names_list)
# transcript count
transcript_count = row["Transcript word count"]
nonstopword_transcript_count = row["Transcript non-stopword count"]
# create wide-form row
rows.append([key, lang, derogatory_count, controversial_count, person_name_count,
hindu_count, muslim_count, christian_count, male_count, female_count,
transcript_count, nonstopword_transcript_count])
wide_lang_df = pd.DataFrame(rows, columns=header)
# remove samples with transcript containing less than 100 words
wide_lang_df = wide_lang_df[wide_lang_df["transcript"] >= 100]
# join wide form of video and language analysis dataframes
wide_video_lang_df1 = wide_video_df.merge(wide_lang_df, how="inner", left_on=["youtube_key", "lang"],
right_on=["key", "lang"], suffixes=("_vd", "_ln"))
wide_video_lang_df2 = wide_video_df.merge(wide_lang_df, how="inner", left_on=["file_key", "lang"],
right_on=["key", "lang"], suffixes=("_vd", "_ln"))
wide_video_lang_df = pd.concat((wide_video_lang_df1, wide_video_lang_df2))
wide_video_lang_df.to_csv("data/wide_lang.csv", index=False)
print(f"{len(wide_video_lang_df)} language samples")
# create language long form for religious person names
rows = []
for _, row in wide_video_lang_df.iterrows():
for religion in ["hindu", "muslim", "christian"]:
person_name_count = row[f"{religion}_name"]
rows.append(row.tolist() + [religion, person_name_count])
long_lang_religion_df = pd.DataFrame(rows, columns=wide_video_lang_df.columns.tolist() + ["religion", "name_count"])
long_lang_religion_df.drop(columns=["hindu_name", "muslim_name", "christian_name"], inplace=True)
long_lang_religion_df.to_csv("data/long_lang_religion.csv", index=False)
# create language long form for gendered person names
rows = []
for _, row in wide_video_lang_df.iterrows():
for gender in ["male", "female"]:
person_name_count = row[f"{gender}_name"]
rows.append(row.tolist() + [gender, person_name_count])
long_lang_gender_df = pd.DataFrame(rows, columns=wide_video_lang_df.columns.tolist() + ["gender", "name_count"])
long_lang_gender_df.drop(columns=["male_name", "female_name"], inplace=True)
long_lang_gender_df.to_csv("data/long_lang_gender.csv", index=False)
if __name__=="__main__":
prepare_data()