-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfbMsgAnalytics.py
executable file
·52 lines (39 loc) · 1.71 KB
/
fbMsgAnalytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import sys
import os, glob
import json
import pandas as pd
def main():
# Use first and only arg as path to Facebook message_*.json dump files
# e.g. facebook-[username].zip/messages/inbox/[convo_name]/
inputDir = sys.argv[1]
fbMsgJsonDirToStatsCsv(inputDir)
# Generates participation.csv for msg count per user and total msg count
# and wordCount.csv for number is instances per word
def fbMsgJsonDirToStatsCsv(inputDir):
dfList = []
df = pd.DataFrame()
# Need to loop through all files since dumps are restricted to 10K msgs per JSON
for filename in glob.glob(os.path.join(inputDir, '*.json')):
with open(filename, 'r', encoding='utf-8-sig') as f:
dfList.append(pd.DataFrame.from_dict(json.load(f).get('messages')))
# Create one df out of multiple dfs
df = pd.concat(dfList)
# Want text messages only
df = df[df['type'] == "Generic"]
# Strip out unneeded cols
df = df[['sender_name', 'content']]
# Save participation.csv before modifying df further
df2 = df['sender_name'].value_counts()
df2['Total'] = df2.sum()
df2.to_csv('participation.csv', header=["count"], encoding='utf-8-sig')
# Remove non-wordy chars
df['content'].replace(r'\n|[^\w\s]', '', regex=True, inplace=True)
# Remove rows with empty strings after replacement
df = df[df['content'].astype(bool)]
# Set all msgs to lowercase and split into words
df['content'] = df['content'].str.lower()
df['content'] = df['content'].str.split(" +")
# Save wordCount.csv
df['content'].explode().value_counts().to_csv('wordCount.csv', header=["count"], encoding='utf-8-sig')
if __name__ == "__main__":
main()