-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
152 lines (125 loc) · 4.85 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from wordcloud import WordCloud
import requests
from bs4 import BeautifulSoup
import logging
import os
mpl.rcParams['font.family'] = 'SimHei' # set font family to SimHei to allow displaying Chinese characters in the chart
csv_files = [f for f in os.listdir('./data') if f.endswith('.csv') and '_verbose' in f]
print("Choose a CSV file to proceed:")
for i, file in enumerate(csv_files):
print(f"{i+1}. {file}")
selected_file = input("Enter the number of the file you want to use: ")
selected_file = csv_files[int(selected_file)-1]
df_raw = pd.read_csv(f'./data/{selected_file}')
#df_raw = pd.read_csv('data/水菜麗_verbose.csv')
output_path = f'./data/{selected_file[:-4]}.csv'
def findAliasName(name):#find alias name of an actor and return list of all alias names
url = f"https://javdb.com/search?q={name}&f=actor"
name_dict = {}
try:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
except:
logging.warning("Couldn't find alias name")
#print("Couldn't find alias name")
return {name:name}
body = soup.find('body', {'data-lang': 'zh'})
try:
section = body.find('section', {'class': 'section'})
except:
logging.warning("Couldn't find alias name")
return {name:name}
container = section.find('div', {'class': 'container'})
try:
actor_box = container.find('div', {'class': 'actors'}).find_all('div', {'class': 'box actor-box'})
for item in actor_box:
actor = item.find('a')
primary_name = actor.find('strong').text.strip()
names = actor['title']
names = names.split(',')
name_dict[primary_name]=names
except:
return {name:name}
if name_dict:
return name_dict
else:
return {name:name}
def filterJoyu(df, keyword):
df_filtered = df.copy()
df_filtered = df_filtered.dropna(subset=['j_actors'])
df_filtered = df_filtered[df_filtered['j_actors'].str.contains(keyword)]
df_filtered = df_filtered.reset_index(drop=True)
return df_filtered
def filterJoyuEnhanced(df, keyword):
name_dict = findAliasName(keyword)
#get all alias names and store them in name_list
name_list = [item for key in name_dict.keys() for item in name_dict[key]]
df_filtered_all = pd.DataFrame(columns=df.columns)
for name in name_list:
df_filtered = filterJoyu(df, name)
df_filtered_all = pd.concat([df_filtered_all, df_filtered], sort=False)
df_filtered_all = df_filtered_all.reset_index(drop=True)
return df_filtered_all
def filterTag(df, keyword):
df_filtered = df.copy()
df_filtered = df_filtered.dropna(subset=['category'])
df_filtered = df_filtered[df_filtered['category'].str.contains(keyword)]
df_filtered = df_filtered.reset_index(drop=True)
return df_filtered
# remove duplicate rows based on column 'bango'
def removeDuplicate(df_raw):
df = df_raw.copy()
df.drop_duplicates(subset='bango', inplace=True)
df.replace(['N/A', 'n/a'], None, inplace=True)
df.insert(0, 'original_idx', df.index)
df.reset_index(drop=True, inplace=True)
return df
def saveCSV(df_raw):
df = df_raw.copy()
if 'original_idx' in df.columns:
df.drop('original_idx', axis=1, inplace=True)
df.to_csv(output_path, index=False)
del df
def plotTag(df):
tags = df['category'].str.split() # split the content in each row by space to get tags
tag_counts = {} # create an empty dictionary to store the tag counts
for row in tags:
for tag in row:
if tag in tag_counts:
tag_counts[tag] += 1
else:
tag_counts[tag] = 1
tag_df = pd.DataFrame.from_dict(tag_counts, orient='index', columns=['count'])
font = 'SimHei'
stop_words = ['單體作品',
'4小時以上作品',
'介紹影片',
'數位馬賽克',
'精選綜合',
]
tag_counts_clean = {k:v for k,v in tag_counts.items() if k not in stop_words}
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
font_path=font,
min_font_size = 10).generate_from_frequencies(tag_counts_clean)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
#tag_df_sorted.plot(kind='bar') # display the tag counts in a bar chart
#plt.xticks(rotation=45)
plt.show() # display the plot using matplotlib
def printDfWithDots(df):
num_rows = 10
df_subset = df.iloc[-num_rows:]
print(df_subset)
def process():
df = removeDuplicate(df_raw)
print(df.info())
print(df.tail(10))
plotTag(df)
saveCSV(df)
process()