-
Notifications
You must be signed in to change notification settings - Fork 0
/
dupes.py
199 lines (178 loc) · 8.67 KB
/
dupes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
"""This script is for checking transcripts for duplicate phrases from the captions and removing them.
It is interactive and includes video url links to exact timestamps if it is ambiguous whether a phrase
is duplicated.
It take up to three arguments:
-ep: REQUIRED. The CxNN code of the episode
-yt: REQUIRED. The URL or YouTube ID for the episode
-page: Optional. Can be inferred from ep so not needed
and one potential flag:
-ignore_existing Optional. Whether to ignore an existing wiki transcript and build from scratch.
Example from top-level pywikibot folder:
>>> python pwb.py dupes -ep:3x37 -yt:bWHYmDFR84I -ignore_existing
This script also runs as part of the regular vod.py "-transcript" option. The process can be skipped
or exited in the middle, with an option to save changes already in progress.
"""
import pywikibot
from pywikibot.bot import (
AutomaticTWSummaryBot,
ConfigParserBot,
ExistingPageBot,
SingleSiteBot,
QuitKeyboardInterrupt,
)
from pywikibot import pagegenerators
from cr_modules.cr import YT, YT_ID_REGEX, get_validated_input
from cr_modules.ep import Ep, EP_REGEX
from cr_modules.transcript import YoutubeTranscript, DEFAULT_LANGUAGE
class DuplicateProcessor:
'''Interactive tool for deleting duplicate phrases from captions.'''
# def __init__(self, transcript):
# self.transcript = transcript
def process_duplicates(self, t, language=DEFAULT_LANGUAGE):
# assert isinstance(t, YoutubeTranscript), 'Must be object of type YoutubeTranscript.'
line_pairs = []
transcript = t.transcript_dict[language]
try:
for line, starttime in t.dupe_lines[language]:
if line not in transcript:
continue
transcript_line = next(x for x in transcript.splitlines() if line in x)
display_line = (line
.replace('<!-- DUPLICATE ', '<<yellow>>')
.replace('-->', '<<default>>'))
# hide other duplicate markers in same text
display_text = (transcript_line
.replace(line, display_line)
.replace('<!-- DUPLICATE ', '')
.replace('-->', ''))
pywikibot.output(f"\n\n{display_text}\n")
delete = pywikibot.input_choice('Delete this duplicate?',
[('Yes', 'Y'),
('No', 'N'),
('Check YouTube video', 'C')])
if delete.lower() == 'y':
new_line = ''
elif delete.lower() == 'n':
new_line = (line
.replace('<!-- DUPLICATE ', '')
.replace('-->', ''))
else:
starttime = starttime-4
url = '?t='.join([t.yt.url, str(starttime)])
pywikibot.output(f'\n\n<<yellow>>{url}<<default>>\n(ctrl or cmd+click to launch)')
delete = pywikibot.input_yn('Delete this duplicate?')
if delete:
new_line = ''
else:
new_line = (line
.replace('<!-- DUPLICATE ', '')
.replace('-->', ''))
line_pairs.append((line, new_line))
for line in line_pairs:
transcript = transcript.replace(line[0], line[1]).replace(' ', ' ')
except QuitKeyboardInterrupt:
if line_pairs:
save = pywikibot.input_yn('Save changes so far?')
if save:
for line in line_pairs:
transcript = transcript.replace(line[0], line[1]).replace(' ', ' ')
pywikibot.output('\nUser did not complete duplicate detection.\nChanges saved.\n')
else:
pywikibot.output('\nUser canceled duplicate detection.')
else:
pywikibot.output('\nUser canceled duplicate detection.')
t.transcript_dict[language] = transcript
return t
class DupeDetectionBot(SingleSiteBot, ExistingPageBot):
'''Add yt_link as value by updating or creating entry'''
update_options = {
'ep': None, # Ep object
'yt': None, # YouTube ID/URL, if known
'ts': None, # YoutubeTranscript object
'transcript_link': None, # link to transcript wiki page
'ignore_existing': False, # whether to ignore existing wiki ts (defaults to using it)
}
def get_transcript_info(self):
if self.opt.ts:
self.opt.ep = self.opt.ts.ep
self.opt.yt = self.opt.ts.yt
def get_wiki_transcript(self, language=DEFAULT_LANGUAGE):
ep = self.opt.ep
if not self.current_page or (self.current_page and self.current_page.title() == f'Transcript:{ep.code}'):
self.current_page = (
pywikibot.Page(
self.site,
ep.transcript_redirects[-1])
).getRedirectTarget()
if language != DEFAULT_LANGUAGE:
title = f"{self.current_page.title()}/{language}"
self.current_page = pywikibot.Page(self.site, title)
if not self.opt.ignore_existing and self.opt.ts:
self.opt.ts.transcript_dict[language] = self.current_page.text
def get_transcript(self, language=DEFAULT_LANGUAGE):
if not self.opt.ts:
self.get_wiki_transcript()
self.opt.ts = YoutubeTranscript().download_and_build_transcript(language=language)
if not self.opt.ignore_existing:
self.opt.ts.transcript_dict[language] = self.current_page.text
return self.opt.ts
def process_duplicates(self, language=DEFAULT_LANGUAGE):
new_ts = DuplicateProcessor().process_duplicates(self.opt.ts, language=language)
# remove maintenance category if all duplicates removed
transcript = new_ts.transcript_dict[language]
if '<!-- DUPLICATE' not in transcript:
new_ts.transcript_dict[language] = (transcript
.replace('[[Category:Transcripts with duplicate lines]]',
'')
.replace(f'[[Category:Transcripts with duplicate lines/{language}]]',
''))
self.put_current(new_ts.transcript_dict[language],
summary='Fixing duplicate captions (via pywikibot)')
def treat_page(self, language=DEFAULT_LANGUAGE) -> None:
if not self.opt.ts:
self.opt.ts = YoutubeTranscript(ep=self.opt.ep, yt=self.opt.yt)
self.opt.ts.download_and_build_transcript(language=language)
else:
self.get_transcript_info()
self.get_wiki_transcript()
self.get_transcript()
# replace with wiki text if transcript has already been saved
if self.current_page.text and not self.opt.ignore_existing:
self.opt.ts.transcript = self.current_page.text
self.process_duplicates()
def main(*args: str) -> None:
local_args = pywikibot.handle_args(args)
ep_arg = next((x for x in local_args if x.startswith('-ep:')),None)
page_arg = next((x for x in local_args if x.startswith('-page:')),None)
if (ep_arg and not page_arg) or not page_arg.startswith('Transcript:'):
page_arg = ep_arg.replace('-ep:', '-page:Transcript:')
local_args.append(page_arg)
gen_factory = pagegenerators.GeneratorFactory()
# Process pagegenerators arguments
local_args = gen_factory.handle_args(local_args)
options = {}
for option in local_args:
arg, _, value = option.partition(':')
arg = arg[1:]
if arg == 'ep':
options['ep'] = Ep(value)
elif arg == 'yt':
options['yt'] = YT(value)
elif not value:
options[arg] = True
else:
options[arg] = value
if not options.get('ep') and not options.get('transcript'):
value = get_validated_input(arg='ep', regex=EP_REGEX)
options['ep'] = Ep(value)
if not options.get('yt') and not options.get('transcript'):
value = get_validated_input(arg='yt', regex=YT_ID_REGEX)
options['yt'] = YT(value)
gen = gen_factory.getCombinedGenerator(preload=True)
dbot = DupeDetectionBot(generator=gen, **options)
dbot.run()
if __name__ == '__main__':
try:
main()
except QuitKeyboardInterrupt:
pywikibot.info('\nUser quit duplicate detection bot run.')