-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp_ft_dytt8.js
312 lines (281 loc) · 11.1 KB
/
app_ft_dytt8.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
/**
* 本脚本从电影天堂自动获取最新发布的99部电影信息
* 并根据模板文件index_t.html生成一个名为index.html的静态电影海报墙网页。
* 定位到脚本目录下安装执行:
* npm install node-schedule
* npm install bufferhelper
* npm install cheerio
*
* v2024-12-11
*/
const { time } = require('console')
const { config } = require('process')
var urllist = []
var urllistremain = []
var movieList = []
var loadingPoolCount = 0
const loadingPoolMax = 200 //同时加载网页的最大数量
const htmlTemplatePath='index_t.html'
const savePath='index.html'
const maxtxtlen=150 //像网页中输出的电影简介的最大字符数
const maxMovies =100 //最多读取的电影数
var movielisturl='https://www.dydytt.net/index.htm' //带有100部电影列表的网页地址
var moviepageurl='https://www.dydytt.net/' //电影详细介绍页的网址前缀
var movielisturl2='https://www.dydytt.net/index.htm' //带有100部电影列表的网页地址(备用地址)
var moviepageurl2='https://www.dydytt.net/' //电影详细介绍页的网址前缀(备用地址)
var errorTimes=0
//不要忘记修改这里!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
var debugMode=false
class MovieInfo
{
constructor()
{
this.index=''
this.title=''
this.year=''
this.country=''
this.language=''
this.type=''
this.stars=''
this.plot=''
this.posturl=''
this.downloadurl=''
}
}
function strRemoveAll(str,regArr=[])
{
var s=str
for(let i=0;i<=regArr.length;i++)
{
s=s.replaceAll(regArr[i],"")
//while(s.indexOf(regArr[i])!=-1)
//{
// s=s.replace(regArr[i],"")
//}
}
return s;
}
function loadPage(url,ishttps=false,pagecode='UTF8',otherinfo={}) {
if(ishttps){
var http = require('https')
}else{
var http = require('http')
}
var pm = new Promise(function (resolve,reject){
console.log('载入:'+url)
http.get(url, function (res){
if(pagecode=='UTF8')
{
let html = ''
res.on('data',function(d){
html += d.toString()
})
res.on('end',function(){
resolve({'data':html,'other':otherinfo})
})
}else{
let bufferHelper=new (require('bufferhelper'))();
res.on('data',function(d){
bufferHelper.concat(d)
})
res.on('end',function(){
resolve({'data':require('iconv-lite').decode(bufferHelper.toBuffer(),pagecode),'other':otherinfo})
})
}
}).on('error',function(e){
console.log("载入页面出错"+e)
reject({'data':e,'other':otherinfo})
});
})
return pm;
}
/*
* 整理出电影下载的URL列表
*/
function _parseURLListPage(d)
{
let h = require('cheerio').load(d.data)
movieList=[]
urllist = []
urllistremain = []
movieList = []
loadingPoolCount = 0
//console.log(h.html())
//console.log(h('.co_area2 .co_content2 ul').html())
let listA=h('.co_area2 .co_content2 ul a')
if(listA.length==0)
{
if(errorTimes==0)
{
console.log('获取电影列表数量为0,页面信息异常:'+d.data)
console.log('尝试第二次读取电影列表。')
errorTimes++
//修改为备用网址
movielisturl=movielisturl2
moviepageurl=moviepageurl2
loadPage(movielisturl,true,'gb2312').then(_parseURLListPage,_loadErr)
return
}else{
console.log('获取电影列表数量为0,页面信息异常:'+d.data)
console.log('结束爬虫程序。')
return //结束爬虫
}
}
//console.log(listA[1])
for( let i=0;i<listA.length;i++)
{
urllist.push(moviepageurl+listA[i].attribs.href)
//console.log('-------------'+i+'--------------')
//console.log(listA[i].children[0].data)
//console.log(listUrl[i])
}
console.log("共获取电影URL信息条数为:"+urllist.length)
//-仅供减少电影列表数量,发布时可去掉下边这一行------------------------------------------------
urllist=urllist.splice(0,maxMovies)
//------------------------------------------------------------------------------------------------
urllistremain=urllist.slice()
_parseUrlList()
}
/*
* 获取信息详情
*/
function _getMovieInfo(s)
{
//console.log('加载网页成功(序号):'+s.other)
//提取关键信息
let ept=['-'] //空数组。如果正则表达式匹配不到任何结果返回null,就使用此数组
let d = require('cheerio').load(s.data)
let info = new MovieInfo()
info.index = s.other
try{
info.title = (d('.title_all h1 font').text().match(RegExp('《.+》')) || ept)[0]
info.title = strRemoveAll(info.title,['《','》'])
info.year = (d('#Zoom').html().match(RegExp('◎年 代.*?<+')) || ept)[0]
info.year = strRemoveAll(info.year,['◎年 代','<',' ',' ','年'])
info.country = (d('#Zoom').html().match(RegExp('◎产 地.*?<+')) || ept)[0]
info.country = strRemoveAll(info.country,['◎产 地','<',' ',' '])
info.language = (d('#Zoom').html().match(RegExp('◎语 言.*?<+')) || ept)[0]
info.language = strRemoveAll(info.language,['◎语 言','<',' ',' '])
info.type = (d('#Zoom').html().match(RegExp('◎类 别.*?<+')) || ept)[0]//◎类 别 动作 / 奇幻 / 冒险
info.type = strRemoveAll(info.type,['◎类 别','<',' ',' '])
info.stars = (d('#Zoom').html().match(RegExp('◎豆瓣评分.+?/')) || ept)[0] //REG: ◎豆瓣评分.+?/ 或 ◎IMDb评分.+?/
if(info.stars == ept[0]){info.stars = (d('#Zoom').html().match(RegExp('◎IMDb评分.+?/')) || ept)[0]} //没有豆瓣评分就尝试找到IMDb评分
if(info.stars == ept[0]){info.stars='暂无评分'}
info.stars = strRemoveAll(info.stars, ['◎豆瓣评分', '◎IMDb评分', ' ', '/', ' ',' '])
info.plot = (d('#Zoom').html().match(RegExp('◎简 介.+?<a')) || ept)[0] //◎简 介.+?<a
//info.plot = strRemoveAll(info.plot, ['◎简 介', '<a', ' ', ' ', '<br />', '<br>', '<br', ' '])
info.plot = strRemoveAll(info.plot, ['◎简 介', '<a', ' ', ' ', ' '])
//移除简介中的html标签
let tempHtml = require('cheerio').load(info.plot);
tempHtml('style').remove();
info.plot = tempHtml('body').text();
info.posturl = d('#Zoom img').attr('src')
info.downloadurl = (d('#Zoom').html().match(RegExp('magnet:.*?"')) || ept)[0] //magnet:.*?"
info.downloadurl = strRemoveAll(info.downloadurl,['"',' '])
}
catch(e){
console.log("在电影下载页识别电影信息时出错:"+e)
console.log(s.data)
}
if(info.title!=ept[0]&&info.title!='')
{
console.log('+添加数据,序号'+info.index+':'+info.title+'|'+info.year+'年|国家:'+info.country+'|语言:'+info.language+' 评分:'+info.stars)
movieList.push(info);
}
else
{
console.log('-丢弃数据,序号:'+info.index)
}
loadingPoolCount--;
_parseUrlList()
}
function _loadErr(e){
console.log('加载页面错误'+e.data)
console.log('附带信息'+(e.other|''))
loadingPoolCount--;
_parseUrlList()
}
function _parseUrlList()
{
if(urllistremain.length==0&&loadingPoolCount==0){
onAllInfoGetted()
return;
}
if(urllistremain.length==0){
return;
}
// 最多同时并发10个loadPage动作
while(loadingPoolCount<loadingPoolMax&&urllistremain.length>0){
let qm = loadPage(urllistremain[0],true,'gb2312',urllist.length-urllistremain.length)
qm.then(_getMovieInfo,_loadErr)
//console.log('加载并行数:'+loadingPoolCount+',附带信息(序号):'+(urllist.length-urllistremain.length));
urllistremain.splice(0,1);
loadingPoolCount++
}
}
function onAllInfoGetted()
{
console.log(movieList.length+"/"+urllist.length+"加载完毕!---------------------------------------------------------------------")
movieList.sort((a,b)=>{
return a.index - b.index ;
})
var fs = require('fs')
//载入模板
fs.readFile(htmlTemplatePath,'utf-8',(err,data)=>{
if(data)
{
console.log('载入index.html成功')
var html=data
date=new Date()
html=html.replace('#更新日期#',date.getFullYear()+'年'+(date.getMonth()+1)+'月'+date.getDate()+'日 '+date.getHours()+':'+date.getMinutes()+":"+date.getSeconds())
//console.log('aaa'+html.match(/<!---模板片段开始-->.*[\d\D]*.*<!---模板片段结束-->/gm))
var tmpl=(html.match(/<!---模板片段开始-->.*[\d\D]*.*<!---模板片段结束-->/gm)||[''])[0]
tmpl=strRemoveAll(tmpl,['<!---模板片段开始-->','<!---模板片段结束-->','\b'])
content=''
//console.log('模板文件:'+tmpl)
for(var i=0;i<movieList.length;i++)
{
div=tmpl
div=div.replace('#电影名#','<small><small>'+movieList[i].index+'.</small></small>'+movieList[i].title)
div=div.replace('#年份#',movieList[i].year)
div=div.replace('#国家#',movieList[i].country)
div=div.replace('#评分#',movieList[i].stars)
div=div.replace('#类型#',movieList[i].type)
if(movieList[i].plot.length>maxtxtlen){div=div.replace('#简介#',movieList[i].plot.substring(0,maxtxtlen)+'...')}
else{div=div.replace('#简介#',movieList[i].plot)}
div=div.replace('#下载地址#',movieList[i].downloadurl)
div=div.replace('#海报链接#',movieList[i].posturl)
content+=div
}
html=html.replace(tmpl,content)
fs.writeFile(savePath,html,(err)=>{
if(err)
{
console.log('保存为index.html失败。');
}else{
console.log('保存为index.html成功。')
}
})
}
else
{
console.log('读取文件'+path+'出错:'+err);
}
})
}
if(debugMode)
{
//只执行一次脚本
loadPage(movielisturl,true,'gb2312').then(_parseURLListPage,_loadErr);
}
else
{
//自动执行脚本
var schedule = require('node-schedule')
//var rule = new schedule.RecurrenceRule()
//rule.minute = [0,10,21,30,40,50]
//rule.second = [0]
var run = schedule.scheduleJob('0 0/10 * * * *', function(){
loadPage(movielisturl,true,'gb2312').then(_parseURLListPage,_loadErr)
})
}