-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.go
182 lines (162 loc) · 4.17 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
package main
import (
// "bufio"
"bytes"
"encoding/base64"
"encoding/json"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"wechatarticles/chrome"
"wechatarticles/http"
"wechatarticles/log"
"wechatarticles/mail"
"wechatarticles/props"
)
func crawl() {
jsonF, err := os.Create(filepath.Join(props.Ppt.WorkDir, props.Ppt.JsonFN))
if err != nil {
panic(err)
}
defer jsonF.Close()
var txtF *os.File = nil
if len(props.Ppt.TxtFN) > 0 {
txtF, err = os.Create(filepath.Join(props.Ppt.WorkDir, props.Ppt.TxtFN))
if err != nil {
panic(err)
}
defer jsonF.Close()
}
tmpJF, err := os.Create(filepath.Join(props.Ppt.WorkDir, props.Ppt.TJsonFN))
if err != nil {
panic(err)
}
defer tmpJF.Close()
cookie := props.CachePpt.Cookie
token := props.CachePpt.Token
fakeid := http.GetFakeid(cookie, token, "逻辑思维")
if len(fakeid) != len("MjM5NjAxOTU4MA==") {
log.Info("token, cookie 已过期,将重新获取并更新本地缓存记录")
//超时重试3次
done := make(chan bool)
suc := false
go func() {
token, cookie = chrome.GetAuth()
done <- true
}()
for i := 0; i < 3; i++ {
select {
case suc = <-done:
i = 4
case <-time.After(time.Second * 60 * 30):
go func() {
token, cookie = chrome.GetAuth()
done <- true
}()
}
}
if !suc {
mail.SendLog("扫码授权邮件超时失败")
os.Exit(1)
}
props.CachePpt.Cookie = cookie
props.CachePpt.Token = token
props.UpdateCacheFile()
}
updateCache := false
for _, src := range props.Ppt.Sources {
for _, name := range src.Names {
if len(props.CachePpt.FakeIds[name]) < 1 {
log.Info("新增缓存fakeid记录", name)
fakeid := http.GetFakeid(cookie, token, name)
props.CachePpt.FakeIds[name] = fakeid
props.CachePpt.NameFakeIds = append(props.CachePpt.NameFakeIds, props.NameId{Name: name, FakeId: fakeid})
updateCache = true
}
}
}
if updateCache {
props.UpdateCacheFile()
}
log.Info("爬取信息日期范围:", props.Ppt.BeginDay, props.Ppt.EdnDay)
articles := make([]http.Article, 0, 200)
for _, src := range props.Ppt.Sources {
log.Debug("爬取公众号类型:", src.Tag)
for _, name := range src.Names {
log.Debug("爬取公众号:", name)
arts := http.GetArticleList(cookie, token, props.CachePpt.FakeIds[name], props.Ppt.BeginDay, props.Ppt.EdnDay)
time.Sleep(time.Second * 10)
log.Debug("公众号文章数量:", name, len(arts))
for _, art := range arts {
log.Debug("爬取文章:", name, art.Title)
art.Source = name
art.Tag = src.Tag
art.Content = chrome.Visit(art.Link)
time.Sleep(time.Second * 3)
art.Content_hex = base64.StdEncoding.EncodeToString([]byte(art.Content))
if src.MustMatch { //标题和内容必须包含关键字才需要记录
match := false
for _, kw := range src.HighlightMailWords {
if strings.Contains(art.Title, kw) || strings.Contains(art.Content, kw) {
match = true
break
}
}
if !match {
continue
}
}
//汇总结果
articles = append(articles, art)
//写临时文件
js, err := json.Marshal(art)
if err != nil {
log.Error("转换为json失败", err)
} else {
fmt.Fprintln(tmpJF, string(js))
}
//写txt文件
if txtF != nil {
fmt.Fprintln(txtF, art.Source, art.Title, art.Time)
fmt.Fprintln(txtF, art.Content)
fmt.Fprintln(txtF, "")
}
}
}
}
//写汇总文件
js, err := json.Marshal(articles)
if err != nil {
log.Error("转换为json失败", err)
return
}
var bb bytes.Buffer
json.Indent(&bb, js, "", "\t")
fmt.Fprintln(jsonF, bb.String())
}
func main() {
if props.Ppt.OnlyMail == false { //非补发邮件
os.RemoveAll(props.Ppt.WorkDir)
os.MkdirAll(props.Ppt.WorkDir, os.ModeDir|os.ModePerm)
//日志开关
log.SetDebug(props.Ppt.Debug, filepath.Join(props.Ppt.WorkDir, props.Ppt.LogFN))
//超时报警退出
done := make(chan bool)
go func() {
crawl()
done <- true
}()
select {
case <-done:
log.Info("爬取数据完成")
case <-time.After(time.Second * 60 * 60 * 3):
mail.SendLog("爬取数据超时失败")
os.Exit(1)
}
}
if props.Ppt.SupportMail == true {
mail.SendResult()
}
}