-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparser.py
128 lines (90 loc) · 3.29 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
__author__ = 'jaeyoung'
from mobile import mTistory, mDaum, mEgloos, mNaver
from core import logger
import sys
import requests
import database
import signal
# -*- Constant -*-
UserAgent = "Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/1A542a Safari/419.3"
crawler_id = 0
isKilled = False
def kill(signum, frame):
global isKilled
isKilled = True
def init():
global crawler_id
if len(sys.argv) < 2:
print "Usage: %s [crawler id]" % sys.argv[0]
sys.exit(1)
signal.signal(signal.SIGINT, kill)
crawler_id = int(sys.argv[1])
database.init(crawler_id)
def main():
while True:
if isKilled:
break
try:
b_id, host, realm, last_crawl, last_post, succeed = database.get_meta()
if not succeed:
logger.log("Getting blog entity is failed")
return
try:
article_list = get_article_list(host, realm, last_post)
except Exception, e:
logger.log("##ERROR:get_list", host, e.message)
database.flag_rollback(b_id)
continue
logger.log(host, " [", len(article_list), "]")
success_count = 0
for article in article_list:
try:
data = get_article(article, realm)
except Exception, e:
logger.log("##ERROR:get_article", article, e.message)
continue
if len(data) == 0:
continue
if database.save_article(b_id, data):
success_count += 1
logger.log(success_count, "accepted")
database.flag(b_id, 0)
except Exception, e:
logger.log("##ERROR:global_error:", b_id, e.message)
database.flag_rollback(b_id)
def get_article_list(host, realm=None, lp=None):
if "http://" not in host:
host = "http://" + host
re = requests.get(host, headers={"User-agent": UserAgent}, timeout=5.0)
article_list = []
if re.status_code == 404:
return article_list
if realm == "Tistory" or "tistory.com" in host:
article_list = mTistory.get_article_list(host, lp)
#
elif realm == "Daum" or "blog.daum.net" in host:
article_list = mDaum.get_article_list(host, lp)
elif realm == "Naver" or "naver.com" in host:
article_list = mNaver.get_article_list(host, lp)
elif realm == "Egloos" or "egloos.com" in host:
article_list = mEgloos.get_article_list(host, lp)
return article_list
def get_article(url, realm=None):
if "http://" not in url:
url = "http://" + url
re = requests.get(url, headers={"User-agent": UserAgent}, timeout=5.0)
data = {}
if re.status_code == 404:
return data
if realm == "Tistory" or "tistory.com" in url:
data = mTistory.get_article(url, re)
elif realm == "Daum" or "blog.daum.net" in url:
data = mDaum.get_article(url, re)
elif realm == "Naver" or "naver.com" in url:
data = mNaver.get_article(url, re)
elif realm == "Egloos" or "egloos.com" in url:
data = mEgloos.get_article(url, re)
return data
if __name__ == "__main__":
init()
main()