Skip to content

Commit

Permalink
代码可以运行,但没有解决验证码问题
Browse files Browse the repository at this point in the history
  • Loading branch information
wen-fei committed Oct 9, 2017
1 parent d2d42fd commit 3d29931
Show file tree
Hide file tree
Showing 14 changed files with 1,635 additions and 139 deletions.
6 changes: 6 additions & 0 deletions CNKI/.idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

435 changes: 351 additions & 84 deletions CNKI/.idea/workspace.xml

Large diffs are not rendered by default.

Binary file added CNKI/CNKI/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added CNKI/CNKI/__pycache__/items.cpython-36.pyc
Binary file not shown.
Binary file added CNKI/CNKI/__pycache__/pipelines.cpython-36.pyc
Binary file not shown.
Binary file added CNKI/CNKI/__pycache__/settings.cpython-36.pyc
Binary file not shown.
51 changes: 51 additions & 0 deletions CNKI/CNKI/cnki_patent_info.csv

Large diffs are not rendered by default.

1,098 changes: 1,098 additions & 0 deletions CNKI/CNKI/dem.html

Large diffs are not rendered by default.

15 changes: 10 additions & 5 deletions CNKI/CNKI/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,22 @@ def __init__(self, crawler):
self.ua = UserAgent()
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")


def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)


@classmethod
def from_crawler(cls, crawler):
return cls(crawler)


def process_request(self, request, spider):
def get_ua():
return getattr(self.ua, self.ua_type)
# def get_ua():
# return getattr(self.ua, self.ua_type)

random_agent = get_ua()
request.headers.setdefault("User-Agent", get_ua())
# random_agent = get_ua()
# request.headers.setdefault("User-Agent", get_ua())
# 设置代理
# request.meta['proxy'] = ""
request.headers.setdefault("Cookie", "Ecp_ClientId=1171009095901465355; Ecp_IpLoginFail=171009112.81.2.110; RsPerPage=50; cnkiUserKey=dd6eca65-a22c-330b-d486-22684afbe7b2; ASP.NET_SessionId=5atsoskm5rxqkirzhct0vjdb; SID_kns=123122; SID_kinfo=125102; SID_klogin=125141; SID_kredis=125142; SID_krsnew=125132")
request.headers.setdefault("Cookie", "Ecp_ClientId=2170914074501149831; RsPerPage=20; cnkiUserKey=42463fbc-f813-8023-21c7-d4cd29c7bff8; ASP.NET_SessionId=u3vqombtchoej45crc4daaue; SID_kns=123119; SID_kinfo=125104; SID_klogin=125144; SID_krsnew=125131; SID_kredis=125144; Ecp_IpLoginFail=171009112.81.2.110")
21 changes: 20 additions & 1 deletion CNKI/CNKI/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,27 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import csv

class CnkiPipeline(object):
colname = ['application_no', 'application_day', 'publication_no',
'publication_day', 'publication_user', 'publication_address',
'patent_inventor', 'patent_agent', 'patent_agent_user',
'publication_address', 'patent_inventor', 'patent_agent',
'patent_agent_user', 'patent_summary', 'patent_main_item']

def open_spider(self, spider):
# 在爬虫启动时,创建csv,并设置newline=''来避免空行出现
self.file = open('cnki_patent_info.csv', 'w', newline='')
# 启动csv的字典写入方法
self.writer = csv.DictWriter(self.file, self.colname)
# 写入字段名称作为首行
self.writer.writeheader()

def close_spider(self, spider):
self.file.close()


def process_item(self, item, spider):
self.writer.writerow(item)
return item
9 changes: 9 additions & 0 deletions CNKI/CNKI/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,12 @@
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
DEFAULT_REQUEST_HEADERS = {
"Host" : "kns.cnki.net",
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/56.0",
"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer" : "http://kns.cnki.net/kns/brief/…ageName=ASP.brief_result_aspx",
"Cookie" : "Ecp_ClientId=1 171009095901465355; Ecp_IpLoginFail=171009112.81.2.110; RsPerPage=50; cnkiUserKey=dd6eca65-a22c-330b-d486-22684afbe7b2; ASP.NET_SessionId=5atsoskm5rxqkirzhct0vjdb; SID_kns=123122; SID_kinfo=125102; SID_klogin=125141; SID_kredis=125142; SID_krsnew=125132",
"Connection" : "keep-alive",
"Upgrade-Insecure-Requests" : "1"
}
Binary file not shown.
Binary file added CNKI/CNKI/spiders/__pycache__/cnki.cpython-36.pyc
Binary file not shown.
139 changes: 90 additions & 49 deletions CNKI/CNKI/spiders/cnki.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,116 @@
# Created by Landuy at 2017/10/9
import scrapy
import re
from scrapy import Request
from time import sleep
from urllib import parse
import requests
from CNKI.items import CnkiItem
import re


class cnkiSpider(scrapy.Spider):
name = 'cnkisp'
allowed_domains = ["http://www.cnki.net"]
# allowed_domains = ["www.cnki.net"]
start_urls = ["http://kns.cnki.net/kns/brief/result.aspx?dbPrefix=SCPD"]

cookies = {
"ASP.NET_SessionId": "5atsoskm5rxqkirzhct0vjdb",
"cnkiUserKey": "dd6eca65-a22c-330b-d486-22684afbe7b2",
"Ecp_ClientId": "1171009095901465355",
"Ecp_IpLoginFail": "171009112.81.2.110",
"SID_kns": "123122",
"SID_kinfo": "125102",
"SID_klogin": "125141",
"SID_kredis": "125142",
"RsPerPage": "20"
}
meta = {'dont_redirect': True, 'handle_httpstatus_list': [302]}

def start_requests(self):
start_url = "http://kns.cnki.net/kns/brief/brief.aspx?curpage=3&RecordsPerPage=50&QueryID=0&ID=&turnpage=1&tpagemode=L&dbPrefix=SCPD&Fields=&DisplayMode=listmode&PageName=ASP.brief_result_aspx#J_ORDER&"
params = {
# "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0",
# "ConfigFile" : "SCPD.xml",
# "DbCatalog" : "中国专利数据库",
# "DbPrefix" : "SCPD",
# "NaviCode" : "*",
# "PageName" : "ASP.brief_result_aspx",
# "action" : "",
# "db_opt" : "SCPD",
# "db_value" : "中国专利数据库",
# "his" : "0",
# "publishdate_from" : "2014-01-01",
# "publishdate_to" : "2014-12-01",
"pagename": "ASP.brief_result_aspx",
"dbPrefix": "SCPD",
"ConfigFile":"SCPD.xml",
"dbCatalog" : "中国专利数据库",
"research" :"off",
"t" : "1507515739351",
"keyValue":"",
"S":"1"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
"Cookie" : "Ecp_ClientId=1171009095901465355; Ecp_IpLoginFail=171009112.81.2.110; RsPerPage=50; cnkiUserKey=dd6eca65-a22c-330b-d486-22684afbe7b2; ASP.NET_SessionId=5atsoskm5rxqkirzhct0vjdb; SID_kns=123122; SID_kinfo=125102; SID_klogin=125141; SID_kredis=125142; SID_krsnew=125132",

}
cookies = {
"ASP.NET_SessionId" : "5atsoskm5rxqkirzhct0vjdb",
"cnkiUserKey": "dd6eca65-a22c-330b-d486-22684afbe7b2",
"Ecp_ClientId" : "1171009095901465355",
"Ecp_IpLoginFail" : "171009112.81.2.110",
"SID_kns":"123122",
"SID_kinfo":"125102",
"SID_klogin":"125141",
"SID_kredis" : "125142",
"RsPerPage" :"20"
}
headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
"Cookie" : "Ecp_ClientId=1171009095901465355; Ecp_IpLoginFail=171009112.81.2.110; RsPerPage=50; cnkiUserKey=dd6eca65-a22c-330b-d486-22684afbe7b2; ASP.NET_SessionId=5atsoskm5rxqkirzhct0vjdb; SID_kns=123122; SID_kinfo=125102; SID_klogin=125141; SID_kredis=125142; SID_krsnew=125132",
}

}
# request = requests.get(url=start_url, params = params, headers= headers)
yield Request(url=start_url, headers=headers, cookies = cookies)
def start_requests(self):
start_url = "http://kns.cnki.net/kns/brief/brief.aspx?" \
"curpage=1&RecordsPerPage=50" \
"&QueryID=5" \
"&ID=&turnpage=1" \
"&tpagemode=L" \
"&dbPrefix=SCPD" \
"&Fields=" \
"&DisplayMode=listmode" \
"&PageName=ASP.brief_result_aspx#J_ORDER&"

yield Request(url=start_url, headers=self.headers, cookies=self.cookies)

def parse(self, response):
"""
得到专利详情页链接列表
:param response:
:return:
"""
urls_node = response.css(".GridTableContent tbody tr")
for node in urls_node:

urls_node = response.css("table.GridTableContent tr")
for node in urls_node[1:]:
patent_detail_url = "http://dbpub.cnki.net/grid2008/dbpub/detail.aspx?dbcode=SCPD&dbname=SCPD2017&filename="
patent_url = node.css("a.fz14::attr(href)").extract_first("")
yield Request(url=parse.urljoin(response.url, patent_url), callback=self.parse_detail)
# / kns / detail / detail.aspx?QueryID = 5 & CurRec = 8 & dbcode = scpd & dbname = SCPD2014 & filename = CN103786360A
match_re = re.match(".*filename=(\w+)", patent_url)
if match_re:
patent_detail_url = patent_detail_url + match_re.group(1)
else:
print("url错误")
continue
print("专利详情url:", patent_detail_url)
yield Request(url=patent_detail_url, callback=self.parse_detail,
headers=self.headers, cookies=self.cookies, meta=self.meta)

# 提取下一页交给scrapy下载
next_url = response.css("div.TitleLeftCell a::attr(href)").extract()[-1]
print("next url is :", parse.urljoin(response.url, next_url))
yield Request(url=parse.urljoin(response.url, next_url),
callback=parse, headers=self.headers, cookies=self.cookies, meta=self.meta)


def parse_detail(self, response):
pass
"""
详情提取
:param response:
:return:
"""
print("详情页提取")
node_list = response.css("table#box tr")
node_1 = node_list[0].css("td::text").extract()
application_no = node_1[1].replace(u'\xa0', u'')
application_day = node_1[3].replace(u'\xa0', u'')
node_2 = node_list[1].css("td::text").extract()
publication_no = node_2[1].replace(u'\xa0', u'')
publication_day = node_2[3].replace(u'\xa0', u'')
node_3 = node_list[2].css("td::text").extract()
publication_user = node_3[1].replace(u'\xa0', u'')
publication_address = node_3[3].replace(u'\xa0', u'')
node_4 = node_list[4].css("td::text").extract()
patent_inventor = node_4[1].replace(u'\xa0', u'')
node_5 = node_list[7].css("td::text").extract()
patent_agent = node_5[1].replace(u'\xa0', u'')
patent_agent_user = node_5[3].replace(u'\xa0', u'')
node_6 = node_list[10].css("td::text").extract()
patent_summary = node_6[1].replace(u'\xa0', u'')
node_7 = node_list[11].css("td::text").extract()
patent_main_item = node_7[1].replace(u'\xa0', u'')
# main_cls_no =
# patent_cls_np =
# patent_title =
item = CnkiItem()
item['application_no'] = application_no
item['application_day'] = application_day
item['publication_no'] = publication_no
item['publication_day'] = publication_day
item['publication_user'] = publication_user
item['publication_address'] = publication_address
item['patent_inventor'] = patent_inventor
item['patent_agent'] = patent_agent
item['patent_agent_user'] = patent_agent_user
item['patent_summary'] = patent_summary
item['patent_main_item'] = patent_main_item
yield item

0 comments on commit 3d29931

Please sign in to comment.