Skip to content

Commit

Permalink
3.3.0 版本提交
Browse files Browse the repository at this point in the history
  • Loading branch information
shengchenyang committed Jun 21, 2023
1 parent f587668 commit 029d545
Show file tree
Hide file tree
Showing 39 changed files with 392 additions and 619 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ pip install ayugespidertools -i https://pypi.org/simple
对以上 `GIF` 中的步骤进行解释:

```shell
# 注:ayugespidertools 的 cli 已剔除,现只能使用 ayuge。

# 查看库版本
ayuge version

Expand Down Expand Up @@ -110,6 +108,7 @@ scrapy crawl <spider_name>
+17).demo_mongo_async: asyncio 版本存储 mongoDB 的 pipelines 示例
+18).demo_mq: 数据存入 rabbitmq 的模板示例
+19).demo_kafka: 数据存入 kafka 的模板示例
+20).demo_file: 下载图片等文件到本地的模板示例
```

注:具体内容及时效性请以 [DemoSpider](https://github.com/shengchenyang/DemoSpider) 项目中描述为准。
Expand Down
5 changes: 2 additions & 3 deletions ayugespidertools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ayugespidertools.items import MongoDataItem, MysqlDataItem
from ayugespidertools.items import AyuItem
from ayugespidertools.scraper.http.request import AiohttpRequest
from ayugespidertools.scraper.http.request.form import AiohttpFormRequest
from ayugespidertools.scraper.spiders import AyuSpider
Expand All @@ -7,8 +7,7 @@
__all__ = [
"AiohttpRequest",
"AiohttpFormRequest",
"MysqlDataItem",
"MongoDataItem",
"AyuItem",
"AyuSpider",
"AyuCrawlSpider",
]
2 changes: 1 addition & 1 deletion ayugespidertools/common/mongodbpipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _get_insert_data(
"""
insert_data = ReuseOperation.get_items_except_keys(
dict_conf=item_dict,
key_list=["_table", "_item_mode", "_mongo_update_rule"],
keys=["_table", "_item_mode", "_mongo_update_rule"],
)
judge_item = next(iter(insert_data.values()))
# 是 namedtuple 类型
Expand Down
35 changes: 17 additions & 18 deletions ayugespidertools/common/multiplexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@
import cv2
import numpy as np
import pymysql
from itemadapter import ItemAdapter
from scrapy.settings import Settings
from twisted.internet.defer import Deferred

from ayugespidertools.common.typevars import MysqlConf
from ayugespidertools.config import logger
from ayugespidertools.items import MongoDataItem, MysqlDataItem, ScrapyItem
from ayugespidertools.items import AyuItem, ScrapyItem

__all__ = [
"ReuseOperation",
Expand Down Expand Up @@ -141,8 +142,8 @@ def get_conf_by_settings(vit_dir: str, inner_settings: Settings) -> Settings:

@staticmethod
def item_to_dict(
item: Union[MysqlDataItem, MongoDataItem, ScrapyItem, dict]
) -> dict:
item: Union[AyuItem, ScrapyItem, dict]
) -> Union[ItemAdapter, dict]:
"""
将 item 转换为 dict 类型
将 spider 中的 yield 的 item 转换为 dict 类型,方便后续处理
Expand All @@ -152,9 +153,7 @@ def item_to_dict(
Returns:
1). dict 类型的 item
"""
if isinstance(item, (MongoDataItem, MysqlDataItem)):
return item.asdict()
return dict(item)
return item.asdict() if isinstance(item, AyuItem) else ItemAdapter(item)

@staticmethod
def is_namedtuple_instance(x: Any) -> bool:
Expand All @@ -176,7 +175,7 @@ def get_files_from_path(path: str) -> list:
path: 需要判断的文件夹路径
Returns:
file_list: path 文件夹下的文件列表
1). path 文件夹下的文件列表
"""
return [f.path for f in os.scandir(path) if f.is_file()]

Expand Down Expand Up @@ -272,36 +271,36 @@ def is_dict_meet_min_limit(cls, dict_conf: dict, key_list: List[str]) -> bool:
def get_items_by_keys(
cls,
dict_conf: dict,
key_list: List[str],
keys: List[str],
) -> dict:
"""
获取 dict_conf 中的含有 key_list 的 key 的字段
获取 dict_conf 中的含有 keys 的 key 的字段
Args:
dict_conf: 需要处理的参数
key_list: 需要取的 key 值列表
keys: 需要取的 key 值列表
Returns:
1). 取值后的 dict,或不满足请求的 False 值
"""
# 参数先要满足最小限定,然后再取出限定的参数值;否则返回空字典
return (
{k: dict_conf[k] for k in key_list}
if cls.is_dict_meet_min_limit(dict_conf=dict_conf, key_list=key_list)
{k: dict_conf[k] for k in keys}
if cls.is_dict_meet_min_limit(dict_conf=dict_conf, key_list=keys)
else {}
)

@classmethod
def get_items_except_keys(cls, dict_conf, key_list: List[str]) -> dict:
def get_items_except_keys(cls, dict_conf, keys: List[str]) -> dict:
"""
获取 dict_conf 中的不含有 key_list 的 key 的字段
获取 dict_conf 中的不含有 keys 的 key 的字段
Args:
dict_conf: 需要处理的参数
key_list: 需要排除的 key 值列表
keys: 需要排除的 key 值列表
Returns:
1). dict_conf 排除 key_list 中的键值后的值
1). dict_conf 排除 keys 中的键值后的值
"""
return {k: dict_conf[k] for k in dict_conf if k not in key_list}
return {k: dict_conf[k] for k in dict_conf if k not in keys}

@classmethod
def create_database(cls, mysql_conf: MysqlConf) -> None:
Expand Down Expand Up @@ -388,7 +387,7 @@ def get_consul_conf(cls, settings: Settings) -> dict:
"""
consul_conf_dict = settings.get("CONSUL_CONFIG", {})
return cls.get_items_by_keys(
dict_conf=consul_conf_dict, key_list=["token", "url", "format"]
dict_conf=consul_conf_dict, keys=["token", "url", "format"]
)

@classmethod
Expand Down
6 changes: 2 additions & 4 deletions ayugespidertools/common/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pymysql
from itemadapter import ItemAdapter

from ayugespidertools.items import MongoDataItem, MysqlDataItem, ScrapyClassicItem
from ayugespidertools.items import AyuItem, ScrapyClassicItem

__all__ = [
"Param",
Expand All @@ -31,9 +31,7 @@ class Param:
PymysqlDictCursor = TypeVar("PymysqlDictCursor", bound="pymysql.cursors.DictCursor")
ItemAdapterType = TypeVar("ItemAdapterType", bound="ItemAdapter")
# 此框架中 Item 的类型种类
ScrapyItems = TypeVar(
"ScrapyItems", MysqlDataItem, MongoDataItem, ScrapyClassicItem
)
ScrapyItems = TypeVar("ScrapyItems", AyuItem, ScrapyClassicItem)

# 基本的请求头
base_headers = {
Expand Down
19 changes: 0 additions & 19 deletions ayugespidertools/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import pandas
import requests
import yaml
from itemadapter import ItemAdapter

from ayugespidertools.common.encryption import EncryptOperation
from ayugespidertools.common.multiplexing import ReuseOperation
Expand Down Expand Up @@ -243,24 +242,6 @@ def get_collate_by_charset(mysql_conf: MysqlConf) -> str:
), f"数据库配置出现未知 charset:{mysql_conf.charset},若抛错请查看或手动创建所需数据表!"
return collate

@staticmethod
def convert_items_to_dict(item) -> ItemAdapter:
"""
数据容器对象的包装器,提供了一个通用接口以统一的方式处理不同类型的对象,而不管它们的底层实现如何。
目前支持的类型有:
1. scrapy.item.Item
2. dict
3. dataclass 基础类
4. attrs 基础类
5. pydantic 基础类
Args:
item: 需要转换的项目,请查看支持类型
Returns:
1). 转换的 ItemAdapter 结果,可以通过 obj["params"] 或 obj.get("params") 来取值
"""
return ItemAdapter(item)

@staticmethod
def first_not_none(data_lst: List[Any]) -> Any:
"""
Expand Down
94 changes: 40 additions & 54 deletions ayugespidertools/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
# python 3.8 无法优雅地使用 LiteralString,以下用 Literal 代替
MysqlItemModeStr = Literal["Mysql"]
MongoDBItemModeStr = Literal["MongoDB"]
MysqlDataItemTypeVar = TypeVar("MysqlDataItemTypeVar", bound="MysqlDataItem")
MongoDataItemTypeVar = TypeVar("MongoDataItemTypeVar", bound="MongoDataItem")
AyuItemTypeVar = TypeVar("AyuItemTypeVar", bound="AyuItem")

__all__ = [
"DataItem",
"ScrapyItem",
"ScrapyClassicItem",
"MysqlDataItem",
"MongoDataItem",
"AyuItem",
]


Expand Down Expand Up @@ -57,7 +55,7 @@ def __new__(cls, class_name, bases, attrs):

# 动态添加字段方法
def add_field(
self: Union[object, MysqlDataItemTypeVar, MongoDataItemTypeVar],
self: Union[object, AyuItemTypeVar],
key: str = None,
value: Any = None,
) -> None:
Expand All @@ -68,9 +66,11 @@ def add_field(
def _asdict(
self,
) -> Dict[str, Any]:
"""
将 AyuItem 转换为 dict
"""
_item_dict = {key: getattr(self, key) for key in self.fields}
_item_dict["_table"] = self._table
_item_dict["_item_mode"] = self._item_mode
if self._mongo_update_rule:
_item_dict["_mongo_update_rule"] = self._mongo_update_rule
return _item_dict
Expand All @@ -79,10 +79,18 @@ def _asitem(
self: Any,
assignment: bool = True,
) -> ScrapyItem:
"""
将 AyuItem 转换为 ScrapyItem
Args:
assignment: 是否将 AyuItem 中的值赋值给 ScrapyItem,默认为 True
Returns:
new_class: 转换 ScrapyItem 后的实例
"""
item_temp = ScrapyItem()
for k, v in self._asdict().items():
item_temp.fields[k] = scrapy.Field()
if any([assignment, k == "_item_mode"]):
if assignment:
item_temp[k] = v
return item_temp

Expand All @@ -93,67 +101,45 @@ def _asitem(


@dataclass
class MysqlDataItem(metaclass=ItemMeta):
class AyuItem(metaclass=ItemMeta):
"""
这个是 Scrapy item 的 Mysql 的存储结构
"""

_table: str = None
_item_mode: MysqlItemModeStr = "Mysql"

def __init__(self, _table, _item_mode: MysqlItemModeStr = _item_mode, **kwargs):
self._table = _table
self._item_mode = _item_mode
self._mongo_update_rule = None
self.fields = []
for key, value in kwargs.items():
setattr(self, key, value)
self.fields.append(key)

def __getitem__(self, key):
return getattr(self, key)

def __setitem__(self, key, value):
if key not in self.fields:
setattr(self, key, value)
self.fields.append(key)
else:
setattr(self, key, value)
def __delitem__(self, key):
if key in self.fields:
delattr(self, key)
self.fields.remove(key)

def __str__(self: Any):
return f"{self.__class__.__name__}({self._asdict()})"

def asdict(self: Any):
return self._asdict()

def asitem(self, assignment: bool = True):
return self._asitem(assignment)


@dataclass
class MongoDataItem(metaclass=ItemMeta):
"""
这个是 Scrapy item 的 mongoDB 的存储结构
Attributes:
_table (str): 数据库表名。
_mongo_update_rule (Dict[str, Any]): MongoDB 存储场景下可能需要的查重条件,默认为 None。
Examples:
>>> item = AyuItem(
>>> _table="test_table",
>>> title="test_title",
>>> _mongo_update_rule={"title": "test_title"},
>>> )
>>> item._table
'test_table'
>>> item.asdict()
{'title': 'test_title', '_table': 'test_table', '_mongo_update_rule': {'title': 'test_title'}}
>>> type(item.asitem())
<class 'ayugespidertools.items.ScrapyItem'>
"""

_table: str = None
_item_mode: MongoDBItemModeStr = "MongoDB"
_mongo_update_rule: Dict[str, Any] = None

def __init__(
self,
_table,
_item_mode: MongoDBItemModeStr = _item_mode,
_table: str,
_mongo_update_rule: Dict[str, Any] = None,
**kwargs,
):
"""
初始化 AyuItem 实例。
Args:
_table: 数据库表名。
_mongo_update_rule: MongoDB 存储场景下可能需要的查重条件,默认为 None。
"""
self._table = _table
self._item_mode = _item_mode
self._mongo_update_rule = _mongo_update_rule
self.fields = []
for key, value in kwargs.items():
Expand Down
4 changes: 3 additions & 1 deletion ayugespidertools/oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ def put_oss(
return True, input_file_name

def enumer_file_by_pre(
self, prefix: str, count_by_type: Union[Param.Str_Lstr, Param.NoneType] = None
self,
prefix: str,
count_by_type: Union[Param.Str_Lstr, Param.NoneType, list] = None,
) -> list:
"""
列举 prefix 文件夹下的所有的 count_by_type 类型的文件元素
Expand Down
2 changes: 2 additions & 0 deletions ayugespidertools/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ayugespidertools.scraper.pipelines.download.file import FilesDownloadPipeline
from ayugespidertools.scraper.pipelines.mongo.asynced import AsyncMongoPipeline
from ayugespidertools.scraper.pipelines.mongo.fantasy import AyuFtyMongoPipeline
from ayugespidertools.scraper.pipelines.mongo.twisted import AyuTwistedMongoPipeline
Expand All @@ -18,4 +19,5 @@
"AyuTwistedMongoPipeline",
"AyuMQPipeline",
"AyuKafkaPipeline",
"FilesDownloadPipeline",
]
4 changes: 0 additions & 4 deletions ayugespidertools/scraper/middlewares/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@
from ayugespidertools.scraper.middlewares.proxy.exclusive import (
ExclusiveProxyDownloaderMiddleware,
)
from ayugespidertools.scraper.middlewares.proxy.private import (
PrivateProxyDownloaderMiddleware,
)

__all__ = [
"RandomRequestUaMiddleware",
Expand All @@ -25,5 +22,4 @@
"DynamicProxyDownloaderMiddleware",
"AbuDynamicProxyDownloaderMiddleware",
"ExclusiveProxyDownloaderMiddleware",
"PrivateProxyDownloaderMiddleware",
]
Loading

0 comments on commit 029d545

Please sign in to comment.