From 5d7b572bbc2be9891ab53f51107c744bdb1282dc Mon Sep 17 00:00:00 2001 From: xuke Date: Thu, 9 Mar 2023 13:25:25 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB=E5=8A=9F=E8=83=BD=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + .vscode/settings.json | 5 + CMakeLists.txt | 13 + bilibli-api.hpp | 24 + main.cpp | 188 + third-part/CMakeLists.txt | 2 + third-part/argparse/argparse.hpp | 707 + third-part/httplib/CMakeLists.txt | 3 + third-part/httplib/httplib.cc | 6614 ++++++++ third-part/httplib/httplib.h | 1939 +++ third-part/nlohmann/json.hpp | 24596 ++++++++++++++++++++++++++++ 11 files changed, 34092 insertions(+) create mode 100644 .gitignore create mode 100644 .vscode/settings.json create mode 100644 CMakeLists.txt create mode 100644 bilibli-api.hpp create mode 100644 main.cpp create mode 100644 third-part/CMakeLists.txt create mode 100644 third-part/argparse/argparse.hpp create mode 100644 third-part/httplib/CMakeLists.txt create mode 100644 third-part/httplib/httplib.cc create mode 100644 third-part/httplib/httplib.h create mode 100644 third-part/nlohmann/json.hpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..95e9d60 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/build/* \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..78fb68a --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "cmake.debugConfig": { + "args": ["1"] + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..71570ac --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,13 @@ +project(bilibili-spider + DESCRIPTION "哔哩哔哩爬虫") + +cmake_minimum_required(VERSION 3.2) + +add_subdirectory(third-part) + +include_directories(third-part) + +add_executable(main main.cpp) + +target_link_libraries(main + http) diff --git a/bilibli-api.hpp b/bilibli-api.hpp new file mode 100644 index 0000000..3f0f765 --- /dev/null +++ b/bilibli-api.hpp @@ -0,0 +1,24 @@ +#include +#include +using namespace httplib; +using namespace nlohmann; +namespace bilibili +{ + typedef uint32_t uid_t; +} + +namespace bilibili +{ + const Headers headers{{"User-Agent", "有事请联系xukela@qq.com,我会在看到邮件后处理。"}}; + + json getUserInfo(bilibili::uid_t uid) + { + const char *host = "api.bilibili.com"; + const char *path = "/x/space/acc/info"; + const Params params{{std::string("mid"), std::to_string(uid)}}; + Client cli(host); + Result res = cli.Get(path, params, headers); + res.value().body; + return json::parse(res.value().body); + } +} \ No newline at end of file diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..43b4f13 --- /dev/null +++ b/main.cpp @@ -0,0 +1,188 @@ +#include +#include +#include "bilibli-api.hpp" +#include +using namespace std; + +mutex m; // 写文件保护,当手动结束进程时先写完文件再退出 +bool spider_exit = false; // 当此值为true时,start()退出 +thread *spider; +string out_dir; +string user_name = "user.json"; +string err_name = "err.txt"; +string log_name = "log.txt"; +uint32_t delay; +bilibili::uid_t suid; // 起始uid +bilibili::uid_t euid; // 结束uid,包括在内 +ofstream *fuser; +ofstream *ferr; +ofstream *flog; + +void start(); +void destory(); +string getCurrTime(); +void signal_handle(int sign); +void addLog(const string &content); +void init(const util::argparser &args); +void addErr(bilibili::uid_t uid, const json &data); +void saveFile(bilibili::uid_t uid, const json &data); + +int main(int argc, char const *argv[]) +{ + auto args = util::argparser("哔哩哔哩爬虫程序"); + args.set_program_name("哔哩哔哩爬虫") + .add_help_option() + .add_argument("uid", "起始UID") + .add_option("-e", "--end", "结束UID(包括此UID),默认:0", 0) + .add_option("-d", "--delay", "爬虫延迟,默认:5秒", 5) + .add_option("-o", "--out-dir", "输出目录,默认:out", "out") + .parse(argc, argv); + init(args); + cout + << "爬虫开始\n起始UID:" << suid << endl + << "结束UID:" << euid << endl + << "输出目录:" << out_dir << endl + << "延迟时间:" << delay << " s\n\n" + << endl; + spider = new thread(start); + spider->join(); + destory(); + exit(EXIT_SUCCESS); +} + +void signal_handle(int sign) +{ + cout << "发现信号: " << sign << endl; + m.lock(); + switch (sign) + { + case SIGINT: + cerr << "手动中断,即将退出" << endl; + spider_exit = true; + break; + + default: + cerr << "未知信号" << endl; + break; + } + m.unlock(); +} + +string getCurrTime() +{ + std::time_t now = std::time(nullptr); + std::tm *local_time = std::localtime(&now); + std::string time_str; + time_str.resize(19); + std::strftime((char *)time_str.c_str(), sizeof(time_str), "%Y-%m-%d %H:%M:%S", local_time); + return time_str; +} + +void saveFile(bilibili::uid_t uid, const json &data) +{ + if (fuser->is_open()) + { + string _data = nlohmann::to_string(data["data"]); + fuser->write(_data.c_str(), _data.length()); + fuser->put('\n'); + fuser->flush(); + cout << getCurrTime() << " 用户编号:" << data["data"]["mid"] << " 用户名:" << data["data"]["name"] << endl; + } + else + { + string con = "保存用户信息出错 " + out_dir + user_name + "未打开"; + addLog(con); + cerr << con << endl; + exit(EXIT_FAILURE); + } +} + +void addErr(bilibili::uid_t uid, const json &data) +{ + string message(to_string(uid)); + message += "\t"; + message.append(data["message"]); + cerr << message << endl; + ferr->write(message.c_str(), message.length()); + ferr->write("\n\n", 2); + ferr->flush(); +} + +void addLog(const string &content) +{ + const string &&time = getCurrTime(); + flog->write(time.c_str(), time.length()); + flog->put('\n'); + flog->write(content.c_str(), content.length()); + (*flog) << "\n\n"; + flog->flush(); +} + +void init(const util::argparser &args) +{ + // 加载参数 + delay = args.get_option("-d"); + suid = args.get_argument("uid"); + euid = args.get_option("-e"); + out_dir = args.get_option_string("-o"); + if (*(out_dir.rbegin()) != '/') + out_dir.append("/"); + + // 开始初始化 + mkdir(out_dir.c_str(), mode_t(0755)); + fuser = new ofstream(out_dir + user_name); + ferr = new ofstream(out_dir + err_name); + flog = new ofstream(out_dir + log_name, ios::app); + + signal(SIGINT, signal_handle); +} + +void destory() +{ + string con("正常退出,当前uid: "); + con.append(to_string(suid)); + addLog(con); + cout << con << endl; + fuser->close(); + ferr->close(); + flog->close(); + delete fuser, ferr, flog; +} + +void start() +{ + suid--; + while (suid != euid or suid == 0) + { + try + { + m.lock(); + if (spider_exit) + { + m.unlock(); + return; + } + else + { + m.unlock(); + time_t t1 = time(nullptr); + suid++; + const json &&info = bilibili::getUserInfo(suid); + info["code"] == 0 ? saveFile(suid, info) : addErr(suid, info); + time_t t2 = time(nullptr); + cout << "IO延迟: " << t2 - t1 << " 秒" << endl; + sleep(delay); + } + } + catch (const std::exception &e) + { + m.unlock(); + addLog(string("出错了:\n") + e.what()); + + std::cerr << "出错了:" << endl + << time << endl + << e.what() << '\n'; + exit(EXIT_FAILURE); + } + } +} \ No newline at end of file diff --git a/third-part/CMakeLists.txt b/third-part/CMakeLists.txt new file mode 100644 index 0000000..c79b715 --- /dev/null +++ b/third-part/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(httplib) + diff --git a/third-part/argparse/argparse.hpp b/third-part/argparse/argparse.hpp new file mode 100644 index 0000000..355641c --- /dev/null +++ b/third-part/argparse/argparse.hpp @@ -0,0 +1,707 @@ +#pragma once +#ifndef JSHL_ARGPARSE_HPP +#define JSHL_ARGPARSE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace util +{ + +// 尽管使用编译器相关的 ABI 函数可以相对比较优雅的实现这个功能 +// 但是不同的编译器在某些类型下可能出现一些奇怪的行为 +// 最重要的是 std::string 还是免不了要模板特例化 +// 因此,不如在这里限定一些类型,免去不可控制的行为 + +// 我们仅支持 bool, int, int64_t, double, std::string +// 想要其他长度的类型,获取值之后自行转换 +// 当然,如果你愿意的话,自己定义模板特例化也不是不可以 + +template +inline std::string type_string() +{ + return "null"; +} + +template <> +inline std::string type_string() +{ + return "bool"; +} + +template <> +inline std::string type_string() +{ + return "int"; +} + +template <> +inline std::string type_string() +{ + return "int64_t"; +} + +template <> +inline std::string type_string() +{ + return "double"; +} + +template <> +inline std::string type_string() +{ + return "string"; +} + +template +std::string to_string(const T &value) +{ + std::ostringstream oss; + oss << value; + return oss.str(); +} + +template +T parse_value(const std::string &value) +{ + std::istringstream iss(value); + T result; + iss >> result; + return result; +} + +struct short_circuit_option +{ + short_circuit_option(std::string sname, std::string lname, std::string help, std::function callback) + : short_name(std::move(sname)), long_name(std::move(lname)), help(std::move(help)), + callback(std::move(callback)) + {} + std::string short_name; + std::string long_name; + std::string help; + std::function callback; +}; + +struct option +{ + option(std::string sname, std::string lname, std::string help, std::string type, std::string value) + : short_name(std::move(sname)), long_name(std::move(lname)), help(std::move(help)), type(std::move(type)), + value(std::move(value)) + {} + + std::string short_name; + std::string long_name; + std::string help; + std::string type; + std::string value; +}; + +struct argument +{ + argument(std::string name, std::string help, std::string type) + : name(std::move(name)), help(std::move(help)), type(std::move(type)) + {} + + std::string name; + std::string help; + std::string type; + std::string value; +}; + +class argparser +{ + private: + std::string description; + std::string program_name; + std::vector short_circuit_options; + std::vector