diff --git a/docs/en/quickstart/concepts/images/modes-flow.png b/docs/en/quickstart/concepts/images/modes-flow.png index c4856e6a5f9..361353de760 100644 Binary files a/docs/en/quickstart/concepts/images/modes-flow.png and b/docs/en/quickstart/concepts/images/modes-flow.png differ diff --git a/docs/en/quickstart/concepts/index.rst b/docs/en/quickstart/concepts/index.rst index d02cca2378f..27542f7f2f7 100644 --- a/docs/en/quickstart/concepts/index.rst +++ b/docs/en/quickstart/concepts/index.rst @@ -5,4 +5,4 @@ Concept .. toctree:: :maxdepth: 1 - workflow + modes diff --git a/docs/en/quickstart/concepts/workflow.md b/docs/en/quickstart/concepts/modes.md similarity index 79% rename from docs/en/quickstart/concepts/workflow.md rename to docs/en/quickstart/concepts/modes.md index 2ce5c58ff19..d27f33ab001 100644 --- a/docs/en/quickstart/concepts/workflow.md +++ b/docs/en/quickstart/concepts/modes.md @@ -6,7 +6,7 @@ OpenMLDB supports different execution modes at different stages of the feature e The following diagram illustrates the typical process of using OpenMLDB for feature engineering development and deployment, as well as the execution modes used in the process: -![image-20220310170024349](https://openmldb.ai/docs/zh/main/_images/modes-flow.png) +![image-20220310170024349](images/modes-flow.png) 1. Offline Data Import: Import offline data for offline feature engineering development and debugging. 2. Offline Feature Development: Develop feature engineering scripts and debug them until satisfactory results are achieved. This step involves joint debugging of machine learning models (such as XGBoost, LightGBM, etc.), but this article mainly focuses on feature engineering development related to OpenMLDB. @@ -16,57 +16,54 @@ The following diagram illustrates the typical process of using OpenMLDB for feat 6. Online Data Preview (optional): Preview and check online data using supported SQL commands. This step is not mandatory. 7. Real-time Feature Calculation: After the feature scheme is deployed and the data is correctly accessed, a real-time feature calculation service that can respond to online requests will be obtained. -## Overview of execution mode +## Overview of Execution Mode -As the data objects for offline and online scenarios are different, their underlying storage and computing nodes are also different. Therefore, OpenMLDB provides several built-in execution modes to support completing the above steps. The following table summarizes the execution modes and development tools used for each step, and three execution modes will be discussed in detail later. +As the data objects for offline and online scenarios are different, their underlying storage and computing nodes are also different. Therefore, OpenMLDB provides several built-in execution modes to support the above steps. The following table summarizes the execution modes and development tools used for each step, and three execution modes will be discussed in detail later. | Steps | Execution Mode | Development Tool | | ------------------------------ | ------------------- | ------------------------------------------------------------ | | 1. Offline Data Import | Offline Mode | OpenMLDB CLI, SDKs | -| Offline Feature Development | Offline Mode | OpenMLDB CLI, SDKs | -| Feature Deployment | Offline Mode | OpenMLDB CLI, SDKs | -| Cold Start Online Data Import | Online Preview Mode | OpenMLDB CLI, SDKs, [Data Import Tool](https://openmldb.ai/docs/zh/main/tutorial/data_import.html) | -| Real-time Data Integration | Online Preview Mode | Connectors, SDKs | -| Online Data Preview (optional) | Online Preview Mode | OpenMLDB CLI, SDKs, [Data Export Tool](https://openmldb.ai/docs/zh/main/tutorial/data_export.html) | -| Real-time Feature Calculation | Online Request Mode | CLI (REST APIs), SDKs | +| 2. Offline Feature Development | Offline Mode | OpenMLDB CLI, SDKs | +| 3. Feature Deployment | Offline Mode | OpenMLDB CLI, SDKs | +| 4. Cold Start Online Data Import | Online Preview Mode | OpenMLDB CLI, SDKs, [Data Import Tool](../../tutorial/data_import.md) | +| 5. Real-time Data Integration | Online Preview Mode | Connectors, SDKs | +| 6. Online Data Preview (optional) | Online Preview Mode | OpenMLDB CLI, SDKs, [Data Export Tool](../../tutorial/data_export.md) | +| 7. Real-time Feature Calculation | Online Request Mode | CLI (REST APIs), SDKs | ### Offline Mode -After starting OpenMLDB CLI, the **default mode is offline mode**. Offline data import, offline feature development, and feature deployment are all executed in offline mode. The purpose of offline mode is to manage and compute offline data. The computing nodes involved are supported by OpenMLDB Spark optimized for feature engineering, and the storage nodes support commonly used storage systems such as HDFS. +After starting OpenMLDB CLI, the **default mode is offline mode**. Offline data import, offline feature development, and feature deployment are all executed in offline mode. The purpose of offline mode is to manage and compute offline data. The computing nodes involved are supported by [OpenMLDB Spark Distribution](../../tutorial/openmldbspark_distribution.md) optimized for feature engineering, and the storage nodes support commonly used storage systems such as HDFS. Offline mode has the following main features: -- The offline mode supports most of the SQL syntax provided by OpenMLDB, including complex SQL syntaxes such as `LAST JOIN` and `WINDOW UNION`, which are optimized for feature engineering. - -- In offline mode, some SQL commands are executed asynchronously, such as `LOAD DATA`, `SELECT`, and `SELECT INTO` commands. Other SQL commands are executed synchronously. - +- The offline mode supports most of the SQL syntax provided by OpenMLDB, including complex SQL syntax such as `LAST JOIN` and `WINDOW UNION`. +- In offline mode, some SQL commands are executed asynchronously, such as `LOAD DATA`, `SELECT`, and `SELECT INTO`. Other SQL commands are executed synchronously. - The asynchronous SQL is managed by the internal TaskManager and can be viewed and managed through commands such as `SHOW JOBS`, `SHOW JOB`, and `STOP JOB`. -```{tip} -::: +:::{tip} Unlike many relational database systems, the `SELECT` command in offline mode is executed asynchronously by default. If you need to set it to synchronous execution, refer to setting the command to run synchronously in offline mode. During offline feature development, if asynchronous execution is used, it is strongly recommended to use the `SELECT INTO` statement for development and debugging, which can export the results to a file for easy viewing. ::: -``` -The `DEPLOY` command for feature deployment is also executed in offline mode. Its specification can refer to the OpenMLDB SQL online specification and requirements. + +The `DEPLOY` command for feature deployment is also executed in offline mode. Its specification can refer to the [OpenMLDB SQL online specification and requirements](../../openmldb_sql/deployment_manage/ONLINE_REQUEST_REQUIREMENTS.md). Offline mode setting command (OpenMLDB CLI): `SET @@execute_mode='offline'`. -### Online preview mode +### Online Preview Mode Cold start online data import, real-time data access, and online data preview are executed in online preview mode. The purpose of the online preview mode is to manage and preview online data. Storage and computation of online data are supported by the tablet component. The main features of the online preview mode are: - `LOAD DATA`, used for online data import, can be done either locally (load_mode='local') or on the cluster (load_mode='cluster'). Local import is synchronous, while cluster import is asynchronous (same as in offline mode). Other operations are synchronous. -- Online preview mode is mainly used for previewing limited data. Selecting and viewing data directly through SELECT in OpenMLDB CLI or SDKs may result in data truncation. If the data volume is large, it is recommended to use an [export tool](https://openmldb.ai/docs/zh/main/tutorial/data_export.html) to view the complete data. -- SELECT statements in online preview mode currently do not support more complex queries such as `LAST JOIN` and `ORDER BY`. Refer to [SELECT](https://openmldb.ai/docs/zh/main/openmldb_sql/dql/SELECT_STATEMENT.html). +- Online preview mode is mainly used for previewing limited data. Selecting and viewing data directly through SELECT in OpenMLDB CLI or SDKs may result in data truncation. If the data volume is large, it is recommended to use an [export tool](../../tutorial/data_export.html) to view the complete data. +- SELECT statements in online preview mode currently do not support more complex queries such as `LAST JOIN` and `ORDER BY`. Refer to [SELECT](../../openmldb_sql/dql/SELECT_STATEMENT.html). - The server in the online preview mode executes SQL statements on a single thread. For large data processing, it may be slow and may trigger a timeout. To increase the timeout period, the `--request_timeout` can be configured on the client. -- To prevent impact on online services, online preview mode limits the maximum number of accessed records and the number of different keys. This can be configured using `--max_traverse_cnt` and `--max_traverse_key_cnt`. Similarly, the maximum result size can be set using `--scan_max_bytes_size`. For detailed configuration, refer to the configuration file. +- To prevent impact on online services, online preview mode limits the maximum number of accessed records and the number of different keys. This can be configured using `--max_traverse_cnt` and `--max_traverse_key_cnt`. Similarly, the maximum result size can be set using `--scan_max_bytes_size`. For detailed configuration, refer to the [configuration file](../../deploy/conf.md). The command for setting online preview mode in OpenMLDB CLI: `SET @@execute_mode='online'` -### Online request mode +### Online Request Mode After deploying feature scripts and accessing online data, the real-time feature computing service is ready to use, and real-time feature extraction can be performed through the online request mode. REST APIs and SDKs support the online request mode. The online request mode is a unique mode in OpenMLDB that supports real-time online computing and is very different from common SQL queries in databases. @@ -78,7 +75,7 @@ The online request mode requires three inputs: Based on the above inputs, for each real-time request row, the online request mode will return a feature extraction result. The computing logic is as follows: The request row is virtually inserted into the correct position of the online data table based on the logic in the SQL script (such as `PARTITION BY`, `ORDER BY`, etc.), and then only the feature aggregation computing is performed on that row, returning the unique corresponding extraction result. The following diagram intuitively explains the operation process of the online request mode. -![modes-request](https://openmldb.ai/docs/zh/main/_images/modes-request.png) +![modes-request](images/modes-request.png) Online request mode is supported in the following ways: diff --git a/docs/poetry.lock b/docs/poetry.lock index fa522bb44da..724b4f19340 100644 --- a/docs/poetry.lock +++ b/docs/poetry.lock @@ -670,13 +670,13 @@ test = ["coverage", "pytest", "pytest-cov"] [[package]] name = "urllib3" -version = "1.26.17" +version = "1.26.18" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ - {file = "urllib3-1.26.17-py2.py3-none-any.whl", hash = "sha256:94a757d178c9be92ef5539b8840d48dc9cf1b2709c9d6b588232a055c524458b"}, - {file = "urllib3-1.26.17.tar.gz", hash = "sha256:24d6a242c28d29af46c3fae832c36db3bbebcc533dd1bb549172cd739c82df21"}, + {file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"}, + {file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"}, ] [package.extras] diff --git a/docs/zh/maintain/diagnose.md b/docs/zh/maintain/diagnose.md index eef7db5b5a1..cb5d7a30f74 100644 --- a/docs/zh/maintain/diagnose.md +++ b/docs/zh/maintain/diagnose.md @@ -8,14 +8,76 @@ 安装方式与使用: ```bash -pip install openmldb-tool # openmldb-tool[rpc] +pip install openmldb-tool # openmldb-tool[pb] openmldb_tool # 注意下划线 ``` 有以下几个子命令可选择执行: ```bash -usage: openmldb_tool [-h] [--helpfull] {status,inspect,test,static-check} ... +usage: openmldb_tool [-h] [--helpfull] {status,inspect,rpc,test,static-check} ... ``` -只有`static-check`静态检查命令需要指定`--dist_conf`参数,该参数指定OpenMLDB节点分布的配置文件。其他命令只需要`--cluster`参数,格式为`/`,默认为镜像中的OpenMLDB集群地址`127.0.0.1:2181/openmldb`。如果是自行设置的OpenMLDB集群,请配置此参数。 + +注意`-c/--cluster`参数,格式为`/`,默认将访问`127.0.0.1:2181/openmldb`。如果是自行设置的OpenMLDB集群,请配置此参数。其他参数根据子命令不同而不同,可以使用`-h`查看,或查看各个子命令的详细文档。 + +### 一键inspect + +`openmldb_tool inspect [--cluster=0.0.0.0:2181/openmldb]`可以一键查询,得到完整的集群状态报告。如果需要局部视角或额外的诊断功能,才需要其他子命令。 + +报告分为几个板块,其中如果所有表都是健康的,不会展示Ops和Partitions板块。用户首先看报告末尾的总结 summary & hint,如果存在server offline(红色),需先重启server,保证server尤其是TabletServer都在线。server重启后,集群可能会尝试自动修复,自动修复也可能会失败,所以,用户有必要等待一定时间后再次inspect。此时如果仍然有不健康的表,可以检查它们的状态,Fatal表需要尽快修复,它们可能会读写失败,Warn表,用户可以考虑推迟修复。修复方式见报告末尾提供的文档。 + +`inspect`可配置参数除了`--cluster/-c`,还可配置不显示彩色`--nocolor/-noc`方便复制,以及`--table_width/-tw n`配置表格宽度,`--offset_diff_thresh/-od n`配置offset diff的报警阈值。 + +``` +diagnosing cluster xxx + + +Server Detail +{server map} +{server online/offline report} + + +Table Partitions Detail +tablet server order: {tablet ip -> idx} +{partition tables of unhealthy tables} +Example: +{a detailed description of partition table} + + +Ops Detail +> failed ops do not mean cluster is unhealthy, just for reference +last one op(check time): {} +last 10 ops != finished: +{op list} + + + +================== +Summary & Hint +================== +Server: + +{online | offline servers ['[tablet]xxx'], restart them first} + +Table: +{all healthy | unhealthy tables desc} +[]Fatal/Warn table, {read/write may fail or still work}, {repair immediatly or not} +{partition detail: if leader healthy, if has unhealthy replicas, if offset too large, related ops} + + Make sure all servers online, and no ops for the table is running. + Repair table manually, run recoverdata, check https://openmldb.ai/docs/zh/main/maintain/openmldb_ops.html. + Check 'Table Partitions Detail' above for detail. +``` + +### 其他常用命令 + +除了一键inspect,在这样几个场景中,我们推荐使用诊断工具的子命令来帮助用户判断集群状态、简化运维。 + +- 部署好集群后,可以使用`test`测试集群是否能正常工作,不需要用户手动测试。如果发现问题,再使用`inspect`诊断。 +- 组件都在线,但出现超时或错误提示某组件无法连接时,可以使用`status --conn`检查与各组件的连接,会打印出简单访问的耗时。也可以用它来测试客户端主机与集群的连接情况,及时发现网络隔离。 +- 离线job如果出现问题,`SHOW JOBLOG id`可以查看日志,但经验较少的用户可能会被日志中的无关信息干扰,可以使用`inspect job`来提取job日志中的关键信息。 +- 离线job太多时,CLI中的展示会不容易读,可以使用`inspect offline`筛选所有failed的job,或者`inspect job --state `来筛选出特定状态的job。 +- 在一些棘手的问题中,可能需要用户通过RPC来获得一些信息,帮助定位问题。`openmldb_tool rpc`可以帮助用户简单快速地调用RPC,降低运维门槛。 +- 没有Prometheus监控时,可以通过`inspect online --dist`获得数据分布信息。 +- 如果你的操作节点到各个组件的机器是ssh免密的,那么,可以使用`static-check`检查配置文件是否正确,版本是否统一,避免部署失败。还可以一键收集整个集群的日志,方便打包并提供给开发人员分析。 ## 子命令详情 @@ -29,7 +91,8 @@ usage: openmldb_tool status [-h] [--helpfull] [--diff] optional arguments: -h, --help show this help message and exit --helpfull show full help message and exit - --diff check if all endpoints in conf are in cluster. If set, need to set `--conf_file` + --diff check if all endpoints in conf are in cluster. If set, need to set `-f,--conf_file` + --conn check network connection of all servers ``` - 简单查询集群状态: @@ -48,6 +111,11 @@ optional arguments: +-----------------+-------------+---------------+--------+---------+ ``` +- 检查并测试集群链接与版本: + ``` + openmldb_tool status --conn + ``` + #### 检查配置文件与集群状态是否一致 如果指定`--diff`参数,会检查配置文件中的所有节点是否都在已经启动的集群中,如果有节点不在集群中,会输出异常信息。如果集群中有节点不在配置文件中,不会输出异常信息。需要配置`-f,--conf_file`,例如,你可以在镜像里这样检查: @@ -57,7 +125,8 @@ openmldb_tool status --diff -f=/work/openmldb/conf/hosts ### inspect 检查 -`inspect`用于检查集群的在线和离线两个部分是否正常工作,可以选择单独检查`online`或`offline`,不指定则都检查。可以定期执行检查,以便及时发现异常。 +如果是为了检查集群状态,更推荐一键`inspect`获取集群完整检查报告,`inspect`子命令是更具有针对性的检查。 + ``` openmldb_tool inspect -h usage: openmldb_tool inspect [-h] [--helpfull] {online,offline,job} ... @@ -68,19 +137,26 @@ positional arguments: offline only inspect offline jobs. job show jobs by state, show joblog or parse joblog by id. ``` -在线检查会检查集群中的表状态(包括系统表),并输出有异常的表,包括表的状态,分区信息,副本信息等,等价于`SHOW TABLE STATUS`并筛选出有异常的表。如果发现集群表现不正常,请先检查下是否有异常表。例如,`SHOW JOBS`无法正常输出历史任务时,可以`inspect online`检查一下是否是job系统表出现问题。 + +#### online在线检查 + +`inspect online`检查在线表的健康状态,并输出有异常的表,包括表的状态,分区信息,副本信息等,等价于`SHOW TABLE STATUS`并筛选出有异常的表。 ##### 检查在线数据分布 -在线检查中,可以使用`inspect online --dist`检查在线数据分布,默认检查所有数据库,可以使用`--db`指定要检查的数据库。若要查询多个数据库,请使用 ',' 分隔数据库名称。会输出数据库在各个节点上的数据分布情况。 +可以使用`inspect online --dist`检查在线数据分布,默认检查所有数据库,可以使用`--db`指定要检查的数据库。若要查询多个数据库,请使用 ',' 分隔数据库名称。会输出数据库在各个节点上的数据分布情况。 -#### 离线检查 +#### offline离线检查 -离线检查会输出最终状态为失败的任务(不检查“运行中”的任务),等价于`SHOW JOBS`并筛选出失败任务。 +`inspect offline`离线检查会输出最终状态为失败的任务(不检查“运行中”的任务),等价于`SHOW JOBS`并筛选出失败任务。更多功能待补充。 #### JOB 检查 -JOB 检查会检查集群中的离线任务,可以使用`inspect job`或`inspect job --state all`查询所有任务,等价于`SHOW JOBS`并按job_id排序。使用`inspect job --state `可以筛选出特定状态的日志,可以使用 ',' 分隔,同时查询不同状态的日志。例如:`inspect offline` 相当于`inspect job --state failed,killed,lost`即筛选出所有失败的任务。 +JOB 检查是更灵活的离线任务检查命令,可以按条件筛选job,或针对单个job日志进行分析。 + +##### 按state筛选 + +可以使用`inspect job`或`inspect job --state all`查询所有任务,等价于`SHOW JOBS`并按job_id排序。使用`inspect job --state `可以筛选出特定状态的日志,可以使用 ',' 分隔,同时查询不同状态的日志。例如:`inspect offline` 相当于`inspect job --state failed,killed,lost`即筛选出所有失败的任务。 以下是一些常见的state: @@ -93,8 +169,13 @@ JOB 检查会检查集群中的离线任务,可以使用`inspect job`或`inspe 更多state信息详见[Spark State]( https://spark.apache.org/docs/3.2.1/api/java/org/apache/spark/launcher/SparkAppHandle.State.html),[Yarn State](https://hadoop.apache.org/docs/current/api/org/apache/hadoop/yarn/api/records/YarnApplicationState.html) +##### 解析单个JOB日志 -使用`inspect job --id `查询指定任务的log日志,其结果会使用配置文件筛选出主要错误信息。如需更新配置文件,可以添加`--conf-update`,并且可以使用`--conf-url`配置镜像源,例如使用`--conf-url https://openmldb.ai/download/diag/common_err.yml`配置国内镜像。如果需要完整的日志信息,可以添加`--detail`获取详细信息。 +使用`inspect job --id `查询指定任务的log日志,其结果会使用配置文件筛选出主要错误信息。 + +解析依靠配置文件,默认情况会自动下载。如需更新配置文件,可以`--conf-update`,它将会在解析前强制下载一次配置文件。如果默认下载源不合适,可以同时配置`--conf-url`配置镜像源,例如使用`--conf-url https://openmldb.ai/download/diag/common_err.yml`配置国内镜像。 + +如果只需要完整的日志信息而不是解析日志的结果,可以使用`--detail`获取详细信息,不会打印解析结果。 ### test 测试 @@ -185,7 +266,6 @@ nameserver: 如果检查配置文件或日志,将会把收集到的文件保存在`--collect_dir`中,默认为`/tmp/diag_collect`。你也也可以访问此目录查看收集到的配置或日志,进行更多的分析。 - #### 检查示例 在镜像容器中可以这样静态检查: @@ -193,14 +273,15 @@ nameserver: openmldb_tool static-check --conf_file=/work/openmldb/conf/hosts -VCL --local ``` -### rpc +### RPC 接口 + +`openmldb_tool`还提供了一个RPC接口,它可以让我们发送RPC更容易,不需要定位Server的IP,拼接RPC方法URL路径,也可以提示所有RPC方法和RPC方法的输入结构。使用方式是`openmldb_tool rpc`,例如,`openmldb_tool rpc ns ShowTable --field '{"show_all":true}'`可以调用`nameserver`的`ShowTable`接口,获取表的状态信息。 -`openmldb_tool`还提供了一个RPC接口,但它是一个额外组件,需要通过`pip install openmldb-tool[rpc]`安装。使用方式是`openmldb_tool rpc`,例如,`openmldb_tool rpc ns ShowTable --field '{"show_all":true}'`可以调用`nameserver`的`ShowTable`接口,获取表的状态信息。 +其中组件不使用ip,可以直接使用角色名。NameServer与TaskManager只有一个活跃,所以我们用ns和tm来代表这两个组件。而TabletServer有多个,我们用`tablet1`,`tablet2`等来指定某个TabletServer,从1开始,顺序可通过`openmldb_tool rpc`或`openmldb_tool status`来查看。 -NameServer与TaskManager只有一个活跃,所以我们用ns和tm来代表这两个组件。 -而TabletServer有多个,我们用`tablet1`,`tablet2`等来指定某个TabletServer,顺序可通过`openmldb_tool rpc`或`openmldb_tool status`来查看。 +如果对RPC服务的方法或者输入参数不熟悉,可以通过`openmldb_tool rpc [method] --hint`查看帮助信息。但它是一个额外组件,需要通过`pip install openmldb-tool[pb]`安装。hint还需要额外的pb文件,帮助解析输入参数,默认是从`/tmp/diag_cache`中读取,如果不存在则自动下载。如果你已有相应的文件,或者已经手动下载,可以通过`--pbdir`指定该目录。自行编译pb文件,见[openmldb tool开发文档](https://github.com/4paradigm/OpenMLDB/blob/main/python/openmldb_tool/README.md#rpc)。 -如果对RPC服务的方法或者输入参数不熟悉,可以通过`openmldb_tool rpc [method] --hint`查看帮助信息。例如: +例如: ```bash $ openmldb_tool rpc ns ShowTable --hint ... @@ -212,9 +293,7 @@ You should input json like this, ignore round brackets in the key and double quo "(optional)show_all": "bool" }' ``` -hint还需要额外的pb文件,帮助解析输入参数,默认是从`/tmp/diag_cache`中读取,如果不存在则自动下载。如果你已有相应的文件,或者已经手动下载,可以通过`--pbdir`指定该目录。 ## 附加 可使用`openmldb_tool --helpfull`查看所有配置项。例如,`--sdk_log`可以打印sdk的日志(zk,glog),可用于调试。 - \ No newline at end of file diff --git a/docs/zh/quickstart/beginner_must_read.md b/docs/zh/quickstart/beginner_must_read.md index 1bb4e01fe3e..7f0a2adbacc 100644 --- a/docs/zh/quickstart/beginner_must_read.md +++ b/docs/zh/quickstart/beginner_must_read.md @@ -2,7 +2,21 @@ 由于OpenMLDB是分布式系统,多种模式,客户端丰富,初次使用可能会有很多疑问,或者遇到一些运行、使用问题,本文从新手使用的角度,讲解如何进行诊断调试,需要帮助时如何提供有效信息给技术人员等等。 -## OpenMLDB集群管理 +## 错误诊断 + +在使用OpenMLDB的过程中,除了SQL语法错误,其他错误信息可能不够直观,但很可能与集群状态有关。所以,错误诊断需要**先确认集群状态**。在发现错误时,请先使用诊断工具的一键诊断功能。一键诊断可以输出全面直观的诊断报告,如果不能使用此工具,可以手动执行`SHOW COMPONENTS;`和`SHOW TABLE STATUS LIKE '%';`提供部分信息。 + +报告将展示集群的组件、在线表等状态,也会提示用户如何修复,请按照报告内容进行操作,详情见[一键inspect](../maintain/diagnose.md#一键inspect)。 + +``` +openmldb_tool inspect [-c=0.0.0.0:2181/openmldb] +``` + +需要注意,由于离线存储只会在执行离线job时被读取,而离线job也不是一个持续的状态,所以,一键诊断只能展示TaskManager组件状态,不会诊断离线存储,也无法诊断离线job的执行错误,离线job诊断见[离线SQL执行](#离线)。 + +如果诊断报告认为集群健康,但仍然无法解决问题,请提供错误和诊断报告给我们。 + +## 创建OpenMLDB与连接 首先,我们建议不熟悉分布式多进程管理的新手使用docker创建OpenMLDB,方便快速上手。熟悉OpenMLDB各组件之后,再尝试分布式部署。 @@ -24,19 +38,13 @@ docker创建OpenMLDB见[快速上手](./openmldb_quickstart.md),请注意文 如果你需要保留已有的在线表,**不要主动地kill全部Tablet再重启**,保证Tablet只有单台在上下线。`stop-all.sh`和`start-all.sh`脚本是给快速重建集群用的,可能会导致在线表数据恢复失败,**不保证能修复**。 -当你发现进程变化或者操作其变化后,需要使用诊断工具进行诊断,确认集群状态是否正常。最常用的两个命令是: +当你发现进程变化或者主动操作其变化后,需要使用诊断工具进行诊断,确认集群状态是否正常: ```bash -openmldb_tool status # --diff hosts 可检查TaskManager等是否掉线,当然,你也可以手动判断 -openmldb_tool inspect online +openmldb_tool inspect # 主要命令 +openmldb_tool status --diff hosts # 可检查TaskManager等是否掉线,当然,你也可以手动判断 ``` -如果诊断出server offline,或是TaskManager等掉线,需要先尝试启动回来。如果启动失败,请查看对应日志,提供错误信息。 - -如果server都在线,但inspect online发现online表不正常。需要从以下几点排查: -- 是否是手动操作了stop all & start all,日志中是否包含`recovering data`信息? - - 如果过程中已经尝试过`recovering data`,但inspect结果仍然不正常,手动恢复的可能性较小 TODO - - 如果没有尝试过`recovering data`,参考下一步。 -- 尝试`recovering data`,命令参考[OpenMLDB运维工具](../maintain/openmldb_ops.md)。如果仍然不正常,请提供日志。 +如果诊断出server offline,或是TaskManager等掉线,需要先启动回来。如果启动失败,请查看对应日志,提供错误信息。如果诊断结果提示需要recoverdata,请参考[OpenMLDB运维工具](../maintain/openmldb_ops.md)执行recoverdata。如果recoverdata脚本提示recover失败,或recover成功后再次inpsect的结果仍然不正常,请提供日志给我们。 ## 源数据 @@ -125,12 +133,14 @@ create table t1(c1 int; 如果是集群离线命令,默认异步模式下,发送命令会得到job id的返回。可使用`show job `来查询job执行情况。 -离线job如果是异步SELECT(并不INTO保存结果),也不会将结果打印在客户端(同步SELECT将会打印结果)。可以通过`show joblog `来获得结果,结果中包含stdout和stderr两部分,stdout为查询结果,stderr为job运行日志。如果发现job failed或者其他状态,不符合你的预期,请仔细查看job运行日志。 +离线job如果是异步SELECT(并不INTO保存结果),也不会将结果打印在客户端,而同步SELECT将会打印结果到控制台。可以通过`show joblog `来获得结果,结果中包含stdout和stderr两部分,stdout为查询结果,stderr为job运行日志。如果发现job failed或者其他状态,不符合你的预期,请仔细查看job运行日志。 -```{note} -日志地址由taskmanager.properties的`job.log.path`配置,如果你改变了此配置项,需要到配置的目的地寻找日志。stdout日志默认在`/work/openmldb/taskmanager/bin/logs/job_x.log`,job运行日志默认在`/work/openmldb/taskmanager/bin/logs/job_x_error.log`(注意有error后缀), +离线job日志中可能有一定的干扰日志,用户可以使用`openmldb_tool inspect job --id x`进行日志的解析提取,帮助定位错误,更多信息请参考[诊断工具job检查](../maintain/diagnose.md#job-检查)。 + +如果taskmanager是yarn模式,而不是local模式,`job_x_error.log`中的信息会较少,只会打印异常。如果异常不直观,需要更早时间的执行日志,执行日志不在`job_x_error.log`中,需要通过`job_x_error.log`中记录的yarn app id,去yarn系统中查询yarn app的container的日志。yarn app container里,执行日志也保存在stderr中。 -如果taskmanager是yarn模式,而不是local模式,`job_x_error.log`中的信息会较少,不会有job错误的详细信息。需要通过`job_x_error.log`中记录的yarn app id,去yarn系统中查询job的真正错误原因,需要查询到某application中主container的stderr日志。 +```{note} +如果你无法通过show joblog获得日志,或者想要直接拿到日志文件,可以直接在TaskManager机器上获取。日志地址由taskmanager.properties的`job.log.path`配置,如果你改变了此配置项,需要到配置的目录中寻找日志。stdout查询结果默认在`/work/openmldb/taskmanager/bin/logs/job_x.log`,stderr job运行日志默认在`/work/openmldb/taskmanager/bin/logs/job_x_error.log`(注意有error后缀)。 ``` #### 在线 diff --git a/python/openmldb_tool/README.md b/python/openmldb_tool/README.md index 3381751edf9..d5168a4bf25 100644 --- a/python/openmldb_tool/README.md +++ b/python/openmldb_tool/README.md @@ -48,21 +48,27 @@ status [-h] [--helpfull] [--diff DIFF] optional arguments: -h, --help show this help message and exit --helpfull show full help message and exit - --diff check if all endpoints in conf are in cluster. If set, need to set `--conf_file` + --diff check if all endpoints in conf are in cluster. If set, need to set `-f,--conf_file` ``` Use `show components` to show servers(no apiserver now). +--conn: +- ping all servers, brpc /health to check ok,and +- online servers version and cost time, we can get from brpc http:///version. (ns,tablet, apiserver set_version in brpc server) + TODO: -- ping all servers, brpc /health to check ok -- online servers version, we can get from brpc http:///version. (ns,tablet, apiserver set_version in brpc server) - brpc /flags to get all gflags(including openmldb), `--enable_flags_service=true` required ## Inspect -Use `show table status like '%';` in all dbs, even the hidden db(system db). +`inspect` for full report, no offline diag now. + +inspect online: Use `show table status like '%';` in all dbs, even the hidden db(system db). + +inspect offline: failed jobs, no more info. TODO: check register table? -If you found some online tables are not behaving properly, do inspect online. +inspect job: full support of offline job, select jobs, parse job log ## Test diff --git a/python/openmldb_tool/diagnostic_tool/diagnose.py b/python/openmldb_tool/diagnostic_tool/diagnose.py index 8bd67719489..21ee2961421 100644 --- a/python/openmldb_tool/diagnostic_tool/diagnose.py +++ b/python/openmldb_tool/diagnostic_tool/diagnose.py @@ -31,13 +31,15 @@ import diagnostic_tool.server_checker as checker from diagnostic_tool.table_checker import TableChecker from diagnostic_tool.parser import LogParser +from .inspect import server_ins, table_ins, partition_ins, ops_ins, ops_hint, inspect_hint +from .rpc import RPC from absl import app from absl import flags from absl.flags import argparse_flags from absl import logging # --verbosity --log_dir -# only some sub cmd needs dist file +# only some sub cmd needs dist file TODO(hw): better to move then to other py file, to avoid -h show them flags.DEFINE_string( "conf_file", "", @@ -81,7 +83,7 @@ def status(args): # --diff with dist conf file, conf_file is required if args.diff: - assert flags.FLAGS.conf_file, "need --conf_file" + assert flags.FLAGS.conf_file, "need -f,--conf_file" print( "only check components in conf file, if cluster has more components, ignore them" ) @@ -96,39 +98,56 @@ def status(args): def inspect(args): - insepct_online(args) - inspect_offline(args) + # report all + # 1. server level + connect = Connector() + status_checker = checker.StatusChecker(connect) + server_map = status_checker._get_components() + offlines = server_ins(server_map) + + # 3. ns ops level, but show only if has unhealthy tables, so hint later + last_one, should_warn, related_ops = ops_ins(connect) + + # 2. partition level: show unhealthy tables and get some hints about table + hints = partition_ins(server_map, related_ops) + if hints: + # show 3 here + ops_hint(last_one, should_warn) + # 4. hint + # let user know what to do + # 1) start offline servers + # 2) let user know the warning table is fatal or not, related ops, warn if offset is too large + # 3) if table not healthy and no related ops, use recoverdata + inspect_hint(offlines, hints) def insepct_online(args): - """show table status""" - conn = Connector() + """inspect online""" + connect = Connector() # scan all db include system db - fails = [] - rs = conn.execfetch("show table status like '%';") - rs.sort(key=lambda x: x[0]) - print(f"inspect {len(rs)} online tables(including system tables)") - for t in rs: + fails = table_ins(connect) + for t in fails: if t[13]: print(f"unhealthy table {t[2]}.{t[1]}:\n {t[:13]}") # sqlalchemy truncated ref https://github.com/sqlalchemy/sqlalchemy/commit/591e0cf08a798fb16e0ee9b56df5c3141aa48959 # so we print warnings alone print(f"full warnings:\n{t[13]}") - fails.append(f"{t[2]}.{t[1]}") - - assert not fails, f"unhealthy tables: {fails}" - print(f"all tables are healthy") + # if has fails, summary will print in table_ins + if not fails: + print(f"all tables are healthy") if getattr(args, "dist", False): - table_checker = TableChecker(conn) - table_checker.check_distribution(dbs=flags.FLAGS.db.split(",")) + table_checker = TableChecker(connect) + dbs = flags.FLAGS.db + db_list = dbs.split(",") if dbs else None + table_checker.check_distribution(dbs=db_list) def inspect_offline(args): """scan jobs status, show job log if failed""" final_failed = ["failed", "killed", "lost"] total, num, jobs = _get_jobs(final_failed) - # TODO some failed jobs are known, what if we want skip them? + # TODO some failed jobs are known or too old, what if we want skip them? print(f"inspect {total} offline jobs") if num: failed_jobs_str = "\n".join(jobs) @@ -241,7 +260,6 @@ def rpc(args): tm: taskmanager""" ) return - from diagnostic_tool.rpc import RPC # use status connction to get version conns_with_version = { @@ -301,7 +319,7 @@ def parse_arg(argv): status_parser.add_argument( "--diff", action="store_true", - help="check if all endpoints in conf are in cluster. If set, need to set `--conf_file`", + help="check if all endpoints in conf are in cluster. If set, need to set `-f,--conf_file`", ) # TODO action support in all python 3.x? status_parser.add_argument( "--conn", @@ -313,10 +331,10 @@ def parse_arg(argv): # sub inspect inspect_parser = subparsers.add_parser( "inspect", - help="Inspect online and offline. Use `inspect [online/offline]` to inspect one.", + help="Get full inspect report, --nocolor for batch mode, --table_width for partition tables display", ) - # inspect online & offline inspect_parser.set_defaults(command=inspect) + inspect_sub = inspect_parser.add_subparsers() # inspect online online = inspect_sub.add_parser("online", help="only inspect online table.") @@ -325,7 +343,9 @@ def parse_arg(argv): "--dist", action="store_true", help="Inspect online distribution." ) # inspect offline - offline = inspect_sub.add_parser("offline", help="only inspect offline jobs.") + offline = inspect_sub.add_parser( + "offline", help="only inspect offline jobs, show failed jobs." + ) offline.set_defaults(command=inspect_offline) # inspect job ins_job = inspect_sub.add_parser( diff --git a/python/openmldb_tool/diagnostic_tool/inspect.py b/python/openmldb_tool/diagnostic_tool/inspect.py new file mode 100644 index 00000000000..288f819bb78 --- /dev/null +++ b/python/openmldb_tool/diagnostic_tool/inspect.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2021 4Paradigm +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" gen multi-level readable reports for cluster devops """ +from absl import flags +import json +from collections import defaultdict +from prettytable import PrettyTable + +from .rpc import RPC + +# ANSI escape codes +flags.DEFINE_bool("nocolor", False, "disable color output", short_name="noc") +flags.DEFINE_integer( + "table_width", + 12, + "max columns in one row, 1 partition use r+1 cols, set k*(r+1)", + short_name="tw", +) +flags.DEFINE_integer( + "offset_diff_thresh", 100, "offset diff threshold", short_name="od" +) + +# color: red, green +RED = "\033[31m" +GREEN = "\033[32m" +BLUE = "\033[34m" +YELLOW = "\033[1;33m" +RESET = "\033[0m" + + +# switch by nocolor flag +def cr_print(color, obj): + if flags.FLAGS.nocolor or color == None: + print(obj) + else: + print(f"{color}{obj}{RESET}") + + +def server_ins(server_map): + print("\n\nServer Detail") + print(server_map) + offlines = [] + for component, value_list in server_map.items(): + for endpoint, status in value_list: + if status != "online": + offlines.append(f"[{component}]{endpoint}") + continue # offline tablet is needlessly to rpc + if offlines: + s = "\n".join(offlines) + cr_print(RED, f"offline servers:\n{s}") + else: + cr_print(GREEN, "all servers online (no backup tm and apiserver)") + return offlines + + +# support nocolor +def light(color, symbol, detail): + if flags.FLAGS.nocolor: + return f"{symbol} {detail}" + else: + return f"{color}{symbol}{RESET} {detail}" + + +def state2light(state): + state = state.ljust(15) # state str should be less than 15 + if not state.startswith("k"): + # meta mismatch status, all red + return light(RED, "X", state) + else: + # meta match, get the real state + state = state[1:] + if state.startswith("TableNormal"): + # green + return light(GREEN, "O", state) + else: + # ref https://github.com/4paradigm/OpenMLDB/blob/0462f8a9682f8d232e8d44df7513cff66870d686/tools/tool.py#L291 + # undefined is loading too: state == "kTableLoading" or state == "kTableUndefined" + # snapshot doesn't mean unhealthy: state == "kMakingSnapshot" or state == "kSnapshotPaused" + return light(YELLOW, "=", state) + + +# similar with `show table status` warnings field, but easier to read +# prettytable.colortable just make table border and header lines colorful, so we color the value +def check_table_info(t, replicas_on_tablet, tablet2idx): + pnum, rnum = t["partition_num"], t["replica_num"] + assert pnum == len(t["table_partition"]) + # multi-line for better display, max display columns in one row + valuable_cols = pnum * (rnum + 1) + display_width = min(flags.FLAGS.table_width, valuable_cols) + # if real multi-line, the last line may < width, padding with empty string + rest = valuable_cols % display_width + total_cols = valuable_cols + (0 if rest == 0 else display_width - rest) + + idx_row = [""] * total_cols + leader_row = [""] * total_cols + followers_row = [""] * total_cols + + table_mark = 0 + hint = "" + for i, p in enumerate(t["table_partition"]): + # each partition add 3 row, and rnum + 1 columns + # tablet idx pid | 1 | 4 | 5 + # leader 1 o + # followers o o + pid = p["pid"] + assert pid == i + + # sort by list tablets + replicas = [] + for r in p["partition_meta"]: + tablet = r["endpoint"] + # tablet_has_partition useless + # print(r["endpoint"], r["is_leader"], r["tablet_has_partition"]) + replicas_on_t = replicas_on_tablet[t["tid"]][p["pid"]] + # may can't find replica on tablet, e.g. tablet server is not ready + info_on_tablet = {} + if r["endpoint"] not in replicas_on_t: + info_on_tablet = {"state": "Miss", "mode": "Miss", "offset": -1} + else: + info_on_tablet = replicas_on_t[r["endpoint"]] + # print(info_on_tablet) + m = { + "role": "leader" if r["is_leader"] else "follower", + "state": info_on_tablet["state"], + "acrole": info_on_tablet["mode"], + "offset": info_on_tablet["offset"], + } + replicas.append((tablet2idx[tablet], m)) + + assert len(replicas) == rnum + replicas.sort(key=lambda x: x[0]) + leader_ind = [i for i, r in enumerate(replicas) if r[1]["role"] == "leader"] + # replica on offline tablet is still in the ns meta, so leader may > 1 + # assert len(ind) <= 1, f"should be only one leader or miss leader in {replicas}" + + # show partition idx and tablet server idx + cursor = i * (rnum + 1) + idx_row[cursor : cursor + rnum + 1] = ["p" + str(pid)] + [ + r[0] for r in replicas + ] + + # fulfill leader line + if leader_ind: + for leader in leader_ind: + # leader state + lrep = replicas[leader][1] + if lrep["state"] != "Miss" and lrep["acrole"] != "kTableLeader": + lrep["state"] = "NotLeaderOnT" # modify the state + leader_row[cursor + leader + 1] = state2light(lrep["state"]) + else: + # can't find leader in nameserver metadata, set in the first column(we can't find leader on any tablet) + leader_row[cursor] = state2light("NotFound") + + # fulfill follower line + for i, r in enumerate(replicas): + idx = cursor + i + 1 + if i in leader_ind: + continue + frep = r[1] + if frep["state"] != "Miss" and frep["acrole"] != "kTableFollower": + frep["state"] = "NotFollowerOnT" + followers_row[idx] = state2light(frep["state"]) + + # after state adjust, diag table + replicas = [r[1] for r in replicas] # tablet server is needless now + # fatal: leader replica is not normal, may read/write fail + # get one normal leader, the partition can work + if not leader_ind or not any( + [replicas[i]["state"] == "kTableNormal" for i in leader_ind] + ): + table_mark = max(4, table_mark) + hint += f"partition {pid} leader replica is not normal\n" + # warn: need repair(may auto repair by auto_failover), but not in emergency + # follower replica is not normal + if any([r["state"] != "kTableNormal" for r in replicas]): + table_mark = max(3, table_mark) + hint += f"partition {pid} has unhealthy replicas\n" + + # offset is not consistent, only check normal replicas + offsets = [r["offset"] for r in replicas if r["state"] == "kTableNormal"] + if offsets and max(offsets) - min(offsets) > flags.FLAGS.offset_diff_thresh: + table_mark = max(3, table_mark) + hint += ( + f"partition {pid} has offset diff > {flags.FLAGS.offset_diff_thresh}\n" + ) + + x = PrettyTable(align="l") + + x.field_names = [i for i in range(display_width)] + step = display_width + for i in range(0, len(idx_row), step): + x.add_row(idx_row[i : i + step]) + x.add_row(leader_row[i : i + step]) + x.add_row(followers_row[i : i + step], divider=True) + + table_summary = "" + if table_mark >= 4: + table_summary = light( + RED, + "X", + f"Fatal table {t['db']}.{t['name']}, read/write may fail, need repair immediately", + ) + elif table_mark >= 3: + table_summary = light( + YELLOW, "=", f"Warn table {t['db']}.{t['name']}, still work, but need repair" + ) + if table_summary: + table_summary += "\n" + hint + return x, table_summary + + +def show_table_info(t, replicas_on_tablet, tablet2idx): + """check table info and display for ut""" + print( + f"Table {t['tid']} {t['db']}.{t['name']} {t['partition_num']} partitions {t['replica_num']} replicas" + ) + table, _ = check_table_info(t, replicas_on_tablet, tablet2idx) + print(table.get_string(border=True, header=False)) + + +def table_ins(connect): + print("\n\nTable Healthy Detail") + rs = connect.execfetch("show table status like '%';") + rs.sort(key=lambda x: x[0]) + print(f"summary: {len(rs)} tables(including system tables)") + warn_tables = [] + for t in rs: + # any warning means unhealthy, partition_unalive may be 0 but already unhealthy, warnings is accurate? + if t[13]: + warn_tables.append(t) + if warn_tables: + # only show tables name + s = "\n".join([f"{t[2]}.{t[1]}" for t in warn_tables]) + cr_print(RED, f"unhealthy tables:\n{s}") + else: + cr_print(GREEN, "all tables are healthy") + return warn_tables + + +def partition_ins(server_map, related_ops): + print("\n\nTable Partition Detail") + # ns table info + rpc = RPC("ns") + res = rpc.rpc_exec("ShowTable", {"show_all": True}) + if not res: + cr_print(RED, "get table info failed or empty from nameserver") + return + res = json.loads(res) + all_table_info = res["table_info"] + + # get table info from tablet server + # >> + replicas = defaultdict(lambda: defaultdict(dict)) + tablets = server_map["tablet"] # has status + invalid_tablets = set() + for tablet, status in tablets: + if status == "offline": + invalid_tablets.add(tablet) + continue + # GetTableStatusRequest empty field means get all + rpc = RPC(tablet) + res = None + try: + res = json.loads(rpc.rpc_exec("GetTableStatus", {})) + except Exception as e: + print(f"rpc {tablet} failed") + # may get empty when tablet server is not ready + if not res or res["code"] != 0: + cr_print(RED, f"get table status failed or empty from {tablet}(online)") + invalid_tablets.add(tablet) + continue + if "all_table_status" not in res: + # just empty replica on tablet, skip + continue + for rep in res["all_table_status"]: + rep["tablet"] = tablet + # tid, pid are int + tid, pid = rep["tid"], rep["pid"] + replicas[tid][pid][tablet] = rep + + tablet2idx = {tablet[0]: i + 1 for i, tablet in enumerate(tablets)} + print(f"tablet server order: {tablet2idx}") + if invalid_tablets: + cr_print( + RED, + f"some tablet servers are offline/bad, can't get table info(exclude empty table server): {invalid_tablets}", + ) + + # display, depends on table info, replicas are used to check + all_table_info.sort(key=lambda x: x["tid"]) + # related op map + related_ops_map = {} + for op in related_ops: + db = op[9] + table = op[10] + if db not in related_ops_map: + related_ops_map[db] = {} + if table not in related_ops_map[db]: + related_ops_map[db][table] = [] + related_ops_map[db][table].append(op) + # print(f"related ops: {related_ops_map}") + print("") # for better display + diag_result = [] + for t in all_table_info: + # no need to print healthy table + table, diag_hint = check_table_info(t, replicas, tablet2idx) + if diag_hint: + print( + f"Table {t['tid']} {t['db']}.{t['name']} {t['partition_num']} partitions {t['replica_num']} replicas" + ) + print(table.get_string(header=False)) + if t["db"] in related_ops_map and t["name"] in related_ops_map[t["db"]]: + diag_hint += f"related op: {sorted(related_ops_map[t['db']][t['name']], key=lambda x: x[11])}" # 11 is pid + diag_result.append(diag_hint) + # comment for table info display, only for unhealthy table TODO: draw a example + if diag_result: + print( + """ +Example: +tablet server order: {'xxx': 1, 'xxx': 2, 'xxx': 3} -> get real tablet addr by idx ++----+-------------------+------------------+------------------+ +| p0 | 1 | 2 | 3 | -> p0: partition 0, 1-3: tablet server idx +| | [light] status | | | -> leader replica is on tablet 1 +| | | [light] status | [light] status | -> follower replicas are on tablet 2, 3 ++----+-------------------+------------------+------------------+ +light: +Green O -> OK +Yellow = -> replica meta is ok but state is not normal +Red X -> NotFound/Miss/NotFollowerOnT/NotLeaderOnT""" + ) + return diag_result + + +def ops_ins(connect): + # op sorted by id TODO: detail to show all include succ op? + rs = connect.execfetch("show jobs from NameServer;") + should_warn = [] + from datetime import datetime + # already in order + ops = [list(op) for op in rs] + for i in range(len(ops)): + op = ops[i] + op[3] = str(datetime.fromtimestamp(int(op[3]) / 1000)) if op[4] else "..." + op[4] = str(datetime.fromtimestamp(int(op[4]) / 1000)) if op[4] else "..." + if op[2] != "FINISHED": + should_warn.append(op) + + recover_type = ["kRecoverTableOP", "kChangeLeaderOP", "kReAddReplicaOP", "kOfflineReplicaOP"] + related_ops = [ + op + for op in should_warn + if op[1] in recover_type and op[2] in ["Submitted", "RUNNING"] + ] + return ops[-1] if ops else None, should_warn, related_ops + +def ops_hint(last_one, should_warn): + print("\n\nOps Detail") + print("> failed ops do not mean cluster is unhealthy, just for reference") + # peek last one to let user know if cluster has tried to recover, or we should wait + if last_one: + print("last one op(check time): ", last_one) + else: + print("no ops in nameserver") + if not should_warn: + print("all nameserver ops are finished") + else: + print("last 10 ops != finished:") + print(*should_warn[-10:], sep="\n") + +def inspect_hint(server_hint, table_hints): + print( + """ + +================== +Summary & Hint +================== +Server: +""" + ) + if server_hint: + cr_print(RED, f"offline servers {server_hint}, restart them first") + else: + cr_print(GREEN, "all servers online") + print("\nTable:\n") + for h in table_hints: + print(h) + if table_hints: + print( + """ + Make sure all servers online, and no ops for the table is running. + Repair table manually, run recoverdata, check https://openmldb.ai/docs/zh/main/maintain/openmldb_ops.html. + Check 'Table Partitions Detail' above for detail. + """ + ) + else: + cr_print(GREEN, "all tables are healthy") diff --git a/python/openmldb_tool/diagnostic_tool/pb.py b/python/openmldb_tool/diagnostic_tool/pb.py new file mode 100644 index 00000000000..06219d00b61 --- /dev/null +++ b/python/openmldb_tool/diagnostic_tool/pb.py @@ -0,0 +1,118 @@ +# Copyright 2021 4Paradigm +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.protobuf.descriptor import Descriptor, FieldDescriptor +from absl import flags + + +class DescriptorHelper: + def __init__(self, service): + # lazy import + assert flags.FLAGS.pbdir, "pbdir not set" + import sys + from pathlib import Path + + sys.path.append(Path(flags.FLAGS.pbdir).as_posix()) + import tablet_pb2 + import name_server_pb2 + import taskmanager_pb2 + + # google.protobuf.symbol_database can get service desc by name, but we have already included all pb2 files we need + # just use one file + pb_map = { + "TabletServer": tablet_pb2, + "NameServer": name_server_pb2, + "TaskManagerServer": taskmanager_pb2, + # "ApiServer": api_server_pb2, + # "DataSync": data_sync_pb2, + } + self.descriptor = pb_map[service].DESCRIPTOR.services_by_name[service] + # from google.protobuf import symbol_database + # self.sym_db = symbol_database.Default() + + def get_input_json(self, method): + m = self.descriptor.FindMethodByName(method) + if not m: + return False, f"method {method} not found" + if not m.input_type.fields: # e.g. ShowTabletRequest is emtpy + return False, f"method {method} has no input" + # GeneratedProtocolMessageType __dict__ is complex, can't use it directly + # cl = self.sym_db.GetSymbol(m.input_type.full_name) + + # fields build a map, message is Descriptor, fields in msg is FieldDescriptor + return True, Field.to_json(m.input_type) + + +class Field: + def to_str(typ): + typ2str = { + FieldDescriptor.TYPE_DOUBLE: "double", + FieldDescriptor.TYPE_FLOAT: "float", + FieldDescriptor.TYPE_INT64: "int64", + FieldDescriptor.TYPE_UINT64: "uint64", + FieldDescriptor.TYPE_INT32: "int32", + FieldDescriptor.TYPE_FIXED64: "fixed64", + FieldDescriptor.TYPE_FIXED32: "fixed32", + FieldDescriptor.TYPE_BOOL: "bool", + FieldDescriptor.TYPE_STRING: "string", + FieldDescriptor.TYPE_GROUP: "group", + FieldDescriptor.TYPE_MESSAGE: "message", + FieldDescriptor.TYPE_BYTES: "bytes", + FieldDescriptor.TYPE_UINT32: "uint32", + } + return typ2str[typ] + + def to_json(field): + # label optional, required, or repeated. + label = {1: "optional", 2: "required", 3: "repeated"} + + def is_map(f): + # I'm a map(containing_type = who includes me and my fields name are key-value) + # e.g. tm RunBatchSql --hint the conf field is map + return f.containing_type and [ff.name for ff in f.fields] == [ + "key", + "value", + ] + + if isinstance(field, FieldDescriptor): + if field.message_type: + # message_type is a Descriptor, check if it's a map + if is_map(field.message_type): + m = field.message_type + # treat key-value as map type, can't figure out custom type, no nested, so just generate here + return { + f"<{m.fields[0].name}>": f"<{m.fields[1].name}>", + "...": "...", + } + else: + # normal nested message + return Field.to_json(field.message_type) + elif field.type == FieldDescriptor.TYPE_ENUM: + return "/".join([n.name for n in field.enum_type.values]) + else: + return f"<{Field.to_str(field.type)}>" + + elif isinstance(field, Descriptor): + d = {} + for f in field.fields: + # each one is FieldDescriptor + # map is repeated too, but it's not a list + if f.label == 3 and not is_map(f.message_type): + # json list style + d[f"({label[f.label]})" + f.name] = [Field.to_json(f), "..."] + else: + d[f"({label[f.label]})" + f.name] = Field.to_json(f) + return d + else: + raise ValueError(f"unknown type {type(field)}") diff --git a/python/openmldb_tool/diagnostic_tool/rpc.py b/python/openmldb_tool/diagnostic_tool/rpc.py index 686734e7641..8e3f8efc660 100644 --- a/python/openmldb_tool/diagnostic_tool/rpc.py +++ b/python/openmldb_tool/diagnostic_tool/rpc.py @@ -12,103 +12,44 @@ # See the License for the specific language governing permissions and # limitations under the License. -from absl import flags import json import requests -from bs4 import BeautifulSoup -from google.protobuf.descriptor import FieldDescriptor from .server_checker import StatusChecker from .connector import Connector +from absl import flags + +# used by pb.py but set here for simplicity, we will check pbdir before call hint(import pb) flags.DEFINE_string( "pbdir", "/tmp/diag_cache", "pb2 root dir, if not set, will use the /pb2 directory in the same directory as this script", ) +def validate_ip_address(ip_string): + return not any(c.isalpha() for c in ip_string) -class DescriptorHelper: - def __init__(self, service): - # TODO(hw): symbol_database is useful? - # lazy import - assert flags.FLAGS.pbdir, "pbdir not set" - import sys - from pathlib import Path - sys.path.append(Path(flags.FLAGS.pbdir).as_posix()) - import tablet_pb2 - import name_server_pb2 - import taskmanager_pb2 - - pb_map = { - "TabletServer": tablet_pb2, - "NameServer": name_server_pb2, - "TaskManagerServer": taskmanager_pb2, - # "ApiServer": api_server_pb2, - # "DataSync": data_sync_pb2, - } - self.descriptor = pb_map[service].DESCRIPTOR.services_by_name[service] - - def get_input_json(self, method): - inp = self.descriptor.FindMethodByName(method).input_type - return Field.to_json(inp) - - -class Field: - def to_str(typ): - typ2str = { - FieldDescriptor.TYPE_DOUBLE: "double", - FieldDescriptor.TYPE_FLOAT: "float", - FieldDescriptor.TYPE_INT64: "int64", - FieldDescriptor.TYPE_UINT64: "uint64", - FieldDescriptor.TYPE_INT32: "int32", - FieldDescriptor.TYPE_FIXED64: "fixed64", - FieldDescriptor.TYPE_FIXED32: "fixed32", - FieldDescriptor.TYPE_BOOL: "bool", - FieldDescriptor.TYPE_STRING: "string", - FieldDescriptor.TYPE_GROUP: "group", - FieldDescriptor.TYPE_MESSAGE: "message", - FieldDescriptor.TYPE_BYTES: "bytes", - FieldDescriptor.TYPE_UINT32: "uint32", - } - return typ2str[typ] - - def to_json(field): - # label optional, required, or repeated. - label = {1: "optional", 2: "required", 3: "repeated"} - if isinstance(field, FieldDescriptor): - key = f"({label[field.label]})" + field.name - if field.type == FieldDescriptor.TYPE_MESSAGE: - value = Field.to_json(field.message_type) - elif field.type == FieldDescriptor.TYPE_ENUM: - value = "/".join([n.name for n in field.enum_type.values]) - else: - value = Field.to_str(field.type) - if field.label == 3: - # json list style - return {key: [value, "..."]} - else: - return {key: value} - else: - # field is a message - if field.containing_type and [f.name for f in field.fields] == [ - "key", - "value", - ]: - # treat key-value as map type, can't figure out custom type - # TODO(hw): it's ok to pass a json list to proto map? - return {"k": "v", "...": "..."} - d = {} - for f in field.fields: - d.update(Field.to_json(f)) - return d + +host2service = { + "nameserver": "NameServer", + "taskmanager": "openmldb.taskmanager.TaskManagerServer", + "tablet": "TabletServer", +} class RPC: """rpc service""" def __init__(self, host) -> None: - self.host, self.endpoint, self.service = RPC.get_endpoint_service(host.lower()) + if validate_ip_address(host): + self.endpoint = host + self.host = "tablet" # TODO: you can get ns/tm by name, it's not necessary to input ip + self.service = host2service[self.host] + else: + self.host, self.endpoint, self.service = RPC.get_endpoint_service( + host.lower() + ) def rpc_help(self): if self.host == "taskmanager": @@ -123,26 +64,31 @@ def rpc_exec(self, operation, field): ) return r.text - def hint(self, info): - if not info: + def hint(self, method): + if not method: # show service name and all rpc methods print(self.rpc_help()) return - # input message to json style - # if taskmanager, service in pb2 is TaskManagerServer service = ( self.service if not self.service.endswith("TaskManagerServer") else "TaskManagerServer" ) + from .pb import DescriptorHelper - helper = DescriptorHelper(service) - json_str = json.dumps(helper.get_input_json(info), indent=4) + ok, input_json = DescriptorHelper(service).get_input_json(method) + if not ok: + print(input_json) # if not ok, it's message + return + # input message to json style + json_str = json.dumps(input_json, indent=4) print( - f"You should input json like this, ignore round brackets in the key and double quotation marks in the value: --field '{json_str}'" + f"You should input json like this:\n --field '{json_str}'" ) + print("ignore round brackets in the key, e.g. (required)") + print('"<>" shows the data type, e.g. "" means you should set string') def search_in(self, typ, info): for item in typ: @@ -168,14 +114,12 @@ def get_endpoint_service(host): host = "nameserver" if host == "ns" else "taskmanager" assert host in components_map, f"{host} not found in cluster" endpoint = components_map[host][num][0] - host2service = { - "nameserver": "NameServer", - "taskmanager": "openmldb.taskmanager.TaskManagerServer", - "tablet": "TabletServer", - } + service = host2service[host] return host, endpoint, service def parse_html(html): + from bs4 import BeautifulSoup + soup = BeautifulSoup(html, "html.parser") return soup.get_text("\n") diff --git a/python/openmldb_tool/diagnostic_tool/table_checker.py b/python/openmldb_tool/diagnostic_tool/table_checker.py index 969e7d110e4..f9703054d5c 100644 --- a/python/openmldb_tool/diagnostic_tool/table_checker.py +++ b/python/openmldb_tool/diagnostic_tool/table_checker.py @@ -24,11 +24,11 @@ class TableChecker: def __init__(self, conn: Connector): self.conn = conn - def check_distribution(self, dbs: list): + def check_distribution(self, dbs: list = None): exist_dbs = [db[0] for db in self.conn.execfetch("SHOW DATABASES")] if not exist_dbs: return - if dbs == ['']: + if not dbs or len(dbs) == 0: dbs = exist_dbs assert all([db in exist_dbs for db in dbs]), "some databases are not exist" @@ -36,67 +36,87 @@ def check_distribution(self, dbs: list): url = f"http://{ns_leader}/NameServer/ShowTable" res = requests.get(url, json={"show_all": True}) tables = res.json()["table_info"] - + if not tables or len(tables) == 0: + print("no table") + return tablet2partition = {} tablet2count = {} tablet2mem = {} tablet2dused = {} table_infos = [] - max_values = {'mp': 0, 'mc': 0, 'mm': 0, 'md': 0} + max_values = {"mp": 0, "mc": 0, "mm": 0, "md": 0} for table in tables: - if table['db'] not in dbs: + if table["db"] not in dbs: continue t = {} - t['name'] = table['db'] + "." + table['name'] - parts = table['table_partition'] - part_dist = self._collect(parts, '') - count_dist = self._collect(parts, 'record_cnt') - mem_dist = self._collect(parts, 'record_byte_size') - dused_dist = self._collect(parts, 'diskused') - max_values['mp'] = max(max_values['mp'], *part_dist.values()) - max_values['mc'] = max(max_values['mc'], *count_dist.values()) - max_values['mm'] = max(max_values['mm'], *mem_dist.values()) - max_values['md'] = max(max_values['md'], *dused_dist.values()) - t['part_size'] = len(parts) - t['part_dist'] = part_dist - t['count_dist'] = count_dist - t['mem_dist'] = mem_dist - t['dused_dist'] = dused_dist + t["name"] = table["db"] + "." + table["name"] + parts = table["table_partition"] + part_dist = self._collect(parts, "") + count_dist = self._collect(parts, "record_cnt") + mem_dist = self._collect(parts, "record_byte_size") + dused_dist = self._collect(parts, "diskused") + t["part_size"] = len(parts) + t["part_dist"] = part_dist + t["count_dist"] = count_dist + t["mem_dist"] = mem_dist + t["dused_dist"] = dused_dist table_infos.append(t) self._add_merge(tablet2partition, part_dist) self._add_merge(tablet2count, count_dist) self._add_merge(tablet2mem, mem_dist) self._add_merge(tablet2dused, dused_dist) - max_values['mm'] = round(max_values['mm'] / 1024 / 1024, 4) - max_values['md'] = round(max_values['md'] / 1024 / 1024, 4) + def get_max(di): + return max(di.values()) + + max_values["mp"] = get_max(tablet2partition) + max_values["mc"] = get_max(tablet2count) + max_values["mm"] = round(get_max(tablet2mem) / 1024 / 1024, 4) + max_values["md"] = round(get_max(tablet2dused) / 1024 / 1024, 4) + max_width = 40 for t in table_infos: print() - print(t['name']) - print('partition size:', t['part_size']) - print('partition dist(include replica)') - self._show_dist(t['part_dist'], max_width=max_width * max(*t['part_dist'].values()) / max_values['mp']) - print('record count dist(include replica)') - self._show_dist(t['count_dist'], max_width=0 if max_values['mc'] == 0 else max_width * max(*t['count_dist'].values()) / max_values['mc']) - print('mem dist(include replica)(MB)') - self._byte2mb(t['mem_dist']) - self._show_dist(t['mem_dist'], max_width=0 if max_values['mm'] == 0 else max_width * max(*t['mem_dist'].values()) / max_values['mm']) - print('diskused dist(include replica)(MB)') - self._byte2mb(t['dused_dist']) - self._show_dist(t['dused_dist'], max_width=max_width * max(*t['dused_dist'].values()) / max_values['md']) + print(t["name"], "distribution") + print("partition size:", t["part_size"]) + print("partition dist(include replica)") + self._show_dist( + t["part_dist"], + max_width=max_width * get_max(t["part_dist"]) / max_values["mp"], + ) + print("record count dist(include replica)") + self._show_dist( + t["count_dist"], + max_width=0 + if max_values["mc"] == 0 + else max_width * get_max(t["count_dist"]) / max_values["mc"], + ) + print("mem dist(include replica)(MB)") + self._byte2mb(t["mem_dist"]) + self._show_dist( + t["mem_dist"], + max_width=0 + if max_values["mm"] == 0 + else max_width * get_max(t["mem_dist"]) / max_values["mm"], + ) + print("diskused dist(include replica)(MB)") + self._byte2mb(t["dused_dist"]) + self._show_dist( + t["dused_dist"], + max_width=max_width * get_max(t["dused_dist"]) / max_values["md"], + ) print() - print('total') - print('tablet2partition') + print("tablet server load distribution") + print("tablet2partition") self._show_dist(tablet2partition) - print('tablet2count') + print("tablet2count(row)") self._show_dist(tablet2count) - print('tablet2mem(MB)') + print("tablet2mem(MB)") self._byte2mb(tablet2mem) self._show_dist(tablet2mem) - print('tablet2diskused(MB)') + print("tablet2diskused(MB)") self._byte2mb(tablet2dused) self._show_dist(tablet2dused) @@ -106,16 +126,24 @@ def _byte2mb(self, dist: dict): def _show_dist(self, dist: dict, max_width=40): figc = tpl.figure() - figc.barh(list(dist.values()), labels=list(dist.keys()), max_width=max_width) + if not dist: # protect barh args + print("no data") + return + figc.barh( + list(dist.values()), + labels=list(dist.keys()), + max_width=max_width, + force_ascii=True, + ) figc.show() def _collect(self, parts, field): dist = {} for part in parts: - for replica in part['partition_meta']: - if replica['endpoint'] not in dist: - dist[replica['endpoint']] = 0 - dist[replica['endpoint']] += replica[field] if field else 1 + for replica in part["partition_meta"]: + if replica["endpoint"] not in dist: + dist[replica["endpoint"]] = 0 + dist[replica["endpoint"]] += replica[field] if field else 1 return dist def _add_merge(self, dist, dist2): diff --git a/python/openmldb_tool/setup.py b/python/openmldb_tool/setup.py index 7b9a8dcf27f..555e5b51153 100644 --- a/python/openmldb_tool/setup.py +++ b/python/openmldb_tool/setup.py @@ -28,7 +28,7 @@ "Programming Language :: Python :: 3", ], install_requires=[ - "openmldb >= 0.6.9", + "openmldb >= 0.8.1", "absl-py", "pyyaml", "paramiko", @@ -36,12 +36,12 @@ "requests", ], extras_require={ - "rpc": [ + "pb": [ "protobuf==3.6.1", "beautifulsoup4", ], "test": [ - "openmldb-tool[rpc]", + "openmldb-tool[pb]", "pytest", ], }, diff --git a/python/openmldb_tool/tests/inspect_test.py b/python/openmldb_tool/tests/inspect_test.py new file mode 100644 index 00000000000..6f5ece39c05 --- /dev/null +++ b/python/openmldb_tool/tests/inspect_test.py @@ -0,0 +1,355 @@ +import pytest +from diagnostic_tool.inspect import show_table_info +from absl import flags + +flags.FLAGS["nocolor"].parse(False) +flags.FLAGS["table_width"].parse(12) + + +def test_show(): + # assume 3 tablet server + tablets = ["0.0.0.0:1111", "0.0.0.0:2222", "0.0.0.0:3333"] + tablet2idx = {tablet: i + 1 for i, tablet in enumerate(tablets)} + # simple + t_info = { + "name": "TABLE_A", + "table_partition": [ + { + "pid": 0, + "partition_meta": [ + { + "endpoint": tablets[0], + "is_leader": True, + # "offset": 0, + # "record_cnt": 0, + # "record_byte_size": 0, + # "tablet_has_partition": true, + # "diskused": 9025, + }, + { + "endpoint": tablets[1], + "is_leader": False, + }, + ], + # "term_offset": [{"term": 1, "offset": 0}], + # "record_cnt": 0, + # "record_byte_size": 0, + # "diskused": 9025, + } + ], + "tid": 0, + "partition_num": 1, + "replica_num": 2, + "db": "DB_A", + } + + replicas = { + 0: { + 0: { + tablets[0]: { + "tid": 0, # actually not used in show_table_info + "pid": 0, # not used in show_table_info + "offset": 5, # check offset on tablet, not ns + "mode": "kTableLeader", + "state": "kTableNormal", + # "is_expire": True, + # "record_cnt": 1, + # "idx_cnt": 1, + # "ts_idx_status": [ + # {"idx_name": "id", "seg_cnts": [0, 0, 0, 0, 0, 0, 1, 0]} + # ], + "name": "Foo", + # "record_byte_size": 127, + # "record_idx_byte_size": 177, + # "record_pk_cnt": 1, + # "compress_type": "kNoCompress", + # "skiplist_height": 1, + # "diskused": 10074, + # "storage_mode": "kMemory", + "tablet": tablets[0], + }, + tablets[1]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[1], + }, + } + } + } + + show_table_info(t_info, replicas, tablet2idx) + + print("healthy ns meta, but replicas on tablet are all follower") + t_info = { + "name": "TABLE_A", + "table_partition": [ + { + "pid": 0, + "partition_meta": [ + { + "endpoint": tablets[0], + "is_leader": True, + }, + { + "endpoint": tablets[1], + "is_leader": False, + }, + { + "endpoint": tablets[2], + "is_leader": False, + }, + ], + } + ], + "tid": 0, + "partition_num": 1, + "replica_num": 3, + "db": "DB_A", + } + replicas = { + 0: { + 0: { + tablets[0]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[0], + }, + tablets[1]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[1], + }, + tablets[2]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[2], + }, + } + } + } + show_table_info(t_info, replicas, tablet2idx) + + print("ns meta all followers, no leader") + t_info = { + "name": "TABLE_A", + "table_partition": [ + { + "pid": 0, + "partition_meta": [ + { + "endpoint": tablets[0], + "is_leader": False, + }, + { + "endpoint": tablets[1], + "is_leader": False, + }, + { + "endpoint": tablets[2], + "is_leader": False, + }, + ], + } + ], + "tid": 0, + "partition_num": 1, + "replica_num": 3, + "db": "DB_A", + } + replicas = { + 0: { + 0: { + tablets[0]: { + "mode": "kTableLeader", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[0], + }, + tablets[1]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[1], + }, + tablets[2]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[2], + }, + } + } + } + show_table_info(t_info, replicas, tablet2idx) + + print("no corresponding replica on tablet server") + t_info = { + "name": "TABLE_A", + "table_partition": [ + { + "pid": 0, + "partition_meta": [ + { + "endpoint": tablets[0], + "is_leader": True, + }, + { + "endpoint": tablets[1], + "is_leader": False, + }, + { + "endpoint": tablets[2], + "is_leader": False, + }, + ], + } + ], + "tid": 0, + "partition_num": 1, + "replica_num": 3, + "db": "DB_A", + } + replicas = { + 0: { + 0: { + tablets[1]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[1], + }, + tablets[2]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[2], + }, + } + } + } + show_table_info(t_info, replicas, tablet2idx) + + print("meta match, but state is not normal") + t_info = { + "name": "TABLE_A", + "table_partition": [ + { + "pid": 0, + "partition_meta": [ + { + "endpoint": tablets[0], + "is_leader": True, + }, + { + "endpoint": tablets[1], + "is_leader": False, + }, + { + "endpoint": tablets[2], + "is_leader": False, + }, + ], + }, + { + "pid": 1, + "partition_meta": [ + { + "endpoint": tablets[0], + "is_leader": True, + }, + { + "endpoint": tablets[1], + "is_leader": False, + }, + { + "endpoint": tablets[2], + "is_leader": False, + }, + ], + }, + ], + "tid": 0, + "partition_num": 2, + "replica_num": 3, + "db": "DB_A", + } + replicas = { + 0: { + 0: { + tablets[0]: { + "mode": "kTableFollower", + "state": "kTableLoading", + "offset": 0, + "tablet": tablets[0], + }, + tablets[1]: { + "mode": "kTableFollower", + "state": "kMakingSnapshot", + "offset": 0, + "tablet": tablets[1], + }, + tablets[2]: { + "mode": "kTableFollower", + "state": "kSnapshotPaused", + "offset": 0, + "tablet": tablets[2], + }, + }, + 1: { + tablets[1]: { + "mode": "kTableFollower", + "state": "kTableUndefined", + "offset": 0, + }, + tablets[2]: { + "mode": "kTableFollower", + "state": "kTableNormal", + "offset": 0, + }, + }, + } + } + show_table_info(t_info, replicas, tablet2idx) + + print("more partitions, display well") + partnum = 13 + meta_pattern = { + "partition_meta": [ + { + "endpoint": tablets[0], + "is_leader": True, + }, + ], + } + t_info = { + "name": "TABLE_A", + "table_partition": [], + "tid": 0, + "partition_num": partnum, + "replica_num": 1, + "db": "DB_A", + } + replicas = {0: {}} + + for i in range(partnum): + t_info["table_partition"].append({"pid": i, **meta_pattern}) + + for i in range(partnum): + replicas[0][i] = { + tablets[0]: { + "mode": "kTableLeader", + "state": "kTableNormal", + "offset": 0, + "tablet": tablets[0], + } + } + print(t_info, replicas) + show_table_info(t_info, replicas, tablet2idx) + + print("nocolor") + flags.FLAGS["nocolor"].parse(True) + show_table_info(t_info, replicas, tablet2idx) diff --git a/src/proto/tablet.proto b/src/proto/tablet.proto index 2944794b0d9..0938c9d965c 100755 --- a/src/proto/tablet.proto +++ b/src/proto/tablet.proto @@ -829,7 +829,7 @@ message BulkLoadInfoResponse { required uint32 key = 1; // TODO(hw): java will use int, cpp uses uint32. Not good? required uint32 value = 2; } - repeated MapFieldEntry ts_idx_map = 2; // TODO(hw): proto3 supports map + repeated MapFieldEntry ts_idx_map = 2; // TODO(hw): proto3 can build map in proto2 syntax } repeated Segment segment = 1; } diff --git a/tools/openmldb_ops.py b/tools/openmldb_ops.py index c7ae0663b52..543c0bbfbf9 100644 --- a/tools/openmldb_ops.py +++ b/tools/openmldb_ops.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging - +# for Python 2, don't use f-string log = logging.getLogger(__name__) import os import sys @@ -118,8 +118,8 @@ def RecoverPartition(executor, db, partitions, endpoint_status): db=db, table_name=table_name, pid=pid, leader_endpoint=leader_endpoint)) status = executor.LoadTable(leader_endpoint, table_name, tid, pid) if not status.OK(): - log.error("load table failed. db {db} name {table_name} tid {tid} pid {pid} endpoint {leader_endpoint} msg {status.GetMsg()}".format( - db=db, table_name=table_name, tid=tid, pid=pid, leader_endpoint=leader_endpoint, status=status)) + log.error("load table failed. db {db} name {table_name} tid {tid} pid {pid} endpoint {leader_endpoint} msg {status}".format( + db=db, table_name=table_name, tid=tid, pid=pid, leader_endpoint=leader_endpoint, status=status.GetMsg())) return Status(-1, "recover partition failed") if not partitions[leader_pos].IsAlive(): status = executor.UpdateTableAlive(db, table_name, pid, leader_endpoint, "yes") @@ -204,8 +204,9 @@ def RecoverData(executor): log.error("get all table failed") return for name in tables: + # if recover failed, continue to recover next table if not RecoverTable(executor, db, name).OK(): - return + log.error("recover table failed. db {db} name {name}, check log for detail".format(db=db, name=name)) def ChangeLeader(db, partition, src_endpoint, desc_endpoint, one_replica, restore = True): log.info( diff --git a/tools/tool.py b/tools/tool.py index e64b172b49b..cff6eb1db98 100644 --- a/tools/tool.py +++ b/tools/tool.py @@ -16,6 +16,7 @@ import subprocess import sys import time +# for Python 2, don't use f-string log = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format = '%(levelname)s: %(message)s') @@ -276,6 +277,7 @@ def LoadTable(self, endpoint, name, tid, pid, sync = True): cmd = list(self.tablet_base_cmd) cmd.append("--endpoint=" + self.endpoint_map[endpoint]) cmd.append("--cmd=loadtable {} {} {} 0 8".format(name, tid, pid)) + log.info("run {cmd}".format(cmd = cmd)) status, output = self.RunWithRetuncode(cmd) time.sleep(1) if status.OK() and output.find("LoadTable ok") != -1: @@ -289,12 +291,12 @@ def LoadTable(self, endpoint, name, tid, pid, sync = True): if table_stat == "kTableNormal": return Status() elif table_stat == "kTableLoading" or table_stat == "kTableUndefined": - log.info("table is loading... tid {tid} pid {pid}".format(tid, pid)) + log.info("table is loading... tid {tid} pid {pid}".format(tid = tid, pid = pid)) else: - return Status(-1, "table stat is {table_stat}".format(table_stat)) + return Status(-1, "table stat is {table_stat}".format(table_stat = table_stat)) time.sleep(2) - return Status(-1, "execute load table failed") + return Status(-1, "execute load table failed, status {msg}, output {output}".format(msg = status.GetMsg(), output = output)) def GetLeaderFollowerOffset(self, endpoint, tid, pid): cmd = list(self.tablet_base_cmd)