diff --git a/.github/workflows/sdk.yml b/.github/workflows/sdk.yml index 27712f578ae..ed78524a9f6 100644 --- a/.github/workflows/sdk.yml +++ b/.github/workflows/sdk.yml @@ -260,7 +260,6 @@ jobs: - name: prepare python deps run: | - pip install twine "urllib3>=1.26.0,<2.0.0" yum install -y net-tools - name: test sqlalchemy and generate coverage report @@ -288,6 +287,7 @@ jobs: if: > github.repository == '4paradigm/OpenMLDB' && startsWith(github.ref, 'refs/tags/v') run: | + pip install twine "urllib3>=1.26.0,<2.0.0" cp python/openmldb_sdk/dist/openmldb*.whl . cp python/openmldb_tool/dist/openmldb*.whl . twine upload openmldb*.whl diff --git a/demo/Dockerfile b/demo/Dockerfile index ee3eea1c088..354fe86bd66 100644 --- a/demo/Dockerfile +++ b/demo/Dockerfile @@ -16,7 +16,7 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* RUN if [ -f "/additions/pypi.txt" ] ; then pip config set global.index-url $(cat /additions/pypi.txt) ; fi -RUN pip install --no-cache-dir py4j==0.10.9 numpy lightgbm tornado requests pandas xgboost==1.4.2 +RUN pip install --no-cache-dir py4j==0.10.9 numpy lightgbm==3 tornado requests pandas==1.5 xgboost==1.4.2 COPY init.sh /work/ COPY predict-taxi-trip-duration/script /work/taxi-trip/ diff --git a/demo/docker-compose.test.yml b/demo/docker-compose.test.yml index a969a97e461..9fd12df1b35 100644 --- a/demo/docker-compose.test.yml +++ b/demo/docker-compose.test.yml @@ -5,44 +5,54 @@ services: context: . volumes: - ./jd-recommendation:/work/oneflow_demo - - ./job_checker.py:/work/job_checker.py - ./quick_start:/work/quick_start # no mvn in image, so build the java demo outside and mount the jar - - ./java_quickstart/demo/target/demo-1.0-SNAPSHOT.jar:/work/java_quickstart/demo-1.0-SNAPSHOT.jar + - ./java_quickstart/demo/target:/work/java_quickstart - ./python_quickstart:/work/python_quickstart + - ./cxx_quickstart:/work/cxx_quickstart # You can add `cat ` here(e.g. `cat /work/openmldb/taskmanager/bin/logs/job_1_error.log`, cat `predict.log`), to check the log info. # No need to docker-compose build again. But if you modified the Dockerfile, must rebuild it. command: - /bin/bash - - -cx + - -ecx # -e, otherwise, the command may not exit when 'exit' - | ./init.sh sleep 5 # quickstart test cd /work/quick_start /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < cluster_quickstart.sql - python3 request_test.py + python3 request_test.py || exit -1 # java/python sdk, no jar in ci, so we should check the java result manually cd /work/java_quickstart + # if no jar, download it + if [ ! -f demo-1.0-SNAPSHOT.jar ]; then + curl -SLO https://openmldb.ai/download/testing/demo-1.0-SNAPSHOT.jar + fi java -cp demo-1.0-SNAPSHOT.jar com.openmldb.demo.App cd /work/python_quickstart python3 demo.py || exit -1 + cd /work/cxx_quickstart + if [ ! -f demo ]; then + curl -SLO https://openmldb.ai/download/testing/demo + fi + chmod +x demo + ./demo # taxi use case test cd /work/taxi-trip /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < taxi.sql - python3 train.py /tmp/feature_data /tmp/model.txt + python3 train.py /tmp/feature_data /tmp/model.txt || exit -1 # port 8887 ./start_predict_server.sh 127.0.0.1:9080 /tmp/model.txt - python3 predict.py || (cat /tmp/p.log && exit -1) + python3 predict.py || ( cat /tmp/p.log && exit -1 ) # talkingdata demo test cd /work/talkingdata # port 8881 python3 predict_server.py --no-init > predict.log 2>&1 & - python3 train_and_serve.py - python3 predict.py || (cat predict.log && exit -1) + python3 train_and_serve.py || exit -1 + python3 predict.py || ( cat predict.log && exit -1 ) # oneflow sql test cd /work/oneflow_demo/sql_scripts @@ -51,7 +61,8 @@ services: # check deployment, jobs will be checked by openmldb_tool curl http://127.0.0.1:9080/dbs/JD_db/deployments/demo | grep "ok" || exit -1 - # open it after new diag tool released, or you can test in local by USE_ADD_WHL + # TODO(hw): udf test + cd /work openmldb_tool status --diff -f /work/openmldb/conf/hosts openmldb_tool inspect diff --git a/demo/python_quickstart/demo.py b/demo/python_quickstart/demo.py index 670f66439f5..d8d672476ac 100644 --- a/demo/python_quickstart/demo.py +++ b/demo/python_quickstart/demo.py @@ -21,6 +21,8 @@ import openmldb.dbapi +# dbapi接口如果执行失败,会抛出异常,本例不捕获异常,暴露错误 + # 连接集群版OpenMLDB db = openmldb.dbapi.connect(zk="127.0.0.1:2181", zkPath="/openmldb") @@ -65,6 +67,16 @@ ) print(result.fetchone()) +### 执行 Deployment +cursor.execute("DEPLOY d1 SELECT col1 FROM t1") +# dict style +result = cursor.callproc("d1", {"col1": 1000, "col2": None, "col3": None, "col4": None, "col5": None}) +print(result.fetchall()) +# tuple style +result = cursor.callproc("d1", (1001, "2023-07-20", "abc", "def", 1)) +print(result.fetchall()) +# drop deployment before drop table +cursor.execute("DROP DEPLOYMENT d1") ### 2.7 删除表 cursor.execute("DROP TABLE t1") diff --git a/demo/setup_openmldb.sh b/demo/setup_openmldb.sh index 8a7b5fb8f63..8f5dea44f80 100755 --- a/demo/setup_openmldb.sh +++ b/demo/setup_openmldb.sh @@ -43,6 +43,9 @@ mkdir -p "${WORKDIR}/openmldb" tar xzf openmldb.tar.gz -C "${WORKDIR}/openmldb" --strip-components 1 # remove symbols and sections strip -s "${WORKDIR}/openmldb/bin/openmldb" +# do not install sync tools in demo docker +rm "${WORKDIR}/openmldb/bin/data_collector" +rm -rf "${WORKDIR}/openmldb/synctool" mkdir -p "${WORKDIR}/openmldb/spark-3.2.1-bin-openmldbspark" tar xzf spark-3.2.1-bin-openmldbspark.tgz -C "${WORKDIR}/openmldb/spark-3.2.1-bin-openmldbspark" --strip-components 1 diff --git a/docs/en/quickstart/cli.md b/docs/en/quickstart/cli_tutorial.md similarity index 100% rename from docs/en/quickstart/cli.md rename to docs/en/quickstart/cli_tutorial.md diff --git a/docs/en/quickstart/concepts/index.rst b/docs/en/quickstart/concepts/index.rst new file mode 100644 index 00000000000..d02cca2378f --- /dev/null +++ b/docs/en/quickstart/concepts/index.rst @@ -0,0 +1,8 @@ +============================= +Concept +============================= + +.. toctree:: + :maxdepth: 1 + + workflow diff --git a/docs/en/quickstart/concepts/workflow.md b/docs/en/quickstart/concepts/workflow.md new file mode 100644 index 00000000000..2ce5c58ff19 --- /dev/null +++ b/docs/en/quickstart/concepts/workflow.md @@ -0,0 +1,93 @@ +# Workflow and Execution Modes + +OpenMLDB supports different execution modes at different stages of the feature engineering development process. This article will introduce the process of using OpenMLDB for feature engineering development and deployment, as well as the different execution modes used in the process. + +## Workflow Overview + +The following diagram illustrates the typical process of using OpenMLDB for feature engineering development and deployment, as well as the execution modes used in the process: + +![image-20220310170024349](https://openmldb.ai/docs/zh/main/_images/modes-flow.png) + +1. Offline Data Import: Import offline data for offline feature engineering development and debugging. +2. Offline Feature Development: Develop feature engineering scripts and debug them until satisfactory results are achieved. This step involves joint debugging of machine learning models (such as XGBoost, LightGBM, etc.), but this article mainly focuses on feature engineering development related to OpenMLDB. +3. Feature Scheme Deployment: Deploy the feature scripts after satisfactory results are achieved. +4. Cold Start Online Data Import: Before official deployment, it is necessary to import the data within the required window for the online storage engine. For example, if the feature scheme involves feature aggregation calculations for data in the past three months, the previous three months' data needs to be imported for cold start. +5. Real-time Data Access: After the system is deployed, the latest data needs to be collected to maintain the window calculation logic, so real-time data access is required. +6. Online Data Preview (optional): Preview and check online data using supported SQL commands. This step is not mandatory. +7. Real-time Feature Calculation: After the feature scheme is deployed and the data is correctly accessed, a real-time feature calculation service that can respond to online requests will be obtained. + +## Overview of execution mode + +As the data objects for offline and online scenarios are different, their underlying storage and computing nodes are also different. Therefore, OpenMLDB provides several built-in execution modes to support completing the above steps. The following table summarizes the execution modes and development tools used for each step, and three execution modes will be discussed in detail later. + +| Steps | Execution Mode | Development Tool | +| ------------------------------ | ------------------- | ------------------------------------------------------------ | +| 1. Offline Data Import | Offline Mode | OpenMLDB CLI, SDKs | +| Offline Feature Development | Offline Mode | OpenMLDB CLI, SDKs | +| Feature Deployment | Offline Mode | OpenMLDB CLI, SDKs | +| Cold Start Online Data Import | Online Preview Mode | OpenMLDB CLI, SDKs, [Data Import Tool](https://openmldb.ai/docs/zh/main/tutorial/data_import.html) | +| Real-time Data Integration | Online Preview Mode | Connectors, SDKs | +| Online Data Preview (optional) | Online Preview Mode | OpenMLDB CLI, SDKs, [Data Export Tool](https://openmldb.ai/docs/zh/main/tutorial/data_export.html) | +| Real-time Feature Calculation | Online Request Mode | CLI (REST APIs), SDKs | + +### Offline Mode + +After starting OpenMLDB CLI, the **default mode is offline mode**. Offline data import, offline feature development, and feature deployment are all executed in offline mode. The purpose of offline mode is to manage and compute offline data. The computing nodes involved are supported by OpenMLDB Spark optimized for feature engineering, and the storage nodes support commonly used storage systems such as HDFS. + +Offline mode has the following main features: + +- The offline mode supports most of the SQL syntax provided by OpenMLDB, including complex SQL syntaxes such as `LAST JOIN` and `WINDOW UNION`, which are optimized for feature engineering. + +- In offline mode, some SQL commands are executed asynchronously, such as `LOAD DATA`, `SELECT`, and `SELECT INTO` commands. Other SQL commands are executed synchronously. + +- The asynchronous SQL is managed by the internal TaskManager and can be viewed and managed through commands such as `SHOW JOBS`, `SHOW JOB`, and `STOP JOB`. + +```{tip} +::: +Unlike many relational database systems, the `SELECT` command in offline mode is executed asynchronously by default. If you need to set it to synchronous execution, refer to setting the command to run synchronously in offline mode. During offline feature development, if asynchronous execution is used, it is strongly recommended to use the `SELECT INTO` statement for development and debugging, which can export the results to a file for easy viewing. +::: +``` + +The `DEPLOY` command for feature deployment is also executed in offline mode. Its specification can refer to the OpenMLDB SQL online specification and requirements. + +Offline mode setting command (OpenMLDB CLI): `SET @@execute_mode='offline'`. + +### Online preview mode + +Cold start online data import, real-time data access, and online data preview are executed in online preview mode. The purpose of the online preview mode is to manage and preview online data. Storage and computation of online data are supported by the tablet component. + +The main features of the online preview mode are: + +- `LOAD DATA`, used for online data import, can be done either locally (load_mode='local') or on the cluster (load_mode='cluster'). Local import is synchronous, while cluster import is asynchronous (same as in offline mode). Other operations are synchronous. +- Online preview mode is mainly used for previewing limited data. Selecting and viewing data directly through SELECT in OpenMLDB CLI or SDKs may result in data truncation. If the data volume is large, it is recommended to use an [export tool](https://openmldb.ai/docs/zh/main/tutorial/data_export.html) to view the complete data. +- SELECT statements in online preview mode currently do not support more complex queries such as `LAST JOIN` and `ORDER BY`. Refer to [SELECT](https://openmldb.ai/docs/zh/main/openmldb_sql/dql/SELECT_STATEMENT.html). +- The server in the online preview mode executes SQL statements on a single thread. For large data processing, it may be slow and may trigger a timeout. To increase the timeout period, the `--request_timeout` can be configured on the client. +- To prevent impact on online services, online preview mode limits the maximum number of accessed records and the number of different keys. This can be configured using `--max_traverse_cnt` and `--max_traverse_key_cnt`. Similarly, the maximum result size can be set using `--scan_max_bytes_size`. For detailed configuration, refer to the configuration file. + +The command for setting online preview mode in OpenMLDB CLI: `SET @@execute_mode='online'` + +### Online request mode + +After deploying feature scripts and accessing online data, the real-time feature computing service is ready to use, and real-time feature extraction can be performed through the online request mode. REST APIs and SDKs support the online request mode. The online request mode is a unique mode in OpenMLDB that supports real-time online computing and is very different from common SQL queries in databases. + +The online request mode requires three inputs: + +1. SQL feature script, which is the SQL script used in the feature deployment and online process, specifying the calculation logic for feature extraction. +2. Online data, which is the online data that has been imported during cold start or in real-time. Generally, it is the latest data for window computing in conjunction with SQL. For example, if the aggregation function in the SQL script defines a time window of the latest three months, then the online storage needs to retain the corresponding latest three months of data. +3. Real-time request row, which includes the current real-time behavior and is used for real-time feature extraction. For example, credit card information in anti-fraud scenarios or search keywords in recommendation scenarios. + +Based on the above inputs, for each real-time request row, the online request mode will return a feature extraction result. The computing logic is as follows: The request row is virtually inserted into the correct position of the online data table based on the logic in the SQL script (such as `PARTITION BY`, `ORDER BY`, etc.), and then only the feature aggregation computing is performed on that row, returning the unique corresponding extraction result. The following diagram intuitively explains the operation process of the online request mode. + +![modes-request](https://openmldb.ai/docs/zh/main/_images/modes-request.png) + +Online request mode is supported in the following ways: + +- OpenMLDB CLI: Not supported + +- [REST API](https://openmldb.ai/docs/zh/main/quickstart/sdk/rest_api.html): Supports requests for single or multiple request rows + +- [Java SDK](https://openmldb.ai/docs/zh/main/quickstart/sdk/java_sdk.html): Supports requests for single or multiple request rows + +- [Python SDK](https://openmldb.ai/docs/zh/main/quickstart/sdk/python_sdk.html): Only supports requests for a single request row + +- [C++ SDK](https://openmldb.ai/docs/zh/main/quickstart/sdk/cxx_sdk.html): Only supports requests for a single request row diff --git a/docs/en/quickstart/data_import_guide.md b/docs/en/quickstart/data_import_guide.md deleted file mode 100644 index 84ee0565488..00000000000 --- a/docs/en/quickstart/data_import_guide.md +++ /dev/null @@ -1,45 +0,0 @@ -# Data Import Quickstart - -There are two versions of OpenMLDB: the standalone version and cluster version. -- For standalone version, datasets are all stored in the memory. Only [`LOAD DATA`](../reference/sql/dml/LOAD_DATA_STATEMENT.md) can be used to import data in this mode. -- For the cluster version, datasets are stored separately in the offline and online storage engines. Offline and online ends don't share the data. - -This tutorial will focus on the data import methods of cluster version. - -## Data Import Methods of Cluster Version - -### 1 Offline Import (`LOAD DATA`) - -- OpenMLDB doesn't have its specialized offline storage engine, but it requires user to specify the offline storage path, that is modifying the configuration option of taskmanager: `offline.data.prefix`. You can use third-party storage engines, like local directory, HDFS, s3 to configure. -- There is only one way to import data offline: using [`LOAD DATA` command](../reference/sql/dml/LOAD_DATA_STATEMENT.md). Hard copy will be adopted as default. -- OpenMLDB will copy the original data to the path of `offline.data.prefix` by default. The files of `csv` and `parquet` format are supported. -- `LOAD DATA` with a soft link is also supported, you can use the option `deep_copy=false` to configure. Only the storage path of the datasets will be saved in OpenMLDB in a soft link. Both the `csv` and `parquet` files are supported as well. - - -```{note} -If the offline path of the table is a soft link, OpenMLDB doesn't support appending data to the table as it doesn't have write access to the files in the **soft link path**. You can overwrire the offline path of the table. If the path has been overwritten, the data in the original directory will not be removed, only the directory in the OpenMLDB will change. -``` - -### 2 Online Import - -The [online modes](../tutorial/modes.md) of OpenMLDB cluster version provide online storage engine (stored in memory). Only **hard copy** can be used in online import. - -#### 2.1 `LOAD DATA` - -[`LOAD DATA` command](../reference/sql/dml/LOAD_DATA_STATEMENT.md) can be used in **Online Request** and **Online Preview** mode to load `csv` files and `parquet` files. - -#### 2.2 Stream - -Data can be loaded from `Pulsar`, `Kafka` and `RocketMQ ` as well, see the following links for detail. -- [Pulsar Connector](../use_case/pulsar_connector_demo.md) -- [Kafka Connector](../use_case/kafka_connector_demo.md) -- [RocketMQ Connector](https://openmldb.ai/docs/zh/main/use_case/rocketmq_connector.html) - -## Note - -The [openmldb-import tool](../tutorial/data_import.md) can be used for bulk load, importing the data quickly into the standalone or the online storage of cluster version. - -The bulk load tool is still in development. There are some restrictions for usage: -1. Only `csv` files can be loaded. -2. The tool is supported only on a single machine. The requirement for the memory of the single machine is high and maybe the memory should be larger than the size of the data to be imported. - diff --git a/docs/en/quickstart/index.rst b/docs/en/quickstart/index.rst index 244dc820994..aefceb8f206 100644 --- a/docs/en/quickstart/index.rst +++ b/docs/en/quickstart/index.rst @@ -5,10 +5,7 @@ Quickstart .. toctree:: :maxdepth: 1 - openmldb_quickstart.md - java_sdk - python_sdk - go_sdk - rest_api - data_import_guide - cli + openmldb_quickstart + concepts/index + cli_tutorial + sdk/index diff --git a/docs/en/quickstart/java_sdk.md b/docs/en/quickstart/java_sdk.md deleted file mode 100644 index d12faa6e85c..00000000000 --- a/docs/en/quickstart/java_sdk.md +++ /dev/null @@ -1,432 +0,0 @@ -# Java SDK Quickstart - -## 1. Package Installation - -### Package Installation on Linux -Configure maven pom - -```xml - - com.4paradigm.openmldb - openmldb-jdbc - 0.8.2 - - - com.4paradigm.openmldb - openmldb-native - 0.8.2 - -``` -### Package Installation on Mac -Configure maven pom - -```xml - - com.4paradigm.openmldb - openmldb-jdbc - 0.8.2 - - - com.4paradigm.openmldb - openmldb-native - 0.8.2-macos - -``` -Note that since `openmldb-native` contains the C++ static library compiled by OpenMLDB, by default it is a Linux's static library. On macOS, the version of the above openmldb-native needs to be changed to `0.8.2-macos`, and the version of openmldb-jdbc remains unchanged. - -The macOS native relase only supports macos-12. If you want use in macos-11 or macos 10.15, you should build openmldb-native from source in macos-11/macos-10.15, see [Build Java SDK](../deploy/compile.md#build-java-sdk-with-multi-processes) for details. - -## 2. Quickstart - -We can connect the OpenMLDB by JDBC Connection or SqlClusterExecutor. - -### JDBC Connection - -JDBC Connecton only supports OpenMLDB cluster, no standalone. - -``` -Class.forName("com._4paradigm.openmldb.jdbc.SQLDriver"); -// No database in jdbcUrl -Connection connection = DriverManager.getConnection("jdbc:openmldb:///?zk=localhost:6181&zkPath=/openmldb"); - -// Set database in jdbcUrl -Connection connection1 = DriverManager.getConnection("jdbc:openmldb:///test_db?zk=localhost:6181&zkPath=/openmldb"); -``` - -The database in connection url must exist. - -```{caution} -JDBC Connection default execute mode is`online`. -``` - -#### 使用概览 - -You can use `Statement` to execute all sql in online or offline mode. To switch the execute mode, you should `SET @@execute_mode='...';`. For example: -```java -Statement stmt = connection.createStatement(); -stmt.execute("SET @@execute_mode='offline"); // set offline mode -stmt.execute("SELECT * from t1"); // offline select -ResultSet res = stmt.getResultSet(); // get the job info of the offline select -stmt.execute("SET @@execute_mode='online"); // set online mode -res = stmt.executeQuery("SELECT * from t1"); // online select, and executeQuery will return the result -``` - -The offline sql and online `LOAD DATA` are async in default, so the result is the job info(id, state, etc.), not the data. You can execute `show job ` to check if the job is finished. **You should run `ResultSet.next()` to get the first row in result, do not run `ResultSet.getXXX` without `next()`**. - -The job can be set to sync: -``` -SET @@sync_job=true; -``` -```{tip} -If the sync job takes more than 0.5h, you should [change the config](../reference/sql/ddl/SET_STATEMENT.md#offline-commands-configuration-details). -``` - -#### PreparedStatement - -`PreparedStatement` supports `SELECT`, `INSERT` and `DELETE`,`INSERT` only inserts into online. -```java -PreparedStatement selectStatement = connection.prepareStatement("SELECT * FROM t1 WHERE id=?"); -PreparedStatement insertStatement = connection.prepareStatement("INSERT INTO t1 VALUES (?,?)"); -PreparedStatement insertStatement = connection.prepareStatement("DELETE FROM t1 WHERE id=?"); -``` - -### SqlClusterExecutor -#### Create SqlClusterExecutor - -First, the OpenMLDB connection parameters should be configured. SdkOption is cluster mode in default. - -```java -// cluster: -SdkOption option = new SdkOption(); -option.setZkCluster("127.0.0.1:2181"); -option.setZkPath("/openmldb"); -option.setSessionTimeout(10000); -option.setRequestTimeout(60000); - -// standalone: -SdkOption option = new SdkOption(); -option.setHost("127.0.0.1"); -option.setPort(6527); -option.setClusterMode(false); // required -option.setSessionTimeout(10000); -option.setRequestTimeout(60000); -``` - -Then,create the executor. - -```java -sqlExecutor = new SqlClusterExecutor(option); -``` - -`SqlClusterExecutor` is thread-safe, but the execute mode is cached in `SqlClusterExecutor`. If one thread set online and execute an online job, and another thread set offline and execute an offline job, the result is unpredictable. If you want multi-threading and execute in multi modes, you should create multi `SqlClusterExecutor`. - -```{caution} -SqlClusterExecutor execute mode is `offline` in default, it's different with JDBC Connection. -``` -#### Statement - -Create a database: - -```java -java.sql.Statement state = sqlExecutor.getStatement(); -try { - state.execute("create database db_test"); -} catch (Exception e) { - e.printStackTrace(); -} finally { - state.close(); -} -``` - -Create a table in database 'db_test': - -```java -java.sql.Statement state = sqlExecutor.getStatement(); -try { - state.execute("use db_test"); - String createTableSql = "create table trans(c1 string,\n" + - " c3 int,\n" + - " c4 bigint,\n" + - " c5 float,\n" + - " c6 double,\n" + - " c7 timestamp,\n" + - " c8 date,\n" + - " index(key=c1, ts=c7));"; - state.execute(createTableSql); -} catch (Exception e) { - e.printStackTrace(); -} finally { - state.close(); -} -``` - -##### Use Statement to Query - -```java -java.sql.Statement state = sqlExecutor.getStatement(); -try { - state.execute("use db_test"); - // sqlExecutor execute mode is offline in default. Set online here - state.execute("SET @@execute_mode='online;"); - // we can `getResultSet` only if returns true - boolean ret = state.execute("select * from trans;"); - Assert.assertTrue(ret); - java.sql.ResultSet rs = state.getResultSet(); -} catch (Exception e) { - e.printStackTrace(); -} -``` - -Read result: - -```java -// print the first three columns for demo -try { - while (result.next()) { - System.out.println(resultSet.getString(1) + "," + resultSet.getInt(2) "," + resultSet.getLong(3)); - } -} catch (SQLException e) { - e.printStackTrace(); -} finally { - try { - if (result != null) { - result.close(); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } -} -``` - -#### PreparedStatement - -We can get `PreparedStatement` from `SqlClusterExecutor`, e.g. get `InsertPreparedStmt` by `getInsertPreparedStmt`. There're three ways to use `InsertPreparedStmt`. -```{note} -Insertion only supports online, the execute mode won't affect it. -``` - -##### Normal Insert - -1. Using the `SqlClusterExecutor::getInsertPreparedStmt(db, insertSql)` interface to get the `InsertPrepareStatement`. -2. Using the `Statement::execute()` interface to execute the insert statement. - -```java -String insertSql = "insert into trans values(\"aa\",23,33,1.4,2.4,1590738993000,\"2020-05-04\");"; -PreparedStatement pstmt = null; -try { - pstmt = sqlExecutor.getInsertPreparedStmt(db, insertSql); - Assert.assertTrue(pstmt.execute()); -} catch (SQLException e) { - e.printStackTrace(); - Assert.fail(); -} finally { - if (pstmt != null) { - try { - // PrepareStatement must be closed after it is used up - pstmt.close(); - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - } -} -``` - -##### Use Placeholder to Execute Insert Statement - -1. Using the `SqlClusterExecutor::getInsertPreparedStmt(db, insertSqlWithPlaceHolder)` interface to` get the InsertPrepareStatement`. -2. Calling the `PreparedStatement::setType(index, value)` interface to fill data into `InsertPrepareStatement`. -3. Using the `Statement::execute()` interface to execute the insert statement. - -```java -String insertSqlWithPlaceHolder = "insert into trans values(\"aa\", ?, 33, ?, 2.4, 1590738993000, \"2020-05-04\");"; -PreparedStatement pstmt = null; -try { - pstmt = sqlExecutor.getInsertPreparedStmt(db, insertSqlWithPlaceHolder); - pstmt.setInt(1, 24); - pstmt.setInt(2, 1.5f); - pstmt.execute(); -} catch (SQLException e) { - e.printStackTrace(); - Assert.fail(); -} finally { - if (pstmt != null) { - try { - // PrepareStatement must be closed after it is used up - pstmt.close(); - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - } -} -``` - -##### Use Placeholder to Execute Batch Insert - -1. Using the `SqlClusterExecutor::getInsertPreparedStmt(db, insertSqlWithPlaceHolder)` interface to` get the InsertPrepareStatement`. -2. Calling the `PreparedStatement::setType(index, value)` interface to fill data into `InsertPrepareStatement`. -3. Using the `PreparedStatement::addBatch()` interface to build current row. -4. Using the `PreparedStatement::setType(index, value)` and `PreparedStatement::addBatch()` to add new rows. -5. Using the `PreparedStatement::executeBatch()` to execute batch insert. - -```java -String insertSqlWithPlaceHolder = "insert into trans values(\"aa\", ?, 33, ?, 2.4, 1590738993000, \"2020-05-04\");"; -PreparedStatement pstmt = null; -try { - pstmt = sqlExecutor.getInsertPreparedStmt(db, insertSqlWithPlaceHolder); - pstmt.setInt(1, 24); - pstmt.setInt(2, 1.5f); - pstmt.addBatch(); - pstmt.setInt(1, 25); - pstmt.setInt(2, 1.6f); - pstmt.addBatch(); - pstmt.executeBatch(); -} catch (SQLException e) { - e.printStackTrace(); - Assert.fail(); -} finally { - if (pstmt != null) { - try { - // PrepareStatement must be closed after it is used up - pstmt.close(); - } catch (SQLException throwables) { - throwables.printStackTrace(); - } - } -} -``` - -#### SQL Queries in the Request Mode - -1. Using the `SqlClusterExecutor::getRequestPreparedStmt(db, selectSql)` interface to get the `RequestPrepareStatement`. -2. Calling the `PreparedStatement::setType(index, value)` interface to set the request data. Please call the `setType` interface and configure a valid value according to the data type corresponding to each column in the data table. -3. Calling the `Statement::executeQuery()` interface to execute the request query statement. - -```java -String selectSql = "SELECT c1, c3, sum(c4) OVER w1 as w1_c4_sum FROM trans WINDOW w1 AS " + - "(PARTITION BY trans.c1 ORDER BY trans.c7 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW);"; -PreparedStatement pstmt = null; -ResultSet resultSet = null; -/* -c1 string,\n" + - " c3 int,\n" + - " c4 bigint,\n" + - " c5 float,\n" + - " c6 double,\n" + - "c7 timestamp,\n" + - " c8 date,\n" + -*/ -try { - // The first step, get RequestPrepareStatement - pstmt= sqlExecutor.getRequestPreparedStmt(db, selectSql); - - // The second step, execute the request mode, you need to set a line of request data in RequestPreparedStatement - pstmt.setString(1, "bb"); - pstmt.setInt(2, 24); - pstmt.setLong(3, 34l); - pstmt.setFloat(4, 1.5f); - pstmt.setDouble(5, 2.5); - pstmt.setTimestamp(6, new Timestamp(1590738994000l)); - pstmt.setDate(7, Date.valueOf("2020-05-05")); - - // Calling executeQuery will execute the select sql, the result in resultSet - resultSet = pstmt.executeQuery(); - - // access resultSet - Assert.assertEquals(resultSet.getMetaData().getColumnCount(), 3); - Assert.assertTrue(resultSet.next()); - Assert.assertEquals(resultSet.getString(1), "bb"); - Assert.assertEquals(resultSet.getInt(2), 24); - Assert.assertEquals(resultSet.getLong(3), 34); - - // The returned result set of a normal request query contains only one row of results, so the result of the second call to resultSet.next() is false - Assert.assertFalse(resultSet.next()); - -} catch (SQLException e) { - e.printStackTrace(); - Assert.fail(); -} finally { - try { - if (resultSet != null) { - // need to close after result is used up - resultSet.close(); - } - if (pstmt != null) { - pstmt.close(); - } - } catch (SQLException throwables) { - throwables.printStackTrace(); - } -} -``` - -#### Delete all data under one key in specific index - -There two methods to delete as below: - -- use delete sql -- use delete preparestatement - -``` -java.sql.Statement state = router.getStatement(); -try { - String sql = "DELETE FROM t1 WHERE col2 = 'key1';"; - state.execute(sql); - sql = "DELETE FROM t1 WHERE col2 = ?;"; - java.sql.PreparedStatement p1 = router.getDeletePreparedStmt("test", sql); - p1.setString(1, "key2"); - p1.executeUpdate(); - p1.close(); -} catch (Exception e) { - e.printStackTrace(); - Assert.fail(); -} finally { - try { - state.close(); - } catch (Exception e) { - e.printStackTrace(); - } -} -``` - -### A Complete Example - -See [Java quickstart demo](https://github.com/4paradigm/OpenMLDB/tree/main/demo/java_quickstart/demo). If macOS, add openmldb-native dependency and use the macos version. - -You can run: -``` -mvn package -java -cp target/demo-1.0-SNAPSHOT.jar com.openmldb.demo.App -``` - -## SDK Option - -Connect to cluster must set `zkCluster` and `zkPath`(set methods or add `foo=bar` after `?` in jdbc url). Other options are optional. - -Connect to standalone must set `host`, `port` and `isClusterMode`(`SDKOption.setClusterMode`). No jdbc supports. Notice that, `isClusterMode` is the required option, we can't detect it automatically now. Other options are optional. - -### General Optional Options - -We can set the options in cluster and standalone: -- enableDebug: default false. To enable the hybridse debug log(not the all log), you can see more log about sql compile and running. But the hybridse debug log may in tablet server log, the client won't collect all. -- requestTimeout: default 60000ms. To set the rpc timeout sent by client, exclude the rpc sent to taskmanager(job rpc timeout option is the variable `job_timeout`). -- glogLevel: default 0, the same to glog minloglevel. INFO, WARNING, ERROR, and FATAL are 0, 1, 2, and 3, respectively. so 0 will print INFO and higher levels。 -- glogDir: default empty. When it's empty, it'll print to stderr. -- maxSqlCacheSize: default 50. The max cache num of one db in one sql mode(client side). If client met no cache error(e.g. get error `please use getInsertRow with ... first` but we did `getInsertRow` before), you can set it bigger. - -### Optional Options for cluster - -The OpenMLDB cluster has zk and taskmanager, so there're options about them: -- sessionTimeout: default 10000ms. the session timeout connect to zookeeper. -- zkLogLevel: default 3. 0-disable all zk log, 1-error, 2-warn, 3-info, 4-debug. -- zkLogFile: default empty. If empty, print log to stdout. -- sparkConfPath: default empty. set the spark conf file used by job in the client side, no need to set conf in taskmanager and restart it. - -## SQL Validation - -JAVA client supports validate if the sql can be executed or deployed, there're two modes: batch and request. - -- `validateSQLInBatch` can validate if the sql can be executed on offline. - -- `validateSQLInRequest` can validate if the sql can be deployed. - -The two methods need all tables schema which need by sql, only support all tables in a single db, please **DO NOT** use `db.table` style in sql. \ No newline at end of file diff --git a/docs/en/quickstart/openmldb_quickstart.md b/docs/en/quickstart/openmldb_quickstart.md index fc99d75e7d0..944778abec4 100644 --- a/docs/en/quickstart/openmldb_quickstart.md +++ b/docs/en/quickstart/openmldb_quickstart.md @@ -1,379 +1,265 @@ # OpenMLDB Quickstart -This tutorial provides a quick start guide to use OpenMLDB. Basic steps are: creating a database, offline data import, offline feature extraction, SQL deployment, online data import, and online real-time feature extraction. The steps of the standalone and cluster versions are slightly different, and are demonstrated separately. +## Basic concepts -## 1. Environment and Data Preparation -```{warning} -Docker Engine version requirement: >= 18.03 -``` -This tutorial is demonstrated based on the OpenMLDB CLI, so first you need to download the sample data and start the OpenMLDB CLI. We recommend using the prepared docker image for a quick experience. - -```{note} -If you wan to compile and install it by yourself, you can refer to our [installation and deployment documentation](../deploy/install_deploy.md). -``` +The main use case of OpenMLDB is as a real-time feature platform for machine learning. The basic usage process is shown in the following diagram: -### 1.1. Download the Docker Image +![modes-flow](https://openmldb.ai/docs/zh/main/_images/modes-flow.png) -Pull the image (image download size is about 1GB, after decompression is about 1.7 GB) and start the docker container: +As can be seen, OpenMLDB covers the feature computing process of machine learning, from offline development to real-time request service online, providing a complete process. Please refer to the documentation for [the usage process and execution mode](https://openmldb.ai/docs/zh/main/quickstart/concepts/modes.html) in detail. This article will demonstrate a quick start and understanding of OpenMLDB step by step, following the basic usage process. -```bash -docker run -it 4pdosc/openmldb:0.8.2 bash -``` +## The preparation -```{important} -After the container is successfully started, all the subsequent commands in this tutorial are executed within the container by default. -``` +This article is developed and deployed based on OpenMLDB CLI, and it is necessary to download the sample data and start OpenMLDB CLI first. It is recommended to use Docker image for a quick experience (Note: due to some known issues of Docker on macOS, the sample program in this article may encounter problems in completing the operation smoothly on macOS. It is recommended to run it on **Linux or Windows**). -## 2. The Standalone Version +- Docker Version: >= 18.03 -### 2.1. Start the Server and Client +### Pulls the image -- Start the standalone OpenMLDB server +Execute the following command in the command line to pull the OpenMLDB image and start the Docker container: ```bash -# 1. initialize the environment and start standlone openmldb server -./init.sh standalone +docker run -it 4pdosc/openmldb:0.8.2 bash ``` -- Start the standalone OpenMLDB CLI client - -```bash -# Start the OpenMLDB CLI for the cluster deployed OpenMLDB -cd taxi-trip -../openmldb/bin/openmldb --host 127.0.0.1 --port 6527 +``` {note} +After successfully starting the container, all subsequent commands in this tutorial are executed inside the container by default. If you need to access the OpenMLDB server inside the container from outside the container, please refer to the [CLI/SDK-container onebox documentation](https://openmldb.ai/docs/zh/main/reference/ip_tips.html#id3). ``` -### 2.2. Steps +### Download sample data -```{important} -Unless otherwise specified, the commands shown below in this section are executed under the CLI by default (CLI commands start with the prompt `>` for distinction). -``` +Execute the following command inside the container to download the sample data used in the subsequent process (**this step can be skipped for versions 0.7.0 and later**, as the data is already stored in the image): -#### 2.2.1. Create the Database and Table - -```sql -> CREATE DATABASE demo_db; -> USE demo_db; -> CREATE TABLE demo_table1(c1 string, c2 int, c3 bigint, c4 float, c5 double, c6 timestamp, c7 date); +```bash +curl https://openmldb.ai/demo/data.parquet --output /work/taxi-trip/data/data.parquet ``` -#### 2.2.2. Offline Data Import +### Start the server and client -We should first import the previously downloaded sample data (the saved data in {ref}`download_data`) for offline feature extraction. - -```sql -> LOAD DATA INFILE 'data/data.csv' INTO TABLE demo_table1; -``` +Start the OpenMLDB server: -We can preview the data by using `SELECT`. - -```sql -> SELECT * FROM demo_table1 LIMIT 10; - ----- ---- ---- ---------- ----------- --------------- - ------------- - c1 c2 c3 c4 c5 c6 c7 - ----- ---- ---- ---------- ----------- --------------- - ------------- - aaa 12 22 2.200000 12.300000 1636097390000 2021-08-19 - aaa 11 22 1.200000 11.300000 1636097290000 2021-07-20 - dd 18 22 8.200000 18.300000 1636097990000 2021-06-20 - aa 13 22 3.200000 13.300000 1636097490000 2021-05-20 - cc 17 22 7.200000 17.300000 1636097890000 2021-05-26 - ff 20 22 9.200000 19.300000 1636098000000 2021-01-10 - bb 16 22 6.200000 16.300000 1636097790000 2021-05-20 - bb 15 22 5.200000 15.300000 1636097690000 2021-03-21 - bb 14 22 4.200000 14.300000 1636097590000 2021-09-23 - ee 19 22 9.200000 19.300000 1636097000000 2021-01-10 - ----- ---- ---- ---------- ----------- --------------- - ------------- +```bash +/work/init.sh ``` -#### 2.2.3. Offline Feature Extraction +Start the OpenMLDB CLI client: -Now we can execute SQL for feature extraction, and store the produced features in a file for subsequent model training. - -```sql -> SELECT c1, c2, sum(c3) OVER w1 AS w1_c3_sum FROM demo_table1 WINDOW w1 AS (PARTITION BY demo_table1.c1 ORDER BY demo_table1.c6 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) INTO OUTFILE '/tmp/feature.csv'; +```bash +/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client ``` -#### 2.2.4. Online SQL Deployment +After successfully starting OpenMLDB CLI, it will be displayed as shown in the following figure: -When the feature extraction script is ready, we can create an online SQL deployment for it. +![image](https://openmldb.ai/docs/zh/main/_images/cli_cluster.png) -```sql -> DEPLOY demo_data_service SELECT c1, c2, sum(c3) OVER w1 AS w1_c3_sum FROM demo_table1 WINDOW w1 AS (PARTITION BY demo_table1.c1 ORDER BY demo_table1.c6 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); -``` +## Use process -You can also view the SQL deployments through the command `SHOW DEPLOYMENTS`; +Referring to the core concepts, the process of using OpenMLDB generally includes six steps: creating databases and tables, importing offline data, offline feature computing, deploying SQL solutions, importing online data, and online real-time feature computing. -```sql -> SHOW DEPLOYMENTS; - --------- ------------------- - DB Deployment - --------- ------------------- - demo_db demo_data_service - --------- ------------------- -1 row in set +```{note} +Unless otherwise specified, the commands demonstrated below are executed by default in OpenMLDB CLI. ``` -Note that, this tutorial for the standalone version uses the same data for offline and online feature extraction. You can also use two different data sets for offline and online. Later on, for the cluster version, you will see that we must import another data set for online feature extraction. +### Step 1: Create database and table -#### 2.2.5. Exit the CLI +Create `demo_db` and table `demo_table1`: ```sql -> quit; +-- OpenMLDB CLI +CREATE DATABASE demo_db; +USE demo_db; +CREATE TABLE demo_table1(c1 string, c2 int, c3 bigint, c4 float, c5 double, c6 timestamp, c7 date); ``` -Up to this point, you have completed all the development and deployment steps based on the CLI, and have returned to the OS command line. - -#### 2.2.6. Real-Time Feature Extraction +### Step 2: Importing offline data -Real-time online services can be provided through the following Web APIs: +Switch to the offline execution mode, and import the sample data as offline data for offline feature calculation. +```sql +-- OpenMLDB CLI +USE demo_db; +SET @@execute_mode='offline'; +LOAD DATA INFILE 'file:///work/taxi-trip/data/data.parquet' INTO TABLE demo_table1 options(format='parquet', mode='append'); ``` -http://127.0.0.1:8080/dbs/demo_db/deployments/demo_data_service - \___________/ \____/ \_____________/ - | | | - APIServer address Database name Deployment name -``` - -The input data of the real-time request accepts the `json` format, and we put a line of data into the `input` field of the request. Here is the example: - -```bash -curl http://127.0.0.1:8080/dbs/demo_db/deployments/demo_data_service -X POST -d'{"input": [["aaa", 11, 22, 1.2, 1.3, 1635247427000, "2021-05-20"]]}' -``` - -The following is the expected return result for this query: - -```json -{"code":0,"msg":"ok","data":{"data":[["aaa",11,22]]}} -``` -You may refer to [3.3.8. Result Explanation](#3.3.8.-Result Explanation) at the end of the article for the result explanation. - -## 3. The Cluster Version -### 3.1. Preliminary Knowledge +Note that the `LOAD DATA` command is an asynchronous command by default. You can use the following command to check the task status and detailed logs: -The most significant differences between the cluster version and the standalone version are: +- To show the list of submitted tasks: SHOW JOBS -- Some commands in the cluster version are non-blocking tasks, including `LOAD DATA` in online mode, and `LOAD DATA`, `SELECT`, `SELECT INTO` commands in offline mode. After submitting a task for such a given command, you can use related commands such as `SHOW JOBS`, `SHOW JOB` to view the task progress. For details, see the [Offline Task Management](../reference/sql/task_manage/reference.md) document. -- The cluster version needs to maintain offline and online data separately, and cannot use the same data set as the stand-alone version. +- To show the detailed information of a task: SHOW JOB job_id (job_id can be obtained from the SHOW JOBS command) -The above differences will be demonstrated based on examples in the following tutorials. - -### 3.2. Start the Server and Client - -- Start the cluster version of the OpenMLDB server: - -```bash -# 1. initialize the environment and start cluster openmldb server -./init.sh -``` +- To show the task logs: SHOW JOBLOG job_id -- Start the OpenMLDB CLI: +Here, we use `SHOW JOBS` to check the task status. Please wait for the task to be successfully completed (the `state` is changed to `FINISHED`), and then proceed to the next step. -```bash -# Start the OpenMLDB CLI for the cluster deployed OpenMLDB -cd taxi-trip -../openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client -``` +![image-20220111141358808](https://openmldb.ai/docs/zh/main/_images/state_finished.png) -### 3.3. Steps +After the task is completed, if you want to preview the data, you can use the `SELECT * FROM demo_table1` statement. It is recommended to first set the offline command to synchronous mode (`SET @@sync_job=true`); otherwise, the command will submit an asynchronous task, and the result will be saved in the log file of the Spark task, which is less convenient to view. -```{important} -Unless otherwise specified, the commands shown below are executed under the OpenMLDB CLI by default (the CLI command starts with a prompt `>`). +```{note} +OpenMLDB also supports importing offline data through linked soft copies, without the need for hard data copying. Please refer to the parameter `deep_copy` in the [LOAD DATA INFILE documentation](https://openmldb.ai/docs/zh/main/openmldb_sql/dml/LOAD_DATA_STATEMENT.html) for more information. ``` -#### 3.3.1. Create Database and Table +### Step 3: Offline feature computing -- Create the database and table: +Assuming that we have determined the SQL script (`SELECT` statement) to be used for feature computation, we can use the following command for offline feature computation: ```sql -> CREATE DATABASE demo_db; -> USE demo_db; -> CREATE TABLE demo_table1(c1 string, c2 int, c3 bigint, c4 float, c5 double, c6 timestamp, c7 date); +-- OpenMLDB CLI +USE demo_db; +SET @@execute_mode='offline'; +SET @@sync_job=false; +SELECT c1, c2, sum(c3) OVER w1 AS w1_c3_sum FROM demo_table1 WINDOW w1 AS (PARTITION BY demo_table1.c1 ORDER BY demo_table1.c6 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) INTO OUTFILE '/tmp/feature_data' OPTIONS(mode='overwrite'); ``` -- You may view the information of the database and table: +The `SELECT INTO` command is an asynchronous task. Use the `SHOW JOBS` command to check the task running status. Please wait for the task to complete successfully (`state` changes to `FINISHED`) before proceeding to the next step. -```sql -> desc demo_table1; - --- ------- ----------- ------ --------- - # Field Type Null Default - --- ------- ----------- ------ --------- - 1 c1 Varchar YES - 2 c2 Int YES - 3 c3 BigInt YES - 4 c4 Float YES - 5 c5 Double YES - 6 c6 Timestamp YES - 7 c7 Date YES - --- ------- ----------- ------ --------- - --- -------------------- ------ ---- ------ ------------- ---- - # name keys ts ttl ttl_type - --- -------------------- ------ ---- ------ ------------- ---- - 1 INDEX_0_1641939290 c1 - 0min kAbsoluteTime - --- -------------------- ------ ---- ------ ------------- ---- -``` +Note: -#### 3.3.2. Offline Data Import +- Similar to the `LOAD DATA` command, the `SELECT` command also runs asynchronously by default in offline mode. -- First, please switch to the offline execution mode by using the command `SET @@execute_mode='offline'`. -- Next, import the previously downloaded sample data (downloaded in {ref}`download_data`) as offline data for offline feature extraction. +- The `SELECT` statement is used to perform SQL-based feature extraction and store the generated features in the directory specified by the `OUTFILE` parameter as `feature_data`, which can be used for subsequent machine learning model training. -```sql -> USE demo_db; -> SET @@execute_mode='offline'; -> LOAD DATA INFILE 'file:///work/taxi-trip/data/data.parquet' INTO TABLE demo_table1 options(format='parquet', header=true, mode='append'); -``` +### Step 4: Deploying SQL solutions -Note that, the `LOAD DATA` command is non-blocking, and you can view the task progress through the task management commands such as `SHOW JOBS` and `SHOW JOBLOG`. +Switch to online preview mode, and deploy the explored SQL plan to online. The SQL plan is named `demo_data_service`, and the online SQL used for feature extraction needs to be consistent with the corresponding offline feature calculation SQL. ```sql -SHOW JOB $JOB_ID - -SHOW JOBLOG $JOB_ID +-- OpenMLDB CLI +SET @@execute_mode='online'; +USE demo_db; +DEPLOY demo_data_service SELECT c1, c2, sum(c3) OVER w1 AS w1_c3_sum FROM demo_table1 WINDOW w1 AS (PARTITION BY demo_table1.c1 ORDER BY demo_table1.c6 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); ``` -#### 3.3.3. Offline Feature Extraction - -You can now execute the SQL for feature extraction, and store the produced features in a file for subsequent model training. - -```sql -> USE demo_db; -> SET @@execute_mode='offline'; -> SELECT c1, c2, sum(c3) OVER w1 AS w1_c3_sum FROM demo_table1 WINDOW w1 AS (PARTITION BY demo_table1.c1 ORDER BY demo_table1.c6 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) INTO OUTFILE '/tmp/feature_data'; -``` +After the deployment, you can use the command `SHOW DEPLOYMENTS` to view the deployed SQL solutions. -Note that, the `SELECT INTO` command in offline mode is non-blocking, and you can view the running progress through offline task management commands such as `SHOW JOBS`. +### Step 5: Importing online data -#### 3.3.4. Online SQL Deployment - -The SQL can be deployed online using the below command: - -```sql -> SET @@execute_mode='online'; -> DEPLOY demo_data_service SELECT c1, c2, sum(c3) OVER w1 AS w1_c3_sum FROM demo_table1 WINDOW w1 AS (PARTITION BY demo_table1.c1 ORDER BY demo_table1.c6 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); -``` - -After going online, you can view the deployed SQL solutions through the command `SHOW DEPLOYMENTS`; +Import the downloaded sample data as online data for online feature computation in online preview mode. ```sql -> SHOW DEPLOYMENTS; - --------- ------------------- - DB Deployment - --------- ------------------- - demo_db demo_data_service - --------- ------------------- -1 row in set +-- OpenMLDB CLI +USE demo_db; +SET @@execute_mode='online'; +LOAD DATA INFILE 'file:///work/taxi-trip/data/data.parquet' INTO TABLE demo_table1 options(format='parquet', header=true, mode='append'); ``` -#### 3.3.5. Online Data Import +`LOAD DATA` is an asynchronous command by default, you can use offline task management commands such as `SHOW JOBS` to check the progress. Please wait for the task to complete successfully (`state` changes to `FINISHED`) before proceeding to the next step. -First, you should switch to the online execution mode by using the command `SET @@execute_mode='online'`. Then in the online mode, you should import the previously downloaded sample data (downloaded in {ref}`download_data`) as online data for online feature extraction. - -```{note} -As the storage engines for the offline and online data are separate in the cluster version, you must import the data again in the online mode even the same data set is used. For most real-world applications, usually two different data sets are used for offline and online modes. -``` +After the task is completed, you can preview the online data: ```sql -> USE demo_db; -> SET @@execute_mode='online'; -> LOAD DATA INFILE 'file:///work/taxi-trip/data/data.parquet' INTO TABLE demo_table1 options(format='parquet', header=true, mode='append'); +-- OpenMLDB CLI +USE demo_db; +SET @@execute_mode='online'; +SELECT * FROM demo_table1 LIMIT 10; ``` -Note that, the online mode of `LOAD DATA` is a non-blocking command, and you can view the progress through the task management commands such as `SHOW JOBS`. +Note that currently, it is required to successfully deploy the SQL plan before importing online data; importing online data before deployment may cause deployment errors. ```{note} -For real-world applications, you will most likely need an additional step to import real-time data. Otherwise OpenMLDB cannot keep fresh data up to date. This step can be done using the SDKs or data stream [connectors](../use_case/pulsar_openmldb_connector_demo.md). +The tutorial skips the step of real-time data access after importing data. In practical scenarios, as time progresses, the latest real-time data needs to be updated in the online database. This can be achieved through the OpenMLDB SDK or online data source connectors such as Kafka, Pulsar, etc. ``` -#### 3.3.6. Exit the CLI +### Step 6: Online real-time feature computing + +The development and deployment work based on OpenMLDB CLI is completed. Next, you can make real-time feature calculation requests in real-time request mode. First, exit OpenMLDB CLI and return to the command line of the operating system. ```sql -> quit; +-- OpenMLDB CLI +quit; ``` -Up to this point, you have completed all the development and deployment steps based on the cluster version of OpenMLDB CLI, and have returned to the OS command line. - -#### 3.3.7. Real-Time Feature Extraction - -Real-time online services can be provided through the following Web APIs (the default http port for the APIServer is 9080): +According to the default deployment configuration, the http port for APIServer is 9080. Real-time online services can be provided through the following Web API: -``` +```bash http://127.0.0.1:9080/dbs/demo_db/deployments/demo_data_service - \___________/ \____/ \_____________/ - | | | - APIServer address Database name Deployment name + \___________/ \____/ \_____________/ + | | | + APIServer地址 Database名字 Deployment名字 ``` -The input data of the real-time request accepts the `json` format, and we put a line of data into the `input` field of the request. Here is the request example: +Real-time requests accept input data in JSON format. Here are two examples: putting a row of data in the `input` field of the request. + +**Example 1:** -Example 1: ```bash curl http://127.0.0.1:9080/dbs/demo_db/deployments/demo_data_service -X POST -d'{"input": [["aaa", 11, 22, 1.2, 1.3, 1635247427000, "2021-05-20"]]}' ``` -The following is the expected return result for this query (the computed features are stored in the `data` field): +Query the expected return result (the calculated features are stored in the `data` field): ```json {"code":0,"msg":"ok","data":{"data":[["aaa",11,22]]}} ``` -Example 2: +**Example 2:** + ```bash curl http://127.0.0.1:9080/dbs/demo_db/deployments/demo_data_service -X POST -d'{"input": [["aaa", 11, 22, 1.2, 1.3, 1637000000000, "2021-11-16"]]}' ``` -Expect: + +Expected query result: + ```json {"code":0,"msg":"ok","data":{"data":[["aaa",11,66]]}} ``` -#### 3.3.8. Result Explanation +### Description of real-time feature computing results + +The SQL execution for online real-time requests is different from batch processing mode. The request mode only performs SQL calculations on the data of the request row. In the previous example, it is the input of the POST request that serves as the request row. The specific process is as follows: Assuming that this row of data exists in the table `demo_table1`, and the following feature calculation SQL is executed on it: -The real-time feature extraction is executed in the request mode. Unlike the batch mode, the request mode will only perform SQL extractions on the request row. In the previous example, the POST input is used as the request row, assuming this row of data exists in the table demo_table1, and execute SQL on it: ```sql SELECT c1, c2, sum(c3) OVER w1 AS w1_c3_sum FROM demo_table1 WINDOW w1 AS (PARTITION BY demo_table1.c1 ORDER BY demo_table1.c6 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW); ``` -The computation of Example 1 is logically done as follows: -1. According to the request line and the `PARTITION BY` in window clause, filter out the lines whose `c1` is "aaa", and sort them according to `c6` from small to large. So theoretically, the intermediate data table after partition sorting is shown in the following table. Among them, the first row after the request behavior is sorted. -``` - ----- ---- ---- ---------- ----------- --------------- ------------ - c1 c2 c3 c4 c5 c6 c7 - ----- ---- ---- ---------- ----------- --------------- ------------ - aaa 11 22 1.2 1.3 1635247427000 2021-05-20 - aaa 11 22 1.200000 11.300000 1636097290000 1970-01-01 - aaa 12 22 2.200000 12.300000 1636097890000 1970-01-01 - ----- ---- ---- ---------- ----------- --------------- ------------ -``` -2. The window range is `2 PRECEDING AND CURRENT ROW`, so we cut out the real window in the above table, the request row is the smallest row, the previous 2 rows do not exist, but the window contains the current row, so the window has only one row (the request row). -3. Window aggregation is performed, to sum `c3` of the data in the window (only one row), and we have the result 22. -The output is: -``` - ----- ---- ----------- - c1 c2 w1_c3_sum - ----- ---- ----------- - aaa 11 22 - ----- ---- ----------- -``` +**The calculation logic for Example 1 is as follows:** + +1. Filter rows in column c1 with the value "aaa" based on the `PARTITION BY` partition of the request row and window, and sort them in ascending order by column c6. Therefore, in theory, the intermediate data table sorted by partition should be as follows. The request row is the first row after sorting. -Example 2: -1. According to the request line and the `PARTITION BY` in window clause, filter out the lines whose `c1` is "aaa", and sort them according to `c6` from small to large. So theoretically, the intermediate data table after partition sorting is shown in the following table. The request row is the last row. +```sql +----- ---- ---- ---------- ----------- --------------- ------------ +c1 c2 c3 c4 c5 c6 c7 +----- ---- ---- ---------- ----------- --------------- ------------ +aaa 11 22 1.2 1.3 1635247427000 2021-05-20 +aaa 11 22 1.200000 11.300000 1636097290000 1970-01-01 +aaa 12 22 2.200000 12.300000 1636097890000 1970-01-01 +----- ---- ---- ---------- ----------- --------------- ------------ ``` - ----- ---- ---- ---------- ----------- --------------- ------------ - c1 c2 c3 c4 c5 c6 c7 - ----- ---- ---- ---------- ----------- --------------- ------------ - aaa 11 22 1.200000 11.300000 1636097290000 1970-01-01 - aaa 12 22 2.200000 12.300000 1636097890000 1970-01-01 - aaa 11 22 1.2 1.3 1637000000000 2021-11-16 - ----- ---- ---- ---------- ----------- --------------- ------------ + +2. The window range is `2 PRECEDING AND CURRENT ROW`, so in the above table, the actual window is extracted, and the request row is the smallest row with no preceding two rows, but the window includes the current row, so the window only contains the request row. +3. For window aggregation, the sum of column c3 for the data within the window (only one row) is calculated, resulting in 22. Therefore, the output result is: + +```sql +----- ---- ----------- +c1 c2 w1_c3_sum +----- ---- ----------- +aaa 11 22 +----- ---- ----------- ``` -2. The window range is `2 PRECEDING AND CURRENT ROW`, so we cut out the real window in the above table, the request row is the largest row, so the previous 2 rows are exist, and the window contains the current row, so the window has 3 rows. -3. Window aggregation is performed, to sum `c3` of the data in the window (3 rows), and we have the result 22*3=66. -The output is: +**The calculation logic for Example 2 is as follows:** + +1. According to the partition of the request line and window by `PARTITION BY`, select the rows where column c1 is "aaa" and sort them in ascending order by column c6. Therefore, theoretically, the intermediate data table after partition and sorting should be as shown in the table below. The request row is the last row after sorting. + +```sql +----- ---- ---- ---------- ----------- --------------- ------------ +c1 c2 c3 c4 c5 c6 c7 +----- ---- ---- ---------- ----------- --------------- ------------ +aaa 11 22 1.200000 11.300000 1636097290000 1970-01-01 +aaa 12 22 2.200000 12.300000 1636097890000 1970-01-01 +aaa 11 22 1.2 1.3 1637000000000 2021-11-16 +----- ---- ---- ---------- ----------- --------------- ------------ ``` - ----- ---- ----------- - c1 c2 w1_c3_sum - ----- ---- ----------- - aaa 11 66 - ----- ---- ----------- + +2. The window range is `2 PRECEDING AND CURRENT ROW`, so the actual window is extracted from the above table, and the two preceding rows of the request row exist, and the current row is also included. Therefore, there are three rows of data in the window. +3. For window aggregation, the sum of column c3 for the data within the window (three rows) is calculated, resulting in 22 + 22 + 22 = 66. Therefore, the output result is: + +```sql +----- ---- ----------- +c1 c2 w1_c3_sum +----- ---- ----------- +aaa 11 66 +----- ---- ----------- ``` + diff --git a/docs/en/quickstart/python_sdk.md b/docs/en/quickstart/python_sdk.md deleted file mode 100644 index efd6d8f551e..00000000000 --- a/docs/en/quickstart/python_sdk.md +++ /dev/null @@ -1,213 +0,0 @@ -# Python SDK Quickstart - -## 1. Install the Python SDK Package - -Install using `pip`. - -```bash -pip install openmldb -``` - -## 2. OpenMLDB DBAPI - -### 2.1 Create Connection - -When creating the connection, the database name is **required** to exist. If it does not exist, you need to create the database before the connection is created. Or you can create a connection without database, then `execute("USE ")` to set the database. - -````python -import openmldb.dbapi - -db = openmldb.dbapi.connect(zk="$zkcluster", zkPath="$zkpath") - -cursor = db.cursor() -```` - -### 2.2 Create Database - -````python -cursor.execute("CREATE DATABASE db1") -cursor.execute("USE db1") -```` - -### 2.3 Create Table - -````python -cursor.execute("CREATE TABLE t1 (col1 bigint, col2 date, col3 string, col4 string, col5 int, index(key=col3, ts=col1))") -```` - -### 2.4 Insert Data to Table - -````python -cursor.execute("INSERT INTO t1 VALUES(1000, '2020-12-25', 'guangdon', 'shenzhen', 1)") -```` - -### 2.5 Execute SQL Query - -````python -result = cursor.execute("SELECT * FROM t1") -print(result.fetchone()) -print(result.fetchmany(10)) -print(result.fetchall()) -```` - -### 2.6 Delete Table - -````python -cursor.execute("DROP TABLE t1") -```` - -### 2.7 Delete Database - -````python -cursor.execute("DROP DATABASE db1") -```` - -### 2.8 Close the Connection - -````python -cursor.close() -```` - -## 3. OpenMLDB SQLAlchemy - -### 3.1 Create Connection - -`create_engine('openmldb:///db_name?zk=zkcluster&zkPath=zkpath')` -When creating the connection, the database is **required** to exist. If it does not exist, you need to create the database before the connection is created. Or you can create a connection without database, then `execute("USE ")` to set the database. - -````python -import sqlalchemy as db - -engine = db.create_engine('openmldb:///?zk=127.0.0.1:2181&zkPath=/openmldb') -connection = engine.connect() -```` - -### 3.2 Create Database - -Create a database using the `connection.execute()`: - -````python -try: - connection.execute("CREATE DATABASE db1") -except Exception as e: - print(e) -connection.execute("USE db1") -```` - -### 3.3 Create Table - -Create a table using the `connection.execute()`: - -````python -try: - connection.execute("CREATE TABLE t1 ( col1 bigint, col2 date, col3 string, col4 string, col5 int, index(key=col3, ts=col1))") -except Exception as e: - print(e) -```` - -### 3.4 Insert Data into the Table - -Using the `connection.execute(ddl)` to execute the SQL insert statement to insert data to the table: - -````python -try: - connection.execute("INSERT INTO t1 VALUES(1000, '2020-12-25', 'guangdon', 'shenzhen', 1);") -except Exception as e: - print(e) -```` - -Using the `connection.execute(ddl, data)` to execute the insert statement of SQL with the placeholder, and the inserted data can be dynamically specified: - -````python -try: - insert = "INSERT INTO t1 VALUES(1002, '2020-12-27', ?, ?, 3);" - connection.execute(insert, ({"col3":"fujian", "col4":"fuzhou"})) -except Exception as e: - print(e) -```` - -### 3.5 Execute SQL Batch Query - -Using the `connection.execute(sql)` to execute SQL batch query statements: - -````python -try: - rs = connection.execute("SELECT * FROM t1") - for row in rs: - print(row) - rs = connection.execute("SELECT * FROM t1 WHERE col3 = ?;", ('hefei')) -except Exception as e: - print(e) -```` - -### 3.6 Execute SQL Queries in the Request Mode - -Using the `connection.execute(sql, request)` to execute SQLs in the request mode. You can put the input request row in the second parameter. - -````python -try: - rs = connection.execute("SELECT * FROM t1", ({"col1":9999, "col2":'2020-12-27', "col3":'zhejiang', "col4":'hangzhou', " col5":100})) -except Exception as e: - print(e) -```` - -### 3.7 Delete Table - -Using the `connection.execute(ddl)` interface to delete a table: - -````python -try: - connection.execute("DROP TABLE t1") -except Exception as e: - print(e) -```` - -### 3.8 Delete Database - -Using the `connection.execute(ddl)` interface to delete a database: - -````python -try: - connection.execute("DROP DATABASE db1") -except Exception as e: - print(e) -```` - -## 4. Notebook Magic Function - -OpenMLDB Python SDK supports Notebook magic function extension, you can use the following statement to register the function. - -````python -import openmldb - -db = openmldb.dbapi.connect(database='demo_db',zk='0.0.0.0:2181',zkPath='/openmldb') -openmldb.sql_magic.register(db) -```` - -The line magic function `%sql` and block magic function `%%sql` can then be used in Notebook. - -![img](images/openmldb_magic_function.png) - -## Example - -See [Python quickstart demo](https://github.com/4paradigm/OpenMLDB/tree/main/demo/python_quickstart/demo.py), including the usage of DBAPI and SQLAlchemy as shown previously. - -## Option - -Connect to cluster must set `zk` and `zkPath`. - -Connect to standalone must set `host` and `port`. - -Whether use dbapi or url to start Python client, optional options are the same with JAVA client, ref[JAVA SDK Option](./java_sdk.md#5-sdk-option)。 - -## Q&A -Q: How to solve `ImportError: dlopen(.._sql_router_sdk.so, 2): initializer function 0xnnnn not in mapped image for ` when use sqlalchemy? -A: The problem often happends when you import other complicate libs with `import openmldb`, the dl load is wrong. Please use the virtual env(e.g. conda) to test it, and make `import openmldb` to be the 1st import and `import sqlalchemy` to be the 2rd. - -If it can't help, please use `request` http to connect the apiserver. - -Q: How to solve the protobuf error? -``` -[libprotobuf FATAL /Users/runner/work/crossbow/crossbow/vcpkg/buildtrees/protobuf/src/23fa7edd52-3ba2225d30.clean/src/google/protobuf/stubs/common.cc:87] This program was compiled against version 3.6.1 of the Protocol Buffer runtime library, which is not compatible with the installed version (3.15.8). Contact the program author for an update. ... -``` -A: Maybe other libs includes a different version of protobuf, try virtual env(e.g. conda). diff --git a/docs/en/quickstart/rest_api.md b/docs/en/quickstart/rest_api.md deleted file mode 100644 index 00694c84da1..00000000000 --- a/docs/en/quickstart/rest_api.md +++ /dev/null @@ -1,312 +0,0 @@ -# REST APIs - -## Important Information - -- As REST APIs interact with the OpenMLDB servers via APIServer, the APIServer must be deployed. The APIServer is an optional module, please refer to [this document](../deploy/install_deploy.md#Deploy-APIServer) for the deployment. -- Currently, APIServer is mainly designed for function development and testing, thus it is not suggested to use it for performance benchmarking and deployed in production. There is no high-availability for the APIServer, and it also introduces overhead of networking and encoding/decoding. - -## Data Insertion - -The request URL: http://ip:port/dbs/{db_name}/tables/{table_name} - -HTTP method: PUT - -The request body: -```json -{ - "value": [ - [v1, v2, v3] - ] -} -``` - -+ Only one record can be inserted at a time. -+ The data layout should be arranged according to the schema strictly. - -**Example** - -```batch -curl http://127.0.0.1:8080/dbs/db/tables/trans -X PUT -d '{ -"value": [ - ["bb",24,34,1.5,2.5,1590738994000,"2020-05-05"] -]}' -``` -The response: - -```json -{ - "code":0, - "msg":"ok" -} -``` - -## Real-Time Feature Extraction - -The request URL: http://ip:port/dbs/{db_name}/deployments/{deployment_name} - -HTTP method: POST - -The request body: -- array style -``` -{ - "input": [["row0_value0", "row0_value1", "row0_value2"], ["row1_value0", "row1_value1", "row1_value2"], ...], - "need_schema": false -} -``` -- json style -```json -{ - "input": [ - {"col0":"row0_value0", "col1":"row0_value1", "col2":"row0_value2", "foo": "bar"}, - {"col0":"row1_value0", "col1":"row1_value1", "col2":"row1_value2"}, - ... - ] -} -``` - -+ Multiple rows of input are supported, whose returned values correspond to the fields in the `data.data` array. -+ A schema will be returned if `need_schema` is `true`. Optional, default is `false`. -+ If input is array style, the response data is array style. If input is json style, the response data is json style. DO NOT use multi styles in one request input. -+ Json style input can provide redundancy columns. - -**Example** - -- array style -```bash -curl http://127.0.0.1:8080/dbs/demo_db/deployments/demo_data_service -X POST -d'{ - "input": [["aaa", 11, 22, 1.2, 1.3, 1635247427000, "2021-05-20"]] - }' -``` - -response: - -```json -{ - "code":0, - "msg":"ok", - "data":{ - "data":[["aaa",11,22]] - } -} -``` - -- json style -```bash -curl http://127.0.0.1:8080/dbs/demo_db/deployments/demo_data_service -X POST -d'{ - "input": [{"c1":"aaa", "c2":11, "c3":22, "c4":1.2, "c5":1.3, "c6":1635247427000, "c7":"2021-05-20", "foo":"bar"}] - }' -``` - -response: - -```json -{ - "code":0, - "msg":"ok", - "data":{ - "data":[{"c1":"aaa","c2":11,"w1_c3_sum":22}] - } -} -``` - -## Query - -The request URL: http://ip:port/dbs/{db_name} - -HTTP method: POST - -request body: - -```json -{ - "mode": "", - "sql": "", - "input": { - "schema": [], - "data": [] - } -} -``` - -- "mode" can be: "offsync", "offasync", "online" -- "input" is optional -- "schema" all supported types (case-insensitive): -`Bool`, `Int16`, `Int32`, `Int64`, `Float`, `Double`, `String`, `Date` and `Timestamp`. - -**Request Body Example** - -- Normal query: - -```json -{ - "mode": "online", - "sql": "select 1" -} -``` - -The response: - -```json -{ - "code":0, - "msg":"ok", - "data": { - "schema":["Int32"], - "data":[[1]] - } -} -``` - -- Parameterized query: - -```json -{ - "mode": "online", - "sql": "SELECT c1, c2, c3 FROM demo WHERE c1 = ? AND c2 = ?", - "input": { - "schema": ["Int32", "String"], - "data": [1, "aaa"] - } -} -``` - -The response: - -```json -{ - "code":0, - "msg":"ok", - "data": { - "schema": ["Int32", "String", "Float"], - "data": [[1, "aaa", 1.2], [1, "aaa", 3.4]] - } -} -``` - -## Get Deployment Info - - -The request URL: http://ip:port/dbs/{db_name}/deployments/{deployment_name} - -HTTP method: Get - -The response: - -```json -{ - "code": 0, - "msg": "ok", - "data": { - "name": "", - "procedure": "", - "input_schema": [ - - ], - "input_common_cols": [ - - ], - "output_schema": [ - - ], - "output_common_cols": [ - - ], - "dbs": [ - - ], - "tables": [ - - ] - } -} -``` - - -## List Database - -The request URL: http://ip:port/dbs - -HTTP method: Get - -The response: - -```json -{ - "code": 0, - "msg": "ok", - "dbs": [ - - ] -} -``` - -## List Table - -The request URL: http://ip:port/dbs/{db}/tables - -HTTP method: Get - -The response: - -```json -{ - "code": 0, - "msg": "ok", - "tables": [ - { - "name": "", - "table_partition_size": 8, - "tid": , - "partition_num": 8, - "replica_num": 2, - "column_desc": [ - { - "name": "", - "data_type": "", - "not_null": false - } - ], - "column_key": [ - { - "index_name": "", - "col_name": [ - - ], - "ttl": { - - } - } - ], - "added_column_desc": [ - - ], - "format_version": 1, - "db": "", - "partition_key": [ - - ], - "schema_versions": [ - - ] - } - ] -} -``` - -## Refresh APIServer metadata cache - -The request URL: http://ip:port/refresh - -HTTP method: POST - -Empty request body. - -The response: - -```json -{ - "code":0, - "msg":"ok" -} -``` diff --git a/docs/en/quickstart/sdk/cpp_sdk.md b/docs/en/quickstart/sdk/cpp_sdk.md new file mode 100644 index 00000000000..59f4a284a63 --- /dev/null +++ b/docs/en/quickstart/sdk/cpp_sdk.md @@ -0,0 +1,117 @@ +# C++ SDK + +## C++SDK package compilation and installation + +```plain +git clone git@github.com:4paradigm/OpenMLDB.git +cd OpenMLDB +make && make install +``` + +## Write user code + +The following code demonstrates the basic use of C++ SDK. openmldb_api.h and sdk/result_set.h is the header file that must be included. + +```c++ +#include +#include +#include + +#include "openmldb_api.h" +#include "sdk/result_set.h" + +int main() +{ + //Create and initialize the OpenmldbHandler object + //Stand-alone version: parameter (ip, port), such as: OpenmldbHandler handler ("127.0.0.1", 6527); + //Cluster version: parameters (ip: port, path), such as: OpenmldbHandler handler ("127.0.0.1:6527", "/openmldb"); + //Take the stand-alone version as an example. + OpenmldbHandler handler("127.0.0.1", 6527); + + // Define database name + std::time_t t = std::time(0); + std::string db = "test_db" + std::to_string(t); + + // Create SQL statement and database + std::string sql = "create database " + db + ";"; + // Execute the SQL statement. The execute() function returns the bool value. A value of true indicates correct execution + std::cout << execute(handler, sql); + + // Create SQL statement and use database + sql = "use " + db + ";"; + std::cout << execute(handler, sql); + + // Create SQL statement and create table + sql = "create table test_table (" + "col1 string, col2 bigint," + "index(key=col1, ts=col2));"; + std::cout << execute(handler, sql); + + // Create SQL statements and insert rows into the table + sql = "insert test_table values(\"hello\", 1)"; + std::cout << execute(handler, sql); + sql = "insert test_table values(\"Hi~\", 2)"; + std::cout << execute(handler, sql); + + // Basic mode + sql = "select * from test_table;"; + std::cout << execute(handler, sql); + + // Get the latest SQL execution result + auto res = get_resultset(); + // Output SQL execution results + print_resultset(res); + // The output in this example should be: + // +-------+--------+ + // | col1 | col2 | + // +-------+--------+ + // | hello | 1 | + // | Hi~ | 2 | + // +-------+---------+ + + + + // Band-parameter mode + //The position of the parameters to be filled in the SQL statement is set to "?" to express + sql = "select * from test_table where col1 = ? ;"; + // Create a ParameterRow object for filling parameters + ParameterRow para(&handler); + // Fill in parameters + para << "Hi~"; + // Execute SQL statement execute_parameterized() function returns the bool value. A value of true indicates correct execution + execute_parameterized(handler, db, sql, para); + res = get_resultset(); + print_resultset(res); + // The output in this example should be: + // +------+--------+ + // | col1 | col2 | + // +------+-------+ + // | Hi~ | 2 | + // +------+--------+ + + + // Request mode + sql = "select col1, sum(col2) over w as w_col2_sum from test_table " + "window w as (partition by test_table.col1 order by test_table.col2 " + "rows between 2 preceding and current row);"; + RequestRow req(&handler, db, sql); + req << "Hi~" << 3l; + execute_request(req); + res = get_resultset(); + print_resultset(res); + // The output in this example should be: + // +------+--------------------+ + // | col1 | w_col2_sum | + // +------+--------------------+ + // | Hi~ | 5 | + // +------+--------------------+ +} +``` + +## Compile and run + +```plain +gcc .cxx -o -lstdc++ -std=c++17 -I/include -L/lib -lopenmldbsdk -lpthread +./ +``` + diff --git a/docs/en/quickstart/go_sdk.md b/docs/en/quickstart/sdk/go_sdk.md similarity index 62% rename from docs/en/quickstart/go_sdk.md rename to docs/en/quickstart/sdk/go_sdk.md index dd6eaca40f6..c30cbb2e502 100644 --- a/docs/en/quickstart/go_sdk.md +++ b/docs/en/quickstart/sdk/go_sdk.md @@ -1,46 +1,54 @@ -# Go SDK Quickstart +# Go SDK -**Requirements**: -- OpenMLDB Version >= 0.7.0 -- API server component is running +## Requirement -## 1. Install the Go SDK Package +- OpenMLDB version: >= v0.6.2 + +- Deploy and run APIServer (refer to [APIServer deployment](https://openmldb.ai/docs/zh/main/deploy/install_deploy.html#apiserver) document) + +## Go SDK package installment ```bash go get github.com/4paradigm/OpenMLDB/go ``` -## 2. API +## Go SDK usage -### 2.1 Connect +This section describes the basic use of Go SDK. -Go SDK connects to API server. +### Connect to OpenMLDB -```go +The Go SDK needs to be connected to the API server. + +```Go db, err := sql.Open("openmldb", "openmldb://127.0.0.1:8080/test_db") ``` -The DSN schema is +The format of data source (DSN) is: -``` +```plain openmldb://API_SERVER_HOST[:API_SERVER_PORT]/DB_NAME ``` -Note that an existed database is required. +You must connect to an existing database. -### 2.2 Create Table +### Create Table -```go +Create a table `demo`: + +```Go db.ExecContext(ctx, "CREATE TABLE demo(c1 int, c2 string);") ``` -### 2.3 Insert Value +### Insert data + +Insert date into table: ```go db.ExecContext(ctx, `INSERT INTO demo VALUES (1, "bb"), (2, "bb");`) ``` -### 2.4 Query +### Query ```go rows, err := db.QueryContext(ctx, `SELECT c1, c2 FROM demo;`) @@ -59,21 +67,21 @@ for rows.Next() { } ``` -### 3. An Example +## Example -```go +```Go package main import ( - "context" - "database/sql" + "context" + "database/sql" - // register openmldb driver - _ "github.com/4paradigm/OpenMLDB/go" + // 加载 OpenMLDB SDK + _ "github.com/4paradigm/OpenMLDB/go" ) func main() { - db, err := sql.Open("openmldb", "openmldb://127.0.0.1:8080/test_db") + db, err := sql.Open("openmldb", "openmldb://127.0.0.1:8080/test_db") if err != nil { panic(err) } @@ -106,3 +114,4 @@ func main() { } } ``` + diff --git a/docs/en/quickstart/sdk/index.rst b/docs/en/quickstart/sdk/index.rst new file mode 100644 index 00000000000..2eec974bee0 --- /dev/null +++ b/docs/en/quickstart/sdk/index.rst @@ -0,0 +1,12 @@ +============================= +SDK +============================= + +.. toctree:: + :maxdepth: 1 + + java_sdk + python_sdk + rest_api + go_sdk + cpp_sdk diff --git a/docs/en/quickstart/sdk/java_sdk.md b/docs/en/quickstart/sdk/java_sdk.md new file mode 100644 index 00000000000..629c715d5e6 --- /dev/null +++ b/docs/en/quickstart/sdk/java_sdk.md @@ -0,0 +1,465 @@ +# Java SDK + +## Java SDK package installation + +- Installing Java SDK package on Linux + + Configure the maven pom: + +```XML + + com.4paradigm.openmldb + openmldb-jdbc + 0.7.2 + + + com.4paradigm.openmldb + openmldb-native + 0.7.2 + +``` + +- Installing Java SDK package on Mac + + Configure the maven pom + +```XML + + com.4paradigm.openmldb + openmldb-jdbc + 0.7.2 + + + com.4paradigm.openmldb + openmldb-native + 0.7.2-macos + +``` + +Note: Since the openmldb-native package contains the C++ static library compiled for OpenMLDB, it is defaults to the Linux static library. For macOS, the version of openmldb-native should be changed to `0.7.2-macos`, while the version of openmldb-jdbc should remain unchanged. + +The macOS version of openmldb-native only supports macOS 12. To run it on macOS 11 or macOS 10.15, the openmldb-native package needs to be compiled from source code on the corresponding OS. For detailed compilation methods, please refer to [Concurrent Compilation of Java SDK](https://openmldb.ai/docs/zh/main/deploy/compile.html#java-sdk). + +To connect to the OpenMLDB service using the Java SDK, you can use JDBC (recommended) or connect directly through SqlClusterExecutor. The following will demonstrate both connection methods in order. + +## JDBC method + +The connection method using JDBC is as follows: + +```java +Class.forName("com._4paradigm.openmldb.jdbc.SQLDriver"); +// No database in jdbcUrl +Connection connection = DriverManager.getConnection("jdbc:openmldb:///?zk=localhost:6181&zkPath=/openmldb"); + +// Set database in jdbcUrl +Connection connection1 = DriverManager.getConnection("jdbc:openmldb:///test_db?zk=localhost:6181&zkPath=/openmldb"); +``` + +The database specified in the Connection address must exist when creating the connection. + +```{caution} +he default execution mode for JDBC Connection is `online`. +``` + +### Usage overview + +All SQL commands can be executed using `Statement`, both in online and offline modes. To switch between offline and online modes, use command `SET @@execute_mode='...';``. For example: + +```java +Statement stmt = connection.createStatement(); +stmt.execute("SET @@execute_mode='offline"); // Switch to offline mode +stmt.execute("SELECT * from t1"); // Offline select +ResultSet res = stmt.getResultSet(); // The ResultSet of the previous execute + +stmt.execute("SET @@execute_mode='online"); // Switch to online mode +res = stmt.executeQuery("SELECT * from t1"); // For online mode, select or executeQuery can directly obtain the ResultSet result. +``` + +The `LOAD DATA` command is an asynchronous command, and the returned ResultSet contains information such as the job ID and state. You can execute `show job ` to check if the job has been completed. Note that the ResultSet needs to execute `next()` method to move the cursor to the first row of data. + +It is also possible to change it to a synchronous command: + +```SQL +SET @@sync_job=true; +``` + +If the actual execution time of the synchronous command exceeds the default maximum idle wait time of 0.5 hours, please [adjust the configuration](https://openmldb.ai/docs/zh/main/openmldb_sql/ddl/SET_STATEMENT.html#id4). + +### PreparedStatement + +`PreparedStatement` supports `SELECT`, `INSERT`, and `DELETE` operations. Note that `INSERT` only supports online insertion. + +```java +PreparedStatement selectStatement = connection.prepareStatement("SELECT * FROM t1 WHERE id=?"); +PreparedStatement insertStatement = connection.prepareStatement("INSERT INTO t1 VALUES (?,?)"); +PreparedStatement insertStatement = connection.prepareStatement("DELETE FROM t1 WHERE id=?"); +``` + +## SqlClusterExecutor method + +### Creating a SqlClusterExecutor + +First, configure the OpenMLDB connection parameters. + +```java +SdkOption option = new SdkOption(); +option.setZkCluster("127.0.0.1:2181"); +option.setZkPath("/openmldb"); +option.setSessionTimeout(10000); +option.setRequestTimeout(60000); +``` + +Then, use SdkOption to create the Executor. + +```java +sqlExecutor = new SqlClusterExecutor(option); +``` + +`SqlClusterExecutor` execution of SQL operations is thread-safe, and in actual environments, a single `SqlClusterExecutor` can be created. However, since the execution mode (execute_mode) is an internal variable of `SqlClusterExecutor`, if you want to execute an offline command and an online command at the same time, unexpected results may occur. In this case, please use multiple `SqlClusterExecutors`. + +```{caution} +The default execution mode for SqlClusterExecutor is offline, which is different from the default mode for JDBC. +``` + +### Statement + +`SqlClusterExecutor` can obtain a `Statement` similar to the JDBC approach and can use `Statement::execute`. + +```java +java.sql.Statement state = sqlExecutor.getStatement(); +try { + state.execute("create database db_test"); +} catch (Exception e) { + e.printStackTrace(); +} finally { + state.close(); +} +``` + +Note that `SqlClusterExecutor` does not have the concept of a default database, so you need to execute a `USE ` command before you can continue to create tables. + +```java +java.sql.Statement state = sqlExecutor.getStatement(); +try { + state.execute("use db_test"); + String createTableSql = "create table trans(c1 string,\n" + + " c3 int,\n" + + " c4 bigint,\n" + + " c5 float,\n" + + " c6 double,\n" + + " c7 timestamp,\n" + + " c8 date,\n" + + " index(key=c1, ts=c7));"; + state.execute(createTableSql); +} catch (Exception e) { + e.printStackTrace(); +} finally { + state.close(); +} +``` + +#### Executing batch SQL queries with Statement + +Use the `Statement::execute` interface to execute batch SQL queries: + +```java +java.sql.Statement state = sqlExecutor.getStatement(); +try { + state.execute("use db_test"); + // The default execution mode for sqlExecutor is offline. If the mode has not been changed to online before, the execution mode needs to be set to online here. + state.execute("SET @@execute_mode='online;"); + // If the return value of execute is true, it means that the operation is successful, and the result can be obtained through getResultSet. + boolean ret = state.execute("select * from trans;"); + Assert.assertTrue(ret); + java.sql.ResultSet rs = state.getResultSet(); +} catch (Exception e) { + e.printStackTrace(); +} +``` + +Accessing query results: + +```java +// Accessing the ResultSet and printing the first three columns of data. +try { + while (result.next()) { + System.out.println(resultSet.getString(1) + "," + resultSet.getInt(2) "," + resultSet.getLong(3)); + } +} catch (SQLException e) { + e.printStackTrace(); +} finally { + try { + if (result != null) { + result.close(); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } +} +``` + +### PreparedStatement + +`SqlClusterExecutor` can also obtain `PreparedStatement`, but you need to specify which type of `PreparedStatement` to obtain. For example, when using InsertPreparedStmt for insertion operations, there are three ways to do it. + +```{note} +Insert operation only supports online mode and is not affected by execution mode. The data will always be inserted into the online database. +``` + +#### Common Insert + +1. Use the `SqlClusterExecutor::getInsertPreparedStmt(db, insertSql)` method to get the InsertPrepareStatement. +2. Use the `PreparedStatement::execute()` method to execute the insert statement. + +```java +String insertSql = "insert into trans values(\"aa\",23,33,1.4,2.4,1590738993000,\"2020-05-04\");"; +java.sql.PreparedStatement pstmt = null; +try { + pstmt = sqlExecutor.getInsertPreparedStmt(db, insertSql); + Assert.assertTrue(pstmt.execute()); +} catch (SQLException e) { + e.printStackTrace(); + Assert.fail(); +} finally { + if (pstmt != null) { + try { + // After using the PrepareStatement, it must be closed. + pstmt.close(); + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + } +} +``` + +#### Insert With Placeholder + +1. Get InsertPrepareStatement by calling `SqlClusterExecutor::getInsertPreparedStmt(db, insertSqlWithPlaceHolder)` interface. +2. Use `PreparedStatement::setType(index, value)` interface to fill in data to the InsertPrepareStatement. Note that the index starts from 1. +3. Use `PreparedStatement::execute()` interface to execute the insert statement. + +```{note} +When the conditions of the PreparedStatement are the same, you can repeatedly call the set method of the same object to fill in data before executing execute(). There is no need to create a new PreparedStatement object. +``` + +```java +String insertSqlWithPlaceHolder = "insert into trans values(\"aa\", ?, 33, ?, 2.4, 1590738993000, \"2020-05-04\");"; +java.sql.PreparedStatement pstmt = null; +try { + pstmt = sqlExecutor.getInsertPreparedStmt(db, insertSqlWithPlaceHolder); + pstmt.setInt(1, 24); + pstmt.setInt(2, 1.5f); + pstmt.execute(); +} catch (SQLException e) { + e.printStackTrace(); + Assert.fail(); +} finally { + if (pstmt != null) { + try { + // After using the PrepareStatement, it must be closed. + pstmt.close(); + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + } +} +``` + +```{note} +After execute, the cached data will be cleared and it is not possible to retry execute. +``` + +#### Batch Insert With Placeholder + +1. To use batch insert, first obtain the InsertPrepareStatement using the `SqlClusterExecutor::getInsertPreparedStmt(db, insertSqlWithPlaceHolder)` interface. +2. Then use the `PreparedStatement::setType(index, value)` interface to fill data into the InsertPrepareStatement. +3. Use the `PreparedStatement::addBatch()` interface to complete filling for one row. +4. Continue to use `setType(index, value)` and `addBatch()` to fill multiple rows. +5. Use the `PreparedStatement::executeBatch()` interface to complete the batch insertion. + +```java +String insertSqlWithPlaceHolder = "insert into trans values(\"aa\", ?, 33, ?, 2.4, 1590738993000, \"2020-05-04\");"; +java.sql.PreparedStatement pstmt = null; +try { + pstmt = sqlExecutor.getInsertPreparedStmt(db, insertSqlWithPlaceHolder); + pstmt.setInt(1, 24); + pstmt.setInt(2, 1.5f); + pstmt.addBatch(); + pstmt.setInt(1, 25); + pstmt.setInt(2, 1.7f); + pstmt.addBatch(); + pstmt.executeBatch(); +} catch (SQLException e) { + e.printStackTrace(); + Assert.fail(); +} finally { + if (pstmt != null) { + try { + // After using the PrepareStatement, it must be closed. + pstmt.close(); + } catch (SQLException throwables) { + throwables.printStackTrace(); + } + } +} +``` + +```{note} +After executeBatch(), all cached data will be cleared and it's not possible to retry executeBatch(). +``` + +### Execute SQL request query + +`RequestPreparedStmt` is a unique query mode (not supported by JDBC). This mode requires both the selectSql and a request data, so you need to provide the SQL and set the request data using setType when calling `getRequestPreparedStmt`. + +There are three steps to execute a SQL request query: + +```{note} +request queries only support online mode and are not affected by the execution mode. They must be performed as online request queries. +``` + +1. Use the `SqlClusterExecutor::getRequestPreparedStmt(db, selectSql)` interface to obtain a RequestPrepareStatement. +2. Call the `PreparedStatement::setType(index, value)` interface to set the request data. Please call the `setType` interface and configure valid values based on the data type corresponding to each column in the data table. +3. Call the `Statement::executeQuery()` interface to execute the request-style query statement. + +```java +String selectSql = "SELECT c1, c3, sum(c4) OVER w1 as w1_c4_sum FROM trans WINDOW w1 AS " + + "(PARTITION BY trans.c1 ORDER BY trans.c7 ROWS BETWEEN 2 PRECEDING AND CURRENT ROW);"; +PreparedStatement pstmt = null; +ResultSet resultSet = null; +/* +c1 string,\n" + + " c3 int,\n" + + " c4 bigint,\n" + + " c5 float,\n" + + " c6 double,\n" + + " c7 timestamp,\n" + + " c8 date,\n" + +*/ +try { + // Step 1,get RequestPrepareStatement + pstmt = sqlExecutor.getRequestPreparedStmt(db, selectSql); + + // Step 2,To execute the request mode, you need to set a row of request data in the RequestPreparedStatement. + pstmt.setString(1, "bb"); + pstmt.setInt(2, 24); + pstmt.setLong(3, 34l); + pstmt.setFloat(4, 1.5f); + pstmt.setDouble(5, 2.5); + pstmt.setTimestamp(6, new Timestamp(1590738994000l)); + pstmt.setDate(7, Date.valueOf("2020-05-05")); + + // Calling executeQuery will execute the select sql, and then put the result in the resultSet + resultSet = pstmt.executeQuery(); + + // Access resultSet + Assert.assertEquals(resultSet.getMetaData().getColumnCount(), 3); + Assert.assertTrue(resultSet.next()); + Assert.assertEquals(resultSet.getString(1), "bb"); + Assert.assertEquals(resultSet.getInt(2), 24); + Assert.assertEquals(resultSet.getLong(3), 34); + + // The return result set of the ordinary request query contains only one row of results. Therefore, the result of the second call to resultSet. next() is false + Assert.assertFalse(resultSet.next()); + +} catch (SQLException e) { + e.printStackTrace(); + Assert.fail(); +} finally { + try { + if (resultSet != null) { + // result用完之后需要close + resultSet.close(); + } + if (pstmt != null) { + pstmt.close(); + } + } catch (SQLException throwables) { + throwables.printStackTrace(); + } +} +``` + +### Delete all data of a key under the specified index + +There are two ways to delete data through the Java SDK: + +- Execute delete SQL directly + +- Use delete PreparedStatement + +Note that this can only delete data under one index, not all indexes. Refer to [DELETE function boundary](https://openmldb.ai/docs/zh/main/quickstart/function_boundary.html#delete) for details. + +```java +java.sql.Statement state = router.getStatement(); +try { + String sql = "DELETE FROM t1 WHERE col2 = 'key1';"; + state.execute(sql); + sql = "DELETE FROM t1 WHERE col2 = ?;"; + java.sql.PreparedStatement p1 = router.getDeletePreparedStmt("test", sql); + p1.setString(1, "key2"); + p1.executeUpdate(); + p1.close(); +} catch (Exception e) { + e.printStackTrace(); + Assert.fail(); +} finally { + try { + state.close(); + } catch (Exception e) { + e.printStackTrace(); + } +} +``` + +### A complete example of using SqlClusterExecutor + +Refer to the [Java quickstart demo](https://github.com/4paradigm/OpenMLDB/tree/main/demo/java_quickstart/demo). If it is used on macOS, please use openmldb-native of macOS version and increase the dependency of openmldb-native. + +Compile and run: + +``` +mvn package +java -cp target/demo-1.0-SNAPSHOT.jar com.openmldb.demo.App +``` + +## SDK Configuration Details + +You must fill in `zkCluster` and `zkPath` (set method or the configuration `foo=bar` after `?` in JDBC). + +### Optional configuration + +| Optional configuration | Description | +| ---------------------- | ------------------------------------------------------------ | +| enableDebug | The default is false. Enable the debug log of hybridse (note that it is not the global debug log). You can view more logs of sql compilation and operation. However, not all of these logs are collected by the client. You need to view the tablet server logs. | +| requestTimeout | The default is 60000 ms. This timeout is the rpc timeout sent by the client, except for those sent to the taskmanager (the rpc timeout of the job is controlled by the variable `job_timeout`). | +| glogLevel | The default is 0, which is similar to the minloglevel of the glog. The `INFO/WARNING/ERROR/FATAL` log corresponds to `0/1/2/3` respectively. 0 means to print INFO and the level on. | +| glogDir | The default is empty. When the log directory is empty, it is printed to stderr. This is referring to the console. | +| maxSqlCacheSize | The default is 50, the maximum number of sql caches for a single execution mode of a single database on the client. If there is an error caused by cache obsolescence, you can increase this size to avoid the problem. | +| sessionTimeout | Default 10000 ms, session timeout of zk | +| zkLogLevel | By default, 3, `0/1/2/3/4` respectively means that `all zk logs/error/warn/info/debug are prohibited` | +| zkLogFile | The default is empty, which is printed to stdout. | +| sparkConfPath | The default is empty. You can change the spark conf used by the job through this configuration without configuring the taskmanager to restart. | + +## SQL verification + +The Java client supports the correct verification of SQL to verify whether it is executable. It is divided into batch and request modes. + +- `ValidateSQLInBatch` can verify whether SQL can be executed at the offline end. +- `ValidateSQLInRequest` can verify whether SQL can be deployed online. + +Both interfaces need to go through all table schemas required by SQL. Currently, only single db is supported. Please do not use `db.table` format in SQL statements. + +For example, verify SQL `select count (c1) over w1 from t3 window w1 as (partition by c1 order by c2 rows between unbounded preceding and current row);`, In addition to this statement, you need to go through in the schema of table `t3` as the second parameter schemaMaps. The format is Map, key is the name of the db, and value is all the table schemas (maps) of each db. In fact, only a single db is supported, so there is usually only one db here, as shown in db3 below. The table schema map key under db is table name, and the value is com._ 4paradigm.openmldb.sdk.Schema, consisting of the name and type of each column. + +```java +Map> schemaMaps = new HashMap<>(); +Map dbSchema = new HashMap<>(); +dbSchema = new HashMap<>(); +dbSchema.put("t3", new Schema(Arrays.asList(new Column("c1", Types.VARCHAR), new Column("c2", Types.BIGINT)))); +schemaMaps.put("db3", dbSchema); +List ret = SqlClusterExecutor.validateSQLInRequest("select count(c1) over w1 from t3 window "+ + "w1 as(partition by c1 order by c2 rows between unbounded preceding and current row);", schemaMaps); +Assert.assertEquals(ret.size(), 0); +``` + diff --git a/docs/en/quickstart/sdk/python_sdk.md b/docs/en/quickstart/sdk/python_sdk.md new file mode 100644 index 00000000000..421f6b8ff93 --- /dev/null +++ b/docs/en/quickstart/sdk/python_sdk.md @@ -0,0 +1,241 @@ +# Python SDK + +## Python SDK package installation + +Execute the following command to install the Python SDK package: + +```bash +pip install openmldb +``` + +## OpenMLDB DBAPI usage + +This section demonstrates the basic use of the OpenMLDB DB API. + +### Create connection + +Parameter `db_name` name must exist, and the database must be created before the connection is created. To continue, create a connection without a database and then use the database db through the `execute ("USE")` command. + +```python +import openmldb.dbapi +db = openmldb.dbapi.connect(zk="$zkcluster", zkPath="$zkpath") +cursor = db.cursor() +``` + +#### Configuration Details + +Zk and zkPath configuration are required. + +The Python SDK can be used through OpenMLDB DBAPI/SQLAlchemy. The optional configurations are basically the same as those of the Java client. Please refer to the [Java SDK configuration](https://openmldb.ai/docs/zh/main/quickstart/sdk/java_sdk.html#sdk) for details. + +### Create database + +Create database `db1`: + +```python +cursor.execute("CREATE DATABASE db1") +cursor.execute("USE db1") +``` + +### Create table + +Create table `t1`: + +```python +cursor.execute("CREATE TABLE t1 (col1 bigint, col2 date, col3 string, col4 string, col5 int, index(key=col3, ts=col1))") +``` + +### Insert data into the table + +Insert one sentence of data into the table: + +```python +cursor.execute("INSERT INTO t1 VALUES(1000, '2020-12-25', 'guangdon', 'shenzhen', 1)") +``` + +### Execute SQL query + +```python +result = cursor.execute("SELECT * FROM t1") +print(result.fetchone()) +print(result.fetchmany(10)) +print(result.fetchall()) +``` + +### SQL batch request query + +```python +#In the Batch Request mode, the input parameters of the interface are“SQL”, “Common_Columns”, “Request_Columns” +result = cursor.batch_row_request("SELECT * FROM t1", ["col1","col2"], ({"col1": 2000, "col2": '2020-12-22', "col3": 'fujian', "col4":'xiamen', "col5": 2})) +print(result.fetchone()) +``` + +### Delete table + +Delete table `t1`: + +```python +cursor.execute("DROP TABLE t1") +``` + +### Delete database + +Delete database `db1`: + +```python +cursor.execute("DROP DATABASE db1") +``` + +### Close connection + +```python +cursor.close() +``` + +## OpenMLDB SQLAlchemy usage + +This section demonstrates using the Python SDK through OpenMLDB SQLAlchemy. + +### Create connection + +```python +create_engine('openmldb:///db_name?zk=zkcluster&zkPath=zkpath') +``` + +Parameter `db_name` must exist, and the database must be created before the connection is created. First, create a connection without a database, and then use the database `db` through the `execute ("USE")` command. + +```python +import sqlalchemy as db +engine = db.create_engine('openmldb:///?zk=127.0.0.1:2181&zkPath=/openmldb') +connection = engine.connect() +``` + +### Create database + +Use the `connection.execute()` interface to create database `db1`: + +```python +try: + connection.execute("CREATE DATABASE db1") +except Exception as e: + print(e) + +connection.execute("USE db1") +``` + +### Create table + +Use the `connection.execute()` interface to create table `t1`: + +```python +try: + connection.execute("CREATE TABLE t1 ( col1 bigint, col2 date, col3 string, col4 string, col5 int, index(key=col3, ts=col1))") +except Exception as e: + print(e) +``` + +### Insert data into the table + +Use the `connection.execute (ddl)` interface to execute the SQL insert statement, and you can insert data into the table: + +```python +try: + connection.execute("INSERT INTO t1 VALUES(1000, '2020-12-25', 'guangdon', 'shenzhen', 1);") +except Exception as e: + print(e) +``` + +Use the `connection.execute (ddl, data)` interface to execute the insert statement of SQL with placeholder. You can specify the insert data dynamically or insert multiple rows: + +```python +try: + insert = "INSERT INTO t1 VALUES(1002, '2020-12-27', ?, ?, 3);" + connection.execute(insert, ({"col3":"fujian", "col4":"fuzhou"})) + connection.execute(insert, [{"col3":"jiangsu", "col4":"nanjing"}, {"col3":"zhejiang", "col4":"hangzhou"}]) +except Exception as e: + print(e) +``` + +### Execute SQL batch query + +Use the `connection.execute (sql)` interface to execute SQL batch query statements: + +```python +try: + rs = connection.execute("SELECT * FROM t1") + for row in rs: + print(row) + rs = connection.execute("SELECT * FROM t1 WHERE col3 = ?;", ('hefei')) + rs = connection.execute("SELECT * FROM t1 WHERE col3 = ?;",[('hefei'), ('shanghai')]) +except Exception as e: + print(e) +``` + +### Execute SQL request query + +Use the `connection.execute (sql, request)` interface to execute the SQL request query. You can put the input data into the second parameter of the execute function: + +```python +try: + rs = connection.execute("SELECT * FROM t1", ({"col1":9999, "col2":'2020-12-27', "col3":'zhejiang', "col4":'hangzhou', "col5":100})) +except Exception as e: + print(e) +``` + +### Delete table + +Use the `connection.execute (ddl)` interface to delete table `t1`: + +```python +try: + connection.execute("DROP TABLE t1") +except Exception as e: + print(e) +``` + +### Delete database + +Use the connection.execute(ddl)interface to delete database `db1`: + +```python +try: + connection.execute("DROP DATABASE db1") +except Exception as e: + print(e) +``` + +## Notebook Magic Function usage + +The OpenMLDB Python SDK supports the expansion of Notebook magic function. Use the following statement to register the function. + +```python +import openmldb +db = openmldb.dbapi.connect(database='demo_db',zk='0.0.0.0:2181',zkPath='/openmldb') +openmldb.sql_magic.register(db) +``` + +Then you can use line magic function `%sql` and block magic function `%%sql` in Notebook. + +![img](https://openmldb.ai/docs/zh/main/_images/openmldb_magic_function.png) + +## The complete usage example + +Refer to the [Python quickstart demo](https://github.com/4paradigm/OpenMLDB/tree/main/demo/python_quickstart/demo.py), including the above DBAPI and SQLAlchemy usage. + +## common problem + +- **What do I do when error** `ImportError:dlopen (.. _sql_router_sdk. so, 2): initializer function 0xnnnn not in mapped image for` **appears when using SQLAlchemy?** + +In addition to import openmldb, you may also import other third-party libraries, which may cause confusion in the loading order. Due to the complexity of the system, you can try to use the virtual env environment (such as conda) to avoid interference. In addition, import openmldb before importing sqlalchemy, and ensure that the two imports are in the first place. + +If the error still occur, it is recommended to connect to OpenMLDB by using request http to connect to apiserver. + +occur + +- **What do I do if Python SDK encountered the following problems?** + +```plain +[libprotobuf FATAL /Users/runner/work/crossbow/crossbow/vcpkg/buildtrees/protobuf/src/23fa7edd52-3ba2225d30.clean/src/google/protobuf/stubs/common.cc:87] This program was compiled against version 3.6.1 of the Protocol Buffer runtime library, which is not compatible with the installed version (3.15.8). Contact the program author for an update. ... +``` + +This problem may be due to the introduction of other versions of protobuf in other libraries. You can try to use the virtual env environment (such as conda). diff --git a/docs/en/quickstart/sdk/rest_api.md b/docs/en/quickstart/sdk/rest_api.md new file mode 100644 index 00000000000..7d8f3c4a881 --- /dev/null +++ b/docs/en/quickstart/sdk/rest_api.md @@ -0,0 +1,327 @@ +# REST API + +## Important information + +REST APIs interact with the services of APIServer and OpenMLDB, so the APIServer module must be properly deployed to be used effectively. APIServer is an optional module during installation and deployment. Refer to the APIServer deployment document. + +At this stage, APIServer is mainly used for functional testing, not recommended for performance testing, nor recommended for the production environment. The default deployment of APIServer does not have a high availability mechanism at present and introduces additional network and codec overhead. + +## Data insertion + +Request address: http://ip:port/dbs/{db_name}/tables/{table_name} + +Request method: PUT + +The requestor: + +```JSON + { + "value": [ + [v1, v2, v3] + ] + } +``` + +- Currently, it only supports inserting one piece of data. + +- The data should be arranged in strict accordance with the schema. + +Sample request data: + +```bash +curl http://127.0.0.1:8080/dbs/db/tables/trans -X PUT -d '{ +"value": [ + ["bb",24,34,1.5,2.5,1590738994000,"2020-05-05"] +]}' +``` + +Response: + +```json +{ + "code":0, + "msg":"ok" +} +``` + +## Real-time feature computing + +Request address: http://ip:port/dbs/{db_name}/deployments/{deployment_name} + +Request method: POST + +Requestor + +- Array format: + +```json +{ + "input": [["row0_value0", "row0_value1", "row0_value2"], ["row1_value0", "row1_value1", "row1_value2"], ...], + "need_schema": false +} +``` + +- JSON format: + +```json +{ + "input": [ + {"col0":"row0_value0", "col1":"row0_value1", "col2":"row0_value2", "foo": "bar"}, + {"col0":"row1_value0", "col1":"row1_value1", "col2":"row1_value2"}, + ... + ] +} +``` + +- It can support multiple rows, and its results correspond to the array of data.data fields in the returned response one by one. + +- need_schema can be set to true, and the schema with output results will be returned. For optional parameter, the default is false. + +- When the input is in array format/ JSON format, the returned result is also in array format/ JSON format. The input requested at a time only supports one format. Please do not mix formats. + +- Input data in JSON format can have redundant columns. + +**Sample request data** + +Example 1: Array format + +```plain +curl http://127.0.0.1:8080/dbs/demo_db/deployments/demo_data_service -X POST -d'{ + "input": [["aaa", 11, 22, 1.2, 1.3, 1635247427000, "2021-05-20"]] + }' +``` + +Response: + +```JSON +{ + "code":0, + "msg":"ok", + "data":{ + "data":[["aaa",11,22]] + } +} +``` + +Example 2: JSON format + +```JSON +curl http://127.0.0.1:8080/dbs/demo_db/deployments/demo_data_service -X POST -d'{ + "input": [{"c1":"aaa", "c2":11, "c3":22, "c4":1.2, "c5":1.3, "c6":1635247427000, "c7":"2021-05-20", "foo":"bar"}] + }' +``` + +Response: + +```JSON +{ + "code":0, + "msg":"ok", + "data":{ + "data":[{"c1":"aaa","c2":11,"w1_c3_sum":22}] + } +} +``` + +## Query + +Request address: http://ip:port/dbs/ {db_name} + +Request method: POST + +Requestor: + +```JSON +{ + "mode": "", + "sql": "", + "input": { + "schema": [], + "data": [] + } +} +``` + +Request parameters: + +| Parameters | Type | Requirement | Description | +| ---------- | ------ | ----------- | ------------------------------------------------------------ | +| mode | String | Yes | Available for `offsync` , `offasync`, `online` | +| sql | String | Yes | | +| input | Object | No | | +| schema | Array | No | Support data types (case insensitive): `Bool`, `Int16`, `Int32`, `Int64`, `Float`, `Double`, `String`, `Date and Timestamp` | +| data | Array | No | | + +**Sample request data** + +Example 1: General query + +```JSON +{ + "mode": "online", + "sql": "select 1" +} +``` + +Response: + +```JSON +{ + "code":0, + "msg":"ok", + "data": { + "schema":["Int32"], + "data":[[1]] + } +} +``` + +Example 2: Parametric query + +```JSON +{ + "mode": "online", + "sql": "SELECT c1, c2, c3 FROM demo WHERE c1 = ? AND c2 = ?", + "input": { + "schema": ["Int32", "String"], + "data": [1, "aaa"] + } +} +``` + +Response: + +```json +{ + "code":0, + "msg":"ok", + "data": { + "schema": ["Int32", "String", "Float"], + "data": [[1, "aaa", 1.2], [1, "aaa", 3.4]] + } +} +``` + +## Query deployment information + +Request address: http://ip:port/dbs/{db_name}/deployments/{deployment_name} + +Request method: GET + +Response: + +```JSON +{ + "code": 0, + "msg": "ok", + "data": { + "name": "", + "procedure": "", + "input_schema": [ + + ], + "input_common_cols": [ + + ], + "output_schema": [ + + ], + "output_common_cols": [ + + ], + "dbs": [ + + ], + "tables": [ + + ] + } +} +``` + +## Acquire all library names + +Request address: http://ip:port/dbs + +Request method: GET + +Response: + +```json +{ + "code": 0, + "msg": "ok", + "dbs": [ + + ] +} +``` + +## Acquire all table names + +Request address: http://ip:port/dbs/{db}/tables + +Request method: GET + +Response: + +```json +{ + "code": 0, + "msg": "ok", + "tables": [ + { + "name": "", + "table_partition_size": 8, + "tid": , + "partition_num": 8, + "replica_num": 2, + "column_desc": [ + { + "name": "", + "data_type": "", + "not_null": false + } + ], + "column_key": [ + { + "index_name": "", + "col_name": [ + + ], + "ttl": { + + } + } + ], + "added_column_desc": [ + + ], + "format_version": 1, + "db": "", + "partition_key": [ + + ], + "schema_versions": [ + + ] + } + ] +} +``` + +## Refresh APIServer metadata cache + +Request address: http://ip:port/refresh + +Request method: POST + +Response: + +```json +{ + "code":0, + "msg":"ok" +} +``` + diff --git a/docs/poetry.lock b/docs/poetry.lock index a29c0ded014..01f5d11fa68 100644 --- a/docs/poetry.lock +++ b/docs/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "alabaster" version = "0.7.12" description = "A configurable sidebar-enabled Sphinx theme" -category = "dev" optional = false python-versions = "*" files = [ @@ -16,7 +15,6 @@ files = [ name = "babel" version = "2.10.3" description = "Internationalization utilities" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -31,7 +29,6 @@ pytz = ">=2015.7" name = "beautifulsoup4" version = "4.11.1" description = "Screen-scraping library" -category = "dev" optional = false python-versions = ">=3.6.0" files = [ @@ -50,7 +47,6 @@ lxml = ["lxml"] name = "certifi" version = "2022.12.7" description = "Python package for providing Mozilla's CA Bundle." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -62,7 +58,6 @@ files = [ name = "charset-normalizer" version = "2.1.1" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "dev" optional = false python-versions = ">=3.6.0" files = [ @@ -77,7 +72,6 @@ unicode-backport = ["unicodedata2"] name = "colorama" version = "0.4.5" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -89,7 +83,6 @@ files = [ name = "docutils" version = "0.17.1" description = "Docutils -- Python Documentation Utilities" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -101,7 +94,6 @@ files = [ name = "idna" version = "3.3" description = "Internationalized Domain Names in Applications (IDNA)" -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -113,7 +105,6 @@ files = [ name = "imagesize" version = "1.4.1" description = "Getting image size from png/jpeg/jpeg2000/gif file" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -125,7 +116,6 @@ files = [ name = "importlib-metadata" version = "4.12.0" description = "Read metadata from Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -145,7 +135,6 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs name = "jinja2" version = "3.1.2" description = "A very fast and expressive template engine." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -163,7 +152,6 @@ i18n = ["Babel (>=2.7)"] name = "linkify-it-py" version = "1.0.3" description = "Links recognition library with FULL unicode support." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -184,7 +172,6 @@ test = ["coverage", "pytest", "pytest-cov"] name = "markdown-it-py" version = "2.2.0" description = "Python port of markdown-it. Markdown parsing, done right!" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -209,7 +196,6 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] name = "markupsafe" version = "2.1.1" description = "Safely add untrusted strings to HTML/XML markup." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -259,7 +245,6 @@ files = [ name = "mdit-py-plugins" version = "0.3.0" description = "Collection of plugins for markdown-it-py" -category = "dev" optional = false python-versions = "~=3.6" files = [ @@ -279,7 +264,6 @@ testing = ["coverage", "pytest (>=3.6,<4)", "pytest-cov", "pytest-regressions"] name = "mdurl" version = "0.1.2" description = "Markdown URL utilities" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -291,7 +275,6 @@ files = [ name = "myst-parser" version = "0.18.0" description = "An extended commonmark compliant parser, with bridges to docutils & sphinx." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -319,7 +302,6 @@ testing = ["beautifulsoup4", "coverage[toml]", "pytest (>=6,<7)", "pytest-cov", name = "packaging" version = "21.3" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -334,7 +316,6 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" name = "pydata-sphinx-theme" version = "0.8.1" description = "Bootstrap-based Sphinx theme from the PyData community" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -356,14 +337,13 @@ test = ["pydata-sphinx-theme[doc]", "pytest"] [[package]] name = "pygments" -version = "2.13.0" +version = "2.15.0" description = "Pygments is a syntax highlighting package written in Python." -category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "Pygments-2.13.0-py3-none-any.whl", hash = "sha256:f643f331ab57ba3c9d89212ee4a2dabc6e94f117cf4eefde99a0574720d14c42"}, - {file = "Pygments-2.13.0.tar.gz", hash = "sha256:56a8508ae95f98e2b9bdf93a6be5ae3f7d8af858b43e02c5a2ff083726be40c1"}, + {file = "Pygments-2.15.0-py3-none-any.whl", hash = "sha256:77a3299119af881904cd5ecd1ac6a66214b6e9bed1f2db16993b54adede64094"}, + {file = "Pygments-2.15.0.tar.gz", hash = "sha256:f7e36cffc4c517fbc252861b9a6e4644ca0e5abadf9a113c72d1358ad09b9500"}, ] [package.extras] @@ -373,7 +353,6 @@ plugins = ["importlib-metadata"] name = "pyparsing" version = "3.0.9" description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "dev" optional = false python-versions = ">=3.6.8" files = [ @@ -388,7 +367,6 @@ diagrams = ["jinja2", "railroad-diagrams"] name = "pytz" version = "2022.2.1" description = "World timezone definitions, modern and historical" -category = "dev" optional = false python-versions = "*" files = [ @@ -400,7 +378,6 @@ files = [ name = "pyyaml" version = "6.0" description = "YAML parser and emitter for Python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -450,7 +427,6 @@ files = [ name = "requests" version = "2.31.0" description = "Python HTTP for Humans." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -472,7 +448,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "snowballstemmer" version = "2.2.0" description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." -category = "dev" optional = false python-versions = "*" files = [ @@ -484,7 +459,6 @@ files = [ name = "soupsieve" version = "2.3.2.post1" description = "A modern CSS selector implementation for Beautiful Soup." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -496,7 +470,6 @@ files = [ name = "sphinx" version = "4.5.0" description = "Python documentation generator" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -532,7 +505,6 @@ test = ["cython", "html5lib", "pytest", "pytest-cov", "typed-ast"] name = "sphinx-book-theme" version = "0.3.3" description = "A clean book theme for scientific explanations and documentation with Sphinx" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -554,7 +526,6 @@ test = ["beautifulsoup4 (>=4.6.1,<5)", "coverage", "myst-nb (>=0.13.2,<0.14.0)", name = "sphinx-copybutton" version = "0.5.0" description = "Add a copy button to each of your code cells." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -573,7 +544,6 @@ rtd = ["ipython", "myst-nb", "sphinx", "sphinx-book-theme"] name = "sphinx-multiversion" version = "0.2.4" description = "Add support for multiple versions to sphinx" -category = "dev" optional = false python-versions = "*" files = [ @@ -588,7 +558,6 @@ sphinx = ">=2.1" name = "sphinxcontrib-applehelp" version = "1.0.2" description = "sphinxcontrib-applehelp is a sphinx extension which outputs Apple help books" -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -604,7 +573,6 @@ test = ["pytest"] name = "sphinxcontrib-devhelp" version = "1.0.2" description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp document." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -620,7 +588,6 @@ test = ["pytest"] name = "sphinxcontrib-htmlhelp" version = "2.0.0" description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -636,7 +603,6 @@ test = ["html5lib", "pytest"] name = "sphinxcontrib-jsmath" version = "1.0.1" description = "A sphinx extension which renders display math in HTML via JavaScript" -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -651,7 +617,6 @@ test = ["flake8", "mypy", "pytest"] name = "sphinxcontrib-qthelp" version = "1.0.3" description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp document." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -667,7 +632,6 @@ test = ["pytest"] name = "sphinxcontrib-serializinghtml" version = "1.1.5" description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -683,7 +647,6 @@ test = ["pytest"] name = "typing-extensions" version = "4.3.0" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -695,7 +658,6 @@ files = [ name = "uc-micro-py" version = "1.0.1" description = "Micro subset of unicode data files for linkify-it-py projects." -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -710,7 +672,6 @@ test = ["coverage", "pytest", "pytest-cov"] name = "urllib3" version = "1.26.12" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" files = [ @@ -727,7 +688,6 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] name = "zipp" version = "3.8.1" description = "Backport of pathlib-compatible object wrapper for zip files" -category = "dev" optional = false python-versions = ">=3.7" files = [ diff --git a/docs/zh/developer/python_dev.md b/docs/zh/developer/python_dev.md index 9fba4195b6e..d4b852fe355 100644 --- a/docs/zh/developer/python_dev.md +++ b/docs/zh/developer/python_dev.md @@ -42,6 +42,11 @@ pytest tests/ pytest -so log_cli=true --log-cli-level=DEBUG tests/ ``` +也可以使用module模式运行,适合做实际运行测试: +``` +python -m diagnostic_tool.diagnose ... +``` + ## Conda 如果使用Conda环境,`pytest`命令可能找到错误的python环境,而导致类似`ModuleNotFoundError: No module named 'IPython'`的问题。请使用`python -m pytest`。 diff --git a/docs/zh/integration/online_datasources/kafka_connector_demo.md b/docs/zh/integration/online_datasources/kafka_connector_demo.md index fbe16a8e021..0d6ef42d266 100644 --- a/docs/zh/integration/online_datasources/kafka_connector_demo.md +++ b/docs/zh/integration/online_datasources/kafka_connector_demo.md @@ -14,8 +14,8 @@ OpenMLDB Kafka Connector实现见[extensions/kafka-connect-jdbc](https://github. ### 下载与准备 - 你需要下载kafka,请点击[kafka官网下载](https://kafka.apache.org/downloads)下载kafka_2.13-3.1.0.tgz。 -- 你需要下载connector包以及依赖,请点击[kafka-connect-jdbc.tgz](http://openmldb.ai/download/kafka-connector/kafka-connect-jdbc.tgz)。 -- 你需要下载本文中所需要的配置与脚本等文件,请点击[kafka_demo_files.tgz](http://openmldb.ai/download/kafka-connector/kafka_demo_files.tgz)下载。 +- 你需要下载connector包以及依赖,请点击[kafka-connect-jdbc.tgz](https://openmldb.ai/download/kafka-connector/kafka-connect-jdbc.tgz)。 +- 你需要下载本文中所需要的配置与脚本等文件,请点击[kafka_demo_files.tgz](https://openmldb.ai/download/kafka-connector/kafka_demo_files.tgz)下载。 本文将使用docker方式启动OpenMLDB,所以无需单独下载OpenMLDB。并且,kafka与connector的启动,都可以在同一个容器中进行。 diff --git a/docs/zh/maintain/diagnose.md b/docs/zh/maintain/diagnose.md index 9cd2f124599..eef7db5b5a1 100644 --- a/docs/zh/maintain/diagnose.md +++ b/docs/zh/maintain/diagnose.md @@ -8,7 +8,7 @@ 安装方式与使用: ```bash -pip install openmldb-tool +pip install openmldb-tool # openmldb-tool[rpc] openmldb_tool # 注意下划线 ``` 有以下几个子命令可选择执行: @@ -84,12 +84,12 @@ JOB 检查会检查集群中的离线任务,可以使用`inspect job`或`inspe 以下是一些常见的state: -state | 描述 ----------|-------- -finished | 成功完成的任务 -running | 正在运行的任务 -failed | 失败的任务 -killed | 被终止的任务 +| state | 描述 | +| -------- | -------------- | +| finished | 成功完成的任务 | +| running | 正在运行的任务 | +| failed | 失败的任务 | +| killed | 被终止的任务 | 更多state信息详见[Spark State]( https://spark.apache.org/docs/3.2.1/api/java/org/apache/spark/launcher/SparkAppHandle.State.html),[Yarn State](https://hadoop.apache.org/docs/current/api/org/apache/hadoop/yarn/api/records/YarnApplicationState.html) @@ -193,6 +193,28 @@ nameserver: openmldb_tool static-check --conf_file=/work/openmldb/conf/hosts -VCL --local ``` +### rpc + +`openmldb_tool`还提供了一个RPC接口,但它是一个额外组件,需要通过`pip install openmldb-tool[rpc]`安装。使用方式是`openmldb_tool rpc`,例如,`openmldb_tool rpc ns ShowTable --field '{"show_all":true}'`可以调用`nameserver`的`ShowTable`接口,获取表的状态信息。 + +NameServer与TaskManager只有一个活跃,所以我们用ns和tm来代表这两个组件。 +而TabletServer有多个,我们用`tablet1`,`tablet2`等来指定某个TabletServer,顺序可通过`openmldb_tool rpc`或`openmldb_tool status`来查看。 + +如果对RPC服务的方法或者输入参数不熟悉,可以通过`openmldb_tool rpc [method] --hint`查看帮助信息。例如: +```bash +$ openmldb_tool rpc ns ShowTable --hint +... +server proto version is 0.7.0-e1d35fcf6 +hint use pb2 files from /tmp/diag_cache +You should input json like this, ignore round brackets in the key and double quotation marks in the value: --field '{ + "(optional)name": "string", + "(optional)db": "string", + "(optional)show_all": "bool" +}' +``` +hint还需要额外的pb文件,帮助解析输入参数,默认是从`/tmp/diag_cache`中读取,如果不存在则自动下载。如果你已有相应的文件,或者已经手动下载,可以通过`--pbdir`指定该目录。 + ## 附加 可使用`openmldb_tool --helpfull`查看所有配置项。例如,`--sdk_log`可以打印sdk的日志(zk,glog),可用于调试。 + \ No newline at end of file diff --git a/docs/zh/openmldb_sql/ddl/DROP_FUNCTION.md b/docs/zh/openmldb_sql/ddl/DROP_FUNCTION.md index 1445d4384fc..2915698eb7a 100644 --- a/docs/zh/openmldb_sql/ddl/DROP_FUNCTION.md +++ b/docs/zh/openmldb_sql/ddl/DROP_FUNCTION.md @@ -3,7 +3,7 @@ **Syntax** ```sql -DROP FUNCTION FunctionName +DROP FUNCTION [IF EXISTS] FunctionName ``` **Example** @@ -13,3 +13,6 @@ DROP FUNCTION FunctionName DROP FUNCTION cut2; ``` +```{note} +删除函数实际是分布式的删除,会删除所有节点上的函数。如果某个节点删除失败,不会终止整个删除过程。我们只在整体层面,或者说是元数据层面上保证函数的唯一性,底层节点上函数可以重复创建,所以,单个节点函数删除失败不会影响后续的创建操作。但还是建议查询节点上删除失败留下的WARN日志,查看具体的删除失败的原因。 +``` diff --git a/docs/zh/quickstart/sdk/python_sdk.md b/docs/zh/quickstart/sdk/python_sdk.md index 570cbed2558..9a4640f7528 100644 --- a/docs/zh/quickstart/sdk/python_sdk.md +++ b/docs/zh/quickstart/sdk/python_sdk.md @@ -10,7 +10,7 @@ pip install openmldb ## 使用 OpenMLDB DBAPI -本节演示 OpenMLDB DBAPI 的基本使用。 +本节演示 OpenMLDB DBAPI 的基本使用。所有dbapi接口如果执行失败,会抛出异常`DatabaseError`,用户可自行捕获异常并处理。返回值为`Cursor`,DDL SQL 不用处理返回值,其他 SQL 的返回值处理参考下方具体示例。 ### 创建连接 @@ -70,6 +70,22 @@ result = cursor.batch_row_request("SELECT * FROM t1", ["col1","col2"], ({"col1": print(result.fetchone()) ``` +### 执行 Deployment + +请注意,执行 Deployment只有DBAPI支持,OpenMLDB SQLAlchemy无对应接口。而且,仅支持单行请求,不支持批量请求。 + +```python +cursor.execute("DEPLOY d1 SELECT col1 FROM t1") +# dict style +result = cursor.callproc("d1", {"col1": 1000, "col2": None, "col3": None, "col4": None, "col5": None}) +print(result.fetchall()) +# tuple style +result = cursor.callproc("d1", (1001, "2023-07-20", "abc", "def", 1)) +print(result.fetchall()) +# drop deployment before drop table +cursor.execute("DROP DEPLOYMENT d1") +``` + ### 删除表 删除表 `t1`: @@ -94,7 +110,7 @@ cursor.close() ## 使用 OpenMLDB SQLAlchemy -本节演示通过 OpenMLDB SQLAlchemy 使用 Python SDK。 +本节演示通过 OpenMLDB SQLAlchemy 使用 Python SDK。同样的,所有dbapi接口如果执行失败,会抛出异常`DatabaseError`,用户可自行捕获异常并处理。返回值处理参考SQLAlchemy标准。 ### 创建连接 @@ -166,7 +182,7 @@ try: for row in rs: print(row) rs = connection.execute("SELECT * FROM t1 WHERE col3 = ?;", ('hefei')) - rs = connection.execute("SELECT * FROM t1 WHERE col3 = ?;",[('hefei'), ('shanghai')]) + rs = connection.execute("SELECT * FROM t1 WHERE col3 = ?;", [('hefei'), ('shanghai')]) except Exception as e: print(e) ``` @@ -210,7 +226,7 @@ OpenMLDB Python SDK 支持了 Notebook magic function 拓展,使用以下语 ```python import openmldb -db = openmldb.dbapi.connect(database='demo_db',zk='0.0.0.0:2181',zkPath='/openmldb') +db = openmldb.dbapi.connect(database='demo_db', zk='0.0.0.0:2181', zkPath='/openmldb') openmldb.sql_magic.register(db) ``` diff --git a/docs/zh/use_case/JD_recommendation.md b/docs/zh/use_case/JD_recommendation.md index 7aff165a20c..143666d58ec 100644 --- a/docs/zh/use_case/JD_recommendation.md +++ b/docs/zh/use_case/JD_recommendation.md @@ -34,7 +34,7 @@ ls jd-recommendation/ export demodir=/jd-recommendation/ ``` -本例仅使用小数据集做演示。如果你想要使用全量数据集,请下载 [JD_data](http://openmldb.ai/download/jd-recommendation/JD_data.tgz)。 +本例仅使用小数据集做演示。如果你想要使用全量数据集,请下载 [JD_data](https://openmldb.ai/download/jd-recommendation/JD_data.tgz)。 ### 安装 OneFlow 工具包 diff --git a/hybridse/src/udf/udf_library.cc b/hybridse/src/udf/udf_library.cc index 85dc4ce9641..205563c69e8 100644 --- a/hybridse/src/udf/udf_library.cc +++ b/hybridse/src/udf/udf_library.cc @@ -256,29 +256,29 @@ Status UdfLibrary::RemoveDynamicUdf(const std::string& name, const std::vector lock(mu_); if (table_.erase(canonical_name) <= 0) { - return Status(kCodegenError, "can not find the function " + canonical_name); + return Status(kCodegenError, "udaf function not present in udf table: " + canonical_name); } if (external_symbols_.erase(lib_name + ".init") <= 0) { - return Status(kCodegenError, "can not find the init function " + lib_name); + return Status(kCodegenError, "can not find the init function in symbol table: " + lib_name); } if (external_symbols_.erase(lib_name + ".update") <= 0) { - return Status(kCodegenError, "can not find the update function " + lib_name); + return Status(kCodegenError, "can not find the update function in symbol table: " + lib_name); } if (external_symbols_.erase(lib_name + ".output") <= 0) { - return Status(kCodegenError, "can not find the output function " + lib_name); + return Status(kCodegenError, "can not find the output function in symbol table: " + lib_name); } } else { std::lock_guard lock(mu_); if (table_.erase(canonical_name) <= 0) { - return Status(kCodegenError, "can not find the function " + canonical_name); + return Status(kCodegenError, "udf function not present in udf table: " + canonical_name); } if (external_symbols_.erase(lib_name) <= 0) { - return Status(kCodegenError, "can not find the function " + lib_name); + return Status(kCodegenError, "can not find the function in symbol table: " + lib_name); } } return lib_manager_.RemoveHandler(file); diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/common/LibraryLoader.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/common/LibraryLoader.java index 47226cae2de..d0d395f6e52 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/common/LibraryLoader.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/common/LibraryLoader.java @@ -82,8 +82,9 @@ synchronized public static void loadLibrary(String libraryPath) { logger.error(String.format("Fail to find %s in resources", libraryPath)); } } catch (IOException | UnsatisfiedLinkError e) { - logger.error(String.format("Error while load %s from local resource", libraryPath), e); - throw new UnsatisfiedLinkError(String.format("Fail to load library %s", libraryPath)); + String msg = String.format("Error while load %s from local resource", libraryPath); + logger.error(msg, e); + throw new RuntimeException(msg, e); } } diff --git a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/udf/ExternalFunctionManager.java b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/udf/ExternalFunctionManager.java index b89d92af130..00bc94e9fb4 100644 --- a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/udf/ExternalFunctionManager.java +++ b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/udf/ExternalFunctionManager.java @@ -37,14 +37,13 @@ static public String getLibraryFilePath(String libraryFileName) { static public void addFunction(String fnName, String libraryFileName) throws Exception { if (hasFunction(fnName)) { - logger.warn(String.format("The function %s exists, ignore adding function", fnName)); - } else { - String libraryFilePath = getLibraryFilePath(libraryFileName); - if(!(new File(libraryFilePath).exists())) { - throw new Exception("The library file does not exist in path: " + libraryFilePath); - } - nameFileMap.put(fnName, libraryFileName); + logger.warn(String.format("The function %s exists, replace", fnName)); + } + String libraryFilePath = getLibraryFilePath(libraryFileName); + if(!(new File(libraryFilePath).exists())) { + throw new Exception("The library file does not exist in path: " + libraryFilePath); } + nameFileMap.put(fnName, libraryFileName); } static public void dropFunction(String fnName) { diff --git a/onebox/stop_all.sh b/onebox/stop_all.sh index 03c9f6fe0cb..747adcdf929 100755 --- a/onebox/stop_all.sh +++ b/onebox/stop_all.sh @@ -19,6 +19,6 @@ set -x -e if [[ "$OSTYPE" = "darwin"* ]]; then pkill -9 -x -l openmldb else - pkill -9 -x -e openmldb + pgrep -a -f "openmldb.*onebox.*" | awk '{print $1}' | xargs -I {} kill -9 {} fi diff --git a/python/openmldb_tool/README.md b/python/openmldb_tool/README.md index e1749a97c86..3381751edf9 100644 --- a/python/openmldb_tool/README.md +++ b/python/openmldb_tool/README.md @@ -3,14 +3,20 @@ In `diagnostic_tool/`: ``` -|-- collector.py # collect version/config/logs (local or remote ssh/scp, defined by distribution conf file) -|-- conf_validator.py -|-- connector.py # openmldb singleton connection -|-- diagnose.py # main -|-- dist_conf.py # read distribution conf file, dist.yml or hosts -|-- log_analyzer.py # analyze log, you can add your own rules -|-- server_checker.py # server status checker, sql tester, you can add more checks -`-- util.py +├── collector.py # collect version/config/logs (local or remote ssh/scp, defined by distribution conf file) +├── common_err.yml +├── conf_validator.py +├── connector.py # openmldb singleton connection +├── diagnose.py # main +├── dist_conf.py # read distribution conf file, dist.yml or hosts +├── __init__.py +├── log_analyzer.py analyze log, you can add your own rules +├── parser.py +├── __pycache__ +├── rpc.py # optional module, rpc helper and executor for servers +├── server_checker.py # server status checker, sql tester, you can add more checks +├── table_checker.py +└── util.py ``` ## Subcommands @@ -25,6 +31,7 @@ inspect no sub means inspect all test test online insert&select, test offline select if taskmanager exists static-check needs config file(dist.yml or hosts) [-V,--version/-C,--conf/-L,--log/-VCL] +rpc user-friendly rpc tool ``` For example: @@ -122,3 +129,15 @@ log_analysis.py read logs from local collection path ``. - show warning logs in `nameserver.info.log`, `tablet.info.log` - show warning logs and exceptions in `taskmanager.log` + +## RPC + +Optional module, rpc helper and executor for servers. You can install it by `pip install openmldb[rpc]`. You can execute rpc directly, but if you want rpc hint, you need to download or compile protobuf files in `OpenMLDB/src/proto`. + +```bash +cd OpenMLDB +make thirdparty +# install to any dir +.deps/usr/bin/protoc --python_out=$(pwd)/pb2 --proto_path=src/proto/ src/proto/*.proto +``` +Then use `openmldb_tool rpc --pbdir=` to run rpc commands. diff --git a/python/openmldb_tool/diagnostic_tool/common_err.yml b/python/openmldb_tool/diagnostic_tool/common_err.yml index 2f45945fbcf..6a4a0f96a84 100644 --- a/python/openmldb_tool/diagnostic_tool/common_err.yml +++ b/python/openmldb_tool/diagnostic_tool/common_err.yml @@ -13,3 +13,4 @@ errors: - "fail to init zk handler with hosts" description: "Error: fail to init zk handler with hosts" solution: "zk_conn_err" + diff --git a/python/openmldb_tool/diagnostic_tool/diagnose.py b/python/openmldb_tool/diagnostic_tool/diagnose.py index 6b2c742f03e..8bd67719489 100644 --- a/python/openmldb_tool/diagnostic_tool/diagnose.py +++ b/python/openmldb_tool/diagnostic_tool/diagnose.py @@ -15,6 +15,7 @@ # limitations under the License. import argparse +import json import os import textwrap import time @@ -118,7 +119,7 @@ def insepct_online(args): assert not fails, f"unhealthy tables: {fails}" print(f"all tables are healthy") - if getattr(args, 'dist', False): + if getattr(args, "dist", False): table_checker = TableChecker(conn) table_checker.check_distribution(dbs=flags.FLAGS.db.split(",")) @@ -131,7 +132,9 @@ def inspect_offline(args): print(f"inspect {total} offline jobs") if num: failed_jobs_str = "\n".join(jobs) - raise AssertionError(f"{num} offline final jobs are failed\nfailed jobs:\n{failed_jobs_str}") + raise AssertionError( + f"{num} offline final jobs are failed\nfailed jobs:\n{failed_jobs_str}" + ) print("all offline final jobs are finished") @@ -142,7 +145,9 @@ def _get_jobs(states=None): total_num = len(jobs) # jobs sorted by id jobs.sort(key=lambda x: x[0]) - show_jobs = [_format_job_row(row) for row in jobs if not states or row[2].lower() in states] + show_jobs = [ + _format_job_row(row) for row in jobs if not states or row[2].lower() in states + ] return total_num, len(show_jobs), show_jobs @@ -222,6 +227,65 @@ def static_check(args): LogAnalyzer(dist_conf, flags.FLAGS.collect_dir).run() +def rpc(args): + connect = Connector() + status_checker = checker.StatusChecker(connect) + + host = args.host + if not host: + status_checker.check_components() + print( + """choose one host to connect, e.g. "openmldb_tool rpc ns". + ns: nameserver(master only, no need to choose) + tablet:you can get from component table, e.g. the first tablet in table is tablet1 + tm: taskmanager""" + ) + return + from diagnostic_tool.rpc import RPC + + # use status connction to get version + conns_with_version = { + endpoint: version + for endpoint, version, _, _ in status_checker.check_connection() + } + _, endpoint, _ = RPC.get_endpoint_service(host) + proto_version = conns_with_version[endpoint] + print(f"server proto version is {proto_version}") + + operation = args.operation + field = json.loads(args.field) + rpc_service = RPC(host) + if args.hint: + pb2_dir = flags.FLAGS.pbdir + print(f"hint use pb2 files from {pb2_dir}") + # check about rpc depends proto compiled dir + if ( + not os.path.isdir(pb2_dir) + or len([pb for pb in os.listdir(pb2_dir) if pb.endswith("_pb2.py")]) < 8 + ): + print(f"{pb2_dir} is broken, mkdir and download") + os.system(f"mkdir -p {pb2_dir}") + import tarfile + import requests + + # pb2.tar has no dir, extract to pb2_dir + url = "https://openmldb.ai/download/diag/pb2.tgz" + r = requests.get(url) + with open(f"{pb2_dir}/pb2.tgz", "wb") as f: + f.write(r.content) + + with tarfile.open(f"{pb2_dir}/pb2.tgz", "r:gz") as tar: + tar.extractall(pb2_dir) + rpc_service.hint(args.operation) + return + if not operation: + print( + "choose one operation, e.g. `openmldb_tool rpc ns ShowTable`, --hint for methods list or one method help" + ) + return + rpc_service(operation, field) + + def parse_arg(argv): """parser definition, absl.flags + argparse""" parser = argparse_flags.ArgumentParser( @@ -258,41 +322,32 @@ def parse_arg(argv): online = inspect_sub.add_parser("online", help="only inspect online table.") online.set_defaults(command=insepct_online) online.add_argument( - "--dist", - action="store_true", - help="Inspect online distribution." + "--dist", action="store_true", help="Inspect online distribution." ) # inspect offline - offline = inspect_sub.add_parser( - "offline", help="only inspect offline jobs." - ) + offline = inspect_sub.add_parser("offline", help="only inspect offline jobs.") offline.set_defaults(command=inspect_offline) # inspect job - ins_job = inspect_sub.add_parser("job", help="show jobs by state, show joblog or parse joblog by id.") - ins_job.set_defaults(command=inspect_job) - ins_job.add_argument( - "--state", - default="all", - help="Specify which state offline jobs, split by ','" + ins_job = inspect_sub.add_parser( + "job", help="show jobs by state, show joblog or parse joblog by id." ) + ins_job.set_defaults(command=inspect_job) ins_job.add_argument( - "--id", - help="inspect joblog by id" + "--state", default="all", help="Specify which state offline jobs, split by ','" ) + ins_job.add_argument("--id", help="inspect joblog by id") ins_job.add_argument( "--detail", action="store_true", - help="show detailed joblog information, use with `--id`" + help="show detailed joblog information, use with `--id`", ) ins_job.add_argument( "--conf-url", default="https://raw.githubusercontent.com/4paradigm/OpenMLDB/main/python/openmldb_tool/diagnostic_tool/common_err.yml", - help="url used to update the log parser configuration. If downloading is slow, you can try mirror source 'https://openmldb.ai/download/diag/common_err.yml'" + help="url used to update the log parser configuration. If downloading is slow, you can try mirror source 'https://openmldb.ai/download/diag/common_err.yml'", ) ins_job.add_argument( - "--conf-update", - action="store_true", - help="update the log parser configuration" + "--conf-update", action="store_true", help="update the log parser configuration" ) # sub test @@ -325,6 +380,40 @@ def parse_arg(argv): ) static_check_parser.set_defaults(command=static_check) + # sub rpc + rpc_parser = subparsers.add_parser( + "rpc", + help="user-friendly rpc tool", + ) + rpc_parser.add_argument( + "host", + nargs="?", + help=textwrap.dedent( + """ \ + host name, if no value, print the component table. + ns: nameserver(master only, no need to choose) + tablet:you can get from component table, e.g. the first tablet in table is tablet1 + tm: taskmanager + """ + ), + ) + rpc_parser.add_argument( + "operation", + nargs="?", + default="", + ) + rpc_parser.add_argument( + "--field", + default="{}", + help='json format, e.g. \'{"db":"db1","table":"t1"}\', default is \'{}\'', + ) + rpc_parser.add_argument( + "--hint", + action="store_true", + help="print rpc hint for current operation(rpc method), if no operation, print all possible rpc methods", + ) + rpc_parser.set_defaults(command=rpc) + def help(args): parser.print_help() diff --git a/python/openmldb_tool/diagnostic_tool/parser.py b/python/openmldb_tool/diagnostic_tool/parser.py index ae57c5597af..5b336de5992 100644 --- a/python/openmldb_tool/diagnostic_tool/parser.py +++ b/python/openmldb_tool/diagnostic_tool/parser.py @@ -1,3 +1,17 @@ +# Copyright 2021 4Paradigm +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import re import requests diff --git a/python/openmldb_tool/diagnostic_tool/rpc.py b/python/openmldb_tool/diagnostic_tool/rpc.py new file mode 100644 index 00000000000..686734e7641 --- /dev/null +++ b/python/openmldb_tool/diagnostic_tool/rpc.py @@ -0,0 +1,181 @@ +# Copyright 2021 4Paradigm +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from absl import flags +import json +import requests +from bs4 import BeautifulSoup +from google.protobuf.descriptor import FieldDescriptor + +from .server_checker import StatusChecker +from .connector import Connector + +flags.DEFINE_string( + "pbdir", + "/tmp/diag_cache", + "pb2 root dir, if not set, will use the /pb2 directory in the same directory as this script", +) + + +class DescriptorHelper: + def __init__(self, service): + # TODO(hw): symbol_database is useful? + # lazy import + assert flags.FLAGS.pbdir, "pbdir not set" + import sys + from pathlib import Path + sys.path.append(Path(flags.FLAGS.pbdir).as_posix()) + import tablet_pb2 + import name_server_pb2 + import taskmanager_pb2 + + pb_map = { + "TabletServer": tablet_pb2, + "NameServer": name_server_pb2, + "TaskManagerServer": taskmanager_pb2, + # "ApiServer": api_server_pb2, + # "DataSync": data_sync_pb2, + } + self.descriptor = pb_map[service].DESCRIPTOR.services_by_name[service] + + def get_input_json(self, method): + inp = self.descriptor.FindMethodByName(method).input_type + return Field.to_json(inp) + + +class Field: + def to_str(typ): + typ2str = { + FieldDescriptor.TYPE_DOUBLE: "double", + FieldDescriptor.TYPE_FLOAT: "float", + FieldDescriptor.TYPE_INT64: "int64", + FieldDescriptor.TYPE_UINT64: "uint64", + FieldDescriptor.TYPE_INT32: "int32", + FieldDescriptor.TYPE_FIXED64: "fixed64", + FieldDescriptor.TYPE_FIXED32: "fixed32", + FieldDescriptor.TYPE_BOOL: "bool", + FieldDescriptor.TYPE_STRING: "string", + FieldDescriptor.TYPE_GROUP: "group", + FieldDescriptor.TYPE_MESSAGE: "message", + FieldDescriptor.TYPE_BYTES: "bytes", + FieldDescriptor.TYPE_UINT32: "uint32", + } + return typ2str[typ] + + def to_json(field): + # label optional, required, or repeated. + label = {1: "optional", 2: "required", 3: "repeated"} + if isinstance(field, FieldDescriptor): + key = f"({label[field.label]})" + field.name + if field.type == FieldDescriptor.TYPE_MESSAGE: + value = Field.to_json(field.message_type) + elif field.type == FieldDescriptor.TYPE_ENUM: + value = "/".join([n.name for n in field.enum_type.values]) + else: + value = Field.to_str(field.type) + if field.label == 3: + # json list style + return {key: [value, "..."]} + else: + return {key: value} + else: + # field is a message + if field.containing_type and [f.name for f in field.fields] == [ + "key", + "value", + ]: + # treat key-value as map type, can't figure out custom type + # TODO(hw): it's ok to pass a json list to proto map? + return {"k": "v", "...": "..."} + d = {} + for f in field.fields: + d.update(Field.to_json(f)) + return d + + +class RPC: + """rpc service""" + + def __init__(self, host) -> None: + self.host, self.endpoint, self.service = RPC.get_endpoint_service(host.lower()) + + def rpc_help(self): + if self.host == "taskmanager": + r = requests.post(f"http://{self.endpoint}") + else: + r = requests.post(f"http://{self.endpoint}/{self.service}") + return RPC.parse_html(r.text) + + def rpc_exec(self, operation, field): + r = requests.post( + f"http://{self.endpoint}/{self.service}/{operation}", json=field + ) + return r.text + + def hint(self, info): + if not info: + # show service name and all rpc methods + print(self.rpc_help()) + return + + # input message to json style + + # if taskmanager, service in pb2 is TaskManagerServer + service = ( + self.service + if not self.service.endswith("TaskManagerServer") + else "TaskManagerServer" + ) + + helper = DescriptorHelper(service) + json_str = json.dumps(helper.get_input_json(info), indent=4) + print( + f"You should input json like this, ignore round brackets in the key and double quotation marks in the value: --field '{json_str}'" + ) + + def search_in(self, typ, info): + for item in typ: + if info in item.keys(): + return item[info] + + def __call__(self, operation, field): + if not operation: + text = self.rpc_help() + else: + text = self.rpc_exec(operation, field) + print(text) + + def get_endpoint_service(host): + conn = Connector() + components_map = StatusChecker(conn)._get_components() + if host.startswith("tablet"): + num = int(host[6:]) - 1 + host = "tablet" + else: + assert host in ["ns", "tm"] + num = 0 + host = "nameserver" if host == "ns" else "taskmanager" + assert host in components_map, f"{host} not found in cluster" + endpoint = components_map[host][num][0] + host2service = { + "nameserver": "NameServer", + "taskmanager": "openmldb.taskmanager.TaskManagerServer", + "tablet": "TabletServer", + } + service = host2service[host] + return host, endpoint, service + + def parse_html(html): + soup = BeautifulSoup(html, "html.parser") + return soup.get_text("\n") diff --git a/python/openmldb_tool/diagnostic_tool/server_checker.py b/python/openmldb_tool/diagnostic_tool/server_checker.py index 0a18c74de34..35c50103b46 100644 --- a/python/openmldb_tool/diagnostic_tool/server_checker.py +++ b/python/openmldb_tool/diagnostic_tool/server_checker.py @@ -19,7 +19,6 @@ from prettytable import PrettyTable import re import requests -import time from .connector import Connector from .dist_conf import DistConf, COMPONENT_ROLES, ServerInfo @@ -43,19 +42,25 @@ def check_connection(self): t.title = "Connections" t.field_names = ["Endpoint", "Version", "Cost_time", "Extra"] err = "" - taskmanager = component_map.pop("taskmanager") # extract taskmanager + taskmanager = [] + if "taskmanager" in component_map: + taskmanager = component_map.pop("taskmanager") # extract taskmanager other_components = [component for role in component_map.values() for component in role] # extract other components + conns = [] for (endpoint, _) in other_components: version, response_time, ex, e = self._get_information(endpoint) - t.add_row([endpoint, version, response_time, ex]) + conns.append([endpoint, version, response_time, ex]) err += e for (endpoint, _) in taskmanager: version, response_time, ex, e = self._get_information_taskmanager(endpoint) - t.add_row([endpoint, version, response_time, ex]) + conns.append([endpoint, version, response_time, ex]) err += e + for conn in conns: + t.add_row(conn) print(t) if err: print(err) + return conns def _get_information(self, endpoint): """get informations from components except taskmanager""" diff --git a/python/openmldb_tool/setup.py b/python/openmldb_tool/setup.py index fafaeae3a88..f7120cfa256 100644 --- a/python/openmldb_tool/setup.py +++ b/python/openmldb_tool/setup.py @@ -17,15 +17,15 @@ from setuptools import setup, find_packages setup( - name='openmldb-tool', - version='0.7.0a0', - author='OpenMLDB Team', - author_email=' ', - url='https://github.com/4paradigm/OpenMLDB', - description='OpenMLDB Tool', + name="openmldb-tool", + version="0.7.0a0", + author="OpenMLDB Team", + author_email=" ", + url="https://github.com/4paradigm/OpenMLDB", + description="OpenMLDB Tool", license="copyright 4paradigm.com", classifiers=[ - 'Programming Language :: Python :: 3', + "Programming Language :: Python :: 3", ], install_requires=[ "openmldb >= 0.6.9", @@ -35,15 +35,20 @@ "termplotlib", "requests", ], - extras_require={'test': [ - "pytest", - ]}, - packages=find_packages(exclude=['tests']), - exclude_package_data={ - 'openmldb-tool': ['diagnostic_tool/common_err.yml'] + extras_require={ + "rpc": [ + "protobuf==3.6.1", + "beautifulsoup4", + ], + "test": [ + "openmldb-tool[rpc]", + "pytest", + ], }, + packages=find_packages(exclude=["tests"]), + exclude_package_data={"openmldb-tool": ["diagnostic_tool/common_err.yml"]}, entry_points={ - 'console_scripts': ['openmldb_tool = diagnostic_tool.diagnose:run'], + "console_scripts": ["openmldb_tool = diagnostic_tool.diagnose:run"], }, zip_safe=False, ) diff --git a/python/openmldb_tool/tests/cmd_test.py b/python/openmldb_tool/tests/cmd_test.py index 29e669d9458..461bba274d9 100644 --- a/python/openmldb_tool/tests/cmd_test.py +++ b/python/openmldb_tool/tests/cmd_test.py @@ -1,3 +1,17 @@ +# Copyright 2021 4Paradigm +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest from diagnostic_tool.diagnose import parse_arg, main from absl import flags @@ -19,7 +33,8 @@ def test_helpmsg(): parse_arg(["foo", "static-check", "-h"]) with pytest.raises(SystemExit): parse_arg(["foo", "--helpfull"]) - + with pytest.raises(SystemExit): + parse_arg(["foo", "rpc", "-h"]) def test_argparse(): cluster_arg = f"--cluster={OpenMLDB_ZK_CLUSTER}" diff --git a/python/openmldb_tool/tests/rpc_test.py b/python/openmldb_tool/tests/rpc_test.py new file mode 100644 index 00000000000..e2804418ef9 --- /dev/null +++ b/python/openmldb_tool/tests/rpc_test.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2021 4Paradigm +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from diagnostic_tool.diagnose import parse_arg, main +from .case_conf import OpenMLDB_ZK_CLUSTER + + +def test_rpc(): + cluster_arg = f"--cluster={OpenMLDB_ZK_CLUSTER}" + args = parse_arg( + [ + "foo", + "rpc", + cluster_arg, + ] + ) + main(args) + + main(parse_arg(["foo", "rpc", cluster_arg, "ns"])) + main(parse_arg(["foo", "rpc", cluster_arg, "tablet1"])) + # no taskmanager in test onebox + if not "onebox" in OpenMLDB_ZK_CLUSTER: + main(parse_arg(["foo", "rpc", cluster_arg, "tm"])) diff --git a/src/base/proto_util.h b/src/base/proto_util.h index be755642e44..1652d3ca9d8 100644 --- a/src/base/proto_util.h +++ b/src/base/proto_util.h @@ -39,6 +39,14 @@ void SetResponseStatus(int code, const std::string& msg, Response* response) { } } +/// @brief Set code and msg, and log it at warning. Must be not ok, skip check code +#define SET_RESP_AND_WARN(s, c, m) \ + do { \ + (s)->set_code(static_cast(c)); \ + (s)->set_msg((m)); \ + LOG(WARNING) << "Set resp: " << (s)->code() << ", " << (s)->msg(); \ + } while (0) + template void SetResponseStatus(const Status& status, Response* response) { if (response != nullptr) { diff --git a/src/nameserver/name_server_impl.cc b/src/nameserver/name_server_impl.cc index f193854701b..550558b6132 100644 --- a/src/nameserver/name_server_impl.cc +++ b/src/nameserver/name_server_impl.cc @@ -1118,8 +1118,6 @@ void NameServerImpl::UpdateTablets(const std::vector& endpoints) { } } - - auto it = tablet_endpoints.begin(); for (; it != tablet_endpoints.end(); ++it) { alive.insert(*it); @@ -3716,8 +3714,6 @@ void NameServerImpl::CreateTable(RpcController* controller, const CreateTableReq } } - - bool NameServerImpl::SaveTableInfo(std::shared_ptr table_info) { std::string table_value; table_info->SerializeToString(&table_value); @@ -9733,16 +9729,16 @@ void NameServerImpl::CreateFunction(RpcController* controller, const CreateFunct } auto tablets = GetAllHealthTablet(); std::vector> succ_tablets; + std::string error_msgs; + // try create on every tablet for (const auto& tablet : tablets) { std::string msg; if (!tablet->client_->CreateFunction(request->fun(), &msg)) { - PDLOG(WARNING, "create function failed. endpoint %s, msg %s", - tablet->client_->GetEndpoint().c_str(), msg.c_str()); - response->set_msg(msg); - break; + error_msgs.append("create function failed on " + tablet->client_->GetEndpoint() + ", reason: " + msg + ";"); } succ_tablets.emplace_back(tablet); } + // rollback and return, it's ok if tablet rollback failed if (succ_tablets.size() < tablets.size()) { for (const auto& tablet : succ_tablets) { std::string msg; @@ -9751,6 +9747,7 @@ void NameServerImpl::CreateFunction(RpcController* controller, const CreateFunct } PDLOG(INFO, "drop function on endpoint %s", tablet->client_->GetEndpoint().c_str()); } + SET_RESP_AND_WARN(response, base::ReturnCode::kCreateFunctionFailedOnTablet, error_msgs); return; } auto fun = std::make_shared<::openmldb::common::ExternalFun>(request->fun()); @@ -9759,8 +9756,7 @@ void NameServerImpl::CreateFunction(RpcController* controller, const CreateFunct fun->SerializeToString(&value); std::string fun_node = zk_path_.external_function_path_ + "/" + fun->name(); if (!zk_client_->CreateNode(fun_node, value)) { - PDLOG(WARNING, "create function node[%s] failed! value[%s] value_size[%u]", - fun_node.c_str(), value.c_str(), value.length()); + SET_RESP_AND_WARN(response, base::ReturnCode::kCreateZkFailed, "create function on zk failed: " + fun_node); return; } } @@ -9771,9 +9767,8 @@ void NameServerImpl::CreateFunction(RpcController* controller, const CreateFunct } void NameServerImpl::DropFunction(RpcController* controller, const DropFunctionRequest* request, - DropFunctionResponse* response, Closure* done) { + DropFunctionResponse* response, Closure* done) { brpc::ClosureGuard done_guard(done); - response->set_code(base::kRPCRunError); std::shared_ptr<::openmldb::common::ExternalFun> fun; { std::lock_guard lock(mu_); @@ -9786,27 +9781,26 @@ void NameServerImpl::DropFunction(RpcController* controller, const DropFunctionR if (request->if_exists()) { base::SetResponseOK(response); } else { - response->set_msg("fun does not exist"); - LOG(WARNING) << request->name() << " does not exist"; + SET_RESP_AND_WARN(response, base::ReturnCode::kError, "fun does not exist in nameserver meta"); } return; } auto tablets = GetAllHealthTablet(); for (const auto& tablet : tablets) { std::string msg; + // if drop function failed on tablet, treat it as success(only log warning) if (!tablet->client_->DropFunction(*fun, &msg)) { - response->set_msg(msg); - LOG(WARNING) << "drop function failed on " << tablet->client_->GetEndpoint(); - return; + LOG(WARNING) << "drop function failed on " << tablet->client_->GetEndpoint() << ", reason: " << msg; } } if (IsClusterMode()) { std::string fun_node = zk_path_.external_function_path_ + "/" + fun->name(); if (!zk_client_->DeleteNode(fun_node)) { - PDLOG(WARNING, "delete function node[%s] failed", fun_node.c_str()); - response->set_msg("delete function node failed"); + // if drop zk node failed, the whole drop function failed + SET_RESP_AND_WARN(response, base::ReturnCode::kDelZkFailed, "delete function zk node failed:" + fun_node); return; } + // func in taskmanager is deleted by client, not in here } base::SetResponseOK(response); LOG(INFO) << "drop function " << request->name() << " success"; @@ -9815,7 +9809,7 @@ void NameServerImpl::DropFunction(RpcController* controller, const DropFunctionR } void NameServerImpl::ShowFunction(RpcController* controller, const ShowFunctionRequest* request, - ShowFunctionResponse* response, Closure* done) { + ShowFunctionResponse* response, Closure* done) { brpc::ClosureGuard done_guard(done); std::lock_guard lock(mu_); if (request->has_name() && !request->name().empty()) { diff --git a/src/sdk/CMakeLists.txt b/src/sdk/CMakeLists.txt index db2aadc638c..cc959f6a23b 100644 --- a/src/sdk/CMakeLists.txt +++ b/src/sdk/CMakeLists.txt @@ -295,7 +295,10 @@ function(get_lib_path X RET) endif() else() # if target has no location, handle it before call this function - get_target_property(RET_V ${X} LOCATION) + get_target_property(type ${X} TYPE) + if (NOT ${type} STREQUAL "INTERFACE_LIBRARY") + get_target_property(RET_V ${X} LOCATION) + endif() # message(STATUS "get ${X} path: ${RET_V}") endif() if("${RET_V}" STREQUAL "RET_V-NOTFOUND") @@ -343,10 +346,6 @@ list(REMOVE_DUPLICATES ABSL_LLVM_TGTS) # get absl llvm libs path foreach(X IN LISTS ABSL_LLVM_TGTS) - get_target_property(type ${X} TYPE) - if (${type} STREQUAL "INTERFACE_LIBRARY") - continue() - endif() get_lib_path(${X} Y) # message(STATUS "get ${X} path: ${Y}") list(APPEND CXXSDK_THIRDPARTY_LIBS ${Y}) diff --git a/src/sdk/db_sdk.cc b/src/sdk/db_sdk.cc index 6b78d4069ec..c04e86d4f03 100644 --- a/src/sdk/db_sdk.cc +++ b/src/sdk/db_sdk.cc @@ -426,6 +426,15 @@ bool ClusterSDK::BuildCatalog() { return UpdateCatalog(table_datas, sp_datas); } +std::vector DBSDK::GetAllDbs() { + std::lock_guard<::openmldb::base::SpinMutex> lock(mu_); + std::vector all_dbs; + for (auto db_name_iter = table_to_tablets_.begin(); db_name_iter != table_to_tablets_.end(); db_name_iter++) { + all_dbs.push_back(db_name_iter->first); + } + return all_dbs; +} + uint32_t DBSDK::GetTableId(const std::string& db, const std::string& tname) { auto table_handler = GetCatalog()->GetTable(db, tname); auto* sdk_table_handler = dynamic_cast<::openmldb::catalog::SDKTableHandler*>(table_handler.get()); diff --git a/src/sdk/db_sdk.h b/src/sdk/db_sdk.h index 48bb1ea80ab..71e3e321241 100644 --- a/src/sdk/db_sdk.h +++ b/src/sdk/db_sdk.h @@ -77,6 +77,7 @@ class DBSDK { std::shared_ptr<::openmldb::client::TaskManagerClient> GetTaskManagerClient(); + std::vector GetAllDbs(); uint32_t GetTableId(const std::string& db, const std::string& tname); std::shared_ptr<::openmldb::nameserver::TableInfo> GetTableInfo(const std::string& db, const std::string& tname); std::vector> GetTables(const std::string& db); diff --git a/src/sdk/sql_cluster_router.cc b/src/sdk/sql_cluster_router.cc index c125b554e51..2c7f473d6eb 100644 --- a/src/sdk/sql_cluster_router.cc +++ b/src/sdk/sql_cluster_router.cc @@ -1674,18 +1674,22 @@ std::shared_ptr SQLClusterRouter::HandleSQLCmd(const h std::string name = cmd_node->GetArgs()[0]; auto base_status = ns_ptr->DropFunction(name, cmd_node->IsIfExists()); if (base_status.OK()) { + *status = {}; + // zk deleted already, remove from cluster_sdk, only failed when func not exist in sdk, ignore error cluster_sdk_->RemoveExternalFun(name); + // drop function from taskmanager, ignore error, taskmanager can recreate the function auto taskmanager_client = cluster_sdk_->GetTaskManagerClient(); if (taskmanager_client) { base_status = taskmanager_client->DropFunction(name, GetJobTimeout()); if (!base_status.OK()) { - *status = {StatusCode::kCmdError, base_status.msg}; + LOG(WARNING) << "drop function " << name << " failed: [" << base_status.GetCode() << "] " + << base_status.GetMsg(); return {}; } } - *status = {}; } else { - *status = {StatusCode::kCmdError, base_status.msg}; + // not exists or nameserver delete failed on zk + APPEND_FROM_BASE_AND_WARN(status, base_status, "drop function failed"); } return {}; } @@ -1930,11 +1934,12 @@ base::Status SQLClusterRouter::HandleSQLCreateTable(hybridse::node::CreatePlanNo return base::Status(base::ReturnCode::kSQLCmdRunError, "fail to execute plan : null pointer"); } - if (create_node->like_clause_ == nullptr) { - std::string db_name = create_node->GetDatabase().empty() ? db : create_node->GetDatabase(); - if (db_name.empty()) { - return base::Status(base::ReturnCode::kSQLCmdRunError, "ERROR: Please use database first"); + std::string db_name = create_node->GetDatabase().empty() ? db : create_node->GetDatabase(); + if (db_name.empty()) { + return base::Status(base::ReturnCode::kSQLCmdRunError, "ERROR: Please use database first"); } + + if (create_node->like_clause_ == nullptr) { ::openmldb::nameserver::TableInfo table_info; table_info.set_db(db_name); @@ -1955,6 +1960,12 @@ base::Status SQLClusterRouter::HandleSQLCreateTable(hybridse::node::CreatePlanNo return base::Status(base::ReturnCode::kSQLCmdRunError, msg); } } else { + auto dbs = cluster_sdk_->GetAllDbs(); + auto it = std::find(dbs.begin(), dbs.end(), db_name); + if (it == dbs.end()) { + return base::Status(base::ReturnCode::kSQLCmdRunError, "fail to create, database does not exist!"); + } + LOG(WARNING) << "CREATE TABLE LIKE will run in offline job, please wait."; std::map config; @@ -3324,22 +3335,25 @@ hybridse::sdk::Status SQLClusterRouter::HandleCreateFunction(const hybridse::nod } fun->set_arg_nullable(iter->second->GetBool()); } + hybridse::sdk::Status st; if (cluster_sdk_->IsClusterMode()) { auto taskmanager_client = cluster_sdk_->GetTaskManagerClient(); if (taskmanager_client) { auto ret = taskmanager_client->CreateFunction(fun, GetJobTimeout()); if (!ret.OK()) { - return {StatusCode::kCmdError, ret.msg}; + APPEND_FROM_BASE_AND_WARN(&st, ret, "create function failed on taskmanager"); + return st; } } } auto ns = cluster_sdk_->GetNsClient(); auto ret = ns->CreateFunction(*fun); if (!ret.OK()) { - return {StatusCode::kCmdError, ret.msg}; + APPEND_FROM_BASE_AND_WARN(&st, ret, "create function failed on nameserver"); + return st; } cluster_sdk_->RegisterExternalFun(fun); - return {}; + return st; } hybridse::sdk::Status SQLClusterRouter::HandleDeploy(const std::string& db, diff --git a/src/sdk/sql_cluster_test.cc b/src/sdk/sql_cluster_test.cc index 8115124881c..6d794692846 100644 --- a/src/sdk/sql_cluster_test.cc +++ b/src/sdk/sql_cluster_test.cc @@ -98,6 +98,13 @@ class SQLClusterDDLTest : public SQLClusterTest { std::string db; }; +TEST_F(SQLClusterDDLTest, TestCreateTableLike) { + ::hybridse::sdk::Status status; + + ASSERT_FALSE(router->ExecuteDDL(db, "create table db2.tb like hive 'hive://db.tb';", &status)); + ASSERT_FALSE(router->ExecuteDDL(db, "drop table db2.tb;", &status)); +} + TEST_F(SQLClusterDDLTest, TestIfExists) { std::string name = "test" + GenRand(); ::hybridse::sdk::Status status; diff --git a/src/tablet/tablet_impl.cc b/src/tablet/tablet_impl.cc index 4a70aed3abc..9a1b77cd6b2 100644 --- a/src/tablet/tablet_impl.cc +++ b/src/tablet/tablet_impl.cc @@ -3547,6 +3547,7 @@ void TabletImpl::GetTableFollower(RpcController* controller, const ::openmldb::a if (info_map.empty()) { response->set_msg("has no follower"); response->set_code(::openmldb::base::ReturnCode::kNoFollower); + return; } for (const auto& kv : info_map) { ::openmldb::api::FollowerInfo* follower_info = response->add_follower_info(); @@ -5678,9 +5679,10 @@ void TabletImpl::DropFunction(RpcController* controller, const openmldb::api::Dr LOG(INFO) << "Drop function success. name " << fun.name() << " path " << fun.file(); base::SetResponseOK(response); } else { - LOG(WARNING) << "Drop function failed. name " << fun.name() << " msg " << status.msg; - response->set_msg(status.msg); - response->set_code(base::kRPCRunError); + // udf remove failed but it's ok to recreate even it exists, nameserver should treat it as success + SET_RESP_AND_WARN(response, base::ReturnCode::kDeleteFailed, + absl::StrCat("drop function failed, name ", fun.name(), ", error: [", status.GetCode(), "] ", + status.str())); } } diff --git a/steps/test_python.sh b/steps/test_python.sh index 1fe32015f35..8c366f77b0c 100644 --- a/steps/test_python.sh +++ b/steps/test_python.sh @@ -18,36 +18,32 @@ set -ex ROOT_DIR=$(pwd) +bash onebox/stop_all.sh + # on hybridsql 0.4.1 or later, 'THIRD_PARTY_SRC_DIR' is defined and is '/deps/src' THIRDSRC=${THIRD_PARTY_SRC_DIR:-thirdsrc} -test -d /rambuild/ut_zookeeper && rm -rf /rambuild/ut_zookeeper/* -cp steps/zoo.cfg "$THIRDSRC/zookeeper-3.4.14/conf" -cd "$THIRDSRC/zookeeper-3.4.14" -# TODO(hw): macos no -p -if [[ "$OSTYPE" =~ ^darwin ]]; then - lsof -ni | grep 6181 | awk '{print $2}'| xargs -I{} kill -9 {} -elif [[ "$OSTYPE" =~ ^linux ]]; then - netstat -anp | grep 6181 | awk '{print $NF}' | awk -F '/' '{print $1}'| xargs -I{} kill -9 {} -fi -./bin/zkServer.sh start && cd "$ROOT_DIR" -echo "zk started" +bash steps/ut_zookeeper.sh reset sleep 5 -cd onebox && sh start_onebox.sh && cd "$ROOT_DIR" +bash onebox/start_onebox.sh echo "onebox started, check" sleep 5 pgrep -f openmldb echo "ROOT_DIR:${ROOT_DIR}" +# debug +python3 -m pip --version + cd "${ROOT_DIR}"/python/openmldb_sdk/dist/ whl_name_sdk=$(ls openmldb*.whl) echo "whl_name_sdk:${whl_name_sdk}" -python3 -m pip install "${whl_name_sdk}" +python3 -m pip install "${whl_name_sdk}[test]" cd "${ROOT_DIR}"/python/openmldb_tool/dist/ whl_name_tool=$(ls openmldb*.whl) echo "whl_name_tool:${whl_name_tool}" -python3 -m pip install "${whl_name_tool}" +# pip 23.1.2 just needs to install test(rpc is required by test) +python3 -m pip install "${whl_name_tool}[rpc,test]" python3 -m pip install pytest-cov diff --git a/tools/openmldb_ops.py b/tools/openmldb_ops.py index f430b08a282..399bdc3605e 100644 --- a/tools/openmldb_ops.py +++ b/tools/openmldb_ops.py @@ -144,7 +144,7 @@ def RecoverTable(executor : Executor, db, table_name) -> Status: log.info(f"recover {table_name} in {db}") status, table_info = executor.GetTableInfo(db, table_name) if not status.OK(): - log.warn(f"get table info failed. msg is {status.GetMsg()}") + log.warning(f"get table info failed. msg is {status.GetMsg()}") return Status(-1, f"get table info failed. msg is {status.GetMsg()}") partition_dict = executor.ParseTableInfo(table_info) endpoints = set() @@ -154,7 +154,7 @@ def RecoverTable(executor : Executor, db, table_name) -> Status: for endpoint in endpoints: status, result = executor.GetTableStatus(endpoint) if not status.OK(): - log.warn(f"get table status failed. msg is {status.GetMsg()}") + log.warning(f"get table status failed. msg is {status.GetMsg()}") return Status(-1, f"get table status failed. msg is {status.GetMsg()}") endpoint_status[endpoint] = result max_pid = int(table_info[-1][2]) @@ -180,7 +180,7 @@ def RecoverTable(executor : Executor, db, table_name) -> Status: if status.OK(): log.info(f"{table_name} in {db} recover success") else: - log.warn(status.GetMsg()) + log.warning(status.GetMsg()) return status def RecoverData(executor : Executor): diff --git a/tools/tool.py b/tools/tool.py index 3759518fd82..358751b4db9 100644 --- a/tools/tool.py +++ b/tools/tool.py @@ -212,10 +212,10 @@ def GetTableStatus(self, endpoint, tid = '', pid = '') -> tuple([Status, Dict]): cmd.append("--cmd=gettablestatus " + tid + " " + pid) status, output = self.RunWithRetuncode(cmd) if not status.OK(): - log.error("gettablestatus failed") + log.error("gettablestatus failed on " + str(cmd)) return status, None if "failed" in output: - log.error("gettablestatus failed") + log.error("gettablestatus failed on " + str(cmd)) return Status(-1, output), None result = {} for record in self.ParseResult(output):