diff --git a/.github/workflows/udf-doc.yml b/.github/workflows/udf-doc.yml index 5a0e6b33807..b18263fcbc7 100644 --- a/.github/workflows/udf-doc.yml +++ b/.github/workflows/udf-doc.yml @@ -7,8 +7,8 @@ on: pull_request: paths: - '.github/workflows/udf-doc.yml' - - 'hybridse/tools/documentation/**' - - 'hybridse/src/cmd/export_udf_info.cc' + - 'include/**' + - 'hybridse/**' workflow_dispatch: jobs: diff --git a/README.md b/README.md index aacea720498..0b499efebdd 100644 --- a/README.md +++ b/README.md @@ -157,11 +157,11 @@ We really appreciate the contribution from our community. ![wechat](docs/en/about/images/wechat.png) ## 12. Publications - +- [PECJ: Stream Window Join on Disorder Data Streams with Proactive Error Compensation](https://tonyskyzeng.github.io/downloads/PECJ_TR.pdf). Xianzhi Zeng, Shuhao Zhang, Hongbin Zhong, Hao Zhang, Mian Lu, Zhao Zheng, and Yuqiang Chen. International Conference on Management of Data (SIGMOD/PODS) 2024. - [Principles and Practices of Real-Time Feature Computing Platforms for ML](https://cacm.acm.org/magazines/2023/7/274061-principles-and-practices-of-real-time-feature-computing-platforms-for-ml/fulltext). Hao Zhang, Jun Yang, Cheng Chen, Siqi Wang, Jiashu Li, and Mian Lu. 2023. Communications of the ACM 66, 7 (July 2023), 77–78. - [Scalable Online Interval Join on Modern Multicore Processors in OpenMLDB](docs/paper/scale_oij_icde2023.pdf). Hao Zhang, Xianzhi Zeng, Shuhao Zhang, Xinyi Liu, Mian Lu, and Zhao Zheng. In 2023 IEEE 39rd International Conference on Data Engineering (ICDE) 2023. [[code]](https://github.com/4paradigm/OpenMLDB/tree/stream) - [FEBench: A Benchmark for Real-Time Relational Data Feature Extraction](https://github.com/decis-bench/febench/blob/main/report/febench.pdf). Xuanhe Zhou, Cheng Chen, Kunyi Li, Bingsheng He, Mian Lu, Qiaosheng Liu, Wei Huang, Guoliang Li, Zhao Zheng, Yuqiang Chen. International Conference on Very Large Data Bases (VLDB) 2023. [[code]](https://github.com/decis-bench/febench). -- [A System for Time Series Feature Extraction in Federated Learning](https://dl.acm.org/doi/pdf/10.1145/3511808.3557176). Siqi Wang, Jiashu Li, Mian Lu, Zhao Zheng, Yuqiang Chen, and Bingsheng He. 2022. In Proceedings of the 31st ACM International Conference on Information & Knowledge Management (CIKM) 2022. [[code]](https://github.com/4paradigm/tsfe). +- [A System for Time Series Feature Extraction in Federated Learning](https://dl.acm.org/doi/pdf/10.1145/3511808.3557176). Siqi Wang, Jiashu Li, Mian Lu, Zhao Zheng, Yuqiang Chen, and Bingsheng He. 2022. In Proceedings of the 31st ACM International Conference on Information & Knowledge Management (CIKM) 2022. [[code]](https://github.com/4paradigm/tsfe). - [Optimizing in-memory database engine for AI-powered on-line decision augmentation using persistent memory](http://vldb.org/pvldb/vol14/p799-chen.pdf). Cheng Chen, Jun Yang, Mian Lu, Taize Wang, Zhao Zheng, Yuqiang Chen, Wenyuan Dai, Bingsheng He, Weng-Fai Wong, Guoan Wu, Yuping Zhao, and Andy Rudoff. International Conference on Very Large Data Bases (VLDB) 2021. ## 13. [The User List](https://github.com/4paradigm/OpenMLDB/discussions/707) diff --git a/README_cn.md b/README_cn.md index 7ff8b3a0d3a..b4ac59aa280 100644 --- a/README_cn.md +++ b/README_cn.md @@ -144,7 +144,7 @@ OpenMLDB 的整体架构设计是为了达到特征平台从开发到部署的 ![wechat](docs/zh/about/images/wechat.png) ## 12. 学术论文 - +- [PECJ: Stream Window Join on Disorder Data Streams with Proactive Error Compensation](https://tonyskyzeng.github.io/downloads/PECJ_TR.pdf). Xianzhi Zeng, Shuhao Zhang, Hongbin Zhong, Hao Zhang, Mian Lu, Zhao Zheng, and Yuqiang Chen. International Conference on Management of Data (SIGMOD/PODS) 2024. - [Principles and Practices of Real-Time Feature Computing Platforms for ML](https://cacm.acm.org/magazines/2023/7/274061-principles-and-practices-of-real-time-feature-computing-platforms-for-ml/fulltext). Hao Zhang, Jun Yang, Cheng Chen, Siqi Wang, Jiashu Li, and Mian Lu. 2023. Communications of the ACM 66, 7 (July 2023), 77–78. - [Scalable Online Interval Join on Modern Multicore Processors in OpenMLDB](docs/paper/scale_oij_icde2023.pdf). Hao Zhang, Xianzhi Zeng, Shuhao Zhang, Xinyi Liu, Mian Lu, and Zhao Zheng. In 2023 IEEE 39rd International Conference on Data Engineering (ICDE) 2023. [[code]](https://github.com/4paradigm/OpenMLDB/tree/stream) - [FEBench: A Benchmark for Real-Time Relational Data Feature Extraction](https://github.com/decis-bench/febench/blob/main/report/febench.pdf). Xuanhe Zhou, Cheng Chen, Kunyi Li, Bingsheng He, Mian Lu, Qiaosheng Liu, Wei Huang, Guoliang Li, Zhao Zheng, Yuqiang Chen. International Conference on Very Large Data Bases (VLDB) 2023. [[code]](https://github.com/decis-bench/febench). diff --git a/cases/function/join/test_lastjoin_simple.yaml b/cases/function/join/test_lastjoin_simple.yaml index 4d23b312ef2..589e98bd05b 100644 --- a/cases/function/join/test_lastjoin_simple.yaml +++ b/cases/function/join/test_lastjoin_simple.yaml @@ -1067,4 +1067,4 @@ cases: rows: - [ "aa",2,131,1590738990000 ] - [ "bb",21,NULL,NULL ] - - [ "dd", 41, NULL, NULL ] \ No newline at end of file + - [ "dd", 41, NULL, NULL ] diff --git a/cases/plan/back_quote_identifier.yaml b/cases/plan/back_quote_identifier.yaml index cafce9e5b2d..4743634c370 100644 --- a/cases/plan/back_quote_identifier.yaml +++ b/cases/plan/back_quote_identifier.yaml @@ -131,12 +131,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a-1 | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b-1 | | +-column_type: string - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a-1, b-1] diff --git a/cases/plan/const_query.yaml b/cases/plan/const_query.yaml index 98ce9ff2119..c388f163b7c 100644 --- a/cases/plan/const_query.yaml +++ b/cases/plan/const_query.yaml @@ -39,3 +39,36 @@ cases: mode: request-unsupport sql: | select int(NULL) as c1, bigint(NULL) as c2, float(NULL) as c3, double(NULL) as c4, timestamp(NULL) as c5, date(NULL) as c6, string(NULL) as c7; + + - id: map_data_type + mode: request-unsupport + desc: access map value with []operator + sql: | + select map(1, 2)[1] + expect: + node_tree_str: | + +-node[kQuery]: kQuerySelect + +-distinct_opt: false + +-where_expr: null + +-group_expr_list: null + +-having_expr: null + +-order_expr_list: null + +-limit: null + +-select_list[list]: + | +-0: + | +-node[kResTarget] + | +-val: + | | map(1, 2)[1] + | +-name: + +-tableref_list: [] + +-window_list: [] + plan_tree_str: | + +-[kQueryPlan] + +-[kProjectPlan] + +-table: + +-project_list_vec[list]: + +-[kProjectList] + +-projects on table [list]: + +-[kProjectNode] + +-[0]map(1, 2)[1]: map(1, 2)[1] + null diff --git a/cases/plan/create.yaml b/cases/plan/create.yaml index 66bb1ee548c..6210401ee9d 100644 --- a/cases/plan/create.yaml +++ b/cases/plan/create.yaml @@ -163,12 +163,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: string - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a, b] @@ -218,12 +216,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int16 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: float - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a] @@ -274,12 +270,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a] @@ -627,12 +621,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a] @@ -685,33 +677,27 @@ cases: | +-0: | | +-node[kColumnDesc] | | +-column_name: column1 - | | +-column_type: int32 - | | +-NOT NULL: 1 + | | +-column_type: int32 NOT NULL | +-1: | | +-node[kColumnDesc] | | +-column_name: column2 - | | +-column_type: int16 - | | +-NOT NULL: 1 + | | +-column_type: int16 NOT NULL | +-2: | | +-node[kColumnDesc] | | +-column_name: column5 - | | +-column_type: string - | | +-NOT NULL: 1 + | | +-column_type: string NOT NULL | +-3: | | +-node[kColumnDesc] | | +-column_name: column6 - | | +-column_type: string - | | +-NOT NULL: 1 + | | +-column_type: string NOT NULL | +-4: | | +-node[kColumnDesc] | | +-column_name: std_ts - | | +-column_type: timestamp - | | +-NOT NULL: 1 + | | +-column_type: timestamp NOT NULL | +-5: | | +-node[kColumnDesc] | | +-column_name: std_date - | | +-column_type: date - | | +-NOT NULL: 1 + | | +-column_type: date NOT NULL | +-6: | +-node[kColumnIndex] | +-keys: [column2] @@ -743,33 +729,27 @@ cases: | +-0: | | +-node[kColumnDesc] | | +-column_name: column1 - | | +-column_type: int32 - | | +-NOT NULL: 1 + | | +-column_type: int32 NOT NULL | +-1: | | +-node[kColumnDesc] | | +-column_name: column2 - | | +-column_type: int16 - | | +-NOT NULL: 1 + | | +-column_type: int16 NOT NULL | +-2: | | +-node[kColumnDesc] | | +-column_name: column5 - | | +-column_type: string - | | +-NOT NULL: 1 + | | +-column_type: string NOT NULL | +-3: | | +-node[kColumnDesc] | | +-column_name: column6 - | | +-column_type: string - | | +-NOT NULL: 1 + | | +-column_type: string NOT NULL | +-4: | | +-node[kColumnDesc] | | +-column_name: std_ts - | | +-column_type: timestamp - | | +-NOT NULL: 1 + | | +-column_type: timestamp NOT NULL | +-5: | | +-node[kColumnDesc] | | +-column_name: std_date - | | +-column_type: date - | | +-NOT NULL: 1 + | | +-column_type: date NOT NULL | +-6: | +-node[kColumnIndex] | +-keys: [column2] @@ -796,17 +776,11 @@ cases: | +-0: | | +-node[kColumnDesc] | | +-column_name: column1 - | | +-column_type: int32 - | | +-NOT NULL: 0 - | | +-default_value: - | | +-expr[primary] - | | +-value: 1 - | | +-type: int32 + | | +-column_type: int32 DEFAULT 1 | +-1: | +-node[kColumnDesc] | +-column_name: column2 | +-column_type: int32 - | +-NOT NULL: 0 +-table_option_list: [] - id: 27 desc: Column default value with explicit type @@ -824,20 +798,11 @@ cases: | +-0: | | +-node[kColumnDesc] | | +-column_name: column1 - | | +-column_type: string - | | +-NOT NULL: 0 - | | +-default_value: - | | +-expr[cast] - | | +-cast_type: string - | | +-expr: - | | +-expr[primary] - | | +-value: 1 - | | +-type: int32 + | | +-column_type: string DEFAULT string(1) | +-1: | +-node[kColumnDesc] | +-column_name: column3 | +-column_type: int32 - | +-NOT NULL: 0 +-table_option_list: [] - id: 28 desc: Create table with database.table @@ -856,12 +821,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: column1 | | +-column_type: string - | | +-NOT NULL: 0 | +-1: | +-node[kColumnDesc] | +-column_name: column3 | +-column_type: int32 - | +-NOT NULL: 0 +-table_option_list: [] - id: 29 desc: create index with db name prefix @@ -898,12 +861,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: column1 | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: column2 | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [column1] @@ -934,12 +895,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: a | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: b | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [a] @@ -1049,12 +1008,10 @@ cases: | | +-node[kColumnDesc] | | +-column_name: column1 | | +-column_type: int32 - | | +-NOT NULL: 0 | +-1: | | +-node[kColumnDesc] | | +-column_name: column2 | | +-column_type: timestamp - | | +-NOT NULL: 0 | +-2: | +-node[kColumnIndex] | +-keys: [column1] @@ -1068,3 +1025,45 @@ cases: +-0: +-node[kCompressType] +-compress_type: snappy + - id: 35 + desc: Create table with array & map type + sql: | + create table t1 (id int, + member ARRAY NOT NULL, + attrs MAP NOT NULL); + expect: + node_tree_str: | + +-node[CREATE] + +-table: t1 + +-IF NOT EXIST: 0 + +-column_desc_list[list]: + | +-0: + | | +-node[kColumnDesc] + | | +-column_name: id + | | +-column_type: int32 + | +-1: + | | +-node[kColumnDesc] + | | +-column_name: member + | | +-column_type: array NOT NULL + | +-2: + | +-node[kColumnDesc] + | +-column_name: attrs + | +-column_type: map NOT NULL + +-table_option_list: [] + plan_tree_str: | + +-[kCreatePlan] + +-table: t1 + +-column_desc_list[list]: + | +-0: + | | +-node[kColumnDesc] + | | +-column_name: id + | | +-column_type: int32 + | +-1: + | | +-node[kColumnDesc] + | | +-column_name: member + | | +-column_type: array NOT NULL + | +-2: + | +-node[kColumnDesc] + | +-column_name: attrs + | +-column_type: map NOT NULL + +-table_option_list: [] diff --git a/cases/plan/simple_query.yaml b/cases/plan/simple_query.yaml index 66cc542fbc0..95372e7803f 100644 --- a/cases/plan/simple_query.yaml +++ b/cases/plan/simple_query.yaml @@ -644,3 +644,4 @@ cases: +-[kTablePlan] +-table: t +-alias: t1 + diff --git a/cases/query/udf_query.yaml b/cases/query/udf_query.yaml index ded80e003ce..c2fdc4678de 100644 --- a/cases/query/udf_query.yaml +++ b/cases/query/udf_query.yaml @@ -554,3 +554,37 @@ cases: - c1 bool data: | true, false + + # ================================================================ + # Map data type + # ================================================================ + - id: 13 + mode: request-unsupport + sql: | + select + map(1, "2")[1] as e1, + map("abc", 100)["abc"] as e2, + map(1, "2", 3, "4")[5] as e3, + map("c", 99, "d", 101)["d"] as e4, + map(date("2012-12-12"), "e", date("2013-11-11"), "f", date("2014-10-10"), "g")[date("2013-11-11")] as e5, + map(timestamp(88), timestamp(1000), timestamp(99), timestamp(2000)) [timestamp(99)] as e6, + map('1', 2, '3', 4, '5', 6, '7', 8, '9', 10, '11', 12)['9'] as e7, + map('1', 2, '3', 4, '5', 6, '7', 8, '9', 10, '11', 12)['10'] as e8, + # first match on duplicate keys + map('1', 2, '1', 4, '1', 6, '7', 8, '9', 10, '11', 12)['1'] as e9, + map("c", 99, "d", NULL)["d"] as e10, + expect: + columns: ["e1 string", "e2 int", "e3 string", "e4 int", "e5 string", "e6 timestamp", "e7 int", "e8 int", "e9 int", "e10 int"] + data: | + 2, 100, NULL, 101, f, 2000, 10, NULL, 2, NULL + - id: 14 + mode: request-unsupport + sql: | + select + array_contains(map_keys(map(1, '2', 3, '4')), 1) as e1, + array_contains(map_keys(map('1', 2, '3', 4)), '2') as e2, + array_contains(map_keys(map(timestamp(88), timestamp(1000), timestamp(99), timestamp(2000))) , timestamp(99)) as e3, + expect: + columns: ["e1 bool", "e2 bool", "e3 bool"] + data: | + true, false, true diff --git a/demo/usability_testing/data_mocker.py b/demo/usability_testing/data_mocker.py index f873daec9dc..6729ef0b70b 100644 --- a/demo/usability_testing/data_mocker.py +++ b/demo/usability_testing/data_mocker.py @@ -6,6 +6,7 @@ from typing import Optional import numpy as np import pandas as pd +import dateutil # to support save csv, and faster parquet, we don't use faker-cli directly @@ -146,8 +147,9 @@ def type_converter(sql_type): if sql_type in ['varchar', 'string']: # TODO(hw): set max length return 'pystr', {} + # timestamp should > 0 cuz tablet insert will check it, use utc if sql_type in ['date', 'timestamp']: - return 'iso8601', {} + return 'iso8601', {"tzinfo": dateutil.tz.UTC} if sql_type in ['float', 'double']: return 'pyfloat', ranges[sql_type] return 'py' + sql_type, {} diff --git a/docs/en/app_ecosystem/feature_platform/concept.md b/docs/en/app_ecosystem/feature_platform/concept.md new file mode 100644 index 00000000000..8a7a6339644 --- /dev/null +++ b/docs/en/app_ecosystem/feature_platform/concept.md @@ -0,0 +1,12 @@ +## Introduction + +The OpenMLDB Feature Platform is a sophisticated feature store service, leveraging [OpenMLDB](https://github.com/4paradigm/OpenMLDB) for efficient feature management and orchestration. + + + +* Feature: Data obtained through feature extraction from raw data that can be directly used for model training and inference. +* Feature View: A set of features defined by a single SQL computation statement. +* Data Table: In OpenMLDB, data tables include online storage that supports real-time queries and distributed offline storage. +* Online Scenario: By deploying online feature services, it provides hard real-time online feature extraction interfaces using online data. +* Offline Scenario: Uses distributed computing to process offline data for feature computation and exports sample files needed for machine learning. +* Online-Offline Consistency: Ensuring that the feature results computed in online and offline scenarios are consistent through the same SQL definitions. diff --git a/docs/en/app_ecosystem/feature_platform/index.rst b/docs/en/app_ecosystem/feature_platform/index.rst new file mode 100644 index 00000000000..93e31d3a062 --- /dev/null +++ b/docs/en/app_ecosystem/feature_platform/index.rst @@ -0,0 +1,12 @@ +============================= +OpenMLDB Feature Platform +============================= + +.. toctree:: + :maxdepth: 1 + + concept + installation + tutorial + usage + \ No newline at end of file diff --git a/docs/en/app_ecosystem/feature_platform/installation.md b/docs/en/app_ecosystem/feature_platform/installation.md new file mode 100644 index 00000000000..1369a3a297f --- /dev/null +++ b/docs/en/app_ecosystem/feature_platform/installation.md @@ -0,0 +1,55 @@ +## Installation + +### Java + +Download the jar file. + +``` +wget https://openmldb.ai/download/feature-platform/openmldb-feature-platform-0.8-SNAPSHOT.jar +``` + +Prepare the config file which may be named as `application.yml`. + +``` +server: + port: 8888 + +openmldb: + zk_cluster: 127.0.0.1:2181 + zk_path: /openmldb + apiserver: 127.0.0.1:9080 +``` + +Start the feature platform server. + +``` +java -jar ./openmldb-feature-platform-0.8-SNAPSHOT.jar +``` + +### Docker + +Prepare the config file `application.yml` and start the docker container. + +``` +docker run -d -p 8888:8888 -v `pwd`/application.yml:/app/application.yml registry.cn-shenzhen.aliyuncs.com/tobe43/openmldb-feature-platform +``` + +### Compiling from Source + +Clone the source code and build from scratch. + +``` +git clone https://github.com/4paradigm/feature-platform + +cd ./feature-platform/frontend/ +npm run build + +cd ../ +mvn clean package +``` + +Start the server with local config file. + +``` +./start_server.sh +``` \ No newline at end of file diff --git a/docs/en/app_ecosystem/feature_platform/tutorial.md b/docs/en/app_ecosystem/feature_platform/tutorial.md new file mode 100644 index 00000000000..f7b34200706 --- /dev/null +++ b/docs/en/app_ecosystem/feature_platform/tutorial.md @@ -0,0 +1,9 @@ +## Tutorial + +Access the feature platform by navigating to http://127.0.0.1:8888/ using any conventional web browser. + +1. Importing Data: Create databases, create data tables, import online data, and import offline data using SQL commands or frontend forms. +2. Creating Features: Define feature views using SQL statements. The feature platform will use a SQL compiler to analyze the features and create corresponding entities. +3. Offline Scenario: Select the desired features to import. You can choose features from different feature views simultaneously and use distributed computing to import sample files into local or distributed storage. +4. Online Scenario: Select the desired features to go live. Publish them as an online feature extraction service with one click, and then use an HTTP client to request and return online feature extraction results. +5. SQL Debugging: Execute any online or offline computing SQL statement and view the execution results and logs on the web frontend. \ No newline at end of file diff --git a/docs/en/app_ecosystem/feature_platform/usage.md b/docs/en/app_ecosystem/feature_platform/usage.md new file mode 100644 index 00000000000..2dfd2d78c85 --- /dev/null +++ b/docs/en/app_ecosystem/feature_platform/usage.md @@ -0,0 +1,2 @@ +## Application Guide + diff --git a/docs/en/app_ecosystem/sql_emulator/index.rst b/docs/en/app_ecosystem/sql_emulator/index.rst new file mode 100644 index 00000000000..f67223a96b1 --- /dev/null +++ b/docs/en/app_ecosystem/sql_emulator/index.rst @@ -0,0 +1,8 @@ +============================= +OpenMLDB SQL Emulator +============================= + +.. toctree:: + :maxdepth: 1 + + sql_emulator \ No newline at end of file diff --git a/docs/en/app_ecosystem/sql_emulator/sql_emulator.md b/docs/en/app_ecosystem/sql_emulator/sql_emulator.md new file mode 100644 index 00000000000..0ffe8525e44 --- /dev/null +++ b/docs/en/app_ecosystem/sql_emulator/sql_emulator.md @@ -0,0 +1,264 @@ +# Quickstart + +OpenMLDB SQL Emulator is a lightweight SQL simulator for [OpenMLDB](https://github.com/4paradigm/OpenMLDB), designed to facilitate more efficient and convenient development and debugging of OpenMLDB SQL without the cumbersome deployment of a running OpenMLDB cluster. + +To efficiently perform time-series feature calculations, OpenMLDB SQL has been improved and extended from standard SQL. Therefore, beginners using OpenMLDB SQL often encounter issues related to unfamiliar syntax and confusion regarding execution modes. Developing and debugging directly on an OpenMLDB cluster can lead to significant time wasted on irrelevant tasks such as deployment, index building, handling large volumes of data, and may also make it challenging to pinpoint the root cause of SQL errors. + +The OpenMLDB SQL Emulator serves as a lightweight tool for simulating the development and debugging of OpenMLDB SQL queries without the need for deployment within an OpenMLDB cluster. We highly recommend this tool to our application developers, as it allows them to initially validate SQL correctness and suitability for deployment before transitioning to the actual OpenMLDB environment for deployment and production. + + +## Installation and Startup + +From [release page](https://github.com/vagetablechicken/OpenMLDBSQLEmulator/releases), download the runtime package `emulator-1.0.jar` and launch it using the following command (please note that the current release version 1.0 corresponds to the SQL syntax of OpenMLDB 0.8.3): + +```bash +java -jar emulator-1.0.jar +``` +Please note that in order to execute SQL queries using the `run` command to validate computation results, you'll also need to download the `toydb_run_engine` from the same page, and put this file in the `/tmp` directory of your system. + +## Usage +Upon starting the emulator, it will directly enter the default database, emudb. No additional database creation is required. + +- Databases do not need explicit creation. You can either use the command `use ` or specify the database name when creating a table to automatically create the database. +- Use the commands `addtable` or `t` to create virtual tables. Repeatedly creating a table with the same name will perform an update operation, using the most recent table schema. Simplified SQL-like syntax is used to manage tables. For instance, the following example creates a table with two columns: + +```sql +addtable t1 a int, b int64 +``` +- Use the command `showtables` or `st` to view all current databases and tables. + + +### OpenMLDB SQL Validation + +Typically, to validate whether OpenMLDB SQL can be deployed, you can do so in a real cluster using `DEPLOY`. However, using this method requires managing `DEPLOYMENT` and indexes. For instance, you may need to manually delete unnecessary `DEPLOYMENT` or clean up indexes if they are created unnecessarily. Therefore, we recommend testing and validating in the Emulator environment. + +You can use `val` and `valreq` to respectively perform validation of OpenMLDB SQL in online batch mode and online request mode (i.e., deploying as a service). For instance, to test if an SQL query can be deployed, you can use the `valreq` command: + +```sql +# table creations - t/addtable: create table +addtable t1 a int, b int64 + +# validate in online request mode +valreq select count(*) over w1 from t1 window w1 as (partition by a order by b rows between unbounded preceding and current row); +``` +If the test fails, it will print the SQL compilation error. If it passes, it will print `validate * success`. The entire process takes place in a virtual environment, without concerns about resource usage after table creation or any side effects. Any SQL that passes validation through `valreq` will definitely be deployable in a real cluster. + +### OpenMLDB SQL Computation Test + +The OpenMLDB SQL Emulator is also capable of returning computation results, facilitating the testing of whether the SQL computations align with expectations. You can recursively perform calculations and validations until the final satisfactory SQL is obtained. This functionality can be achieved using the `run` command in the Emulator. + +Please note that the `run` command requires support from `toydb_run_engine`. You can either use the pre-existing emulator package containing `toydb` or download the `toydb` program from [this page](https://github.com/vagetablechicken/OpenMLDBSQLEmulator/releases) and place it into the `/tmp` directory. + +Assuming the Emulator already has `toydb`, the steps for computation test are as follows: + +``` +# step 1, generate a yaml template +gencase + +# step 2 modify the yaml file to add table and data +# ... + +# step 3 load yaml and show tables +loadcase +st + +# step 4 use val/valreq to validate the sql +valreq select count(*) over w1 from t1 window w1 as (partition by id order by std_ts rows between unbounded preceding and current row); + +# step 5 dump the sql you want to run next, this will rewrite the yaml file +dumpcase select count(*) over w1 from t1 window w1 as (partition by id order by std_ts rows between unbounded preceding and current row); + +# step 6 run sql using toydb +run +``` +#### Explanations + +**step 1:** Run command `gencase` to generate a template yaml file. The default directory is `/tmp/emu-case.yaml`. + +Example yaml file: +```yaml +# call toydb_run_engine to run this yaml file +# you can generate yaml cases for reproduction by emulator dump or by yourself + +# you can set the global default db +db: emudb +cases: + - id: 0 + desc: describe this case + # you can set batch mode + mode: request + db: emudb # you can set default db for case, if not set, use the global default db + inputs: + - name: t1 + db: emudb # you can set db for each table, if not set, use the default db(table db > case db > global db) + # must set table schema, emulator can't do this + columns: ["id int", "pk1 string","col1 int32", "std_ts timestamp"] + # gen by emulator, just to init table, not the deployment index + indexs: [] + # must set the data, emulator can't do this + data: | + 1, A, 1, 1590115420000 + 2, B, 1, 1590115420000 + # query: only support single query, to check the result by `expect` + sql: | + + # optional, you can just check the output, or add your expect + # expect: + # schema: id:int, pk1:string, col1:int, std_ts:timestamp, w1_col1_sum:int, w2_col1_sum:int, w3_col1_sum:int + # order: id + # data: | + # 1, A, 1, 1590115420000, 1, 1, 1 + # 2, B, 1, 1590115420000, 1, 1, 1 +``` + +**step 2:** Edit this yaml file. Note the following: +- You must modify the table name, table schema, and its data; these cannot be modified within the Emulator. +- You can modify the `mode` of operation, which accepts either `batch` or `request` mode. +- It's not necessary to fill in the SQL. You can write it into a file in the Emulator using `dumpcase `. The common practice is to first validate the SQL, then dump it into the case once the SQL passes validation. Afterwards, use the `run` command to confirm that the SQL computation aligns with expectations. +- The table's indexes don't need to be manually filled in. They can be automatically generated during `dumpcase` based on the table schema (indexes are not specific to SQL and are unrelated to SQL queries; they are only required when creating a table). If not using `dumpcase`, then manually enter at least one index. Indexes have no specific requirements. Examples of manual creation: `["index1:c1:c2", ".."]`, `["index1:c1:c4:(10m,2):absorlat"]`. + +**step 3:** Execute `loadcase`, and the table information from this case will be loaded into the Emulator. Confirm the successful loading of the case's table by using `st/showtables`. The displayed information should be similar to: +```bash +emudb> st +emudb={t1=id:int32,pk1:string,col1:int32,std_ts:timestamp} +``` + +**step 4:** Use `valreq` to confirm that the SQL we've written is syntactically correct and suitable for deployment. + +**step 5 & 6:** Perform computation testing on this SQL using the `dumpcase` and `run` commands. `dumpcase` effectively writes the SQL and default indexes into a case file, while the `run` command executes this case file. If you are proficient enough, you can also directly modify the case file and run it in the Emulator using `run`, or directly use `toydb_run_engine --yaml_path=...` to run it. + +## Additional Information +### Build +You can build the Emulator on your own. If you need to verify SQL computation results using the `run` command, you must place `toydb_run_engine` in `src/main/resources` and then execute the build process. + +```bash +# pack without toydb +mvn package -DskipTests + +# pack with toydb +cp toydb_run_engine src/main/resources +mvn package +``` + +To build `toydb_run_engine` from the source code: +``` +git clone https://github.com/4paradigm/OpenMLDB.git +cd OpenMLDB +make configure +cd build +make toydb_run_engine -j # minimum build +``` + +### Compatible OpenMLDB Versions + +The Emulator employs `openmldb-jdbc` for validations. The current compatible OpenMLDB version is: +|Emulator Version | Compatible OpenMLDB Versions | +|--|--| +| 1.0 | 0.8.3 | + +### Command List + + +#### Commands for Creation +Note that if a table already exists, the creation of a new table will replace the existing table. The default database is `emudb`. + + +- `use ` Use a database. If it doesn't exist, it will be created. +- `addtable c1 t1,c2 t2, ...` Create/replace a table in the current database. + - abbreviate: `t c1 t1,c2 t2, ...` + +- `adddbtable c1 t1,c2 t2, ...` Create/replace a table in the specified database. If the database doesn't exist, it will be created. + - abbreviate: `dt c1 t1,c2 t2, ...` +- `sql ` Create a table by SQL. + +- `showtables` / `st` list all tables. + + +#### `genddl ` + +If you want to create tables without redundant indexes, you can use `genddl` to generate ddl from the query SQL. + +Note that method `genDDL` in `openmldb-jdbc` does not support multiple databases yet, so we can't use this method to parse SQLs that have multiple dbs. + +- Example1 +``` +t t1 a int, b bigint +t t2 a int, b bigint +genddl select *, count(b) over w1 from t1 window w1 as (partition by a order by b rows between 1 preceding and current row) +``` +output: +``` +CREATE TABLE IF NOT EXISTS t1( + a int, + b bigint, + index(key=(a), ttl=1, ttl_type=latest, ts=`b`) +); +CREATE TABLE IF NOT EXISTS t2( + a int, + b bigint +); +``` +Since the SQL doesn't involve operations on t2, the SQL that creates t2 is just a simple create table, while the SQL that creates t1 is a create table with an index. + +- Example2 +``` +t t1 a int, b bigint +t t2 a int, b bigint +genddl select *, count(b) over w1 from t1 window w1 as (union t2 partition by a order by b rows_range between 1d preceding and current row) +``` +output: +``` +CREATE TABLE IF NOT EXISTS t1( + a int, + b bigint, + index(key=(a), ttl=1440m, ttl_type=absolute, ts=`b`) +); +CREATE TABLE IF NOT EXISTS t2( + a int, + b bigint, + index(key=(a), ttl=1440m, ttl_type=absolute, ts=`b`) +); +``` +Since there's a union window, the SQLs that create t1 and t2 both have an index. + +#### SQL Validation Commands + +- `val ` validates SQL in batch mode; tables should be created beforehand +- `valreq ` validates SQL in request mode; tables should be created beforehand +``` +t t1 a int, b int64 +val select * from t1 where a == 123; +valreq select count(*) over w1 from t1 window w1 as (partition by a order by b rows between unbounded preceding and current row); +``` + +#### toydb Execution Commands + +`run ` + +Run the yaml file in toydb. You can use `gencase` to generate it. Currently, a single case is supported. The case should include table creation commands and a single SQL query. The default mode is `request` but can be changed to `batch` mode. + +Since the Emulator does not support add/del table or table data, please include the relevant operations in the yaml file. + +This yaml file can also be used to reproduce errors. If you need assistance, please provide us with the corresponding yaml file. + +#### Miscellaneous +- `#` comment. +- You **cannot** run a command in multi-lines, e.g. `val select * from t1;` cannot be written as +``` +# wrong +val select * +from +t1; +``` +- `?help` hint +- `?list` list all cmds +- `!run-script $filename` reads and executes commands from a given script file. The script file is just a text file with commands in it. e.g. +``` +!run-script src/test/resources/simple.emu +``` +- `!set-display-time true/false` toggles display of command execution time. Time is shown in milliseconds and is physical time of the method. +- `!enable-logging filename` and `!disable-logging` control logging settings, i.e. duplication of all Shell's input and output in a file. + +### CLI Framework + +We use `cliche` for the CLI interface. See [Manual](https://code.google.com/archive/p/cliche/wikis/Manual.wiki) and [source](https://github.com/budhash/cliche). diff --git a/docs/en/deploy/conf.md b/docs/en/deploy/conf.md index 138a414fa3d..5ca1ba9dcee 100644 --- a/docs/en/deploy/conf.md +++ b/docs/en/deploy/conf.md @@ -187,8 +187,8 @@ #--max_traverse_cnt=0 # max table traverse unique key number(batch query), default: 0 #--max_traverse_key_cnt=0 -# max result size in byte (default: 2MB) -#--scan_max_bytes_size=2097152 +# max result size in byte (default: 0 unlimited) +#--scan_max_bytes_size=0 # loadtable # The number of data bars to submit a task to the thread pool when loading diff --git a/docs/en/index.rst b/docs/en/index.rst index 4ebb3d4389d..3b6bd1e7599 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -17,3 +17,10 @@ OpenMLDB Docs (|version|) reference/index developer/index +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: 📚 Application Ecosystem + + app_ecosystem/feature_platform/index + app_ecosystem/sql_emulator/index \ No newline at end of file diff --git a/docs/en/integration/deploy_integration/images/demo_steps.png b/docs/en/integration/deploy_integration/images/demo_steps.png new file mode 100644 index 00000000000..f7ea3519f2c Binary files /dev/null and b/docs/en/integration/deploy_integration/images/demo_steps.png differ diff --git a/docs/en/integration/online_datasources/images/demo_steps.png b/docs/en/integration/online_datasources/images/demo_steps.png new file mode 100644 index 00000000000..f7ea3519f2c Binary files /dev/null and b/docs/en/integration/online_datasources/images/demo_steps.png differ diff --git a/docs/en/integration/online_datasources/images/init_sink_status.png b/docs/en/integration/online_datasources/images/init_sink_status.png new file mode 100644 index 00000000000..17637f27bd3 Binary files /dev/null and b/docs/en/integration/online_datasources/images/init_sink_status.png differ diff --git a/docs/en/integration/online_datasources/images/kafka_connector_steps.png b/docs/en/integration/online_datasources/images/kafka_connector_steps.png new file mode 100644 index 00000000000..1ed49e8f8b7 Binary files /dev/null and b/docs/en/integration/online_datasources/images/kafka_connector_steps.png differ diff --git a/docs/en/integration/online_datasources/images/kafka_openmldb_result.png b/docs/en/integration/online_datasources/images/kafka_openmldb_result.png new file mode 100644 index 00000000000..8d986acc9be Binary files /dev/null and b/docs/en/integration/online_datasources/images/kafka_openmldb_result.png differ diff --git a/docs/en/integration/online_datasources/images/kafka_topic_describe.png b/docs/en/integration/online_datasources/images/kafka_topic_describe.png new file mode 100644 index 00000000000..fc329b15c2d Binary files /dev/null and b/docs/en/integration/online_datasources/images/kafka_topic_describe.png differ diff --git a/docs/en/integration/online_datasources/images/openmldb_result.png b/docs/en/integration/online_datasources/images/openmldb_result.png new file mode 100644 index 00000000000..8ac56daee5c Binary files /dev/null and b/docs/en/integration/online_datasources/images/openmldb_result.png differ diff --git a/docs/en/integration/online_datasources/images/producer_code.png b/docs/en/integration/online_datasources/images/producer_code.png new file mode 100644 index 00000000000..2ce00c602a3 Binary files /dev/null and b/docs/en/integration/online_datasources/images/producer_code.png differ diff --git a/docs/en/integration/online_datasources/images/sink_status.png b/docs/en/integration/online_datasources/images/sink_status.png new file mode 100644 index 00000000000..ba9af5126cc Binary files /dev/null and b/docs/en/integration/online_datasources/images/sink_status.png differ diff --git a/docs/en/integration/online_datasources/images/table.png b/docs/en/integration/online_datasources/images/table.png new file mode 100644 index 00000000000..ee6c72b4586 Binary files /dev/null and b/docs/en/integration/online_datasources/images/table.png differ diff --git a/docs/en/integration/online_datasources/images/test_data.png b/docs/en/integration/online_datasources/images/test_data.png new file mode 100644 index 00000000000..bd25ee5be4e Binary files /dev/null and b/docs/en/integration/online_datasources/images/test_data.png differ diff --git a/docs/en/integration/online_datasources/images/topic_schema.png b/docs/en/integration/online_datasources/images/topic_schema.png new file mode 100644 index 00000000000..12d66ecebb4 Binary files /dev/null and b/docs/en/integration/online_datasources/images/topic_schema.png differ diff --git a/docs/en/integration/online_datasources/kafka_connector_demo.md b/docs/en/integration/online_datasources/kafka_connector_demo.md new file mode 100644 index 00000000000..b59d3c4a02b --- /dev/null +++ b/docs/en/integration/online_datasources/kafka_connector_demo.md @@ -0,0 +1,275 @@ +# Kafka + +## Introduction + +Apache Kafka is an event streaming platform that can be used as an online data source for OpenMLDB, allowing real-time data streams to be imported into OpenMLDB online. For more information on Kafka, please refer to the official website https://kafka.apache.org/. We have developed a Kafka Connector to seamlessly connect Kafka and OpenMLDB, facilitating data integration between the two platforms. In this document, you will learn about the concepts and usage of this connector. + +Please note that, for the sake of simplicity, this article will demonstrate the use of the Kafka Connect standalone mode to start the connector. However, the connector can also be fully started in distributed mode. + +```{seealso} +The implementation of the OpenMLDB Kafka Connector can be found in the [extensions/kafka-connect-jdbc](https://github.com/4paradigm/OpenMLDB/tree/main/extensions/kafka-connect-jdbc) directory. +``` + +## Overview + +### Download and Preperation + +- If you need to download Kafka, please click on the [Kafka Official Download](https://kafka.apache.org/downloads) link and download `kafka_2.13-3.1.0.tgz`. +- If you need to download the connector package and its dependencies, please click on [kafka-connect-jdbc.tgz](http://openmldb.ai/download/kafka-connector/kafka-connect-jdbc.tgz). +- If you need to download the configuration and script files required in this article, please click on [kafka_demo_files.tgz](http://openmldb.ai/download/kafka-connector/kafka_demo_files.tgz). + +This article will use Docker mode to start OpenMLDB, so there is no need to download OpenMLDB separately. Additionally, both Kafka and the connector can be started in the same container. + +We recommend that you bind all three downloaded file packages to the `kafka` directory. Alternatively, you can download the file packages after starting the container. For our demonstration, we assume that the file packages are all in the `/work/kafka` directory. + +``` +docker run -it -v `pwd`:/work/kafka 4pdosc/openmldb:0.8.4 bash +``` + +### Note + +Timestamp is in ms, value is set to JsonConvertor, only integer is supported. Depends on different messages, other Convertor can be selected. + +Connector can be used in earlier versions of Kafka Server, e.g. 1.1.1. However, note that the earlier versions may not have Kafka Broker "auto create topics" on. You will need to [enable it](https://kafka.apache.org/documentation/#brokerconfigs_auto.create.topics.enable). + + +### Process + +The brief process of using a connector is shown in the following figure. We will now provide a detailed introduction to each step. + +Overall, the usage process can be summarized into four steps: + +1. Start OpenMLDB and create a database + +2. Start Kafka and create a topic + +3. Start OpenMLDB Kafka Connector + +4. Conduct testing or normal use + +![demo steps](images/kafka_connector_steps.png) + +## Step 1: Start OpenMLDB and Create Database + +### Start OpenMLDB Cluster + +Start cluster in OpenMLDB container: + +``` +/work/init.sh +``` + +```{caution} +Currently, only the OpenMLDB cluster version can serve as the receiver of sink, and data will only be sink to the online storage of the cluster. +``` + +### Create Database + +We can quickly create a database through the pipe without logging into the client CLI: + +``` +echo "create database kafka_test;" | /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client +``` + +## Step 2: Start Kafka and Create Topic + +### Start Kafka + +Unzip Kafka, and then use the start script to start Kafka. + +``` +cd kafka +tar -xzf kafka_2.13-3.1.0.tgz +cd kafka_2.13-3.1.0 +./bin/kafka-server-start.sh -daemon config/server.properties +``` + +```{note} +The OpenMLDB service has already started ZooKeeper using port 2181, and Kafka does not need to start ZooKeeper again. So, all you need to do here is start the server. +``` + +You can use the `ps` command to check whether Kafka is running normally. If the startup fails, please check the log file `logs/server.log`. + +``` +ps axu|grep kafka +``` + +### Create Topic + +We have created a topic named `topic1`. Please note that special characters should be avoided as much as possible in the topic name. + +``` +./bin/kafka-topics.sh --create --topic topic1 --bootstrap-server localhost:9092 +``` + +You can `describe` topic to double-check if it is running normally. + +``` +./bin/kafka-topics.sh --describe --topic topic1 --bootstrap-server localhost:9092 +``` + +![topic status](images/kafka_topic_describe.png) + +## Step 3: Start Connector + +First, upzip `/work/kafka` in the connector and kafka_demo_files package. + +``` +cd /work/kafka +tar zxf kafka-connect-jdbc.tgz +tar zxf kafka_demo_files.tgz +``` + +To start the connector, you need two configuration files from the `kafka_demo_files` and place the connector plugin in the correct location. + +The first configuration file is `connect-standalone.properties` for the connect worker. The key configuration to focus on is the "plugin.path," please make sure this configuration is set as follows: + +``` +plugin.path=/usr/local/share/java +``` + +The connector and all dependent packages required to run, need to be placed in this directory. The command is as follows: + +``` +mkdir -p /usr/local/share/java +cp -r /work/kafka/kafka-connect-jdbc /usr/local/share/java/ +``` + +The second configuration file is the Sink Connector configuration for connecting to OpenMLDB, named `openmldb-sink.properties`, as shown below: + +``` +name=test-sink +connector.class=io.confluent.connect.jdbc.JdbcSinkConnector +tasks.max=1 +topics=topic1 +connection.url=jdbc:openmldb:///kafka_test?zk=127.0.0.1:2181&zkPath=/openmldb +auto.create=true +value.converter=org.apache.kafka.connect.json.JsonConverter +value.converter.schemas.enable=true +``` + +In the connection configuration, it is essential to fill in the correct OpenMLDB URL address. This connector receives messages from `topic1` and automatically creates a table (auto.create). We have set the value converter in the `openmldb-sink.properties` configuration of the connector. Alternatively, you can set the default converter in the `connect-standalone.properties` of the connector worker without the need for additional configuration in the connector. + +```{tip} +Details of configuration items can be found in the Kafka documentation under [Configuring Connectors](https://kafka.apache.org/documentation/#connect_configuring). + +Among them, `connection.url` needs to be configured with the correct OpenMLDB cluster address and database name, and the database must already exist. + +Attributes such as `value.converter` can also be configured at the connector, and they will override the default configuration of the connector worker. After the connector is started, properties can also be dynamically modified through the HTTP API. +``` + +Next, use Kafka Connector standalone mode to start the connect worker. + +``` +cd /work/kafka/kafka_2.13-3.1.0 +./bin/connect-standalone.sh -daemon ../kafka_demo_files/connect-standalone.properties ../kafka_demo_files/openmldb-sink.properties +``` + +Please confirm whether the connect worker is running and if the sink task is correctly connected to the OpenMLDB cluster. You can check the `logs/connect.log`, and under normal circumstances, the log should display `Executing sink task`. + +## Step 4: Testing + +### Send Message + +For testing, we will use the console producer provided by Kafka as the messaging tool. + +Since the table has not been created yet, our message should include a schema to help Kafka parse the information and write it to OpenMLDB. + +``` +{"schema":{"type":"struct","fields":[{"type":"int16","optional":true,"field":"c1_int16"},{"type":"int32","optional":true,"field":"c2_int32"},{"type":"int64","optional":true,"field":"c3_int64"},{"type":"float","optional":true,"field":"c4_float"},{"type":"double","optional":true,"field":"c5_double"},{"type":"boolean","optional":true,"field":"c6_boolean"},{"type":"string","optional":true,"field":"c7_string"},{"type":"int64","name":"org.apache.kafka.connect.data.Date","optional":true,"field":"c8_date"},{"type":"int64","name":"org.apache.kafka.connect.data.Timestamp","optional":true,"field":"c9_timestamp"}],"optional":false,"name":"foobar"},"payload":{"c1_int16":1,"c2_int32":2,"c3_int64":3,"c4_float":4.4,"c5_double":5.555,"c6_boolean":true,"c7_string":"c77777","c8_date":19109,"c9_timestamp":1651051906000}} +``` + +To simplify the process, we will save the above information in the file `kafka_demo_files/message`. You can use this file directly to send the message to Kafka using the console producer. + +``` +./bin/kafka-console-producer.sh --topic topic1 --bootstrap-server localhost:9092 < ../kafka_demo_files/message +``` + +```{tip} +If you prefer the messages not to contain a schema but do not have additional components like Schema Registry, you can first create a table in OpenMLDB and then configure `auto.schema=true` in the connector. For detailed configuration methods, please refer to the [kafka connect jdbc documentation](https://github.com/4paradigm/OpenMLDB/blob/main/extensions/kafka-connect-jdbc/DEVELOP.md). Currently, this method only supports usage with JsonConverter. +``` + +### Check + +We can query whether the insertion was successful in OpenMLDB. Query script `kafka_demo_files/select.sql`, the content is as follows: + +``` +set @@execute_mode='online'; +use kafka_test; +select * from topic1; +``` + +You can directly run the query script to query: + +``` +/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < ../kafka_demo_files/select.sql +``` + +![openmldb result](images/kafka_openmldb_result.png) + +## Debugging + +### Log + +The logs of the Kafka server can be found in `log/server.log`. If Kafka itself is not functioning correctly, please check this log for potential issues. + +The log for the connector can be found in `log/connect.log`. If the producer is not running as expected or you encounter difficulties querying data in OpenMLDB, please check this log for any relevant information. + +### Reinitialization + +If you encounter any problems during testing, you can reinitialize the environment to facilitate the retry process. + +To stop Kafka, you need to terminate the two daemon processes by using the command `kill kafka`. + +``` +ps axu|grep kafka | grep -v grep | awk '{print $2}' | xargs kill -9 +``` + +Kafka's data can also be deleted, please refer to [TERMINATE THE KAFKA ENVIRONMENT](https://kafka.apache.org/quickstart#quickstart_kafkaterminate): + +``` +rm -rf /tmp/kafka-logs /tmp/kraft-combined-logs +``` + +Please do not delete the `/tmp/zookeeper` directory here, nor kill the zookeeper process, as OpenMLDB also uses this zookeeper cluster. When reinitializing OpenMLDB, it will handle the zookeeper termination and deletion of this directory. + +``` +/work/init.sh +``` + +To start a new OpenMLDB cluster, run the command `init.sh`. Afterward, you can create a database in OpenMLDB and restart Kafka for further testing. + +## Deployment using Kubernetes + +To deploy the OpenMLDB Kafka Connect service, you need to prepare the configuration files locally. It is recommended to modify the "plugin.path=/tmp" configuration item and adjust the ZK address of the OpenMLDB connection. + +Configuration files required for the deployment: + +- connect-standalone.properties +- openmldb-sink.properties + +After preparing the configuration files locally, create a ConfigMap using the following command. + +``` +kubectl create configmap openmldb-kafka-connect-configmap --from-file=connect-standalone.properties=connect-standalone.properties --from-file=openmldb-sink.properties=openmldb-sink.properties +``` + +Next, use the provided Dockerfile by OpenMLDB to create images and push them to the image repository. + +``` +docker build -t registry.cn-shenzhen.aliyuncs.com/tobe43/openmldb-kafka-connect -f Dockerfile . +``` + +Use the Yaml file provided by OpenMLDB to create a deployment, and make necessary modifications to the launched configuration repository. Please note that currently, only a single node Connect service is supported, so the replica configuration for deployment must be set to 1. + +``` +kubectl create -f ./openmldb-kafka-connect-deployment.yaml +``` + +Once the deployment is created, you can check the running status and logs of the corresponding Pod, and then send a message to Kafka to verify if the service functions correctly. + +If you want to access the RESTful interface of the Connect service, you can refer to the provided Yaml file by OpenMLDB to create the service. By default, NodePort is used, and the corresponding service can be accessed on port 8083 of the host. + +``` +kubectl create -f ./openmldb-kafka-connect-service.yaml +``` diff --git a/docs/en/integration/online_datasources/pulsar_connector_demo.md b/docs/en/integration/online_datasources/pulsar_connector_demo.md new file mode 100644 index 00000000000..43ee64988d7 --- /dev/null +++ b/docs/en/integration/online_datasources/pulsar_connector_demo.md @@ -0,0 +1,334 @@ +# Pulsar + +## Introduction + +Apache Pulsar is a cloud-native, distributed messaging platform that can serve as an online data source for OpenMLDB, allowing real-time data streams to be imported into OpenMLDB online. For more information about Pulsar, please refer to the official website https://pulsar.apache.org/. We have developed the OpenMLDB JDBC Connector for Pulsar, which facilitates the seamless connection between Pulsar and OpenMLDB. In this document, you will learn about the concept and usage of this connector. + +Please note that for the sake of simplicity, this article will use Pulsar Standalone, an OpenMLDB cluster, and a simple JSON message producer program to demonstrate how the OpenMLDB JDBC Connector works. However, this connector is fully functional in a Pulsar Cluster. + +```{seealso} +For detailed information on Pulsar's OpenMLDB Connector, you can also refer to Pulsar's [Official Website for Relevant Information](https://pulsar.apache.org/docs/en/next/io-connectors/#jdbc-Openmldb). +``` + +## Overview + +### Download + +- To proceed with the usage, you will need to download all the necessary files for this article. Please click on [files](https://openmldb.ai/download/pulsar-connector/files.tar.gz) to download them. These files include the connector packages, schema files, configuration files, and more. + +- Alternatively, if you only want to download the connector package for your own project, please click on [connector snapshot](https://github.com/4paradigm/OpenMLDB/releases/download/v0.4.4/pulsar-io-jdbc-openmldb-2.11.0-SNAPSHOT.nar). + +### The Process + +The overall process of using the connector is illustrated in the figure below. We will now provide a detailed introduction to each step. Additionally, we have recorded the complete steps, and the details can be found in [terminalizer](https://terminalizer.com/view/be2309235671). You can also download the script in [demo.yml](https://github.com/vagetablechicken/pulsar-openmldb-connector-demo/blob/main/demo.yml). + +In summary, the usage process can be broken down into three steps: + +1. Create the relevant databases and tables in OpenMLDB. +2. Create a sink in Pulsar to connect the Pulsar data stream with OpenMLDB and configure the corresponding schema in Pulsar to ensure that the data stream is correctly received by OpenMLDB and stored in the online database. +3. Conduct testing or normal usage. + +![demo steps](images/demo_steps.png) + +## Step 1: Create a Database and Data Table in OpenMLDB + +### Start OpenMLDB Cluster + +Using Docker, you can quickly start OpenMLDB and create tables for testing. For more information on creating an OpenMLDB cluster, please refer to [Quickstart](../../quickstart/openmldb_quickstart.md). + +```{caution} +Currently, only the OpenMLDB cluster version can act as the receiver of sinks, and data will only be sunk to the online storage of the cluster. +``` + +We recommend using the 'host network' mode to run Docker and bind the file directory 'files' where the SQL script is located. + +``` +docker run -dit --network host -v `pwd`/files:/work/pulsar_files --name openmldb 4pdosc/openmldb:0.8.4 bash +docker exec -it openmldb bash +``` + +Start cluster in OpenMLDB cluster: + +``` +./init.sh +``` + +```{caution} +Please note that on the macOS platform, even when using the host network, it is not supported to connect to the OpenMLDB server inside the container from outside. However, it is feasible to connect to OpenMLDB services in other containers from within the container. +``` + +### Create Table + +We use a script to quickly create tables, with the following content: + +``` +create database pulsar_test; +use pulsar_test; +create table connector_test(id string, vendor_id int, pickup_datetime bigint, dropoff_datetime bigint, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int); +desc connector_test; +``` + +Execute script: + +``` +/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < /work/pulsar_files/create.sql +``` + +![table desc](images/table.png) + +```{note} +Currently, both JSONSchema and JDBC base connectors in Pulsar do not support 'java.sql.Timestamp'. Therefore, we use 'long' as the data type for the timestamp column (in OpenMLDB, long can be used as the timestamp). +``` + +## Step 2: Create Sink and Schema in Pulsar + +### Start Pulsar Standalone + +Using Docker makes it easier and quicker to launch Pulsar. We recommend using the 'host network' mode to run Docker, as it can avoid many container-related network connection issues. Additionally, we need to use `pulsar-admin` for sink creation, which is within the Pulsar image. Therefore, we use bash to run the container and execute commands one by one inside the container. It is also necessary to bind the 'files' directory. + +``` +docker run -dit --network host -v `pwd`/files:/pulsar/files --name pulsar apachepulsar/pulsar:2.9.1 bash +docker exec -it pulsar bash +``` + +Start the standalone server in Pulsar container. + +``` +bin/pulsar-daemon start standalone --zookeeper-port 5181 +``` + +```{note} +Since the OpenMLDB service is already using port 2181, we will set a different zk port for Pulsar. We will use port 2181 to connect to OpenMLDB, but the zk port within Pulsar standalone will not have any external impact. +``` + +You can use `ps` to check whether Pulsar is running normally. If the startup fails, check the log `logs/pulsar-standalone-....log`. + +``` +ps axu|grep pulsar +``` + +When you start a local standalone cluster, it will automatically create a 'public/default' namespace. This namespace is used for development, as mentioned in the [Pulsar Documentation](https://pulsar.apache.org/docs/en/2.9.0/standalone/#start-pulsar-standalone). + +**We will create sink in this namespace** + +```{seealso} +If you want to directly start Pulsar locally, refer to [Set Up a Standalone Pulsar Locally] (https://pulsar.apache.org/docs/en/standalone/) +``` + +#### Q&A + +Q: What is the reason when encountering such issue? + +``` +2022-04-07T03:15:59,289+0000 [main] INFO org.apache.zookeeper.server.NIOServerCnxnFactory - binding to port 0.0.0.0/0.0.0.0:5181 +2022-04-07T03:15:59,289+0000 [main] ERROR org.apache.pulsar.zookeeper.LocalBookkeeperEnsemble - Exception while instantiating ZooKeeper +java.net.BindException: Address already in use +``` + +A: Pulsar requires an unused port to start zk. Port 5181 is already in use, and the port number of '--zookeeper-port' needs to be changed. + +Q: Is port 8080 already in use? + +A: Port 8080 is the default configuration port for 'webServicePort', which can be replaced in `conf/standalone.conf`. However, note that pulsar-admin will use the 'webServiceUrl' in `conf/client.conf` to connect, and changes will also need to be synchronized. + +Q: Is port 6650 already in use? + +A: Changes need to be synchronized to the 'brokerServicePort' in `conf/standalone.conf` and the 'brokerServiceUrl' configuration item in `conf/client.conf`. + +### Install Connector (Optional) + +In the previous steps, we bound the 'files' directory, which already provides the nar package for the connector. We can use the "non-built-in connector" mode to set the connector (i.e. specify the 'archive' configuration item in the sink configuration, which will be described in the next step). + +But if you want to use the OpenMLDB connector as a built-in connector, you need to create the 'connectors' directory and copy the nar file to the 'connectors' directory. + +``` +mkdir connectors +cp files/pulsar-io-jdbc-openmldb-2.11.0-SNAPSHOT.nar connectors/ +``` + +If you want to change or add a connector while Pulsar is running, you can notify Pulsar to update the information: + +``` +bin/pulsar-admin sinks reload +``` + +When the OpenMLDB connector becomes a built-in connector, its sink type name is 'jdbc-openmldb', and you can directly use this type name to specify the use of the OpenMLDB connector. + +### Create Sink + +We use the 'public/default' namespace to create a sink, and we need a configuration file for the sink, which is located in `files/pulsar-openmldb-jdbc-sink.yaml`. The content is as follows: + +``` + tenant: "public" + namespace: "default" + name: "openmldb-test-sink" + archive: "files/pulsar-io-jdbc-openmldb-2.11.0-SNAPSHOT.nar" + inputs: ["test_openmldb"] + configs: + jdbcUrl: "jdbc:openmldb:///pulsar_test?zk=localhost:2181&zkPath=/openmldb" + tableName: "connector_test" +``` + +```{note} +'name': The name of the sink. + +'archive': We use 'archive' to specify the sink connector, so here we use the OpenMLDB connector as a non-built-in connector. + +'input': This can be the name of multiple topics, but in this article, we will use only one topic. + +'config': JDBC configuration used to connect to the OpenMLDB cluster. +``` + +Next, create a sink and check. Please note that the input topic we set is 'test_openmldb', which will be used in subsequent steps. + +``` +./bin/pulsar-admin sinks create --sink-config-file files/pulsar-openmldb-jdbc-sink.yaml +./bin/pulsar-admin sinks status --name openmldb-test-sink +``` + +![init sink status](images/init_sink_status.png) + +### Create Schema + +Upload the schema to the topic 'test_openmldb', and the schema type is in JSON format. In the following steps, we will produce JSON messages with the same schema. The schema file is `files/openmldb-table-schema`, and its content is as follows: + +``` +{ + "type": "JSON", + "schema":"{\"type\":\"record\",\"name\":\"OpenMLDBSchema\",\"namespace\":\"com.foo\",\"fields\":[{\"name\":\"id\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"vendor_id\",\"type\":\"int\"},{\"name\":\"pickup_datetime\",\"type\":\"long\"},{\"name\":\"dropoff_datetime\",\"type\":\"long\"},{\"name\":\"passenger_count\",\"type\":\"int\"},{\"name\":\"pickup_longitude\",\"type\":\"double\"},{\"name\":\"pickup_latitude\",\"type\":\"double\"},{\"name\":\"dropoff_longitude\",\"type\":\"double\"},{\"name\":\"dropoff_latitude\",\"type\":\"double\"},{\"name\":\"store_and_fwd_flag\",\"type\":[\"null\",\"string\"],\"default\":null},{\"name\":\"trip_duration\",\"type\":\"int\"}]}", + "properties": {} +} +``` + +The command to upload and check the schema is as follows: + +``` +./bin/pulsar-admin schemas upload test_openmldb -f ./files/openmldb-table-schema +./bin/pulsar-admin schemas get test_openmldb +``` + +![topic schema](images/topic_schema.png) + +## Step 3: Testing + +### Send Message + +We use two OpenMLDB images with `data/taxi_tour_table_train_simple.csv` as the sample data of the testing message. The data is shown in the following figure: +![test data](images/test_data.png) + +#### Java Producer + +For the Producer JAVA code, see detailed in [demo producer](https://github.com/vagetablechicken/pulsar-client-java). The core code is as follows: +![snippet](images/producer_code.png) + +As you can see, the producer will send two messages to the topic 'test_openmldb'. Afterward, Pulsar will read the messages and write them to the online storage of the OpenMLDB cluster. + +The program package is located in the 'files' directory, and you can run it directly: + +``` +java -cp files/pulsar-client-java-1.0-SNAPSHOT-jar-with-dependencies.jar org.example.Client +``` + +#### Python Producer + +The producer can also be implemented using Python, as detailed in `files/pulsar_client.py`. Before running, it is necessary to install the Pulsar Python client: + +``` +pip3 install pulsar-client==2.9.1 +``` + +Run: + +``` +python3 files/pulsar_client.py +``` + +```{note} +Known issue: If the value of a Long type in the message is relatively small, during the jdbc bindValue phase of the Pulsar Function sink, it may read the value from the JsonRecord as a Java Integer type, resulting in using `statement.setInt` instead of `setLong` when calling [setColumnValue](https://github.com/apache/pulsar/blob/82237d3684fe506bcb6426b3b23f413422e6e4fb/pulsar-io/jdbc/core/src/main/java/org/apache/pulsar/io/jdbc/BaseJdbcAutoSchemaSink.java#L170)) for the Long type column. As a result, it will generate an SQLException with `data type not match`. + +If you want to see which specific column has the data type issue, please open the function's debug level logs. The method is described in [Debugging](#debugging). +``` + +### Check + +#### Check in Pulsar + +We can check the sink status in Pulsar: + +``` +./bin/pulsar-admin sinks status --name openmldb-test-sink +``` + +![sink status](images/sink_status.png) + +```{note} +"numReadFromPulsar": Pulsar sent 2 messages to the sink example. + +"numWritenToSink": The sink example wrote 2 messages to OpenMLDB. +``` + +#### Check in OpenMLDB + +We can query these message data in OpenMLDB online storage. The content of the query script select.sql is as follows: + +``` +set @@execute_mode='online'; +use pulsar_test; +select *, string(timestamp(pickup_datetime)), string(timestamp(dropoff_datetime)) from connector_test; +``` + +Execute the script in OpenMLDB container: + +``` +/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < /work/pulsar_files/select.sql +``` + +![openmldb result](images/openmldb_result.png) + + +### Debugging + +If OpenMLDB does not have data and the sink's status is reasonable, then there might be an issue with the sink writing. Please check the sink log at the address `logs/functions/public/default/openmldb-test-sink/openmldb-test-sink-0.log`. If you are using a different sink name, please locate the correct sink log. + +Pulsar will repeatedly attempt to write messages that were not successfully written before, so if you have previously sent an error message, even if the new message is successfully written, there will still be an error message written in the log. During testing, it is recommended to directly truncate the topic and retest it: + +``` +./bin/pulsar-admin topics truncate persistent://public/default/test_openmldb +``` + +If you have a self-named sink name, you can use `./bin/pulsar-admin topics list public/default` to query the full name of the topic. + +#### Debug Log + +If the sink log information is not sufficient to locate the issue, you can open the debug log. You will need to modify the configuration and restart the sink by editing `vim conf/functions_log4j2.xml` and making the following modifications: + +```xml + + pulsar.log.level + debug + +``` + +```xml + + ${sys:pulsar.log.level} + + ${sys:pulsar.log.appender} + ${sys:pulsar.log.level} + + +``` + +Then, restart sink: + +``` +./bin/pulsar-admin sinks restart --name openmldb-test-sink +``` + +#### Restart Pulsar + +``` +bin/pulsar-daemon stop standalone --zookeeper-port 5181 +rm -r data logs +bin/pulsar-daemon start standalone --zookeeper-port 5181 +``` diff --git a/docs/en/maintain/monitoring.md b/docs/en/maintain/monitoring.md index 356dd08eb2a..f20af17304c 100644 --- a/docs/en/maintain/monitoring.md +++ b/docs/en/maintain/monitoring.md @@ -25,6 +25,13 @@ The OpenMLDB exporter is a Prometheus exporter implemented in Python. The core c - Python >= 3.8 - OpenMLDB >= 0.5.0 +### Compatibility + +| [OpenMLDB Exporter version](https://pypi.org/project/openmldb-exporter/) | [OpenMLDB supported version](https://github.com/4paradigm/OpenMLDB/releases) | [Grafana Dashboard revision](https://grafana.com/grafana/dashboards/17843-openmldb-dashboard/?tab=revisions) | Explaination | +| ---- | ---- | ---- | ------- | +| >= 0.9.0 | >= 0.8.4 | >=4 | OpenMLDB removed deploy response time in database since 0.8.4 | +| < 0.9.0 | >= 0.5.0, < 0.8.4 | 3 | | + ### Preparation 1. Get OpenMLDB diff --git a/docs/en/reference/sql/data_types/date_and_time_types.md b/docs/en/openmldb_sql/data_types/date_and_time_types.md similarity index 100% rename from docs/en/reference/sql/data_types/date_and_time_types.md rename to docs/en/openmldb_sql/data_types/date_and_time_types.md diff --git a/docs/en/reference/sql/data_types/index.rst b/docs/en/openmldb_sql/data_types/index.rst similarity index 100% rename from docs/en/reference/sql/data_types/index.rst rename to docs/en/openmldb_sql/data_types/index.rst diff --git a/docs/en/reference/sql/data_types/numeric_types.md b/docs/en/openmldb_sql/data_types/numeric_types.md similarity index 100% rename from docs/en/reference/sql/data_types/numeric_types.md rename to docs/en/openmldb_sql/data_types/numeric_types.md diff --git a/docs/en/reference/sql/data_types/string_types.md b/docs/en/openmldb_sql/data_types/string_types.md similarity index 62% rename from docs/en/reference/sql/data_types/string_types.md rename to docs/en/openmldb_sql/data_types/string_types.md index 328bd22ad34..4f556a9ec76 100644 --- a/docs/en/reference/sql/data_types/string_types.md +++ b/docs/en/openmldb_sql/data_types/string_types.md @@ -7,7 +7,7 @@ A string is a sequence of bytes or characters, enclosed within either single quo "another string" ``` -| type | size | use | -| :----- | :--- | :--------- | -| STRING | 2M | 变长字符串 | +| type | size | use | +| :----- | :--- | :--------------------- | +| STRING | 2M | Variable-length character strings | diff --git a/docs/en/openmldb_sql/dml/LOAD_DATA_STATEMENT.md b/docs/en/openmldb_sql/dml/LOAD_DATA_STATEMENT.md index e10a91e98be..d8998fbf3ce 100644 --- a/docs/en/openmldb_sql/dml/LOAD_DATA_STATEMENT.md +++ b/docs/en/openmldb_sql/dml/LOAD_DATA_STATEMENT.md @@ -65,6 +65,7 @@ The following table introduces the parameters of `LOAD DATA INFILE`. - As metioned in the above table, online execution mode only supports append input mode. - When `deep_copy=false`, OpenMLDB doesn't support to modify the data in the soft link. Therefore, if the current offline data comes from a soft link, `append` import is no longer supported. Moreover, if current connection is soft copy, using the hard copy with `overwrite` will not delete the data of the soft connection. +- If the `insert_memory_usage_limit` session variable is set, a failure will be returned if the server memory usage exceeds the set value during online import ``` diff --git a/docs/en/reference/sql/dql/GROUP_BY_CLAUSE.md b/docs/en/openmldb_sql/dql/GROUP_BY_CLAUSE.md similarity index 100% rename from docs/en/reference/sql/dql/GROUP_BY_CLAUSE.md rename to docs/en/openmldb_sql/dql/GROUP_BY_CLAUSE.md diff --git a/docs/en/reference/sql/dql/HAVING_CLAUSE.md b/docs/en/openmldb_sql/dql/HAVING_CLAUSE.md similarity index 100% rename from docs/en/reference/sql/dql/HAVING_CLAUSE.md rename to docs/en/openmldb_sql/dql/HAVING_CLAUSE.md diff --git a/docs/en/reference/sql/dql/JOIN_CLAUSE.md b/docs/en/openmldb_sql/dql/JOIN_CLAUSE.md similarity index 100% rename from docs/en/reference/sql/dql/JOIN_CLAUSE.md rename to docs/en/openmldb_sql/dql/JOIN_CLAUSE.md diff --git a/docs/en/reference/sql/dql/LIMIT_CLAUSE.md b/docs/en/openmldb_sql/dql/LIMIT_CLAUSE.md similarity index 100% rename from docs/en/reference/sql/dql/LIMIT_CLAUSE.md rename to docs/en/openmldb_sql/dql/LIMIT_CLAUSE.md diff --git a/docs/en/reference/sql/dql/NO_TABLE_SELECT_CLAUSE.md b/docs/en/openmldb_sql/dql/NO_TABLE_SELECT_CLAUSE.md similarity index 100% rename from docs/en/reference/sql/dql/NO_TABLE_SELECT_CLAUSE.md rename to docs/en/openmldb_sql/dql/NO_TABLE_SELECT_CLAUSE.md diff --git a/docs/en/reference/sql/dql/SELECT_INTO_STATEMENT.md b/docs/en/openmldb_sql/dql/SELECT_INTO_STATEMENT.md similarity index 100% rename from docs/en/reference/sql/dql/SELECT_INTO_STATEMENT.md rename to docs/en/openmldb_sql/dql/SELECT_INTO_STATEMENT.md diff --git a/docs/en/reference/sql/dql/SELECT_STATEMENT.md b/docs/en/openmldb_sql/dql/SELECT_STATEMENT.md similarity index 91% rename from docs/en/reference/sql/dql/SELECT_STATEMENT.md rename to docs/en/openmldb_sql/dql/SELECT_STATEMENT.md index 01a7180c914..534f71f7280 100644 --- a/docs/en/reference/sql/dql/SELECT_STATEMENT.md +++ b/docs/en/openmldb_sql/dql/SELECT_STATEMENT.md @@ -138,7 +138,7 @@ TableAsName ```{warning} The `SELECT` running in online mode or the stand-alone version may not obtain complete data. -Because a query may perform a large number of scans on multiple tablets, for stability, the largest number of bytes to scan is limited, namely `scan_max_bytes_size`. +The largest number of bytes to scan is limited, namely `scan_max_bytes_size`, default value is unlimited. But if you set the value of `scan_max_bytes_size` to a specific value, the `SELECT` statement will only scan the data within the specified size. If the select results are truncated, the message of `reach the max byte ...` will be recorded in the tablet's log, but there will be no error. -If the select results are truncated, the message of `reach the max byte ...` will be recorded in the tablet's log, but there will be no error. +Even if the `scan_max_bytes_size` is set to unlimited, the `SELECT` statement may failed, e.g. client errors `body_size=xxx from xx:xxxx is too large`, ` Fail to parse response from xx:xxxx by baidu_std at client-side`. We don't recommend to use `SELECT` in online mode or the stand-alone version. If you want to get the count of the online table, please use `SELECT COUNT(*) FROM table_name;`. ``` diff --git a/docs/en/reference/sql/dql/WHERE_CLAUSE.md b/docs/en/openmldb_sql/dql/WHERE_CLAUSE.md similarity index 100% rename from docs/en/reference/sql/dql/WHERE_CLAUSE.md rename to docs/en/openmldb_sql/dql/WHERE_CLAUSE.md diff --git a/docs/en/reference/sql/dql/WINDOW_CLAUSE.md b/docs/en/openmldb_sql/dql/WINDOW_CLAUSE.md similarity index 100% rename from docs/en/reference/sql/dql/WINDOW_CLAUSE.md rename to docs/en/openmldb_sql/dql/WINDOW_CLAUSE.md diff --git a/docs/en/reference/sql/dql/WITH_CLAUSE.md b/docs/en/openmldb_sql/dql/WITH_CLAUSE.md similarity index 100% rename from docs/en/reference/sql/dql/WITH_CLAUSE.md rename to docs/en/openmldb_sql/dql/WITH_CLAUSE.md diff --git a/docs/en/openmldb_sql/dql/images/dql_images.pptx b/docs/en/openmldb_sql/dql/images/dql_images.pptx new file mode 100644 index 00000000000..17e4a0c8dae Binary files /dev/null and b/docs/en/openmldb_sql/dql/images/dql_images.pptx differ diff --git a/docs/en/reference/sql/dql/images/last_join_with_order1.png b/docs/en/openmldb_sql/dql/images/last_join_with_order1.png similarity index 100% rename from docs/en/reference/sql/dql/images/last_join_with_order1.png rename to docs/en/openmldb_sql/dql/images/last_join_with_order1.png diff --git a/docs/en/reference/sql/dql/images/last_join_with_order2.png b/docs/en/openmldb_sql/dql/images/last_join_with_order2.png similarity index 100% rename from docs/en/reference/sql/dql/images/last_join_with_order2.png rename to docs/en/openmldb_sql/dql/images/last_join_with_order2.png diff --git a/docs/en/reference/sql/dql/images/last_join_without_order.png b/docs/en/openmldb_sql/dql/images/last_join_without_order.png similarity index 100% rename from docs/en/reference/sql/dql/images/last_join_without_order.png rename to docs/en/openmldb_sql/dql/images/last_join_without_order.png diff --git a/docs/en/reference/sql/dql/images/last_join_without_order2.png b/docs/en/openmldb_sql/dql/images/last_join_without_order2.png similarity index 100% rename from docs/en/reference/sql/dql/images/last_join_without_order2.png rename to docs/en/openmldb_sql/dql/images/last_join_without_order2.png diff --git a/docs/en/reference/sql/dql/images/window_exclude_current_row.png b/docs/en/openmldb_sql/dql/images/window_exclude_current_row.png similarity index 100% rename from docs/en/reference/sql/dql/images/window_exclude_current_row.png rename to docs/en/openmldb_sql/dql/images/window_exclude_current_row.png diff --git a/docs/en/reference/sql/dql/images/window_exclude_current_time.png b/docs/en/openmldb_sql/dql/images/window_exclude_current_time.png similarity index 100% rename from docs/en/reference/sql/dql/images/window_exclude_current_time.png rename to docs/en/openmldb_sql/dql/images/window_exclude_current_time.png diff --git a/docs/en/reference/sql/dql/images/window_frame_type.png b/docs/en/openmldb_sql/dql/images/window_frame_type.png similarity index 100% rename from docs/en/reference/sql/dql/images/window_frame_type.png rename to docs/en/openmldb_sql/dql/images/window_frame_type.png diff --git a/docs/en/reference/sql/dql/images/window_max_size.png b/docs/en/openmldb_sql/dql/images/window_max_size.png similarity index 100% rename from docs/en/reference/sql/dql/images/window_max_size.png rename to docs/en/openmldb_sql/dql/images/window_max_size.png diff --git a/docs/en/reference/sql/dql/images/window_union_1_table.png b/docs/en/openmldb_sql/dql/images/window_union_1_table.png similarity index 100% rename from docs/en/reference/sql/dql/images/window_union_1_table.png rename to docs/en/openmldb_sql/dql/images/window_union_1_table.png diff --git a/docs/en/reference/sql/dql/images/window_union_1_table_instance_not_in_window.png b/docs/en/openmldb_sql/dql/images/window_union_1_table_instance_not_in_window.png similarity index 100% rename from docs/en/reference/sql/dql/images/window_union_1_table_instance_not_in_window.png rename to docs/en/openmldb_sql/dql/images/window_union_1_table_instance_not_in_window.png diff --git a/docs/en/reference/sql/dql/images/window_union_2_table.png b/docs/en/openmldb_sql/dql/images/window_union_2_table.png similarity index 100% rename from docs/en/reference/sql/dql/images/window_union_2_table.png rename to docs/en/openmldb_sql/dql/images/window_union_2_table.png diff --git a/docs/en/reference/sql/dql/index.rst b/docs/en/openmldb_sql/dql/index.rst similarity index 100% rename from docs/en/reference/sql/dql/index.rst rename to docs/en/openmldb_sql/dql/index.rst diff --git a/docs/en/openmldb_sql/functions_and_operators/index.rst b/docs/en/openmldb_sql/functions_and_operators/index.rst new file mode 100644 index 00000000000..b889a6e8a87 --- /dev/null +++ b/docs/en/openmldb_sql/functions_and_operators/index.rst @@ -0,0 +1,10 @@ +============================= +Expressions, Functions, and Operations +============================= + + +.. toctree:: + :maxdepth: 1 + + operators + Files/udfs_8h diff --git a/docs/en/openmldb_sql/functions_and_operators/operators.md b/docs/en/openmldb_sql/functions_and_operators/operators.md new file mode 100644 index 00000000000..dcdc12b8107 --- /dev/null +++ b/docs/en/openmldb_sql/functions_and_operators/operators.md @@ -0,0 +1,116 @@ +# Operator + +## Operator Precedence + +```yacc +%left "OR" +%left "AND" +%left "XOR" +%left UNARY_NOT_PRECEDENCE // (NOT / !) +%nonassoc "=" "==" "<>" ">" "<" ">=" "<=" "!=" "LIKE" "ILIKE" "RLIKE" "IN" "DISTINCT" "BETWEEN" "IS" "NOT_SPECIAL" +%nonassoc "ESCAPE" +%left "|" +%left "^" +%left "&" +%left "<<" ">>" +%left "+" "-" +%left "||" +%left "*" "/" "DIV" "%" "MOD" +%left UNARY_PRECEDENCE // For all unary operators, +, -, ~ +``` + +## Various Operations + +### 1. Comparison Operation + +| operator name | function description | +| :-------------- | :-------------------------- | +| `>` | Greater than | +| `>=` | Greater than or equal to | +| `<` | Less than | +| `<=` | Less than or equal to | +| `!=`, `<>` | Not equal to | +| `=`, `==` | Equal | +| `BEWTEEN...AND` | Between left and right | +| `IN` | In the collection | +| `LIKE` | Comparison, case sensitive | +| `ILIKE` | Comparison, case insensitive | +| `RLIKE` | Regular expression comparison | + +### 2. Logic Operation + +| operator name | function description | +| :---------- | :------- | +| `AND` | Logical and | +| `OR` | Logical or | +| `XOR` | Logical xor | +| `NOT`, `!` | Logical not, unary operator | + +### 3. Arithmetic Operations + +| operator name | function description | +| :--------- | :-------------------------------------------------- | +| `%`, `MOD` | Modulo | +| `*` | Multiplication | +| `+` | Addition | +| `-` | Subtraction | +| `/` | Float division | +| `DIV` | Integer division | +| `+` | Unary plus | +| `-` | Unary minus, support only `-number` | + +### 4. Bit Operation + +| operator name | Description | +| :------- | :---------- | +| `&` | Bitwise AND | +| `\|` | Bitwise OR | +| `^` | Bitwise XOR | +| `~` | Bitwise NOT, unary operator | + +### 5. Type Operations and Functions + +| operator name | Description | +| :------------- | :--------------------------------------------------------- | +| `CAST` | ```CAST expr AS dist_type```,cast expression `expr` to target type | +| `bool` | `bool(expr)`,convert expression to BOOL type | +| `smallint` | `smallint(expr)`,convert expression to SMALLINT type | +| `int` | `int(expr)`,convert expression to INT type | +| `bigint` | `bigint(expr)`,convert expression to type BIGINT | +| `string(expr)` | `string(expr),convert expression to type STRING` | + +**Conversion Compatibility Between Types** + +Safe: Indicates that the conversion from the original type to the target type is safe without loss of precision and no computation exceptions. For example, converting from int to bigint is unsafe: + +```sql +SELECT BIGINT(12345); +-- 12345 +``` + +Unsafe: Indicates that the conversion from the original type to the target type is unsafe, and the precision may be lost or an exception may occur after data conversion. + +```sql +SELECT INT(1.2); +-- output 1 +``` + +X:Indicates that a conversion from the original type to the target type is not supported + +| src\|dist | bool | smallint | int | float | int64 | double | timestamp | date | string | +| :------------ | :----- | :------- | :----- | :----- | :----- | :----- | :-------- | :----- | :----- | +| **bool** | Safe | Safe | Safe | Safe | Safe | Safe | UnSafe | X | Safe | +| **smallint** | UnSafe | Safe | Safe | Safe | Safe | Safe | UnSafe | X | Safe | +| **int** | UnSafe | UnSafe | Safe | Safe | Safe | Safe | UnSafe | X | Safe | +| **float** | UnSafe | UnSafe | UnSafe | Safe | Safe | Safe | UnSafe | X | Safe | +| **bigint** | UnSafe | UnSafe | UnSafe | UnSafe | Safe | UnSafe | UnSafe | X | Safe | +| **double** | UnSafe | UnSafe | UnSafe | UnSafe | UnSafe | Safe | UnSafe | X | Safe | +| **timestamp** | UnSafe | UnSafe | UnSafe | UnSafe | Safe | UnSafe | Safe | UnSafe | Safe | +| **date** | UnSafe | X | X | X | X | X | UnSafe | Safe | Safe | +| **string** | UnSafe | UnSafe | UnSafe | UnSafe | UnSafe | UnSafe | UnSafe | UnSafe | Safe | + +## assignment operator + +| operator name | function description | +| :------- | :------------------------ | +| `=` | Assignment (can be used in SET statement) | diff --git a/docs/en/openmldb_sql/index.rst b/docs/en/openmldb_sql/index.rst new file mode 100644 index 00000000000..6380c333c9d --- /dev/null +++ b/docs/en/openmldb_sql/index.rst @@ -0,0 +1,18 @@ +============================= +OpenMLDB SQL +============================= + + +.. toctree:: + :maxdepth: 1 + + sql_difference + language_structure/index + data_types/index + functions_and_operators/index + dql/index + dml/index + ddl/index + deployment_manage/index + task_manage/index + udf_develop_guide diff --git a/docs/en/openmldb_sql/sql_difference.md b/docs/en/openmldb_sql/sql_difference.md new file mode 100644 index 00000000000..feee7e6a9c4 --- /dev/null +++ b/docs/en/openmldb_sql/sql_difference.md @@ -0,0 +1,266 @@ +# Main Differences from Standard SQL + +This article provides a comparison between the main usage of OpenMLDB SQL (SELECT query statements) and standard SQL (using MySQL-supported syntax as an example). It aims to help developers with SQL experience quickly adapt to OpenMLDB SQL. + +Unless otherwise specified, the default version is OpenMLDB: >= v0.7.1 + +## Support Overview + +The table below summarizes the differences in overall performance between OpenMLDB SQL and standard SQL based on SELECT statement elements across three execution modes (for execution mode details, please refer to [Workflow and Execution Modes](../quickstart/concepts/modes.md)). OpenMLDB SQL is currently partially compatible with standard SQL, with additional syntax introduced to accommodate specific business scenarios. New syntax is indicated in bold in the table. + +Note: ✓ indicates that the statement is supported, while ✕ indicates that it is not. + +| | **OpenMLDB SQL**
**Offline Mode** | **OpenMLDB SQL**
**Online Preview Mode** | **OpenMLDB SQL**
**Online Request Mode** | **Standard SQL** | **Remarks** | +| -------------- | ---------------------------- | -------------------------------- | -------------------------------- | ------------ | ------------------------------------------------------------ | +| WHERE Clause | ✓ | ✓ | ✕ | ✓ | Some functionalities can be achieved through built-in functions with the `_where` suffix. | +| HAVING Clause | ✓ | ✓ | X | ✓ | | +| JOIN Clause | ✓ | ✕ | ✓ | ✓ | OpenMLDB only supports **LAST JOIN** and **LEFT JOIN**. | +| GROUP BY | ✓ | ✕ | ✕ | ✓ | | +| ORDER BY | ✓ | ✓ | ✓ | ✓ | Support is limited to usage within the `WINDOW` and `LAST JOIN` clauses; it does not support reverse sorting in `DESC`. | +| LIMIT | ✓ | ✓ | ✕ | ✓ | | +| WINDOW Clause | ✓ | ✓ | ✓ | ✓ | OpenMLDB includes new syntax **WINDOW UNION** and **WINDOW ATTRIBUTES**. | +| WITH Clause | ✕ | ✕ | ✕ | ✓ | OpenMLDB supports begins from version v0.8.0. | +| Aggregate Function | ✓ | ✓ | ✓ | ✓ | OpenMLDB has more extension functions. | + + + +## Explanation of Differences + +### Difference Dimension + +Compared to standard SQL, the differences in OpenMLDB SQL can be explained from three main perspectives: + +1. **Execution Mode**: OpenMLDB SQL has varying support for different SQL statements in three distinct execution modes: offline mode, online preview mode, and online request mode. The choice of execution mode depends on specific requirements. In general, for real-time computations in SQL, business SQL must adhere to the constraints of the online request mode. +2. **Clause Combinations**: The combination of different clauses can introduce additional limitations. In these scenarios, one clause operates on the result set of another clause. For example, when LIMIT is applied to WHERE, the SQL would resemble `SELECT * FROM (SELECT * FROM t1 WHERE id >= 2) LIMIT 2`. The term 'table reference' used here refers to `FROM TableRef`, which does not represent a subquery or a complex FROM clause involving JOIN or UNION. +3. **Special Restrictions**: Unique restrictions that do not fit the previous categories are explained separately. These restrictions are usually due to incomplete functionality or known program issues. + +### Configuration of Scanning Limits + +To prevent user errors from affecting online performance, OpenMLDB has introduced relevant parameters that limit the number of full table scans in offline mode and online preview mode. If these limitations are enabled, certain operations involving scans of multiple records (such as SELECT *, aggregation operations, etc.) may result in truncated results and, consequently, incorrect outcomes. It's essential to note that these parameters do not affect the accuracy of results in online request mode. + +The configuration of these parameters is done within the tablet configuration file `conf/tablet.flags`, as detailed in the document on [Configuration File](../deploy/conf.md#the-configuration-file-for-tablet-conftabletflags). The parameters affecting scan limits include: + +- Maximum Number of Scans: `--max_traverse_cnt` +- Maximum Number of Scanned Keys: `--max_traverse_pk_cnt` +- Size Limit for Returned Results: `--scan_max_bytes_size` + +In versions from v0.7.3 onwards, it's expected that the default values for these parameters will be set to 0, implying there will be no related restrictions. Users of earlier versions should take note of the parameter settings. + +### WHERE Clause + +| **Apply To** | **Offline Mode** | **Online Preview Mode** | **Online Request Mode** | +| ------------------ | ------------ | ---------------- | ---------------- | +| Table References | ✓ | ✓ | ✕ | +| LAST JOIN | ✓ | ✓ | ✕ | +| Subquery/ WITH Clause | ✓ | ✓ | ✕ | + +In the online request mode, the `WHERE` clause isn't supported. However, some functionalities can be achieved through computation functions with the `_where` suffix, like `count_where` and `avg_where`, among others. For detailed information, please refer to [Built-In Functions](./udfs_8h.md). + +### LIMIT Clause + +LIMIT is followed by an INT literal, and it does not support other expressions. It indicates the maximum number of rows for returned data. However, LIMIT is not supported in the online mode. + +| **Apply to** | **Offline Mode** | **Online Preview Mode** | **Online Request Mode** | +| ----------------- | ---------------- | ----------------------- | ----------------------- | +| Table Reference | ✓ | ✓ | ✕ | +| WHERE | ✓ | ✓ | ✕ | +| WINDOW | ✓ | ✓ | ✕ | +| LAST JOIN | ✓ | ✓ | ✕ | +| GROUP BY & HAVING | ✕ | ✓ | ✕ | + +### WINDOW Clause + +The WINDOW clause and the GROUP BY & HAVING clause cannot be used simultaneously. When transitioning to the online mode, the input table for the WINDOW clause must be either a physical table or a simple column filtering, along with LAST JOIN concatenation of the physical table. Simple column filtering entails a select list containing only column references or renaming columns, without additional expressions. You can refer to the table below for specific support scenarios. If a scenario is not listed, it means that it's not supported. + +| **Apply to** | **Offline Mode** | **Online Preview Mode** | **Online Request Mode** | +| ------------------------------------------------------------ | ---------------- | ----------------------- | ----------------------- | +| Table Reference | ✓ | ✓ | ✓ | +| GROUP BY & HAVING | ✕ | ✕ | ✕ | +| LAST JOIN | ✓ | ✓ | ✓ | +| Subqueries are only allowed under these conditions:
1. Simple column filtering from a single table
2. Multi-table LAST JOIN
3. Simple column filtering after a dual-table LAST JOIN
| ✓ | ✓ | ✓ | + +Special Restrictions: + +- In online request mode, the input for WINDOW can be a LAST JOIN or a LAST JOIN within a subquery. It's important to note that the columns for `PARTITION BY` and `ORDER BY` in the window definition must all originate from the leftmost table of the JOIN. + +### GROUP BY & HAVING Clause + +The GROUP BY statement is still considered an experimental feature and only supports a physical table as the input table. It's not supported in other scenarios. GROUP BY is also not available in the online mode. + +| **Apply to** | **Offline Mode** | **Online Preview Mode** | **Online Request Mode** | +| --------------- | ---------------- | ----------------------- | ----------------------- | +| Table Reference | ✓ | ✓ | ✕ | +| WHERE | ✕ | ✕ | ✕ | +| LAST JOIN | ✕ | ✕ | ✕ | +| Subquery | ✕ | ✕ | ✕ | + +### JOIN Clause + +OpenMLDB exclusively supports the LAST JOIN and LEFT JOIN syntax. For a detailed description, please refer to the section on JOIN in the extended syntax. A JOIN consists of two inputs, the left and right. In the online request mode, it supports two inputs as physical tables or specific subqueries. You can refer to the table for specific details. If a scenario is not listed, it means it's not supported. + +| **Apply to** | **Offline Mode** | **Online Preview Mode** | **Online Request Mode** | +| ---------------------------------------------- | ------------ | ---------------- | ---------------- | +| LAST JOIN + two table reference | ✓ | ✕ | ✓ | +| LAST JOIN + simple column filtering for both tables| ✓ | ✕ | ✓ | +| LAST JOIN + left table is filtering with WHERE | ✓ | ✕ | ✓ | +| LAST JOIN one of the table is WINDOW or LAST JOIN | ✓ | ✕ | ✓ | +| LAST JOIN + right table is LEFT JOIN subquery | ✕ | ✕ | ✓ | +| LEFT JOIN | ✕ | ✕ | ✕ | + +Special Restrictions: +- Launching LAST JOIN for specific subqueries involves additional requirements. For more information, please refer to [Online Requirements](../openmldb_sql/deployment_manage/ONLINE_REQUEST_REQUIREMENTS.md#specifications-of-last-join-under-online-request-mode). +- LAST JOIN and LEFT JOIN is currently not supported in online preview mode. + +### WITH Clause + +OpenMLDB (>= v0.7.2) supports non-recursive WITH clauses. The WITH clause functions equivalently to how other clauses work when applied to subqueries. To understand how the WITH statement is supported, please refer to its corresponding subquery writing methods as explained in the table above. + +No special restrictions apply in this case. + +### ORDER BY Keyword + +The sorting keyword `ORDER BY` is only supported within the `WINDOW` and `LAST JOIN` clauses in the window definition, and the reverse sorting keyword `DESC` is not supported. Detailed guidance on these clauses can be found in the WINDOW and LAST JOIN sections. + +### Aggregate Function + +Aggregation functions can be applied to all tables or windows. Window aggregation queries are supported in all three modes. Full table aggregation queries are only supported in online preview mode and are not available in offline and online request modes. + +- Regarding full table aggregation, OpenMLDB v0.6.0 began supporting this feature in online preview mode. However, it's essential to pay attention to the described [Scanning Limit Configuration](https://openmldb.feishu.cn/wiki/wikcnhBl4NsKcAX6BO9NDtKAxDf#doxcnLWICKzccMuPiWwdpVjSaIe). + +- OpenMLDB offers various extensions for aggregation functions. To find the specific functions supported, please consult the product documentation in [OpenMLDB Built-In Function](../openmldb_sql/udfs_8h.md). + +## Extended Syntax + +OpenMLDB has focused on deep customization of the `WINDOW` and `LAST JOIN` statements and this section will provide an in-depth explanation of these two statements. + +### WINDOW Clause + +A typical WINDOW statement in OpenMLDB generally includes the following elements: + +- Data Definition: Defines the data within the window using `PARTITION BY`. +- Data Sorting: Defines the data sorting within the window using `ORDER BY`. +- Scope Definition: Determines the direction of time extension through `PRECEDING`, `CURRENT ROW`, and `UNBOUNDED`. +- Range Unit: Utilizes `ROWS` and `ROWS_RANGE` to specify the unit of window sliding range. +- Window Attributes: Includes OpenMLDB-specific window attribute definitions, such as `MAXSIZE`, `EXCLUDE CURRENT_ROW`, `EXCLUDE CURRENT_TIME`, and `INSTANCE_NOT_IN_WINDOW`. +- Multi-table Definition: Uses the extended syntax `WINDOW ... UNION` to determine whether concatenation of cross-table data sources is required. + +For a detailed syntax of the WINDOW statement, please refer to the [WINDOW Documentation](../openmldb_sql/dql/WINDOW_CLAUSE.md) + +| **Statement Element** | **Support Syntax** | **Description** | Required? | +| ---------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | --------- | +| Data Definition | PARTITION BY | OpenMLDB supports multiple column data types: bool, int16, int32, int64, string, date, timestamp. | ✓ | +| Data Sorting | ORDER BY | - It only supports sorting on a single column.
- Supported data types for sorting include int16, int32, int64, and timestamp.
- Reverse order (`DESC`) is not supported.
- Must specify for versions before v0.8.4 | - | +| Scope Definition | Basic upper and lower bounds definition: ROWS/ROWS_RANGE BETWEEN ... AND ... Scope definition is supported with keywords PRECEDING, OPEN PRECEDING, CURRENT ROW, UNBOUNDED | - Must specify both upper and lower boundaries.
- The boundary keyword `FOLLOWING` is not supported.
- In online request mode, `CURRENT ROW` represents the present request line. From a table perspective, the current row is virtually inserted into the appropriate position in the table based on the `ORDER BY` criteria. | ✓ | +| Scope Unit | ROWS
ROWS_RANGE (Extended) | - ROW_RANGE is an extended syntax for defining window boundaries similar to standard SQL RANGE-type windows. It allows defining window boundaries with either numerical values or values with time units. This is an extended syntax.
- Window ranges defined in time units are equivalent to window definitions where time is converted into milliseconds. For example, `ROWS_RANGE 10s PRECEDING ...` and `ROWS_RANGE 10000 PRECEDING...` are equivalent. | ✓ | +| Window Properties (Extended) | MAXSIZE
EXCLUDE CURRENT_ROW
EXCLUDE CURRENT_TIME
INSTANCE_NOT_IN_WINDOW | MAXSIZE is only valid to ROWS_RANGE Without ORDER BY and EXCLUDE CURRENT_TIME cannot be used together | - | +| Multi Table Definition (Extension) | In practical use, the syntax form is relatively complex. Please refer to:
[Cross Table Feature Development Tutorial](../tutorial/tutorial_sql_2.md)
[WINDOW UNION Syntax Documentation](../openmldb_sql/dql/WINDOW_CLAUSE.md#1-window--union) | - Merging of multiple tables is allowed
- Union of simple subqueries is allowed
- It is commonly used in combination with aggregation functions for cross-table aggregation operations. | - | +| Incognito Window | - | Complete window definition must include `PARTITION BY`, `ORDER BY`, and window range definition. | - | + +#### Special Restrictions + +In online preview mode or offline mode, there are certain known issues when using LIMIT or WHERE clauses as inputs to the WINDOW clause, and it's generally not recommended. + +#### Example of Window Definition + +Define a `ROWS` type window with a range from the first 1000 rows to the current row. + +```SQL +SELECT + sum(col2) OVER w1 as w1_col2_sum +FROM + t1WINDOW w1 AS ( + PARTITION BY col1 + ORDER BY + col5 ROWS BETWEEN 1000 PRECEDING + AND CURRENT ROW + ); +``` + +Define a `ROWS_RANGE` type window with a range covering all rows in the first 10 seconds of the current row, including the current row. + +```SQL +SELECT + sum(col2) OVER w1 as w1_col2_sum +FROM + t1WINDOW w1 AS ( + PARTITION BY col1 + ORDER BY + col5 ROWS_RANGE BETWEEN 10s PRECEDING + AND CURRENT ROW + ); +``` + +Define a `ROWS` type window with a range from the first 1000 rows to the current row, containing only the current row and no other data at the current time. + +```SQL +SELECT + sum(col2) OVER w1 as w1_col2_sum +FROM + t1 WINDOW w1 AS ( + PARTITION BY col1 + ORDER BY + col5 ROWS BETWEEN 1000 PRECEDING + AND CURRENT ROW EXCLUDE CURRENT_TIME + ); +``` + +Define a `ROWS_RANGE` type window with a range from the current time to the past 10 seconds, excluding the current request line. + +```SQL +SELECT + sum(col2) OVER w1 as w1_col2_sum +FROM + t1 WINDOW w1 AS ( + PARTITION BY col1 + ORDER BY + col5 ROWS_RANGE BETWEEN 10s PRECEDING + AND CURRENT ROW EXCLUDE CURRENT_ROW + ); +``` + +Anonymous window: + +```SQL +SELECT + id, + pk1, + col1, + std_ts, + sum(col1) OVER ( + PARTITION BY pk1 + ORDER BY + std_ts ROWS BETWEEN 1 PRECEDING + AND CURRENT ROW + ) as w1_col1_sumfrom t1; +``` + +#### Example of WINDOW ... UNION + +In practical development, many applications store data in multiple tables. In such cases, the syntax `WINDOW ... UNION` is commonly used for cross-table aggregation operations. Please refer to the "Multi-Table Aggregation Features" section in the [Cross-Table Feature Development Tutorial](../tutorial/tutorial_sql_2.md). + +### LAST JOIN Clause + +For detailed syntax specifications for LAST JOIN, please refer to the [LAST JOIN Documentation](../openmldb_sql/dql/JOIN_CLAUSE.md#join-clause). + +| **Statement Element** | **Support Syntax** | **Description** | Required? | +| --------------------- | ------------------ | ------------------------------------------------------------ | --------- | +| ON | ✓ | Supported column types include: BOOL, INT16, INT32, INT64, STRING, DATE, TIMESTAMP. | ✓ | +| USING | X | - | - | +| ORDER BY | ✓ | - LAST JOIN extended syntax, not supported by LEFT JOIN.
- Only the following column types can be used: INT16, INT32, INT64, TIMESTAMP.
- The reverse order keyword DESC is not supported. | - | + +#### Example of LAST JOIN + +```SQL +SELECT + * +FROM + t1 +LAST JOIN t2 ON t1.col1 = t2.col1; + +SELECT + * +FROM + t1 +LEFT JOIN t2 ON t1.col1 = t2.col1; +``` + diff --git a/docs/en/openmldb_sql/udf_develop_guide.md b/docs/en/openmldb_sql/udf_develop_guide.md new file mode 100644 index 00000000000..1a2d73335a8 --- /dev/null +++ b/docs/en/openmldb_sql/udf_develop_guide.md @@ -0,0 +1,230 @@ +# UDF Development Guideline +## Background +Although OpenMLDB provides over a hundred built-in functions for data scientists to perform data analysis and feature extraction, there are scenarios where these functions might not fully meet the requirements. To facilitate users in quickly and flexibly implementing specific feature computation needs, we have introduced support for user-defined functions (UDFs) based on C++ development. Additionally, we enable the loading of dynamically generated user-defined function libraries. + +```{seealso} +Users can also extend OpenMLDB's computation function library using the method of developing built-in functions. However, developing built-in functions requires modifying the source code and recompiling. If users wish to contribute extended functions to the OpenMLDB codebase, they can refer to [Built-in Function Develop Guide](./built_in_function_develop_guide.md). +``` + +## Development Procedures +### Develop UDF functions +#### Naming Convention of C++ Built-in Function +- The naming of C++ built-in function should follow the [snake_case](https://en.wikipedia.org/wiki/Snake_case) style. +- The name should clearly express the function's purpose. +- The name of a function should not be the same as the name of a built-in function or other custom functions. The list of all built-in functions can be seen [here](../openmldb_sql/udfs_8h.md). + +#### C++ Type and SQL Type Correlation +The types of the built-in C++ functions' parameters should be BOOL, NUMBER, TIMESTAMP, DATE, or STRING. +The SQL types corresponding to C++ types are shown as follows: + +| SQL Type | C/C++ Type | +|:----------|:------------| +| BOOL | `bool` | +| SMALLINT | `int16_t` | +| INT | `int32_t` | +| BIGINT | `int64_t` | +| FLOAT | `float` | +| DOUBLE | `double` | +| STRING | `StringRef` | +| TIMESTAMP | `Timestamp` | +| DATE | `Date` | + + +#### Parameters and Return Values + +**Return Value**: + +* If the output type of the UDF is a basic type and `return_nullable` set to false, it will be processed as a return value. +* If the output type of the UDF is a basic type and `return_nullable` set to true, it will be processed as a function parameter. +* If the output type of the UDF is STRING, TIMESTAMP or DATE, it will return through the **last parameter** of the function. + +**Parameters**: + +* If the parameter is a basic type, it will be passed by value. +* If the output type of the UDF is STRING, TIMESTAMP or DATE, it will be passed by a pointer. +* The first parameter must be `UDFContext* ctx`. The definition of [UDFContext](../../../include/udf/openmldb_udf.h) is: + +```c++ + struct UDFContext { + ByteMemoryPool* pool; // Used for memory allocation. + void* ptr; // Used for the storage of temporary variables for aggregate functions. + }; +``` + +**Function Declaration**: + +* The functions must be declared by extern "C". + +#### Memory Management + +- In scalar functions, the use of 'new' and 'malloc' to allocate space for input and output parameters is not allowed. However, temporary space allocation using 'new' and 'malloc' is permissible within the function, and the allocated space must be freed before the function returns. + +- In aggregate functions, space allocation using 'new' or 'malloc' can be performed in the 'init' function but must be released in the 'output' function. The final return value, if it is a string, needs to be stored in the space allocated by mempool. + +- If dynamic memory allocation is required, OpenMLDB provides memory management interfaces. Upon function execution completion, OpenMLDB will automatically release the memory. +```c++ +char *buffer = ctx->pool->Alloc(size); +``` +- The maximum size allocated at once cannot exceed 2M. + +**Note**: +- If the parameters are declared as nullable, then all parameters are nullable, and each input parameter will have an additional `is_null` parameter. +- If the return value is declared as nullable, it will be returned through parameters, and an additional `is_null` parameter will indicate whether the return value is null. + +For instance, to declare a UDF scalar function, sum, which has two parameters, if the input and return value are nullable: +```c++ +extern "C" +void sum(::openmldb::base::UDFContext* ctx, int64_t input1, bool is_null, int64_t input2, bool is_null, int64_t* output, bool* is_null) { +``` +#### Scalar Function Implementation + +Scalar functions process individual data rows and return a single value, such as abs, sin, cos, date, year. +The process is as follows: +- The head file `udf/openmldb_udf.h` should be included. +- Develop the logic of the function. + +```c++ +#include "udf/openmldb_udf.h" // must include this header file + +// Develop a UDF that slices the first 2 characters of a given string. +extern "C" +void cut2(::openmldb::base::UDFContext* ctx, ::openmldb::base::StringRef* input, ::openmldb::base::StringRef* output) { + if (input == nullptr || output == nullptr) { + return; + } + uint32_t size = input->size_ <= 2 ? input->size_ : 2; + //use ctx->pool for memory allocation + char *buffer = ctx->pool->Alloc(size); + memcpy(buffer, input->data_, size); + output->size_ = size; + output->data_ = buffer; +} +``` + + +#### Aggregation Function Implementation + +Aggregate functions process a dataset (such as a column of data) and perform computations, returning a single value, such as sum, avg, max, min, count. +The process is as follows: +- The head file `udf/openmldb_udf.h` should be included. +- Develop the logic of the function. + +To develop an aggregate function, you need to implement the following three C++ methods: + +- init function: Perform initialization tasks such as allocating space for intermediate variables. Function naming format: 'aggregate_function_name_init'. + +- update function: Implement the logic for processing each row of the respective field in the update function. Function naming format: 'aggregate_function_name_update'. + +- output function: Process the final aggregated value and return the result. Function naming format: 'aggregate_function_name_output'." + +**Node**: Return `UDFContext*` as the return value in the init and update function. + +```c++ +#include "udf/openmldb_udf.h" //must include this header file +// implementation of aggregation function special_sum +extern "C" +::openmldb::base::UDFContext* special_sum_init(::openmldb::base::UDFContext* ctx) { + // allocate space for intermediate variables and assign to 'ptr' in UDFContext. + ctx->ptr = ctx->pool->Alloc(sizeof(int64_t)); + // init the value + *(reinterpret_cast(ctx->ptr)) = 10; + // return pointer of UDFContext, cannot be omitted + return ctx; +} + +extern "C" +::openmldb::base::UDFContext* special_sum_update(::openmldb::base::UDFContext* ctx, int64_t input) { + // get the value from ptr in UDFContext + int64_t cur = *(reinterpret_cast(ctx->ptr)); + cur += input; + *(reinterpret_cast(ctx->ptr)) = cur; + // return the pointer of UDFContext, cannot be omitted + return ctx; +} + +// get the aggregation result from ptr in UDFcontext and return +extern "C" +int64_t special_sum_output(::openmldb::base::UDFContext* ctx) { + return *(reinterpret_cast(ctx->ptr)) + 5; +} + +``` + + +For more UDF implementation, see [here](../../../src/examples/test_udf.cc). + + +### Compile Dynamic Library + +- Copy the `include` directory (`https://github.com/4paradigm/OpenMLDB/tree/main/include`) to a certain path (like `/work/OpenMLDB/`) for later compiling. +- Run the compiling command. `-I` specifies the path of the `include` directory. `-o` specifies the name of the dynamic library. + +```shell +g++ -shared -o libtest_udf.so examples/test_udf.cc -I /work/OpenMLDB/include -std=c++11 -fPIC +``` + +### Copy Dynamic Library +The compiled dynamic libraries should be copied into the `udf` directories for both TaskManager and tablets. Please create a new `udf` directory if it does not exist. +- The `udf` directory of a tablet is `path_to_tablet/udf`. +- The `udf` directory of TaskManager is `path_to_taskmanager/taskmanager/bin/udf`. + +For example, if the deployment paths of a tablet and TaskManager are both `/work/openmldb`, the structure of the directory is shown below: + +``` + /work/openmldb/ + ├── bin + ├── conf + ├── taskmanager + │   ├── bin + │   │   ├── taskmanager.sh + │   │   └── udf + │   │   └── libtest_udf.so + │   ├── conf + │   └── lib + ├── tools + └── udf +    └── libtest_udf.so +``` + +```{note} +- For multiple tablets, the library needs to be copied to every tablet. +- Dynamic libraries should not be deleted before the execution of `DROP FUNCTION`. +``` + + +### Register, Drop and Show the Functions +For registering, please use [CREATE FUNCTION](../openmldb_sql/ddl/CREATE_FUNCTION.md). + +Register an scalar function: +```sql +CREATE FUNCTION cut2(x STRING) RETURNS STRING OPTIONS (FILE='libtest_udf.so'); +``` +Register an aggregation function: +```sql +CREATE AGGREGATE FUNCTION special_sum(x BIGINT) RETURNS BIGINT OPTIONS (FILE='libtest_udf.so'); +``` +Register an aggregation function with input value and return value support null: +```sql +CREATE AGGREGATE FUNCTION third(x BIGINT) RETURNS BIGINT OPTIONS (FILE='libtest_udf.so', ARG_NULLABLE=true, RETURN_NULLABLE=true); +``` + +**note**: +- The types of parameters and return values must be consistent with the implementation of the code. +- `FILE` specifies the file name of the dynamic library. It is not necessary to include a path. +- A UDF function can only work on one type. Please create multiple functions for multiple types. + + +After successful registration, the function can be used. +```sql +SELECT cut2(c1) FROM t1; +``` + +You can view registered functions through `SHOW FUNCTIONS`. +```sql +SHOW FUNCTIONS; +``` + +Use the `DROP FUNCTION` to delete a registered function. +```sql +DROP FUNCTION cut2; +``` diff --git a/docs/en/quickstart/concepts/modes.md b/docs/en/quickstart/concepts/modes.md index d27f33ab001..0a66b02b0c2 100644 --- a/docs/en/quickstart/concepts/modes.md +++ b/docs/en/quickstart/concepts/modes.md @@ -59,7 +59,7 @@ The main features of the online preview mode are: - Online preview mode is mainly used for previewing limited data. Selecting and viewing data directly through SELECT in OpenMLDB CLI or SDKs may result in data truncation. If the data volume is large, it is recommended to use an [export tool](../../tutorial/data_export.html) to view the complete data. - SELECT statements in online preview mode currently do not support more complex queries such as `LAST JOIN` and `ORDER BY`. Refer to [SELECT](../../openmldb_sql/dql/SELECT_STATEMENT.html). - The server in the online preview mode executes SQL statements on a single thread. For large data processing, it may be slow and may trigger a timeout. To increase the timeout period, the `--request_timeout` can be configured on the client. -- To prevent impact on online services, online preview mode limits the maximum number of accessed records and the number of different keys. This can be configured using `--max_traverse_cnt` and `--max_traverse_key_cnt`. Similarly, the maximum result size can be set using `--scan_max_bytes_size`. For detailed configuration, refer to the [configuration file](../../deploy/conf.md). +- To prevent impact on online services, you can limit the maximum number of accessed records and the number of different keys in online preview mode. This can be configured using `--max_traverse_cnt` and `--max_traverse_key_cnt`. Similarly, the maximum result size can be set using `--scan_max_bytes_size`. For detailed configuration, refer to the [configuration file](../../deploy/conf.md). The command for setting online preview mode in OpenMLDB CLI: `SET @@execute_mode='online'` diff --git a/docs/en/reference/sql/ddl/SET_STATEMENT.md b/docs/en/reference/sql/ddl/SET_STATEMENT.md index 6c0e83de75a..25d03370eaf 100644 --- a/docs/en/reference/sql/ddl/SET_STATEMENT.md +++ b/docs/en/reference/sql/ddl/SET_STATEMENT.md @@ -35,6 +35,7 @@ The following format is also equivalent. | @@session.sync_job|@@sync_job | When the value is `true`, the offline command will be executed synchronously, waiting for the final result of the execution.
When the value is `false`, the offline command returns immediately. If you need to check the execution, please use `SHOW JOB` command. | `true`,
`false` | `false` | | @@session.sync_timeout|@@sync_timeout | When `sync_job=true`, you can configure the waiting time for synchronization commands. The timeout will return immediately. After the timeout returns, you can still view the command execution through `SHOW JOB`. | Int | 20000 | | @@session.spark_config|@@spark_config | Set the Spark configuration for offline jobs, configure like 'spark.executor.memory=2g;spark.executor.cores=2'. Notice that the priority of this Spark configuration is higer than TaskManager Spark configuration but lower than CLI Spark configuration file. | String | "" | +| @@session.insert_memory_usage_limit |@@insert_memory_usage_limit | Set server memory usage limit when inserting or importing data. If the server memory usage exceeds the set value, the insertion will fail. The value range is 0-100. 0 means unlimited | Int | "0" | ## Example diff --git a/docs/en/use_case/JD_recommendation_en.md b/docs/en/use_case/JD_recommendation.md similarity index 57% rename from docs/en/use_case/JD_recommendation_en.md rename to docs/en/use_case/JD_recommendation.md index 089bb7e810b..9a2a8cfa870 100644 --- a/docs/en/use_case/JD_recommendation_en.md +++ b/docs/en/use_case/JD_recommendation.md @@ -3,36 +3,43 @@ In this article, we will use [JD Prediction of purchase intention for high potential customers problem](https://jdata.jd.com/html/detail.html?id=1) as a demonstration,to show how we can use [OpenMLDB](https://github.com/4paradigm/OpenMLDB) and [OneFlow](https://github.com/Oneflow-Inc/oneflow) together to build a complete machine learning application. Full dataset [download here](https://openmldb.ai/download/jd-recommendation/JD_data.tgz). +## Background -Extracting patterns from historical data to predict the future purchase intentions, to bring together the most suitable products and customers who need them most, is the key issue in the application of big data in precision marketing, and is also the key technology in digitalization for all e-commerce platforms. As the largest self-operated e-commerce company in China, JD.com has accumulated hundreds of millions of loyal customers and massive amounts of real-life data. This demonstration is based on the real-life data, including real customers, product and behavior data (after desensitization) from Jingdong Mall, and utilizes data mining technology and machine learning algorithm to build a prediction model for user purchase intentions, and output matching results between high-potential customers and target products. This aims to provide high-quality target groups for precision marketing, mine the potential meaning behind the data, and provide e-commerce customers with a simpler, faster and more worry-free shopping experience. In this demonstration, OpenMLDB is used for data mining, and the [DeepFM](https://github.com/Oneflow-Inc/models/tree/main/RecommenderSystems/deepfm) model in OneFlow is used for high-performance training and inference to provide accurate product recommendations. +Extracting patterns from historical data to predict future purchase intentions, to bring together the most suitable products and customers who need them most, is the key issue in the application of big data in precision marketing, and is also the key technology in digitalization for all e-commerce platforms. As the largest self-operated e-commerce company in China, JD.com has accumulated hundreds of millions of loyal customers and massive amounts of real-life data. -Note that: (1) this case is based on the OpenMLDB cluster version for tutorial demonstration; (2) this document uses the pre-compiled docker image. If you want to test it in the OpenMLDB environment compiled and built by yourself, you need to configure and use our [Spark Distribution for Feature Engineering Optimization](https://github.com/4paradigm/spark). Please refer to relevant documents of [compilation](https://openmldb.ai/docs/en/main/deploy/compile.html) (Refer to Chapter: "Spark Distribution Optimized for OpenMLDB") and the [installation and deployment documents](https://openmldb.ai/docs/en/main/deploy/install_deploy.html) (Refer to the section: [Deploy TaskManager](https://openmldb.ai/docs/en/main/deploy/install_deploy.html#deploy-taskmanager)). +This demonstration is based on real-life data, including real customers, product and behavior data (after desensitization) from Jingdong Mall, and utilizes data mining technology and machine learning algorithm to build a prediction model for user purchase intentions, and output matching results between high-potential customers and target products. This aims to provide high-quality target groups for precision marketing, mine the potential meaning behind the data, and provide e-commerce customers with a simpler, faster, and more worry-free shopping experience. -## 1. Preparation and Preliminary Knowledge +In this demonstration, OpenMLDB is used for data mining, and the [DeepFM](https://github.com/Oneflow-Inc/models/tree/main/RecommenderSystems/deepfm) model in OneFlow is used for high-performance training and inference to provide accurate product recommendations. -### 1.1 Demo Scripts +```{note} +Note that this document uses the pre-compiled docker image. If you want to test it in the OpenMLDB environment compiled and built by yourself, please refer to relevant documents of [compilation](https://openmldb.ai/docs/en/main/deploy/compile.html) and the [installation and deployment documents](https://openmldb.ai/docs/en/main/deploy/install_deploy.html). +``` + +## Preparation and Preliminary Knowledge -Download demo scripts, or you can checkout `demo/jd-recommendation` in Github repo. +### Download Demo Materials + +Download demo data and scripts. ``` -wget http://openmldb.ai/download/jd-recommendation/demo.tgz +wget https://openmldb.ai/download/jd-recommendation/demo-0.8.1.tgz tar xzf demo.tgz -ls demo +ls jd-recommendation/ ``` - -Export `demodir`, we'll use the variable `demodir` in the future. +or you can checkout branch `demo/jd-recommendation`. The directory of this demo is set as `demodir`, which will be extensively used in the scripts. Therefore you need to set this environment variable: ``` export demodir=/demo ``` We'll use the small dataset in demo.tgz. If you want to test on full dataset, please download [JD_data](http://openmldb.ai/download/jd-recommendation/JD_data.tgz). -### 1.2 OneFlow Installation -OneFlow framework leverage on the great computational power from GPU. Therefore please ensure that the machines for deployment are equipped with NVidia GPUs, and ensure the driver version is >=460.X.X [driver version support for CUDA 11.0](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions). +### OneFlow Installation +OneFlow framework leverages on the great computational power from GPU. Therefore please ensure that the machines for deployment are equipped with NVidia GPUs, and ensure the driver version is >=460.X.X [driver version support for CUDA 11.0](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions). Install OneFlow with the following commands: ```bash conda create -y -n oneflow python=3.9.2 conda activate oneflow -pip install -f https://staging.oneflow.info/branch/master/cu112 --pre oneflow +pip install numpy==1.23 nvidia-cudnn-cu11 # for oneflow +pip install -f https://release.oneflow.info oneflow==0.9.0+cu112 pip install psutil petastorm pandas sklearn xxhash "tritonclient[all]" geventhttpclient tornado ``` @@ -44,23 +51,24 @@ docker pull oneflowinc/oneflow-serving:nightly Note that we are installing Oneflow nightly versions here. The versions tested in this guide are as follows: Oneflow:https://github.com/Oneflow-Inc/oneflow/tree/fcf205cf57989a5ecb7a756633a4be08444d8a28 Oneflow-serving:https://github.com/Oneflow-Inc/serving/tree/ce5d667468b6b3ba66d3be6986f41f965e52cf16 +If this docker image is not available, you can use this [backup image](https://openmldb.ai/download/jd-recommendation/oneflow-image.tar.gz),and then use `docker load < oneflow-image.tar.gz`. ``` -### 1.3 Pull and Start the OpenMLDB Docker Image -- Note: Please make sure that the Docker Engine version number is > = 18.03 - +### Pull and Start the OpenMLDB Docker Image Pull the OpenMLDB docker image and run. +- Docker: >=18.03 +Since the OpenMLDB cluster needs to communicate with other components, we will use the host network straightaway. In this example, we will use downloaded scripts in the docker, therefore we map the `demodir` directory into the docker container. ```bash docker run -dit --name=openmldb --network=host -v $demodir:/work/oneflow_demo 4pdosc/openmldb:0.8.4 bash docker exec -it openmldb bash ``` ```{note} -Note that all the commands for OpenMLDB part below run in the docker container by default. All the commands for OneFlow are to run in the environment as installed in 1.1. +Note that all the commands for OpenMLDB part below run in the docker container by default. All the commands for OneFlow are to run in the virtual environment `oneflow`. ``` -### 1.4 Start OpenMLDB cluster +### Start OpenMLDB cluster In container: ```bash @@ -70,39 +78,38 @@ We provide the init.sh script in the image that helps users to quickly initializ - Configure zookeeper - Start cluster version OpenMLDB -### 1.4 Start OpenMLDB CLI Client +### Start OpenMLDB CLI Client ```bash /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client ``` -```{note} -Note that most of the commands in this tutorial are executed under the OpenMLDB CLI. In order to distinguish from the ordinary shell environment, the commands executed under the OpenMLDB CLI use a special prompt of >. -``` -```{important} -Some commands in the cluster version are non-blocking tasks, including `LOAD DATA` in online mode and `LOAD DATA`, `SELECT`, `SELECT INTO` commands in the offline mode. After submitting a task, you can use relevant commands such as `SHOW JOBS` and `SHOW JOB` to view the task progress. For details, see the offline task management document. -``` +### Preliminary +Some commands in the cluster version are non-blocking tasks, including `LOAD DATA` in online mode and `LOAD DATA`, `SELECT`, `SELECT INTO` commands in online/offline mode. After submitting a task, you can use relevant commands such as `SHOW JOBS` and `SHOW JOB` to view the task progress. For details, see the [offline task management document](../openmldb_sql/task_manage/SHOW_JOB.md). -## 2. Machine Learning Process Based on OpenMLDB and OneFlow +## Machine Learning Process Based on OpenMLDB and OneFlow -### 2.1 Overview +### Overview Machine learning with OpenMLDB and OneFlow can be summarized into a few main steps: -1. OpenMLDB offlien feature design and extraction (SQL) -1. OneFlow model training -1. SQL and model serving +1. OpenMLDB offline feature design and extraction (SQL) +2. OneFlow model training +3. SQL and model serving + We will detail each step in the following sections. -### 2.2 Offline feature extraction with OpenMLDB -#### 2.2.1 Creating Databases and Data Tables -The following commands are executed in the OpenMLDB CLI environment. +### Offline feature extraction with OpenMLDB +The following commands are all executed in OpenMLDB CLI. + +#### Creating Databases and Data Tables ```sql -> CREATE DATABASE JD_db; -> USE JD_db; -> CREATE TABLE action(reqId string, eventTime timestamp, ingestionTime timestamp, actionValue int); -> CREATE TABLE flattenRequest(reqId string, eventTime timestamp, main_id string, pair_id string, user_id string, sku_id string, time bigint, split_id int, time1 string); -> CREATE TABLE bo_user(ingestionTime timestamp, user_id string, age string, sex string, user_lv_cd string, user_reg_tm bigint); -> CREATE TABLE bo_action(ingestionTime timestamp, pair_id string, time bigint, model_id string, type string, cate string, br string); -> CREATE TABLE bo_product(ingestionTime timestamp, sku_id string, a1 string, a2 string, a3 string, cate string, br string); -> CREATE TABLE bo_comment(ingestionTime timestamp, dt bigint, sku_id string, comment_num int, has_bad_comment string, bad_comment_rate float); +-- OpenMLDB CLI +CREATE DATABASE JD_db; +USE JD_db; +CREATE TABLE action(reqId string, eventTime timestamp, ingestionTime timestamp, actionValue int); +CREATE TABLE flattenRequest(reqId string, eventTime timestamp, main_id string, pair_id string, user_id string, sku_id string, time bigint, split_id int, time1 string); +CREATE TABLE bo_user(ingestionTime timestamp, user_id string, age string, sex string, user_lv_cd string, user_reg_tm bigint); +CREATE TABLE bo_action(ingestionTime timestamp, pair_id string, time bigint, model_id string, type string, cate string, br string); +CREATE TABLE bo_product(ingestionTime timestamp, sku_id string, a1 string, a2 string, a3 string, cate string, br string); +CREATE TABLE bo_comment(ingestionTime timestamp, dt bigint, sku_id string, comment_num int, has_bad_comment string, bad_comment_rate float); ``` You can also use sql script to execute (`/work/oneflow_demo/sql_scripts/create_tables.sql`) as shown below: @@ -110,26 +117,24 @@ You can also use sql script to execute (`/work/oneflow_demo/sql_scripts/create_t /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < /work/oneflow_demo/sql_scripts/create_tables.sql ``` -#### 2.2.2 Offline Data Preparation -First, you need to switch to offline execution mode. Next, import the sample data as offline data for offline feature calculation. - -The following commands are executed under the OpenMLDB CLI. +#### Offline Data Preparation +First, you need to switch to offline execution mode. Next, import the sample data as offline data for offline feature calculation. If you are importing a larger dataset, you can consider using soft links to reduce import time. In this demo, only small amount of data is imported, thus we use hard copy. For multiple data imports, the asynchronous mode will be more time-efficient. But you need to make sure that all imports are done before going into the next step. ```sql -> USE JD_db; -> SET @@execute_mode='offline'; -> LOAD DATA INFILE '/root/project/data/JD_data/action/*.parquet' INTO TABLE action options(format='parquet', header=true, mode='overwrite'); -> LOAD DATA INFILE '/root/project/data/JD_data/flattenRequest_clean/*.parquet' INTO TABLE flattenRequest options(format='parquet', header=true, mode='overwrite'); -> LOAD DATA INFILE '/root/project/data/JD_data/bo_user/*.parquet' INTO TABLE bo_user options(format='parquet', header=true, mode='overwrite'); -> LOAD DATA INFILE '/root/project/data/JD_data/bo_action/*.parquet' INTO TABLE bo_action options(format='parquet', header=true, mode='overwrite'); -> LOAD DATA INFILE '/root/project/data/JD_data/bo_product/*.parquet' INTO TABLE bo_product options(format='parquet', header=true, mode='overwrite'); -> LOAD DATA INFILE '/root/project/data/JD_data/bo_comment/*.parquet' INTO TABLE bo_comment options(format='parquet', header=true, mode='overwrite'); +-- OpenMLDB CLI +USE JD_db; +SET @@execute_mode='offline'; +LOAD DATA INFILE '/work/oneflow_demo/data/action/*.parquet' INTO TABLE action options(format='parquet', header=true, mode='overwrite'); +LOAD DATA INFILE '/work/oneflow_demo/data/flattenRequest_clean/*.parquet' INTO TABLE flattenRequest options(format='parquet', header=true, mode='overwrite'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_user/*.parquet' INTO TABLE bo_user options(format='parquet', header=true, mode='overwrite'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_action/*.parquet' INTO TABLE bo_action options(format='parquet', header=true, mode='overwrite'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_product/*.parquet' INTO TABLE bo_product options(format='parquet', header=true, mode='overwrite'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_comment/*.parquet' INTO TABLE bo_comment options(format='parquet', header=true, mode='overwrite'); ``` -or use script to execute, and check the job status with the following commands: +or use a script to execute, and check the job status with the following commands: ``` /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < /work/oneflow_demo/sql_scripts/load_offline_data.sql - echo "show jobs;" | /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client ``` @@ -137,14 +142,17 @@ echo "show jobs;" | /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk Note that `LOAD DATA` is a non-blocking task. You can use the command `SHOW JOBS` to view the running status of the task. Please wait for the task to run successfully (`state` to `FINISHED` status) before proceeding to the next step. ``` -#### 2.2.3 The Feature Extraction Script -Usually, users need to analyse the data according to the goal of machine learning before designing the features, and then design and investigate the features according to the analysis. Data analysis and feature research of the machine learning are not the scope of this demo, and we will not expand it. We assume that users already have the basic theoretical knowledge of machine learning, the ability to solve machine learning problems, the ability to understand SQL syntax, and the ability to use SQL syntax to construct features. For this case, we have designed several features after the analysis and research. +#### The Feature Extraction Script +Usually, users need to analyze the data according to the goal of machine learning before designing the features, and then design and investigate the features according to the analysis. Data analysis and feature research of machine learning are not in the scope of this demo, and we will not expand it. We assume that users already have the basic theoretical knowledge of machine learning, the ability to solve machine learning problems, the ability to understand SQL syntax, and the ability to use SQL syntax to construct features. For this case, we have designed several features after the analysis and research. + +In the actual process of machine learning feature exploration, scientists repeatedly experiment with features, seeking the best feature set for model effectiveness. Therefore, they continuously repeat the process of "feature design -> offline feature extraction -> model training," constantly adjusting features to achieve the desired results. -#### 2.2.4 Offline Feature Extraction -In the offline mode, the user extracts features and outputs the feature results to `'/root/project/out/1`(mapped to`$demodir/out/1`) that is saved in the data directory for subsequent model training. The `SELECT` command corresponds to the SQL feature extraction script generated based on the above table. The following commands are executed under the OpenMLDB CLI. +#### Offline Feature Extraction +In the offline mode, the user extracts features and outputs the feature results to `'/work/oneflow_demo/out/1`(mapped to`$demodir/out/1`) which is saved in the data directory for subsequent model training. The `SELECT` command corresponds to the SQL feature extraction script generated based on the above table. The following commands are executed under the OpenMLDB CLI. ```sql -> USE JD_db; -> select * from +-- OpenMLDB CLI +USE JD_db; +select * from ( select `reqId` as reqId_1, @@ -230,17 +238,18 @@ as out3 on out0.reqId_1 = out3.reqId_17 INTO OUTFILE '/work/oneflow_demo/out/1' OPTIONS(mode='overwrite'); ``` +```{note} +Note that the cluster version `SELECT INTO` is a non-blocking task. You can use the command `SHOW JOBS` to view the running status of the task. Please wait for the task to run successfully (`state` to `FINISHED` status) before proceeding to the next step. It takes around 1.5 minites. +``` Since there is only one command, we can directly execute the sql script `sync_select_out.sql`: ``` /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < /work/oneflow_demo/sql_scripts/sync_select_out.sql ``` + +### Pre-process Dataset to Match DeepFM Model Requirements ```{note} -Note that the cluster version `SELECT INTO` is a non-blocking task. You can use the command `SHOW JOBS` to view the running status of the task. Please wait for the task to run successfully (`state` to `FINISHED` status) before proceeding to the next step. -``` -### 2.3 Pre-process Dataset to Match DeepFM Model Requirements -```{note} -Note that following commands are executed outside the demo docker. They are executed in the environment as installed in section 1.1. +Note that following commands are executed outside the demo docker. They are executed in the virtual environment for OneFlow. ``` According to [DeepFM paper](https://arxiv.org/abs/1703.04247), we treat both categorical and continuous features as sparse features. @@ -279,9 +288,9 @@ out/ 3 directories, 4 files ``` -### 2.4 Launch OneFlow for Model Training +### Launch OneFlow for Model Training ```{note} -Note that following commands are executed in the environment as installed in section 1.2. +Note that the following commands are executed in the virtual environment for OneFlow. ``` ```bash @@ -290,34 +299,39 @@ sh train_deepfm.sh -h Usage: train_deepfm.sh DATA_DIR(abs) We'll read required args in $DATA_DIR/data_info.txt, and save results in path ./ ``` -The usage is shown above. So we run: +The training in OneFlow is done with the script `train_deepfm.sh`, with usage shown above. Normally, no special configurations are required. The scripts will read the parameters from `$DATA_DIR/data_info.txt`, including `num_train_samples`, `num_val_samples`, `num_test_samples` and `table_size_array`. Please use the output directory as follows: ```bash bash train_deepfm.sh $demodir/feature_preprocess/out ``` -Trained model will be saved in `$demodir/oneflow_process/model_out`, saved model for serving will be saved in `$demodir/oneflow_process/model/embedding/1/model`. +The trained model will be saved in `$demodir/oneflow_process/model_out`, saved model for serving will be saved in `$demodir/oneflow_process/model/embedding/1/model`. -## 3. Model Serving -### 3.1 Overview +## Model Serving +### Overview Model serving with OpenMLDB+OneFlow can be summarized into a few main steps. 1. OpenMLDB deploying: deploy SQL and prepare the online data -1. Oneflow serving: load model -1. Predict serving demo +2. Oneflow serving: load model +3. Predict serving demo + We will detail each step in the following sections. -### 3.2 OpenMLDB deploying +### OpenMLDB Deploying -#### 3.2.1 Online SQL Deployment -Assuming that the model produced by the features designed in Section 2.2.3 in the previous model training meets the expectation. The next step is to deploy the feature extraction SQL script online to provide real-time feature extraction. +#### Online SQL Deployment +Assuming that the model produced by the features designed in the previous model training meets the expectation. The next step is to deploy the feature extraction SQL script online to provide real-time feature extraction. In OpenMLDB docker(if exited, enter with `docker exec -it openmldb bash`): 1. Restart OpenMLDB CLI for SQL online deployment. ```bash - docker exec -it demo bash /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client ``` -2. Deploy the sql(see [Offline Feature Extracion](#224-offline-feature-extraction)) - +2. Deploy the sql(see [Offline Feature Extracion](#offline-feature-extraction)) +```sql +-- OpenMLDB CLI +USE JD_db; +DEPLOY demo OPTIONS(RANGE_BIAS='inf', ROWS_BIAS='inf') ; +``` +Or you can deploy with script inside the docker: ``` /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client < /work/oneflow_demo/sql_scripts/deploy.sql ``` @@ -326,18 +340,22 @@ Use the following command to check the deployment details: ```sql show deployment demo; ``` +After deployment, you can access the service through OpenMLDB ApiServer `127.0.0.1:9080`. + +#### Online Data Import +We need to import the data for real-time feature extraction. For simplicity, we directly import and use the same dataset as offline. In production, typically the offline dataset comprises a large volume of cold data, while the online dataset consists of recent hot data. -#### 3.2.2 Online Data Import -We need to import the data for real-time feature extraction. First, you need to switch to **online** execution mode. Then, in the online mode, import the sample data as the online data source. The following commands are executed under the OpenMLDB CLI. +The following commands are executed under the OpenMLDB CLI. ```sql -> USE JD_db; -> SET @@execute_mode='online'; -> LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/action/*.parquet' INTO TABLE action options(format='parquet', mode='append'); -> LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/flattenRequest_clean/*.parquet' INTO TABLE flattenRequest options(format='parquet', mode='append'); -> LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_user/*.parquet' INTO TABLE bo_user options(format='parquet', mode='append'); -> LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_action/*.parquet' INTO TABLE bo_action options(format='parquet', mode='append'); -> LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_product/*.parquet' INTO TABLE bo_product options(format='parquet', mode='append'); -> LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_comment/*.parquet' INTO TABLE bo_comment options(format='parquet', mode='append'); +-- OpenMLDB CLI +USE JD_db; +SET @@execute_mode='online'; +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/action/*.parquet' INTO TABLE action options(format='parquet', mode='append'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/flattenRequest_clean/*.parquet' INTO TABLE flattenRequest options(format='parquet', mode='append'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_user/*.parquet' INTO TABLE bo_user options(format='parquet', mode='append'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_action/*.parquet' INTO TABLE bo_action options(format='parquet', mode='append'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_product/*.parquet' INTO TABLE bo_product options(format='parquet', mode='append'); +LOAD DATA INFILE '/work/oneflow_demo/data/JD_data/bo_comment/*.parquet' INTO TABLE bo_comment options(format='parquet', mode='append'); ``` You can run the script: @@ -346,16 +364,16 @@ You can run the script: ``` And check the import job status by: ``` - echo "show jobs;" | /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client + echo "show jobs;" | /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client ``` ```{note} Note that the cluster version `LOAD DATA` is a non-blocking task. You can use the command `SHOW JOBS` to view the running status of the task. Please wait for the task to run successfully (`state` to `FINISHED` status) before proceeding to the next step. ``` -### 3.3 Oneflow serving +### Oneflow Serving -#### 3.3.1 Check Config +#### Check Config Check if model files `$demodir/oneflow_process/model` are correctly organized and saved as shown below: ``` @@ -384,9 +402,9 @@ model/ | `-- one_embedding_options.json `-- config.pbtxt ``` -1. Field `name` in `config.pbtxt` should be consistent with the name of the folder(`embedding`). And `persistent_table.path` will be generated automatically in `model/embedding/1/model/one_embedding_options.json`, you can check if it's the absolute path of`$demodir/oneflow_process/persistent`. +Field `name` in `config.pbtxt` should be consistent with the name of the folder(`embedding`). And `persistent_table.path` will be generated automatically in `model/embedding/1/model/one_embedding_options.json`, you can check if it's the absolute path of`$demodir/oneflow_process/persistent`. -#### 3.3.2 Start OneFlow serving +#### Start OneFlow serving Start OneFlow model serving with the following commands: ``` docker run --runtime=nvidia --rm -p 8001:8001 -p8000:8000 -p 8002:8002 \ @@ -395,8 +413,22 @@ docker run --runtime=nvidia --rm -p 8001:8001 -p8000:8000 -p 8002:8002 \ oneflowinc/oneflow-serving:nightly \ bash -c '/opt/tritonserver/bin/tritonserver --model-repository=/models' ``` -If sucessful, the output will look like the following: -``` +If successful, the output will look like the following: +``` +I0711 09:58:55.199227 1 server.cc:549] ++---------+---------------------------------------------------------+--------+ +| Backend | Path | Config | ++---------+---------------------------------------------------------+--------+ +| oneflow | /opt/tritonserver/backends/oneflow/libtriton_oneflow.so | {} | ++---------+---------------------------------------------------------+--------+ + +I0711 09:58:55.199287 1 server.cc:592] ++-----------+---------+--------+ +| Model | Version | Status | ++-----------+---------+--------+ +| embedding | 1 | READY | ++-----------+---------+--------+ +... I0929 07:28:34.281655 1 grpc_server.cc:4117] Started GRPCInferenceService at 0.0.0.0:8001 I0929 07:28:34.282343 1 http_server.cc:2815] Started HTTPService at 0.0.0.0:8000 I0929 07:28:34.324662 1 http_server.cc:167] Started Metrics Service at 0.0.0.0:8002 @@ -406,29 +438,33 @@ We can request `http://127.0.0.1:8000` to do predict. You can check if the servi ``` curl -v localhost:8000/v2/health/ready ``` - -If the repsonse is `Connection refused`, the serving failed to start. - +If the response is `Connection refused`, the serving failed to start. +Furthermore, check if model is successfully loaded: +``` +curl -v localhost:8000/v2/models/stats +``` +If successful, you will be able to see `embedding` information. Else, check the model path and the organization. ```{note} -If port 800x confict, you can change the host port. For example, use `-p 18000:8000`. If you change the host port mapping of 8000, you should change the oneflow request port in predict server demo too. +If port 800x confict, you can change the host port. For example, use `-p 18000:8000`. If you change the host port mapping of 8000, you should change the oneflow request port in the predict server demo too. ``` -### 3.4 Predict Serving Demo +### Predict Serving Demo ```{note} -Note that following commands are executed in the environment as installed in section 1.2. +Note that the following commands can be executed in any environment. Because of Python dependencies, we recommend using the virtual environment of OneFlow. ``` -The start script use `127.0.0.1:9080` to query OpenMLDB ApiServer, and `127.0.0.1:8000`to query OneFlow Triton serving。 +Upon receiving a request, the prediction service first obtains real-time features through OpenMLDB and the request the inference service with real-time features. The script uses `127.0.0.1:9080` to query OpenMLDB ApiServer, and `127.0.0.1:8000` to query OneFlow Triton serving. ```bash sh $demodir/serving/start_predict_server.sh ``` +You can check execution logs from `/tmp/p.log`. -### 3.5 Send Real-Time Request to test +### Send Real-Time Request to test Requests can be executed outside the OpenMLDB docker. The details can be found in [IP Configuration](https://openmldb.ai/docs/en/main/reference/ip_tips.html). -Execute `predict.py` in command window. This script will send a line of request data to the prediction service. Results will be received and printed out. +`predict.py` will send a line of request data to the prediction service. Results will be received and printed out. ```bash python $demodir/serving/predict.py @@ -436,6 +472,7 @@ python $demodir/serving/predict.py Sample output: ``` ----------------ins--------------- + ['200080_5505_2016-03-15 20:43:04' 1458045784000 '200080_5505_2016-03-15 20:43:04' '200080_5505' '5505' '200080' 1 1.0 1.0 1 1 3 1 '200080_5505_2016-03-15 20:43:04' None '3' '1' '1' '214' '8' @@ -443,6 +480,12 @@ Sample output: 0.02879999950528145 0.0 0.0 2 2 '1,,NULL' '4,0,NULL' '200080_5505_2016-03-15 20:43:04' ',NULL,NULL' ',NULL,NULL' ',NULL,NULL' 1 1 1 ',NULL,NULL' ',NULL,NULL'] + ---------------predict change of purchase ------------- + [[b'0.007005:0']] + +``` +```{note} +If an error occurs, use client.py in the serving directory, or [download](https://github.com/4paradigm/OpenMLDB/blob/f2d985c986c5c4cbe538b01dabdbd1956588da40/demo/jd-recommendation/serving/client.py), to separately debug triton infer. ``` diff --git a/docs/en/use_case/OpenMLDB_Byzer_taxi.md b/docs/en/use_case/OpenMLDB_Byzer_taxi.md deleted file mode 100644 index 9554f77ea87..00000000000 --- a/docs/en/use_case/OpenMLDB_Byzer_taxi.md +++ /dev/null @@ -1,276 +0,0 @@ -# Build End-to-end Machine Learning Applications Based on SQL (OpenMLDB + Byzer) - -This tutorial will show you how to complete a machine learning workflow with the help of [OpenMLDB](https://github.com/4paradigm/OpenMLDB) and [Byzer](https://www.byzer.org/home). -OpenMLDB will compute real-time features based on the data and queries from Byzer, and then return results to Byzer for subsequent model training and inference. - -## 1. Preparations - -### 1.1 Install OpenMLDB - -1. The demo will use the OpenMLDB cluster version running in Docker. See [OpenMLDB Quickstart](../quickstart/openmldb_quickstart.md) for detail installation procedures. -2. Please modify the OpenMLDB IP configuration in order to enable the Byzer engine to access the OpenMLDB service out of the container. See [IP Configuration](../reference/ip_tips.md) for detail guidance. - -### 1.2 Install the Byzer Engine and the Byzer Notebook - -1. For detail installation procedures of Byzer engine, see [Byzer Language Doc](https://docs.byzer.org/#/byzer-lang/en-us/). - -2. We have to use the [OpenMLDB plugin](https://github.com/byzer-org/byzer-extension/tree/master/byzer-openmldb) developed by Byzer to transmit messages between two platforms. To use a plugin in Byzer, please configure `streaming.datalake.path`, see [the manual of Byzer Configuration](https://docs.byzer.org/#/byzer-lang/zh-cn/installation/configuration/byzer-lang-configuration) for detail. - -3. Byzer Notebook is used in this demo. Please install it after the installation of Byzer engine. You can also use the [VSCode Byzer plugin](https://docs.byzer.org/#/byzer-lang/zh-cn/installation/vscode/byzer-vscode-extension-installation) to connect your Byzer engine. The interface of Byzer Notebook is shown below, see [Byzer Notebook Doc](https://docs.byzer.org/#/byzer-notebook/zh-cn/) for more about it. - -![Byzer_Notebook](images/Byzer_Notebook.jpg) - - -### 1.3 Dataset Preparation -In this case, the dataset comes from the Kaggle taxi trip duration prediction problem. If it is not in your Byzer `Deltalake`, [download](https://www.kaggle.com/c/nyc-taxi-trip-duration/overview) it first. Please remember to import it into Byzer Notebook after download. - - -## 2. The Workflow of Machine Learning - -### 2.1 Load the Dataset - -Please import the origin dataset into the `File System` of Byzer Notebook, it will automatically generate the storage path `tmp/upload`. -Use the `load` Byzer Lang command as below to load this dataset. -```sql -load csv.`tmp/upload/train.csv` where delimiter="," -and header = "true" -as taxi_tour_table_train_simple; -``` - -### 2.2 Import the Dataset into OpenMLDB - -Install the OpenMLDB plugin in Byzer. - -```sql -!plugin app add - "byzer-openmldb-3.0"; -``` - -Now you can use this plugin to connect OpenMLDB. **Please make sure the OpenMLDB engine has started and there is a database named `db1` before you run the following code block in Byzer Notebook.** - -```sql -run command as FeatureStoreExt.`` where -zkAddress="172.17.0.2:7527" -and `sql-0`=''' -SET @@execute_mode='offline'; -''' -and `sql-1`=''' -SET @@job_timeout=20000000; -''' -and `sql-2`=''' -CREATE TABLE t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int); -''' -and `sql-3`=''' -LOAD DATA INFILE 'tmp/upload/train.csv' -INTO TABLE t1 options(format='csv',header=true,mode='append'); -''' -and db="db1" -and action="ddl"; -``` - -```{note} -1. The port number of zkAddress should correspond with the files' IP configuration under the OpenMLDB `conf/` path. -2. You can check the `streaming.plugin.clzznames` of the `\byzer.properties.override` file, which is under the `$BYZER_HOME\conf` path of Byzer, to see if the `byzer-openmldb-3.0` plugin is successfully installed. You can see the main class name `tech.mlsql.plugins.openmldb.ByzerApp` after installation. -3. If the plugin installation fail, download the `.jar` files and [install it offline](https://docs.byzer.org/#/byzer-lang/zh-cn/extension/installation/offline_install). -``` - -### 2.3 Real-time Feature Extractions - -The features developed in the [OpenMLDB + LightGBM: Taxi Trip Duration Prediction](./lightgbm_demo.md) Section 2.3 will be used in this demo. -The processed data will be exported to a local `csv` file. - -```sql -run command as FeatureStoreExt.`` where -zkAddress="172.17.0.2:7527" -and `sql-0`=''' -SET @@execute_mode='offline'; -''' -and `sql-1`=''' -SET @@job_timeout=20000000; -''' -and `sql-2`=''' -SELECT trp_duration, passanger_count, -sum(pickup_latitude) OVER w AS vendor_sum_pl, -max(pickup_latitude) OVER w AS vendor_max_pl, -min(pickup_latitude) OVER w AS vendor_min_pl, -avg(pickup_latitude) OVER W AS vendor_avg_pl, -sum(pickup_latitude) OVER w2 AS pc_sum_pl, -max(pickup_latitude) OVER w2 AS pc_max_pl, -min(pickup_latitude) OVER w2 AS pc_min_pl, -avg(pickup_latitude) OVER w2 AS pc_avg_pl, -count(vendor_id) OVER w2 AS pc_cnt, -count(vendor_id) OVER w AS vendor_cnt -FROM t1 -WINDOW w AS(PARTITION BY vendor_id ORDER BY ickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW), -w2 AS(PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW) INTO OUTFILE '/tmp/feature_data'; -''' -and db="db1" -and action="ddl"; -``` - - - -### 2.4 Data Vectorization -Convert all `int` type fields to `double` in Byzer Notebook. - -```sql -select *, -cast(passenger_count as double) as passenger_count_d, -cast(pc_cnt as double) as pc_cnt_d, -cast(vendor_cnt as double) as vendor_cnt_d -from feature_data -as new_feature_data; -``` - -Then merge all the fields into a vector. - -```sql -select vec_dense(array( -passenger_count_d, -vendor_sum_pl, -vendor_max_pl, -vendor_min_pl, -vendor_avg_pl, -pc_sum_pl, -pc_max_pl, -pc_min_pl, -pc_avg_pl, -pc_cnt_d, -vendor_cnt -)) as features,cast(trip_duration as double) as label -from new_feature_data -as trainning_table; - -``` - - - -### 2.5 Training - -Use the `train` Byzer Lang command and its [built-in Linear Regression Algorithm](https://docs.byzer.org/#/byzer-lang/zh-cn/ml/algs/linear_regression) to train the model, and save it to `/model/tax-trip`. - -```sql -train trainning_table as LinearRegression.`/model/tax-trip` where - -keepVersion="true" - -and evaluateTable="trainning_table" -and `fitParam.0.labelCol`="label" -and `fitParam.0.featuresCol`= "features" -and `fitParam.0.maxIter`="50"; - -``` - -```{note} -To check the parameters of Byzer's inbuilt Linear Regression Algorithm, please use `!show et/params/LinearRegression;` command. -``` - -### 2.6 Feature Deployment - -Deploy the feature extraction script onto OpenMLDB: copy the best performance code and set the `execute_mode` to `online`. -The following example uses the code the same as that in the feature extraction, which might not be the 'best'. -```sql -run command as FeatureStoreExt.`` where -zkAddress="172.17.0.2:7527" -and `sql-0`=''' -SET @@execute_mode='online'; -''' -and `sql-1`=''' -SET @@job_timeout=20000000; -''' -and `sql-2`=''' -SELECT trp_duration, passanger_count, -sum(pickup_latitude) OVER w AS vendor_sum_pl, -max(pickup_latitude) OVER w AS vendor_max_pl, -min(pickup_latitude) OVER w AS vendor_min_pl, -avg(pickup_latitude) OVER W AS vendor_avg_pl, -sum(pickup_latitude) OVER w2 AS pc_sum_pl, -max(pickup_latitude) OVER w2 AS pc_max_pl, -min(pickup_latitude) OVER w2 AS pc_min_pl, -avg(pickup_latitude) OVER w2 AS pc_avg_pl, -count(vendor_id) OVER w2 AS pc_cnt, -count(vendor_id) OVER w AS vendor_cnt -FROM t1 -WINDOW w AS(PARTITION BY vendor_id ORDER BY ickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW), -w2 AS(PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW) INTO OUTFILE '/tmp/feature_data_test'; -''' -and db="db1" -and action="ddl"; - -``` - -Import the online data: the following example uses the test set from Kaggle, real-time data source can be connected instead in production. - -```sql -run command as FeatureStoreExt.`` where -zkAddress="172.17.0.2:7527" -and `sql-0`=''' -SET @@execute_mode='online'; -''' -and `sql-1`=''' -SET @@job_timeout=20000000; -''' -and `sql-2`=''' -CREATE TABLE t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int); -''' -and `sql-3`=''' -LOAD DATA INFILE 'tmp/upload/test.csv' -INTO TABLE t1 options(format='csv',header=true,mode='append'); -''' -and db="db1" -and action="ddl"; -``` - - - -### 2.7 Model Deployment - -Register the previously trained and saved model as a UDF function in Byzer Notebook in order to use it more conveniently. - -```sql -register LinearRegression.`/model/tax-trip` as tax_trip_model_predict; -``` - -### 2.8 Prediction - -Convert all `int` type fields of the online dataset, after processed by OpenMLDB, to `double`. - -```sql -select *, -cast(passenger_count as double) as passenger_count_d, -cast(pc_cnt as double) as pc_cnt_d, -cast(vendor_cnt as double) as vendor_cnt_d -from feature_data_test -as new_feature_data_test; -``` - -Then merge all the fields into a vector. - - -```sql -select vec_dense(array( -passenger_count_d, -vendor_sum_pl, -vendor_max_pl, -vendor_min_pl, -vendor_avg_pl, -pc_sum_pl, -pc_max_pl, -pc_min_pl, -pc_avg_pl, -pc_cnt_d, -vendor_cnt -)) as features, -from new_feature_data_test -as testing_table; -``` - -Use this processed test set to predict. - -```sql -select tax_trip_model_predict(testing_table) as predict_label; -``` - - - - - diff --git a/docs/en/use_case/images/ds_tenant_manage.png b/docs/en/use_case/images/ds_tenant_manage.png deleted file mode 100644 index 0f221e6e048..00000000000 Binary files a/docs/en/use_case/images/ds_tenant_manage.png and /dev/null differ diff --git a/docs/en/use_case/index.rst b/docs/en/use_case/index.rst index 34d8d1e0316..c13463d9c7f 100644 --- a/docs/en/use_case/index.rst +++ b/docs/en/use_case/index.rst @@ -5,11 +5,6 @@ Use Cases .. toctree:: :maxdepth: 1 - lightgbm_demo - pulsar_connector_demo - kafka_connector_demo - dolphinscheduler_task_demo + taxi_tour_duration_prediction + JD_recommendation talkingdata_demo - JD_recommendation_en - airflow_provider_demo - OpenMLDB_Byzer_taxi diff --git a/docs/en/use_case/taxi_tour_duration_prediction.md b/docs/en/use_case/taxi_tour_duration_prediction.md new file mode 100644 index 00000000000..fb790441793 --- /dev/null +++ b/docs/en/use_case/taxi_tour_duration_prediction.md @@ -0,0 +1,218 @@ +# Taxi Journey Time Prediction (OpenMLDB+LightGBM) + +This article will use [The Problem of Predicting Taxi Travel Time on Kaggle](https://www.kaggle.com/c/nyc-taxi-trip-duration/overview) as an example to demonstrate how to use the combination of OpenMLDB and LightGBM to create a complete machine-learning application. + +Please note that this document employs a pre-compiled Docker image. If you wish to perform tests in your self-compiled and built OpenMLDB environment, you will need to configure and utilize the [Spark Distribution Documentation for Feature Engineering Optimization](https://github.com/4paradigm/Spark/). Refer to the [Spark Distribution Documentation for OpenMLDB Optimization](../tutorial/openmldbspark_distribution.md#openmldb-spark-distribution) and the [Installation and Deployment Documentation](../deploy/install_deploy.md#modifyingtheconfigurationfileconftaskmanagerproperties) for more detailed information. + +## Preparation and Preliminary Knowledge + +This article is centered around the development and deployment of OpenMLDB CLI. To begin, you should download the sample data and initiate the OpenMLDB CLI. We recommend using Docker images for a streamlined experience. + +- Docker version: >= 18.03 + +### Pull Image + +Execute the following command from the command line to pull the OpenMLDB image and start the Docker container: + +```bash +docker run -it 4pdosc/openmldb:0.8.4 bash +``` + +This image comes pre-installed with OpenMLDB and encompasses all the scripts, third-party libraries, open-source tools, and training data necessary for this case. + +```{note} +Keep in mind that the demonstration commands in the OpenMLDB section of this tutorial are executed by default within the Docker container that has been started. +``` + +### Initialize Environment + +```bash +./init.sh +cd taxi-trip +``` + +The init.sh script provided within the image helps users initialize the environment quickly, including: + +- Configuring Zookeeper +- Starting the cluster version of OpenMLDB + +### Start OpenMLDB CLI + +```bash +/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client +``` + +### Preliminary Knowledge: Asynchronous Tasks + +Some OpenMLDB commands are asynchronous, such as the `LOAD DATA`, `SELECT`, and `SELECT INTO` commands in online/offline mode. After submitting a task, you can use relevant commands such as `SHOW JOBS` and `SHOW JOB` to view the progress of the task. For details, please refer to the [Offline Task Management Document](../openmldb_sql/task_manage/index.rst). + +## Machine Learning Process + +### Step 1: Create Database and Table + +Create database `demo_db` and table `t1`: + +```sql +--OpenMLDB CLI +CREATE DATABASE demo_db; +USE demo_db; +CREATE TABLE t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int); +``` + +### Step 2: Import Offline Data + +First, switch to offline execution mode. Then, import the sample data `/work/taxi-trip/data/taxi_tour_table_train_simple.snappy.parquet` as offline data for offline feature computation. + +```sql +--OpenMLDB CLI +USE demo_db; +SET @@execute_mode='offline'; +LOAD DATA INFILE '/work/taxi-trip/data/taxi_tour_table_train_simple.snappy.parquet' INTO TABLE t1 options(format='parquet', header=true, mode='append'); +``` + +```{note} +`LOAD DATA` is an asynchronous task. Please use the `SHOW JOBS` command to check the task's running status. Wait for the task to run successfully (from `state` transitions to `FINISHED` status) before proceeding to the next step. +``` + +### Step 3: Feature Design + +Typically, before designing features, users need to analyze the data based on machine learning goals and then design and explore features based on this analysis. However, this article does not cover data analysis and feature research in machine learning. It assumes that users have a basic understanding of machine learning theory, the ability to solve machine learning problems, proficiency in SQL syntax, and the capacity to construct features using SQL syntax. In this case, it is assumed that the user has designed the following features through analysis and research: + +| Feature Name | Characteristic Meaning | SQL Feature Representation | +| --------------- | ------------------------------------------------------------ | --------------------------------------- | +| trip_duration | Travel time for a single trip | `trip_duration` | +| passenger_count | Number of passengers | `passenger_count` | +| vendor_sum_pl | The cumulative pickup_latitude for taxis of the same brand within the last 1-day time window. | m(pickup_latitude) OVER w` | +| vendor_max_pl | The maximum pickup_latitude for taxis of the same brand within the last 1-day time window | `max(pickup_latitude) OVER w` | +| vendor_min_pl | The minimum pickup_latitude for taxis of the same brand within the last 1-day time window | `min(pickup_latitude) OVER w` | +| vendor_avg_pl | The average pickup_latitude for taxis of the same brand within the last 1-day time window | `avg(pickup_latitude) OVER w` | +| pc_sum_pl | The cumulative pickup_latitude for trips with the same passenger count within the last 1-day time window. | `sum(pickup_latitude) OVER w2` | +| pc_max_pl | The maximum pickup_latitude for trips with the same passenger count within the last 1-day time window. | `max(pickup_latitude) OVER w2` | +| pc_min_pl | The minimum pickup_latitude for trips with the same passenger count within the last 1-day time window. | `min(pickup_latitude) OVER w2` | +| pc_avg_pl | The average pickup_latitude for trips with the same passenger count within the last 1-day time window. | `avg(pickup_latitude) OVER w2` | +| pc_cnt | The total number of trips with the same passenger capacity within the last 1-day time window | `count(vendor_id) OVER w2` | +| vendor_cnt | The total number of trips for taxis of the same brand within the last day's time window | `count(vendor_id) OVER w AS vendor_cnt` | + +In the actual process of machine learning feature research, scientists conduct repeated experiments on features to find the best feature set for the model. So the process of "feature design -> offline feature extraction -> model training" will be repeated multiple times, and the features will be continuously adjusted to achieve the desired outcome. + +### Step 4: Offline Feature Extraction + +Users perform feature extraction in offline mode and save the feature results in the `/tmp/feature_data` directory for future model training. The `SELECT` command corresponds to the SQL feature computation script generated based on the above feature design. + +```sql +--OpenMLDB CLI +USE demo_db; +SET @@execute_mode='offline'; +SELECT trip_duration, passenger_count, +sum(pickup_latitude) OVER w AS vendor_sum_pl, +max(pickup_latitude) OVER w AS vendor_max_pl, +min(pickup_latitude) OVER w AS vendor_min_pl, +avg(pickup_latitude) OVER w AS vendor_avg_pl, +sum(pickup_latitude) OVER w2 AS pc_sum_pl, +max(pickup_latitude) OVER w2 AS pc_max_pl, +min(pickup_latitude) OVER w2 AS pc_min_pl, +avg(pickup_latitude) OVER w2 AS pc_avg_pl, +count(vendor_id) OVER w2 AS pc_cnt, +count(vendor_id) OVER w AS vendor_cnt +FROM t1 +WINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW), +w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW) INTO OUTFILE '/tmp/feature_data'; +``` + +```{note} +`SELECT INTO` is an asynchronous task. Please use the SHOW JOBS command to check the task's running status and wait for it to run successfully (transitioning to the FINISHED state) before proceeding to the next step. +``` + +### Step 5: Model Training + +1. Model training is not completed within OpenMLDB. Therefore, exit the OpenMLDB CLI first using the `quit` command. + + ``` + quit; + ``` + +2. On the regular command line, run `train.py` (located in the `/work/taxi-trip` directory) and use the open-source training tool `LightGBM` to train the model based on the offline feature table generated in the previous step. The training results are saved in `/tmp/model.txt`. + + ```bash + python3 train.py /tmp/feature_data /tmp/model.txt + ``` + +### Step 6: Launch Feature Extraction SQL Script + +Assuming that the features designed in [Step 3: Feature Design](#step-3-feature-design) have produced the expected model in the previous training, the next step is to deploy the feature extraction SQL script online to provide online feature extraction services. + +1. Restart the OpenMLDB CLI for SQL online deployment: + +```bash +/work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client +``` + +2. Execute online deployment: + +```sql +--OpenMLDB CLI +USE demo_db; +SET @@execute_mode='online'; +DEPLOY demo SELECT trip_duration, passenger_count, +sum(pickup_latitude) OVER w AS vendor_sum_pl, +max(pickup_latitude) OVER w AS vendor_max_pl, +min(pickup_latitude) OVER w AS vendor_min_pl, +avg(pickup_latitude) OVER w AS vendor_avg_pl, +sum(pickup_latitude) OVER w2 AS pc_sum_pl, +max(pickup_latitude) OVER w2 AS pc_max_pl, +min(pickup_latitude) OVER w2 AS pc_min_pl, +avg(pickup_latitude) OVER w2 AS pc_avg_pl, +count(vendor_id) OVER w2 AS pc_cnt, +count(vendor_id) OVER w AS vendor_cnt +FROM t1 +WINDOW w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW), +w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW); +``` + +```note +DEPLOY contains BIAS OPTIONS here because importing data files into online storage will not be updated. For the current time, it may exceed the TTL (Time-To-Live) of the table index after DEPLOY, leading to the expiration of this data in the table. Time-based expiration relies solely on the 'ts' column and 'ttl' of each index. If the value in that column of the data is < (current time - abs_ttl), it will be expired for that index, irrespective of other factors. The different indexes also do not influence each other. If your data does not generate real-time new timestamps, you also need to consider including BIAS OPTIONS. +``` + +### Step 7: Import Online Data + +Firstly, switch to **online** execution mode. Then, in online mode, import the sample data from `/work/taxi-trip/data/taxi_tour_table_train_simple.csv` as online data for online feature computation. + +```sql +--OpenMLDB CLI +USE demo_db; +SET @@execute_mode='online'; +LOAD DATA INFILE 'file:///work/taxi-trip/data/taxi_tour_table_train_simple.csv' INTO TABLE t1 options(format='csv', header=true, mode='append'); +``` + +```{note} +`LOAD DATA` is an asynchronous task. Please use the SHOW JOBS command to monitor the task's progress and wait for it to successfully complete (transition to the FINISHED state) before proceeding to the next step. +``` + +### Step 8: Start Prediction Service + +1. If you have not already exited the OpenMLDB CLI, exit the OpenMLDB CLI first. + + ``` + quit; + ``` + +2. Start the estimation service on the regular command line: + + ```bash + ./start_predict_server.sh 127.0.0.1:9080 /tmp/model.txt + ``` + +### Step 9: Send Prediction Request + +Execute the built-in `predict.py` script from the regular command line. This script sends a request data line to the prediction service, receives the estimation results in return, and prints them out. + +```bash +# Run inference with a HTTP request +python3 predict.py +# The following output is expected (the numbers might be slightly different) +----------------ins--------------- +[[ 2. 40.774097 40.774097 40.774097 40.774097 40.774097 40.774097 + 40.774097 40.774097 1. 1. ]] +---------------predict trip_duration ------------- +848.014745715936 s +``` diff --git a/docs/zh/app_ecosystem/feat_insight/faq.md b/docs/zh/app_ecosystem/feat_insight/faq.md new file mode 100644 index 00000000000..1d260f2bdbf --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/faq.md @@ -0,0 +1,38 @@ +# 常见问题 + +## FeatInsight 和主流 Feature Store 有什么区别? + +主流 Feature Store 包括 Feast、Tecton、Feathr 等提供了特征管理和计算能力,在线存储主要使用 Redis 等预聚合 Key-value 存储。FeatInsight 提供的是实时计算特征的能力,特征抽取方案无论怎样修改都可以直接一键上线而不需要重新上线和同步在线数据。主要的功能对比如下。 + +| 特征存储系统 | Feast | Tecton | Feathr | FeatInsight | +| ----------------- | ------------------ | ----------------- | ----------------- | ----------------- | +| 数据源支持 | 多种数据源 | 多种数据源 | 多种数据源 | 多种数据源 | +| 可扩展性 | 高 | 高 | 中到高 | 高 | +| 实时特征服务 | 支持 | 支持 | 支持 | 支持 | +| 批处理特征服务 | 支持 | 支持 | 支持 | 支持 | +| 特征转换 | 支持基本转换 | 支持复杂转换和 SQL | 支持复杂转换 | 支持复杂转换和 SQL | +| 数据存储 | 支持多种存储选项 | 主要支持云存储 | 支持多种存储选项 | 内置高性能时序数据库,支持多种存储选项 | +| 社区和支持 | 开源社区 | 商业支持 | 开源社区 | 开源社区 | +| 实时特征计算 | 不支持 | 不支持 | 不支持 | 支持 | + +## 部署 FeatInsight 是否需要 OpenMLDB ? + +需要,因为 FeatInsight 的元数据存储以及特征计算依赖 OpenMLDB 集群,因此部署 FeatInsight 需要提前部署 OpenMLDB 集群,也可以使用整合两者的 [Docker 镜像](./install/docker.md)一键部署。 + +使用 FeatInsight 后用户可以不依赖 OpenMLDB CLI 或 SDK 来实现特征的开发和上线,通过 Web 界面就可以完成特征工程的所有上线需求。 + +## 如何基于 FeatInsight 实现 MLOps 工作流? + +使用 FeatInsight 可以在 Web 前端完成数据库、数据表的创建,然后提交在线数据和离线数据的导入工作。使用 OpenMLDB SQL 语法进行数据的探索以及特征的创建,然后就可以离线特征的导出以及在线特征的一键上线,从 MLOps 对离线到在线流程不需要任何额外的开发工作,具体流程可参考[快速入门](./quickstart.md)。 + +## FeatInsight 的生态集成支持如何? + +FeatInsight 依托于 OpenMLDB 生态,支持与 OpenMLDB 生态中的其他组件进行集成。 + +例如与 OpenMLDB 生态中的数据集成组件进行集成,支持 [Kafka](../../integration/online_datasources/kafka_connector_demo.md)、[Pulsar](../../integration/online_datasources/pulsar_connector_demo.md)、[RocketMQ](../../integration/online_datasources/rocketmq_connector.md)、[Hive](../../integration/offline_data_sources/hive.md)、[Amazon S3](../../integration/offline_data_sources/s3.md),调度系统支持 [Airflow](../../integration/deploy_integration/airflow_provider_demo.md)、[DolphinScheduler](../../integration/deploy_integration/dolphinscheduler_task_demo.md)、[Byzer](../../integration/deploy_integration/OpenMLDB_Byzer_taxi.md) 等,对于 Spark Connector 支持的 HDFS、Iceberg 等和云计算相关的 Kubernetes、阿里云 MaxCompute 等也有一定程度的支持。 + +## FeatInsight 有什么业务价值和技术含量? + +相比于使用 HDFS 存储离线数据、Redis 存储在线数据的简易版 Feature Store,FeatInsight 的价值在于使用了 OpenMLDB SQL 这种在线离线一致性的特征抽取语言。对于特征开发的科学家,只需要编写 SQL 逻辑就可以完成特征定义,在离线场景下这个 SQL 会被翻译成分布式 Spark 应用来执行,在在线场景下同样的 SQL 会被翻译成在线时序数据库的查询语句来执行,实现特征的在线和离线一致性。 + +目前 SQL 编译器、在线存储引擎、离线计算引擎都是基于 C++ 和 Scala 等编程语言实现的,对于非技术背景的科学家来说,使用 SQL 语言来定义特征开发流程,可以降低学习成本,提高开发效率。所有代码都是开源可用,OpenMLDB 项目地址 https://github.com/4paradigm/openmldb ,FeatInsight 项目地址 https://github.com/4paradigm/FeatInsight 。 diff --git a/docs/zh/app_ecosystem/feat_insight/functions/computed_features.md b/docs/zh/app_ecosystem/feat_insight/functions/computed_features.md new file mode 100644 index 00000000000..e64625c16f9 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/computed_features.md @@ -0,0 +1,23 @@ +# 预计算特征 + +## 介绍 + +用户可以通过预计算把特征值直接存入OpenMLDB在线表中,然后访问在线表数据进行读写特征。 + +OpenMLDB 提供了类似 Redis 的内存数据索引查询功能,可以快速高效得提取预先计算好的特征值,这个功能也和传统 Feature Store 的在线存储实现类似,但如果特征抽取逻辑修改了也需要在外部的预计算逻辑上提前修改。 + +![](../images/computed_feature_page.png) + +## 预览样本特征 + +用户可以直接选择特征表,然后选择需要提取的特征列,点击“预览样本特征”即可进行特征的预览。 + +![](../images/computed_feature_sample.png) + +注意,如果需要实现更复杂的特征计算逻辑,例如对特征列进行表达式计算,可以通过命令行或 SDK 来编写 OpenMLDB SQL 实现。 + +## 通过索引过滤 + +用户除了预览随机的样本特征,还可以通过数据表的索引来检索。首先选择表内包含的索引,如果想通过其他索引方式查询则可以使用 SQL 来创建新的索引,然后根据索引填写对应的值,这样就可以实现精确的预计算特征检索了。 + +![](../images/computed_feature_with_index.png) diff --git a/docs/zh/app_ecosystem/feat_insight/functions/import_data.md b/docs/zh/app_ecosystem/feat_insight/functions/import_data.md new file mode 100644 index 00000000000..75d85fd79e4 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/import_data.md @@ -0,0 +1,110 @@ + +# 数据导入 + +## 介绍 + +FeatInsight 在前端支持数据导入相关功能,功能如下: + +* 数据库管理 + * 创建数据库 + * 查看数据库 + * 删除数据库 +* 数据表管理 + * 创建数据表 + * 查看数据表 + * 删除数据表 +* 数据导入 + * 在线数据导入 + * 离线数据导入 + +用户可以在此页面完成特征工程需要的所有在线和离线数据表的创建和导入。 + +![](../images/import_data.png) + +## 创建数据库 + +在创建数据库页面,可以通过点击“创建数据库”按钮来创建新的数据库,只需要输入不重复的数据库名即可。 + +![](../images/create_database_form.png) + +在同一个页面展示了 OpenMLDB 的所有数据库列表,可以通过列表来查看数据库名称和执行删除操作。 + +![](../images/database_list.png) + +点击数据库名称,还可以查看数据库内详情,包括该数据库下所有数据表,还能进一步点击查看数据表详情。 + +![](../images/database_detail.png) + +## 创建数据表 + +在创建数据表页面,包含了“创建数据表”、“使用 SQL 创建表”、“使用 Parquet 创建表”和“使用 Hive 创建表”四种功能。 + +选择“创建数据表”按钮,只需要填写表名以及每一列的字段名和类型即可,简单易用但限制是无法直接指定索引。 + +![](../images/create_table_form.png) + +选择“使用 SQL 创建表”按钮,用户可以输入 SQL 语句来创建数据表,灵活性最强并且可以指定索引。 + +![](../images/create_table_from_sql.png) + +选择“使用 Parquet 创建表”按钮,用户可以输入 Parquet 文件路径来创建数据表,同样无法指定索引。 + +![](../images/create_table_from_parquet.png) + +选择“使用 Hive 创建表”按钮,用户可以输入 Hive 表名来创建数据表,同样无法指定索引。 + +![](../images/create_table_from_hive.png) + +在同一个页面展示了 OpenMLDB 的所有数据表列表,可以通过列表来查看数据表名称和执行删除操作。 + +![](../images/table_list.png) + +点击数据库名称,可以参看数据库详情,点击数据表名称,则查看数据表的详情信息,并且可以在详情页快速预览在线表的数据。 + +![](../images/table_detail.png) + +## 导入在线数据 + +在导入在线表页面,包含了“使用 SQL 导入”、“插入单行数据”、“使用 Parquet 导入”、“使用 CSV 导入”和“使用 Hive 导入”的五种功能。 + +选择“使用 SQL 导入”, 用户可以执行任意的“INSERT”或“LOAD DATA”语句来导入数据。 + +![](../images/import_online_from_sql.png) + +选择“插入单行数据”,用户可以手动输入单行数据来导入数据。 + +![](../images/import_online_from_insert.png) + +选择“使用 Parquet 导入”,用户可以输入 Parquet 文件路径来导入数据。 + +![](../images/import_online_from_parquet.png) + +选择”使用 CSV 导入”,用户可以输入 CSV 文件路径来导入数据。 + +![](../images/import_online_from_csv.png) + +选择“使用 Hive 导入”,用户可以输入 Hive 表名来导入数据。 + +![](../images/import_online_from_hive.png) + +## 导入离线数据 + +导入离线数据功能和导入在线数据功能类似,但暂时不支持插入单行离线数据,并且所有的导入任务都会切换到离线模式执行。 + +在导入离线表页面,包含了“使用 SQL 导入”、“使用 Parquet 导入”、“使用 CSV 导入”和“使用 Hive 导入”的四种功能。 + +选择“使用 SQL 导入”, 用户可以执行任意的“INSERT”或“LOAD DATA”语句来导入数据。 + +![](../images/import_offline_from_sql.png) + +选择“使用 Parquet 导入”,用户可以输入 Parquet 文件路径来导入数据。 + +![](../images/import_offline_from_parquet.png) + +选择”使用 CSV 导入”,用户可以输入 CSV 文件路径来导入数据。 + +![](../images/import_offline_from_csv.png) + +选择“使用 Hive 导入”,用户可以输入 Hive 表名来导入数据。 + +![](../images/import_offline_from_hive.png) diff --git a/docs/zh/app_ecosystem/feat_insight/functions/index.rst b/docs/zh/app_ecosystem/feat_insight/functions/index.rst new file mode 100644 index 00000000000..26d65eace5d --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/index.rst @@ -0,0 +1,17 @@ +============================= +功能列表 +============================= + +.. toctree:: + :maxdepth: 1 + + import_data + manage_feature + online_scenario + offline_scenario + sql_playground + manage_center + computed_features + + sql_tool + diff --git a/docs/zh/app_ecosystem/feat_insight/functions/manage_center.md b/docs/zh/app_ecosystem/feat_insight/functions/manage_center.md new file mode 100644 index 00000000000..d785a48f4a3 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/manage_center.md @@ -0,0 +1,33 @@ +# 管理中心 + +## 介绍 + +FeatInsight 提供了对于数据表,特征,任务以及服务的管理功能,用户可以在管理中心查看和管理相关资源。 + +目前支持查看和管理的资源如下: + +* 特征:查看目前已创建的特征视图以及其中的单个特征。具体信息包括名称、对应数据库、SQL、依赖表等。 +* 数据表:查看目前已创建的数据库及数据表。数据表的具体信息包括schema、关联特征视图、关联特征服务等。 +* 离线样本:查看已导出的离线样本信息。具体信息包括特征名、导出路径、SQL、相关联的离线任务状态、日志等。 +* 离线任务:查看运行中/已运行的离线任务。具体信息包括任务类型、状态、运行日志等。 +* 特征服务:查看已上线的特征服务。具体信息包括版本、特征名、SQL、Deployment、相关联的特征及依赖表等。并提供服务测试界面。 + +## 特征管理 + +![](../images/features_page.png) + +## 数据表管理 + +![](../images/tables_page.png) + +## 离线样本管理 + +![](../images/offline_samples_page.png) + +## 离线任务管理 + +![](../images/offline_jobs_page.png) + +## 特征服务管理 + +![](../images/feature_services_page.png) diff --git a/docs/zh/app_ecosystem/feat_insight/functions/manage_feature.md b/docs/zh/app_ecosystem/feat_insight/functions/manage_feature.md new file mode 100644 index 00000000000..86f9361b96e --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/manage_feature.md @@ -0,0 +1,50 @@ +# 特征管理 + +## 介绍 + +OpenMLDB 提供完备的特征生命周期管理功能,包含的功能如下: + +* 创建 + * 创建特征视图 + * 自动创建特征视图包含的特征 +* 查看 + * 查看特征视图详情 + * 查看特征详情 + * 预览样本特征 +* 删除 + * 删除特征视图 + * 自动删除特征视图包含的特征 + +![](../images/features_page.png) + +## 创建 + +用户想要创建特征,首先要创建一个特征视图,也就是用 SQL 定义的一组特征。 + +在特征页面,点击创建按钮,可以填写特征视图的名称,选择默认的数据库,然后填写要抽取特征的 SQL 语句,如果不了解 SQL 语法可点击旁边的“SQL 使用案例”来参考。 + +![](../images/create_feature_form1.png) + +填写完成后需要点击“分析 SQL”,这一步会对 SQL 语法进行校验,并且检查 SQL 语句是否符合上线需求。校验成功后会自动分析出 SQL 对应的特征数量和类型,用户可以在创建前给每个特征添加特征描述,方便后期检索。 + +![](../images/create_feature_form2.png) + +如果用户填写了错误的 SQL 语句,会提示“执行失败”,用户可以通过日志查看错误信息,并且修改 SQL 保证分析成功后才能创建。 + +## 查看 + +成功创建特征后,用户可以在所有特征列表和以及所有特征视图列表中查看系统中所有特征的详情信息。 + +点击特征视图的名称,可以查看特征视图的详情信息,包括创建时指定的 SQL 语句,以及关联的特征列表,还有依赖的数据表等。 + +![](../images/feature_view_detail.png) + +点击特征的名称,则可以查看特征的详细信息,包括特征所在的特征视图,提取单个特征自动生成的 SQL 语句,以及可以快速预览样本特征。 + +![](../images/feature_detail.png) + +## 删除 + +在特征视图列表中,有“删除”按钮,用户点击确认后可以删除对应的特征视图,并且自动删除特征视图包含的特征。注意删除操作一旦执行就无法撤回,请谨慎操作。 + +![](../images/delete_feature_view.png) diff --git a/docs/zh/app_ecosystem/feat_insight/functions/offline_scenario.md b/docs/zh/app_ecosystem/feat_insight/functions/offline_scenario.md new file mode 100644 index 00000000000..dbfb53cd110 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/offline_scenario.md @@ -0,0 +1,32 @@ + +# 离线场景 + +## 介绍 + +用户可以在此界面完成所有跟离线场景有关的需求,功能包括: + +* 创建离线样本 +* 查看离线样本 +* 删除离线样本 + +离线场景下所有导出任务都使用分布式计算,读取离线数据后进行特征抽取并导出成离线样本文本,使用开源的机器学习训练框架即可使用。 + +![](../images/offline_scenario.png) + +## 创建离线样本 + +用户点击创建按钮,可以创建和导出一次离线样本,用户只需要选择需要导出的特征和导出路径即可,选择的特征要求可参考[在线场景文档](./online_scenario.md)。 + +![](../images/create_offline_sample.png) + +除了基础参数,用户点击“更多选项”,还可以指定导出任务的 Spark 资源参数、导出的样本文件格式、导出模式等等。 + +## 查看离线样本 + +用户点击离线样本的编号,可以查看离线样本的详情信息,包括样本的导出路径、导出时间、导出模式、导出样本的文件格式等等,也可以查看关联的离线任务的状态和日志,只有离线任务执行成功过才可以开始使用离线样本文件。 + +![](../images/offline_sample_detail.png) + +## 删除离线样本 + +在离线样本列表有“删除”按钮,用户点击并确认后可以删除离线样本的记录,注意这里不会删除实际的样本文件,用户如果需要可以手动到导出路径处删除。删除操作同样时不可撤回,请谨慎操作。 diff --git a/docs/zh/app_ecosystem/feat_insight/functions/online_scenario.md b/docs/zh/app_ecosystem/feat_insight/functions/online_scenario.md new file mode 100644 index 00000000000..a84540eb7e1 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/online_scenario.md @@ -0,0 +1,55 @@ +# 在线场景 + +## 介绍 + +用户可以在此界面完成所有跟在线场景有关的需求,功能包括: + +* 创建特征服务 +* 查看特征服务 +* 设置服务版本 +* 请求特征服务 +* 删除特征服务 + +![](../images/online_scenario.png) + +## 创建特征服务 + +用户点击创建按钮,可以创建一个特征服务,需要填写特征服务名称(可重复)和服务版本,两者可以确定一个上线的服务实例,然后选择对应的特征。 + +选择特征时,可以直接选择特征视图名称,这样会自动包含特征视图内所有特征,也可以选择特征视图下某个特征,如果两者都选会自动合并而不会输出重复特征。 + +![](../images/create_feature_service.png) + +如果用户选择的特征来自于不同的特征视图,还需要填写一个主表主键,首先所有选择的特征视图必须有相同的主表,这里填写的就是相同主表的一个主键,可以是某一列的名称也可以是联合主键,必须保证主键的值在主表内是唯一的,否则上线后 SQL 合并会有计算错误。 + +![](../images/create_feature_service_with_keys.png) + +## 查看特征服务 + +用户点击特征服务名称,可以查看当前特征服务的详细信息,包括所有的服务版本。 + +![](../images/feature_service_detail.png) + +如果点击对应的服务版本,还可以查看对应服务版本的详细信息,包括上线部署使用的 SQL 语句,服务关联的特征列表以及依赖的数据表等。 + +![](../images/feature_service_version_detail.png) + +## 设置服务版本 + +在特征服务详情页,用户可以设置当前服务版本为最新版本。 + +注意,多个服务版本可以同时对外提供服务,如果用户请求特征服务的时候不指定服务版本,则会自动请求到最新版本。如果期望客户端不感知后端升级自动使用新版本特征服务,可以使用此功能,但需要保证不同版本特征服务返回结果一致。 + +## 请求特征服务 + +点击“请求特征服务”按钮,可以请求当前特征服务,并返回特征服务返回的结果。 + +![](../images/request_feature_service.png) + +这里提供表单模式和 JSON 模式,用户可以直接填写表单内容来请求特征服务,也可以参考上面提供的 JSON 示例来请求特征服务。如果使用 JSON 模式可以一次性请求多行数据,并且请求格式于 OpenMLDB APIServer 线上服务要求的格式一致。 + +## 删除特征服务 + +在特征服务列表页有“删除服务版本“按钮,每次只能删除一个版本,如果需要删除多个版本,需要多次点击所有服务版本的删除按钮。 + +注意,删除操作无法撤回,请务必谨慎操作。 diff --git a/docs/zh/app_ecosystem/feat_insight/functions/sql_playground.md b/docs/zh/app_ecosystem/feat_insight/functions/sql_playground.md new file mode 100644 index 00000000000..d4c8d14d207 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/sql_playground.md @@ -0,0 +1,25 @@ +# SQL 实验室 + +## 介绍 + +SQL 实验室提供了 OpenMLDB SQL 语句的调试和执行功能,方便用户执行任意 SQL 操作并调试特征抽取的 SQL 语句。 + +## 在线模式 + +在线模式下会请求在线数据库,可进行在线数据的增删改查。 + +![](../images/sql_playground_online.png) + +在线模式下查询结果会直接返回到前端暂时,注意避免因为超时或数据量太大导致的执行失败问题。 + +如果用户的 SQL 执行错误,可以通过返回的错误信息进行调试和排查。 + +![](../images/sql_playground_fail.png) + +## 离线模式 + +离线模式下会提交分布式执行的SQL,可进行离线探索或样本生成。 + +![](../images/sql_playground_offline.png) + +离线模式则会返回离线任务信息,可以点击任务ID查看任务的详情和日志。 diff --git a/docs/zh/app_ecosystem/feat_insight/functions/sql_tool.md b/docs/zh/app_ecosystem/feat_insight/functions/sql_tool.md new file mode 100644 index 00000000000..e9187e0c515 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/functions/sql_tool.md @@ -0,0 +1,15 @@ +# 可视化 SQL 工具 + +## 介绍 + +由于OpenMLDB目前仅支持单条SQL语句生成特征组,我们提供了一个可视化的SQL工具,来方便用户通过拖拉拽SQL算子将多条SQL语句通过`WITH`语句生成一条复杂的SQL语句。 + +## 使用 + +可视化SQL工具的入口在创建特征页面中,如下图所示: + +![](../images/sql_tool_entry.png) + +使用SQL工具拖拽算子、连接、填入SQL后,可点击“预览生成SQL”生成对应的复杂SQL。如下图所示: + +![](../images/sql_tool_tutorial.png) diff --git a/docs/zh/app_ecosystem/feat_insight/images/bigscreen.png b/docs/zh/app_ecosystem/feat_insight/images/bigscreen.png new file mode 100644 index 00000000000..ff341bb40dc Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/bigscreen.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/computed_feature_page.png b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_page.png new file mode 100644 index 00000000000..b240611e047 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_page.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/computed_feature_sample.png b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_sample.png new file mode 100644 index 00000000000..cffe3fc214a Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_sample.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/computed_feature_with_index.png b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_with_index.png new file mode 100644 index 00000000000..12daca885c6 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/computed_feature_with_index.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_database_form.png b/docs/zh/app_ecosystem/feat_insight/images/create_database_form.png new file mode 100644 index 00000000000..3374877602e Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_database_form.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_feature_form1.png b/docs/zh/app_ecosystem/feat_insight/images/create_feature_form1.png new file mode 100644 index 00000000000..238fc4d7b16 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_feature_form1.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_feature_form2.png b/docs/zh/app_ecosystem/feat_insight/images/create_feature_form2.png new file mode 100644 index 00000000000..159cf1ba80b Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_feature_form2.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_feature_service.png b/docs/zh/app_ecosystem/feat_insight/images/create_feature_service.png new file mode 100644 index 00000000000..b47b80f90d8 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_feature_service_with_keys.png b/docs/zh/app_ecosystem/feat_insight/images/create_feature_service_with_keys.png new file mode 100644 index 00000000000..1b428bfa0fb Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_feature_service_with_keys.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_offline_sample.png b/docs/zh/app_ecosystem/feat_insight/images/create_offline_sample.png new file mode 100644 index 00000000000..dad96e751bd Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_offline_sample.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_table_form.png b/docs/zh/app_ecosystem/feat_insight/images/create_table_form.png new file mode 100644 index 00000000000..801492953c3 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_table_form.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_table_from_hive.png b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_hive.png new file mode 100644 index 00000000000..dd015060dcd Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_hive.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_table_from_parquet.png b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_parquet.png new file mode 100644 index 00000000000..9b0f4a7d32b Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_parquet.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_table_from_sql.png b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_sql.png new file mode 100644 index 00000000000..3813c993826 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_table_from_sql.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_test_feature_service.png b/docs/zh/app_ecosystem/feat_insight/images/create_test_feature_service.png new file mode 100644 index 00000000000..a00fc2da3c3 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_test_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_test_featureview.png b/docs/zh/app_ecosystem/feat_insight/images/create_test_featureview.png new file mode 100644 index 00000000000..5dd19888f06 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_test_featureview.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/create_test_table.png b/docs/zh/app_ecosystem/feat_insight/images/create_test_table.png new file mode 100644 index 00000000000..08780718313 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/create_test_table.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/csv_import_test_table.png b/docs/zh/app_ecosystem/feat_insight/images/csv_import_test_table.png new file mode 100644 index 00000000000..0410ae42196 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/csv_import_test_table.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/database_detail.png b/docs/zh/app_ecosystem/feat_insight/images/database_detail.png new file mode 100644 index 00000000000..8097c1e2060 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/database_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/database_list.png b/docs/zh/app_ecosystem/feat_insight/images/database_list.png new file mode 100644 index 00000000000..a739eb8d79f Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/database_list.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/delete_feature_view.png b/docs/zh/app_ecosystem/feat_insight/images/delete_feature_view.png new file mode 100644 index 00000000000..3fe4e6a95e1 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/delete_feature_view.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/export_test_offline_samples.png b/docs/zh/app_ecosystem/feat_insight/images/export_test_offline_samples.png new file mode 100644 index 00000000000..19c6c64e8eb Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/export_test_offline_samples.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/feature_detail.png b/docs/zh/app_ecosystem/feat_insight/images/feature_detail.png new file mode 100644 index 00000000000..018dff2d0af Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/feature_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/feature_service_detail.png b/docs/zh/app_ecosystem/feat_insight/images/feature_service_detail.png new file mode 100644 index 00000000000..16344ea0956 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/feature_service_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/feature_service_version_detail.png b/docs/zh/app_ecosystem/feat_insight/images/feature_service_version_detail.png new file mode 100644 index 00000000000..fd5430a9e51 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/feature_service_version_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/feature_services_page.png b/docs/zh/app_ecosystem/feat_insight/images/feature_services_page.png new file mode 100644 index 00000000000..c3e91eb685f Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/feature_services_page.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/feature_view_detail.png b/docs/zh/app_ecosystem/feat_insight/images/feature_view_detail.png new file mode 100644 index 00000000000..eab8fdc0ff8 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/feature_view_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/features_page.png b/docs/zh/app_ecosystem/feat_insight/images/features_page.png new file mode 100644 index 00000000000..fb15f55b871 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/features_page.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/ide_develop_featuer_platform.png b/docs/zh/app_ecosystem/feat_insight/images/ide_develop_featuer_platform.png new file mode 100644 index 00000000000..4e506ccf867 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/ide_develop_featuer_platform.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_data.png b/docs/zh/app_ecosystem/feat_insight/images/import_data.png new file mode 100644 index 00000000000..cf8092b6fc0 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_data.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_job_result.png b/docs/zh/app_ecosystem/feat_insight/images/import_job_result.png new file mode 100644 index 00000000000..378f0d6e182 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_job_result.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_csv.png b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_csv.png new file mode 100644 index 00000000000..155ad2cb139 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_csv.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_hive.png b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_hive.png new file mode 100644 index 00000000000..2d8e5c4b774 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_hive.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_parquet.png b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_parquet.png new file mode 100644 index 00000000000..3c2ddec0ed7 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_parquet.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_sql.png b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_sql.png new file mode 100644 index 00000000000..e3adc2f7300 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_offline_from_sql.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_online_from_csv.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_csv.png new file mode 100644 index 00000000000..b9bb99f07ec Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_csv.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_online_from_hive.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_hive.png new file mode 100644 index 00000000000..d704c533bb4 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_hive.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_online_from_insert.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_insert.png new file mode 100644 index 00000000000..94f7f419818 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_insert.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_online_from_parquet.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_parquet.png new file mode 100644 index 00000000000..cbcfcf78f2a Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_parquet.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/import_online_from_sql.png b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_sql.png new file mode 100644 index 00000000000..28c7251b5d7 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/import_online_from_sql.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/local_test_offline_samples.png b/docs/zh/app_ecosystem/feat_insight/images/local_test_offline_samples.png new file mode 100644 index 00000000000..dcb9aeff76b Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/local_test_offline_samples.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/offline_jobs_page.png b/docs/zh/app_ecosystem/feat_insight/images/offline_jobs_page.png new file mode 100644 index 00000000000..92efd33b652 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/offline_jobs_page.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/offline_sample_detail.png b/docs/zh/app_ecosystem/feat_insight/images/offline_sample_detail.png new file mode 100644 index 00000000000..de9c6c2f6b9 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/offline_sample_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/offline_samples_page.png b/docs/zh/app_ecosystem/feat_insight/images/offline_samples_page.png new file mode 100644 index 00000000000..4befb2796c0 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/offline_samples_page.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/offline_scenario.png b/docs/zh/app_ecosystem/feat_insight/images/offline_scenario.png new file mode 100644 index 00000000000..80f06cda445 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/offline_scenario.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/online_csv_import_test_table.png b/docs/zh/app_ecosystem/feat_insight/images/online_csv_import_test_table.png new file mode 100644 index 00000000000..c212a442d84 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/online_csv_import_test_table.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/online_scenario.png b/docs/zh/app_ecosystem/feat_insight/images/online_scenario.png new file mode 100644 index 00000000000..df39b93d558 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/online_scenario.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/preview_test_features.png b/docs/zh/app_ecosystem/feat_insight/images/preview_test_features.png new file mode 100644 index 00000000000..275304ce378 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/preview_test_features.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/preview_test_table.png b/docs/zh/app_ecosystem/feat_insight/images/preview_test_table.png new file mode 100644 index 00000000000..3bb32c63b47 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/preview_test_table.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/request_feature_service.png b/docs/zh/app_ecosystem/feat_insight/images/request_feature_service.png new file mode 100644 index 00000000000..33cdc1ac2de Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/request_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/request_test_feature_service.png b/docs/zh/app_ecosystem/feat_insight/images/request_test_feature_service.png new file mode 100644 index 00000000000..008f4dd81eb Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/request_test_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/sql_playground_fail.png b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_fail.png new file mode 100644 index 00000000000..ae77fb35844 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_fail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/sql_playground_offline.png b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_offline.png new file mode 100644 index 00000000000..5a8b51c148a Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_offline.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/sql_playground_online.png b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_online.png new file mode 100644 index 00000000000..a5ded8ea3ec Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/sql_playground_online.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/sql_tool_entry.png b/docs/zh/app_ecosystem/feat_insight/images/sql_tool_entry.png new file mode 100644 index 00000000000..0dc479f859f Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/sql_tool_entry.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/sql_tool_tutorial.png b/docs/zh/app_ecosystem/feat_insight/images/sql_tool_tutorial.png new file mode 100644 index 00000000000..bafcd166450 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/sql_tool_tutorial.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/table_detail.png b/docs/zh/app_ecosystem/feat_insight/images/table_detail.png new file mode 100644 index 00000000000..d10fcbb31d2 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/table_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/table_list.png b/docs/zh/app_ecosystem/feat_insight/images/table_list.png new file mode 100644 index 00000000000..bfa97f9d0a6 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/table_list.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/tables_page.png b/docs/zh/app_ecosystem/feat_insight/images/tables_page.png new file mode 100644 index 00000000000..48b8dc53bc5 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/tables_page.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/test_feature_service_detail.png b/docs/zh/app_ecosystem/feat_insight/images/test_feature_service_detail.png new file mode 100644 index 00000000000..89c5f12197c Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/test_feature_service_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/test_features_list.png b/docs/zh/app_ecosystem/feat_insight/images/test_features_list.png new file mode 100644 index 00000000000..6ad5b8da86d Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/test_features_list.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/images/test_offline_sample_detail.png b/docs/zh/app_ecosystem/feat_insight/images/test_offline_sample_detail.png new file mode 100644 index 00000000000..61ff9d843f1 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/images/test_offline_sample_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/index.rst b/docs/zh/app_ecosystem/feat_insight/index.rst new file mode 100644 index 00000000000..296bfd07586 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/index.rst @@ -0,0 +1,13 @@ +============================= +FeatInsight +============================= + +.. toctree:: + :maxdepth: 1 + + introduction + quickstart + install/index + functions/index + use_cases/index + faq diff --git a/docs/zh/app_ecosystem/feat_insight/install/config_file.md b/docs/zh/app_ecosystem/feat_insight/install/config_file.md new file mode 100644 index 00000000000..17d8539540d --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/install/config_file.md @@ -0,0 +1,30 @@ +# FeatInsight 配置文件 + +## 介绍 + +FeatInsight 基于 Spring Boot 开发,使用 `application.yml` 规范作为配置文件。 + +## 配置示例 + +简化版配置示例如下: + +``` +server: + port: 8888 + +openmldb: + zk_cluster: 127.0.0.1:2181 + zk_path: /openmldb + apiserver: 127.0.0.1:9080 +``` + +## 配置项目 + + +| 配置项目 | 介绍 | 类型 | 示例 | +| ------- | --- | --- | ---- | +| server.port | 服务端口 | int | 8888 | +| openmldb.zk_cluster | ZooKeeper 集群地址 | string | 127.0.0.1:2181 | +| openmldb.zk_path | OpenMLDB 根路径 | string | /openmldb | +| openmldb.apiserver | OpenMLDB APIServer 地址 | string | 127.0.0.1:9080 | +| openmldb.skip_index_check | 是否跳过索引检查 | boolean | false | \ No newline at end of file diff --git a/docs/zh/app_ecosystem/feat_insight/install/docker.md b/docs/zh/app_ecosystem/feat_insight/install/docker.md new file mode 100644 index 00000000000..3625d61bd47 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/install/docker.md @@ -0,0 +1,46 @@ +# Docker + +## 介绍 + +使用官方构建好的 Docker 镜像, 可以快速部署 OpenMLDB 特征服务. + +## 内置 OpenMLDB 镜像 + +使用内置 OpenMLDB 的镜像,可以一键启动 OpenMLDB 集群和 OpenMLDB 特征服务,无需额外部署即可使用特征服务。 + +``` +docker run -d -p 8888:8888 registry.cn-shenzhen.aliyuncs.com/tobe43/portable-openmldb +``` + +启动 OpenMLDB 和 FeatInsight 需要约一分钟,可通过 `docker logs` 查看日志,启动成功后在本地浏览器打开 `http://127.0.0.1:8888` 即可访问 FeatInsight 服务。 + + +## 不包含 OpenMLDB 镜像 + +使用不包含 OpenMLDB 的镜像,需要提前部署 OpenMLDB 集群,然后启动 OpenMLDB 特征服务容器,部署步骤较繁琐但灵活性高。 + +首先参考 [OpenMLDB 部署文档](../../../deploy/index.rst) 提前部署 OpenMLDB 集群。 + +然后参考 [FeatInsight 配置文件](./config_file.md),创建 `application.yml` 配置文件。 + +``` +server: + port: 8888 + +openmldb: + zk_cluster: 127.0.0.1:2181 + zk_path: /openmldb + apiserver: 127.0.0.1:9080 +``` + +对于 Linux 操作系统可以使用下面命令启动 FeatInsight 容器. + +``` +docker run -d -p 8888:8888 --net=host -v `pwd`/application.yml:/app/application.yml registry.cn-shenzhen.aliyuncs.com/tobe43/featinsight +``` + +由于 MacOS 通过虚拟机启动 Docker 容器,使用 `--net=host` 参数无法正常工作,需要提前修改配置文件指向正确的 OpenMLDB 服务。 + +``` +docker run -d -p 8888:8888 -v `pwd`/application.yml:/app/application.yml registry.cn-shenzhen.aliyuncs.com/tobe43/featinsight +``` diff --git a/docs/zh/app_ecosystem/feat_insight/install/index.rst b/docs/zh/app_ecosystem/feat_insight/install/index.rst new file mode 100644 index 00000000000..9c23f1f778d --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/install/index.rst @@ -0,0 +1,13 @@ +============================= +安装部署 +============================= + +.. toctree:: + :maxdepth: 1 + + docker + package + source + config_file + upgrade + diff --git a/docs/zh/app_ecosystem/feat_insight/install/package.md b/docs/zh/app_ecosystem/feat_insight/install/package.md new file mode 100644 index 00000000000..73b9188104d --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/install/package.md @@ -0,0 +1,38 @@ +# 安装包 + +## 介绍 + +使用官方预编译的安装包,只需要本地有 Java 环境就可以快速部署 FeatInsight 服务。 + +注意,需参考 [OpenMLDB 部署文档](../../../deploy/index.rst) 提前部署 OpenMLDB 集群。 + +## 下载 + +下载 Jar 文件。 + +``` +wget https://openmldb.ai/download/featinsight/featinsight-0.1.0-SNAPSHOT.jar +``` + +## 配置 + +参考 [FeatInsight 配置文件](./config_file.md),创建 `application.yml` 配置文件。 + +``` +server: + port: 8888 + +openmldb: + zk_cluster: 127.0.0.1:2181 + zk_path: /openmldb + apiserver: 127.0.0.1:9080 +``` + +## 启动 + +启动 FeatInsight 服务。 + +``` +java -jar ./featinsight-0.1.0-SNAPSHOT.jar +``` + diff --git a/docs/zh/app_ecosystem/feat_insight/install/source.md b/docs/zh/app_ecosystem/feat_insight/install/source.md new file mode 100644 index 00000000000..875d843817d --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/install/source.md @@ -0,0 +1,39 @@ +# 源码编译 + +## 介绍 + +通过源码编译 FeatInsight 项目,可以按需使用特定源码功能,本文档提供源码编译的完整流程。 + +## 下载源码 + +下载项目源码。 + +``` +git clone https://github.com/4paradigm/FeatInsight +``` + +## 编译源码 + +进入项目根目录,执行以下命令编译前端和后端代码。 + +``` +cd ./FeatInsight/frontend/ +npm run build + +cd ../ +mvn clean package +``` + +## 启动服务 + +部署 OpenMLDB 集群并配置好配置文件后,可以使用下面命令启动服务。 + +``` +./start_server.sh +``` + +## 使用 IDE 开发 + +如果使用 IDE 开发,修改 `application.yml` 配置文件,找到 `HtttpServer.java` 类直接启动即可。 + +![](../images/ide_develop_featuer_platform.png) diff --git a/docs/zh/app_ecosystem/feat_insight/install/upgrade.md b/docs/zh/app_ecosystem/feat_insight/install/upgrade.md new file mode 100644 index 00000000000..1155258a3ec --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/install/upgrade.md @@ -0,0 +1,11 @@ +# 版本升级 + +## 介绍 + +FeatInsight 对外提供 HTTP 接口,底层依赖 OpenMLDB 数据库存储元数据,因此可以通过多实例和 Rolling update 等方法进行版本升级。 + +## 单实例升级步骤 + +1. 下载新版本的 OpenMLDB 安装包或 Docker 镜像。 +2. 停止当前正在运行的 OpenMLDB 特征服务实例。 +3. 基于新版本 OpenMLDB 特征服务包启动新实例。 diff --git a/docs/zh/app_ecosystem/feat_insight/introduction.md b/docs/zh/app_ecosystem/feat_insight/introduction.md new file mode 100644 index 00000000000..49f477d6f41 --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/introduction.md @@ -0,0 +1,40 @@ +# 简介 + +FeatInsight 是一个先进的特征存储(Feature Store)服务,基于 [OpenMLDB](https://github.com/4paradigm/OpenMLDB) 数据库实现高效的特征管理和编排功能。 + +FeatInsight 提供简便易用的 UI 界面,用户可以进行机器学习特征开发的全流程,包括数据的导入、查看、编辑,特征的生成、存储、上线等功能。 针对离线场景中,用户可以选择特征生成离线样本用于后续的机器学习开发;针对在线场景中,用户可以选择特征创建特征服务,实现实时特征计算。 + +![](./images/bigscreen.png) + +## 主要功能 + +FeatInsight 包括以下几个主要功能: + +- [数据管理](./functions/import_data.md):用于导入和管理特征工程需要的原始数据和在线数据。 +- [特征管理](./functions/manage_feature.md):用于存储原始特征数据和派生特征数据的存储系统。 +- [在线场景](./functions/online_scenario.md):上线特征服务,使用在线数据提供硬实时的在线特征抽取接口。 +- [离线场景](./functions/offline_scenario.md):对离线数据进行特征计算并导出样本文件,提供离线样本、任务管理功能。 +- [SQL实验室](./functions/sql_playground.md):可调试和执行任意的 OpenMLDB SQL 语句,使用在线模式或离线模型完成特征计算任务。 +- [预计算特征](./functions/computed_features.md):用户可以通过预计算把特征值直接存入 OpenMLDB 在线表中,然后访问在线表数据进行读写特征。 + +## 核心特性 + +FeatInsight 的主要目的是解决在机器学习项目中常见的问题,包括简便快捷地进行特征提取、转换、组合、选择以及血缘管理,特征的重用和共享,特征服务版本控制,以及确保在训练和推理过程中使用的特征数据的一致和可靠。一些 FeatInsight 的范例应用场景包括: + +* 上线在线特征服务:提供本地化部署的高性能特征存储和在线特征计算功能。 +* 搭建 MLOps 平台:基于 OpenMLDB 在线离线一致性快速实现完成的 MLOps 工作流。 +* 搭建 FeatureStore 平台:提供完备的特征创建、删除、上线、血缘管理等功能,低成本实现本地 FeatureStore 服务。 +* 复用开源特征方案:在本地复用开源共建的特征数据集,实现特征重用和共享。 +* 机器学习业务组件:为推荐系统、自然语言处理、金融医疗等领域机器学习模型提供一站式特征工程落地方案。 + +## 核心概念 + +以下是 FeatInsight 所使用到的一些术语及其定义,以方便理解: + +* 特征:通过对原始数据进行特征抽取得到的可直接用于模型训练和推理的数据。 +* 预计算特征:通过外部批计算或流式处理后存储的特征值,可直接上线使用。 +* 特征视图:通过单个SQL计算语句定义的一组特征。 +* 特征服务:将单个或多个特征组成一个特征服务,提供给在线场景使用。 +* 在线场景:通过上线特征服务,使用在线数据提供硬实时的在线特征抽取接口。 +* 离线场景:使用分布式计算,对离线数据进行特征计算并导出机器学习所需的样本文件。 +* 在线离线一致性:通过相同的SQL定义可保证在线场景和离线场景计算的特征结果一致。 diff --git a/docs/zh/app_ecosystem/feat_insight/quickstart.md b/docs/zh/app_ecosystem/feat_insight/quickstart.md new file mode 100644 index 00000000000..88a255b198c --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/quickstart.md @@ -0,0 +1,116 @@ +# 快速入门 + +本文将介绍如何快速入门 FeatInsight,基于一个 SQL 示例来演示如何使用。 + +安装部署可参考 [OpenMLDB 部署文档](../../../deploy/index.rst) 和 [FeatInsight 部署文档](./install/index.rst)。 + +## 使用流程 + +FeatInsight 的基本使用流程包括以下几个步骤: + +1. 导入数据:使用SQL命令或前端表单进行创建数据库、创建数据表、导入在线数据和导入离线数据等操作。 +2. 创建特征:使用SQL语句来定义特征视图,FeatInsight 使用SQL编译器进行特征分析并创建对应的特征。 +3. 离线场景:选择想要导入的特征,可以同时选择不同特征视图的特征,并使用分布式计算把样本文件导入到本地或分布式存储。 +4. 在线场景:选择想要上线的特征,一键发布成在线特征抽取服务,然后可使用HTTP客户端进行请求和返回在线特征抽取结果。 + +### 1. 导入数据 + +首先创建数据库`test_db`表,和数据表`test_table`,可以直接执行 SQL 来创建。 + +``` +CREATE DATABASE test_db; + +CREATE TABLE test_db.test_table (id STRING, trx_time DATE); +``` + +也可以在 FeatInsight 的“数据导入”前端页面直接创建。 + +![](./images/create_test_table.png) + +为了测试方便,我们准备一个 CSV 文件并保存到 `/tmp/test_table.csv`。注意,这里本地是 OpenMLDB TaskManager 服务器的本地路径,一般也是 FeatInsight 的服务器路径,需要提前登陆编辑。 + +``` +id,trx_time +user1,2024-01-01 +user2,2024-01-02 +user3,2024-01-03 +user4,2024-01-04 +user5,2024-01-05 +user6,2024-01-06 +user7,2024-01-07 +``` + +在线数据可以使用 `LOAD DATA` 或 `INSERT` 命令来导入,这里演示通过点击 “使用 CSV 导入” 来执行。 + +![](./images/online_csv_import_test_table.png) + +通过前端页面可以预览已导入的在线数据。 + +![](./images/preview_test_table.png) + +离线数据也可以使用 `LOAD DATA` 命令或前端选择“使用 CSV 导入”来执行。 + +![](./images/csv_import_test_table.png) + +等待半分钟后导入任务完成,可以查看任务的状态以及日志。 + +![](./images/import_job_result.png) + +### 2. 创建特征 + +数据导入完成后,可以开始创建特征,本示例使用 SQL 来创建两个基本特征。 + +``` +SELECT id, dayofweek(trx_time) as trx_day FROM test_table +``` + +在”特征“页面选择“创建特征”,填写特征组名称以及 SQL 语句。 + +![](./images/create_test_featureview.png) + +创建完成后,可以在“特征”页面查看成功创建的特征。 + +![](./images/test_features_list.png) + +点击特征名称,进入特征详情页,可以查看特征基础信息,并提供特征预览功能。 + +![](./images/preview_test_features.png) + +### 3. 生成离线样本 + +在“离线场景”页面,可以选择导出离线样本,只要选择刚创建好的特征和提供导出路径即可,前端还提供了“更多选项”可以选择到处格式、运行参数等。 + +![](./images/export_test_offline_samples.png) + +提交导出任务后,可以在“离线样本”详情页查看导出信息,大概半分钟后成功完成。 + +![](./images/test_offline_sample_detail.png) + +在本地即可查看导出的样本文件内容。为了验证 FeatInsight 提供的在线离线一致性,可记录离线特征结果,并于后面的在线特征计算做比较。 + +![](./images/local_test_offline_samples.png) + +### 4. 创建在线服务 + +在“特征服务”页面可以选择创建特征服务,同样是只需要选择上线的特征,以及提供特征服务名称和版本即可。 + +![](./images/create_test_feature_service.png) + +创建成功后,可以在特征服务详情页查看到服务的基本信息,上线包含的特征列表,以及依赖数据表的血缘关系等。 + +![](./images/test_feature_service_detail.png) + +最后通过“请求特征服务”页面,我们可以输入测试数据进行在线特征计算,并且和离线样本的特征结果进行比对。 + +![](./images/request_test_feature_service.png) + +## 总结 + +本示例演示了使用 FeatInsight 的完整流程,通过编写简单的 SQL 即可实现在线和离线的特征定义,通过选择不同的特征,甚至是组合不同特征组的特征,即可实现快速的特征复用和上线,并且对比离线和在线的计算结果验证了特征计算的一致性。 + +## 附录:高级功能 + +除了特征工程的基本功能之外,FeatInsight 还提供了高级功能以方便用户进行特征工程的开发: + +* SQL 实验室:提供了 OpenMLDB SQL 语句的调试和执行功能,方便用户执行任意 SQL 操作并调试特征抽取的 SQL 语句。详情请见[这里](./functions/sql_playground)。 +* 预计算特征:可以将通过外部批计算或流式处理后得到的特征值直接存入OpenMLDB在线表中,然后访问在线表数据进行读写特征。详情请见[这里](./functions/computed_features)。 \ No newline at end of file diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature.png new file mode 100644 index 00000000000..ad6b53b0ab3 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature_service.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature_service.png new file mode 100644 index 00000000000..83a6d260ddd Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_tables.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_tables.png new file mode 100644 index 00000000000..156ad25b474 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_create_tables.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_feature_view_detail.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_feature_view_detail.png new file mode 100644 index 00000000000..2cdd1bcb4a2 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_feature_view_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_request_feature_service.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_request_feature_service.png new file mode 100644 index 00000000000..e731ee79a5a Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/recommend_request_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature.png new file mode 100644 index 00000000000..a01520f8464 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature_service.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature_service.png new file mode 100644 index 00000000000..8296a3907e1 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_table.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_table.png new file mode 100644 index 00000000000..86a098c491f Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_create_table.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_export_offline_samples.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_export_offline_samples.png new file mode 100644 index 00000000000..13ae5329408 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_export_offline_samples.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_feature_service_detail.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_feature_service_detail.png new file mode 100644 index 00000000000..79426e410b2 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_feature_service_detail.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_features.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_features.png new file mode 100644 index 00000000000..aeb439453c2 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_features.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_offline_data.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_offline_data.png new file mode 100644 index 00000000000..0b69b9cb00a Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_offline_data.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_online_data.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_online_data.png new file mode 100644 index 00000000000..9d8fb565873 Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_import_online_data.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_offline_samples_data.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_offline_samples_data.png new file mode 100644 index 00000000000..a02f861f0ec Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_offline_samples_data.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_preview_online_table.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_preview_online_table.png new file mode 100644 index 00000000000..ed086f6fb9b Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_preview_online_table.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_request_feature_service.png b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_request_feature_service.png new file mode 100644 index 00000000000..0a72b79170b Binary files /dev/null and b/docs/zh/app_ecosystem/feat_insight/use_cases/images/taxi_request_feature_service.png differ diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/index.rst b/docs/zh/app_ecosystem/feat_insight/use_cases/index.rst new file mode 100644 index 00000000000..cae20e85faa --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/use_cases/index.rst @@ -0,0 +1,9 @@ +============================= +应用案例 +============================= + +.. toctree:: + :maxdepth: 1 + + taxi_tour_duration_prediction + recommend_system \ No newline at end of file diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/recommend_system.md b/docs/zh/app_ecosystem/feat_insight/use_cases/recommend_system.md new file mode 100644 index 00000000000..1d071e5c34e --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/use_cases/recommend_system.md @@ -0,0 +1,113 @@ +# 电商推荐系统物料统计场景 + +## 场景介绍 + +在常见的电商推荐系统中,需对每次推荐请求前的特定时间段内(近7天),用户对各类标签广告的浏览次数进行精确统计,这些统计数据将被反馈给推荐系统,以便进行更深入的规则分析和判断。 + +## 场景数据 + +这里准备3张数据表,首先是请求数据表,用户通过 ID 以及请求时间查询当前窗口所需的特征。 + +``` +CREATE TABLE recommend_system.request (uid string, event_time timestamp) +``` + +然后是曝光表,需要提供用户 ID 以及物料 ID 信息,为了简化把其他无关的列都去掉。 + +``` +CREATE TABLE recommend_system.feeds (uid string, material_id string, event_time timestamp) +``` + +最后是物料表,主要包含物料基本信息,包括本场景需要统计的物料类型等,同样简化把无关的字段先去掉。 + +``` +CREATE TABLE recommend_system.material (material_id string, tag string); +``` + +## 特征设计 + +根据场景的背景描述,只需要提取用户 ID 以及物料的不同标签出现的次数即可,使用以下的 OpenMLDB SQL 进行特征抽取。 + +``` +SELECT + uid, + count_cate(material_id, tag) OVER w AS category_count +FROM + (SELECT uid, CAST (null AS string) AS material_id, CAST (null AS string) AS tag, event_time FROM request) +WINDOW + w AS ( + UNION ( + SELECT + uid, feeds.material_id, material.tag AS tag, event_time + FROM feeds + LAST JOIN material ON feeds.material_id = material.material_id) + PARTITION BY uid ORDER BY event_time ROWS_RANGE BETWEEN 7d PRECEDING AND CURRENT ROW) +``` + +可以参考下面的逻辑来理解 SQL 语句的含义: + +1. 将曝光表与物料表进行 Join 操作,这样拼接后的表就可以获得物料的标签类型等需要的属性。 +2. 对请求表进行拓展,增加 material_id 和 tag 列并使用 null 值填充,这样方便后续与第一步的输出表进行 Union 操作。 +3. 使用 Window Union 将第一步和第二步的表进行 Union 操作,这样就得到了一个完整的表,然后基于这个完整表进行窗口操作和查询操作。注意,这里使用 Window Union 而不是 Join + Window 是为了避免 Left Join 可能一行数据产生多行样本,而使用 Last Join 则可能导致副表只能拼接一行数据。 +4. 最后使用 count_cate 函数对物料标签进行计数,得到特征。 + +## 实现流程 + +### 1. 数据导入 + +首先创建数据库和数据表,为了方便上线这里把索引也提前加上了。 + +``` +CREATE DATABASE recommend_system; + +CREATE TABLE recommend_system.request (uid string, event_time timestamp, INDEX(key=uid, TS=event_time)); + +CREATE TABLE recommend_system.feeds (uid string, material_id string, event_time timestamp, INDEX(key=uid, TS=event_time)); + +CREATE TABLE recommend_system.material (material_id string, tag string); +``` + +因为实际数据需要脱敏,用户可以根据实际情况进行测试数据的导入,本文只演示特征上线流程。 + +### 2. 定义特征 + +使用前面介绍的 SQL 语句定义特征。 + +``` +SELECT + uid, + count_cate(material_id, tag) OVER w AS category_count +FROM + (SELECT uid, CAST (null AS string) AS material_id, CAST (null AS string) AS tag, event_time FROM request) +WINDOW + w AS ( + UNION ( + SELECT + uid, feeds.material_id, material.tag AS tag, event_time + FROM feeds + LAST JOIN material ON feeds.material_id = material.material_id) + PARTITION BY uid ORDER BY event_time ROWS_RANGE BETWEEN 7d PRECEDING AND CURRENT ROW) +``` + +在前端页面创建特征,并自动分析出需要创建的两个特征。 + +![](./images/recommend_create_feature.png) + +创建成功后可以通过特征视图查看详情。 + +![](./images/recommend_feature_view_detail.png) + +### 3. 特征上线 + +在在线场景页面,选择需要上线的特征,并确认创建。 + +![](./images/recommend_create_feature_service.png) + +特征服务上线成功后,就可以通过输入请求数据进行在线请求测试了。 + +![](./images/recommend_request_feature_service.png) + +## 总结 + +对于推荐系统模型来说,特征工程是非常重要的一环,FeatInsight 提供了一个简单快速的特征管理和特征上线流程,帮助用户快速上线特征,提升推荐系统的效果,对于更复杂的特征也都可以使用 SQL 来描述和上线。 + diff --git a/docs/zh/app_ecosystem/feat_insight/use_cases/taxi_tour_duration_prediction.md b/docs/zh/app_ecosystem/feat_insight/use_cases/taxi_tour_duration_prediction.md new file mode 100644 index 00000000000..8a948c368ed --- /dev/null +++ b/docs/zh/app_ecosystem/feat_insight/use_cases/taxi_tour_duration_prediction.md @@ -0,0 +1,104 @@ +# 出租车行程时间预测场景 + +## 场景介绍 + +场景来自 Kaggle 的 [New York City Taxi Trip Duration](https://www.kaggle.com/c/nyc-taxi-trip-duration/overview), 对纽约市出租车公司的行程时间进行预测,预测的输入为出发地经纬度、目的地经纬度、出发时间、天气情况等,需要抽取特征最终预测出行程时间。 + +## 特征设计 + +特征设计参考 [出租车行程时间预测 (OpenMLDB + LightGBM)](../../../use_case/taxi_tour_duration_prediction.md),使用下面的 OpenMLDB SQL 进行特征工程和数据导出。 + +``` +SELECT + trip_duration, + passenger_count, + sum(pickup_latitude) OVER w AS vendor_sum_pl, + max(pickup_latitude) OVER w AS vendor_max_pl, + min(pickup_latitude) OVER w AS vendor_min_pl, + avg(pickup_latitude) OVER w AS vendor_avg_pl, + sum(pickup_latitude) OVER w2 AS pc_sum_pl, + max(pickup_latitude) OVER w2 AS pc_max_pl, + min(pickup_latitude) OVER w2 AS pc_min_pl, + avg(pickup_latitude) OVER w2 AS pc_avg_pl, + count(vendor_id) OVER w2 AS pc_cnt, + count(vendor_id) OVER w AS vendor_cnt +FROM t1 +WINDOW + w AS (PARTITION BY vendor_id ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW), + w2 AS (PARTITION BY passenger_count ORDER BY pickup_datetime ROWS_RANGE BETWEEN 1d PRECEDING AND CURRENT ROW) +``` + +## 实现流程 + +### 1. 数据导入 + +创建测试数据库 `taxi_trip_duration` 和测试数据表 `t1`。 + +``` +CREATE DATABASE taxi_trip_duration; + +CREATE TABLE taxi_trip_duration.t1 (id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int); +``` + +![](./images/taxi_create_table.png) + +注意,在 OpenMLDB 0.8.4及前序版本不支持自动创建索引,因此需要在创建表时添加索引。 + +``` +CREATE TABLE taxi_trip_duration.t1(id string, vendor_id int, pickup_datetime timestamp, dropoff_datetime timestamp, passenger_count int, pickup_longitude double, pickup_latitude double, dropoff_longitude double, dropoff_latitude double, store_and_fwd_flag string, trip_duration int, INDEX(KEY=vendor_id, TS=pickup_datetime), INDEX(KEY=passenger_count, TS=pickup_datetime)); +``` + +然后从 Kaggle 下载数据集进行导入数据,下载命令如下。 + +``` +kaggle competitions download -c nyc-taxi-trip-duration +``` + +下载后解压得到 `train.csv` 文件,放在 `/tmp/train.csv` 路径下,在前端页面选择“使用 CSV 导入”在线数据。 + +![](./images/taxi_import_online_data.png) + +导入成功后,可以预览在线表数据。 + +![](./images/taxi_preview_online_table.png) + +然后进行离线的数据导入,同样在前端页面选择“使用 CSV 导入”操作即可。 + +![](./images/taxi_import_offline_data.png) + +### 2. 创建特征 + +根据前面设计的 SQL 语句,我们选择创建一个特征组,创建时会”分析 SQL“,并且根据 SQL 自动分析出创建的特征列表。 + +![](./images/taxi_create_feature.png) + +![](./images/taxi_features.png) + +### 3. 离线场景 + +在离线场景,我们选择刚生成的特征视图的所有特征,把离线样本导出到本地进行模型训练。 + +![](./images/taxi_export_offline_samples.png) + +离线任务执行成功后,可以查看本地路径 `/tmp/taxi_tour_features/`,发现特征数据已经计算出来,并且导出到本地可以直接给模型训练使用。模型训练可参考[出租车行程时间预测 (OpenMLDB + LightGBM)](../../../use_case/taxi_tour_duration_prediction.md)。 + +![](./images/taxi_offline_samples_data.png) + +### 4. 在线场景 + +通过离线场景验证特征 SQL 正确后,可通过在线场景把特征上线成特征服务。 + +![](./images/taxi_create_feature_service.png) + +创建成功后,可以查看特征服务的详情页面。 + +![](./images/taxi_feature_service_detail.png) + +最后可以在请求页面进行在线测试,并且验证在线离线特征结果是否一致。 + +![](./images/taxi_request_feature_service.png) + +## 总结 + +使用 FeatInsight 实现出租车行程时间预测场景,整个过程非常简单且步骤清晰,相比于使用 OpenMLDB 命令行工具更加直观,而且只需要有浏览器就可以操作,免去科学家搭建环境的麻烦,在线调试特征以及特征复用也更加简单。 + diff --git a/docs/zh/app_ecosystem/sql_emulator/index.rst b/docs/zh/app_ecosystem/sql_emulator/index.rst new file mode 100644 index 00000000000..f67223a96b1 --- /dev/null +++ b/docs/zh/app_ecosystem/sql_emulator/index.rst @@ -0,0 +1,8 @@ +============================= +OpenMLDB SQL Emulator +============================= + +.. toctree:: + :maxdepth: 1 + + sql_emulator \ No newline at end of file diff --git a/docs/zh/app_ecosystem/sql_emulator/sql_emulator.md b/docs/zh/app_ecosystem/sql_emulator/sql_emulator.md new file mode 100644 index 00000000000..7dc5ba1b208 --- /dev/null +++ b/docs/zh/app_ecosystem/sql_emulator/sql_emulator.md @@ -0,0 +1,257 @@ +# 应用指南 + +OpenMLDB SQL Emulator 是一个[OpenMLDB](https://github.com/4paradigm/OpenMLDB)的轻量级的SQL模拟器,旨在于更加高效方便的开发、调试 OpenMLDB SQL。 + +为了高效的实现时序特征计算,OpenMLDB SQL对标准SQL做了改进和扩展,因此初学者在使用OpenMLDB SQL的时候,经常会碰到语法不熟悉、执行模式混淆等问题。如果直接在OpenMLDB集群上进行开发、调试,由于部署、构建索引、大数据量等问题,经常会浪费大量时间在无关任务上,并且可能无法找到SQL本身的错误原因。OpenMLDB SQL Emulator是一个轻量级OpenMLDB SQL模拟开发调试工具,可以在脱离OpenMLDB集群部署的情况下,进行SQL的验证和调试操作。我们强烈推荐此工具给我们的应用开发人员,可以首先基于此工具快速验证SQL的正确性、可上线性以后,再切换到OpenMLDB真实环境上进行部署上线。 + +## 安装和启动 + +从[项目页面](https://github.com/vagetablechicken/OpenMLDBSQLEmulator/releases)下载运行包 `emulator-1.0.jar`,使用如下方式启动(注意当前发布的 1.0 版本对应于 OpenMLDB 0.8.3 的 SQL 语法): + +```bash +java -jar emulator-1.0.jar +``` + +注意,如果想使用`run`命令执行 SQL来验证计算结果,还需要同时下载该页面下的`toydb_run_engine`,并且存放在系统`/tmp`目录下。 + +## 使用流程 + +启动emulator后,将直接进入到默认的数据库 emudb,不需要额外创建数据库。 +- 数据库不需要被显式创建,只需要`use `或建表时指定数据库名,即可自动创建数据库。 +- 使用命令`addtable`或者`t`来创建虚拟表,重复创建同名表就是更新操作,将使用最新的表schema。我们使用简化的类SQL语法管理表,比如下面的例子创建了一个含有两列的表。 +```sql +addtable t1 a int, b int64 +``` +- 使用命令`showtables`或者`st`来查看当前所有的数据库和表。 + +### 验证 OpenMLDB SQL + +通常情况下,如需要验证OpenMLDB SQL 是否可以上线,可以在真实集群中使用`DEPLOY`进行上线测试。使用这种方法需要管理`DEPLOYMENT`与索引。例如,如果不需要某些测试用的`DEPLOYMENT`,需要手动删除;如果创建了不需要的索引,还需要清理索引。所以,我们建议在Emulator中测试验证。 + +你可以使用`val`和`valreq`分别进行在线批模式和在线请求模式(即服务部署上线)的OpenMLDB SQL验证。例如,我们测试一个SQL是否能被`DEPLOY`上线,使用`valreq`命令: + +```sql +# table creations - t/addtable: create table +addtable t1 a int, b int64 + +# validate in online request mode +valreq select count(*) over w1 from t1 window w1 as (partition by a order by b rows between unbounded preceding and current row); +``` + +如果测试不通过,将打印SQL编译错误;通过则打印`validate * success`。整个过程在虚拟环境中,无需担心建表后的资源占用,也没有任何副作用。只要`valreq`验证通过的 SQL,则一定能在真实集群中上线。 + +### 测试运行 OpenMLDB SQL + +OpenMLDB SQL Emulator也可以返回计算结果,用于测试SQL的计算是否符合预期。你可以在其中不断进行计算和上线验证,直到调试得到最终的上线SQL。该功能可以通过Emulator的`run`命令实现。 + +注意,使用`run`命令需要额外的`toydb_run_engine`支持,可以使用自带`toydb`的`emulator`包,或在[此页面下载](https://github.com/vagetablechicken/OpenMLDBSQLEmulator/releases)`toydb` 程序,并将其直接放入`/tmp`中。 + +假设`Emulator`已有`toydb`,测试运行步骤如下: + +``` +# step 1, generate a yaml template +gencase + +# step 2 modify the yaml file to add table and data +# ... + +# step 3 load yaml and show tables +loadcase +st + +# step 4 use val/valreq to validate the sql +valreq select count(*) over w1 from t1 window w1 as (partition by id order by std_ts rows between unbounded preceding and current row); + +# step 5 dump the sql you want to run next, this will rewrite the yaml file +dumpcase select count(*) over w1 from t1 window w1 as (partition by id order by std_ts rows between unbounded preceding and current row); + +# step 6 run sql using toydb +run +``` + +#### 步骤解释 +**step 1:** 运行命令`gencase`生成一个yaml模版文件,默认创建目录为是`/tmp/emu-case.yaml`。 + +范例yaml文件: +```yaml +# call toydb_run_engine to run this yaml file +# you can generate yaml cases for reproduction by emulator dump or by yourself + +# you can set the global default db +db: emudb +cases: + - id: 0 + desc: describe this case + # you can set batch mode + mode: request + db: emudb # you can set default db for case, if not set, use the global default db + inputs: + - name: t1 + db: emudb # you can set db for each table, if not set, use the default db(table db > case db > global db) + # must set table schema, emulator can't do this + columns: ["id int", "pk1 string","col1 int32", "std_ts timestamp"] + # gen by emulator, just to init table, not the deployment index + indexs: [] + # must set the data, emulator can't do this + data: | + 1, A, 1, 1590115420000 + 2, B, 1, 1590115420000 + # query: only support single query, to check the result by `expect` + sql: | + + # optional, you can just check the output, or add your expect + # expect: + # schema: id:int, pk1:string, col1:int, std_ts:timestamp, w1_col1_sum:int, w2_col1_sum:int, w3_col1_sum:int + # order: id + # data: | + # 1, A, 1, 1590115420000, 1, 1, 1 + # 2, B, 1, 1590115420000, 1, 1, 1 +``` + +**step 2:** 编辑这个yaml文件,编辑需要注意以下几点: +- 必须修改表名,表schema及其数据,这些不可在Emulator中修改。 +- 可以修改运行`mode`,接受`batch`或`request`模式。 +- 可以不填写SQL,可以在Emulator中通过`dumpcase `写入文件。常见使用方法是,先validate SQL,SQL通过校验后dump到case中,再使用`run`命令确认 SQL 的计算符合预期。 +- 表的indexs也无需手动填写,`dumpcase`时可以根据表schema自动生成(indexs并非特殊的索引,与SQL也无关,仅仅是创建表时需要创建至少一个索引)。如果不使用`dumpcase`,那么请手动填写至少一个索引,索引没有特别要求。手动创建范例:`["index1:c1:c2", ".."]`,`["index1:c1:c4:(10m,2):absorlat"]`。 + +**step 3:** 执行`loadcase`,这个case的表信息将被加载到Emulator中,通过`st/showtables`确认 case 的表加载成功,显示信息如下: +```bash +emudb> st +emudb={t1=id:int32,pk1:string,col1:int32,std_ts:timestamp} +``` + +**step 4:** 使用`valreq`来确认我们编写的 SQL 是语法正确且可以上线的。 + +**step 5 & 6:** 对这个SQL进行计算测试,使用命令`dumpcase`和`run`。 `dumpcase`实际是将SQL与默认索引写入case文件中,`run`命令运行该case文件。 如果你足够熟练,也可以直接修改case文件,再在Emulator中使用`run`运行它,或直接使用`toydb_run_engine --yaml_path=...`来运行。 + + +## 更多信息 +### 编译 + +你可以自行编译Emulator,如果需要使用`run`命令验证SQL计算结果,需要将`toydb_run_engine`放在`src/main/resources`中并执行编译。 + +```bash +# pack without toydb +mvn package -DskipTests + +# pack with toydb +cp toydb_run_engine src/main/resources +mvn package +``` + +从源码编译`toydb_run_engine`: +``` +git clone https://github.com/4paradigm/OpenMLDB.git +cd OpenMLDB +make configure +cd build +make toydb_run_engine -j # minimum build +``` + +### OpenMLDB适配版本 +Emulator使用`openmldb-jdbc`进行验证,目前支持的OpenMLDB版本为: +|Emulator Version | Compatible OpenMLDB Versions | +|--|--| +| 1.0 | 0.8.3 | + +### 常用命令 + +#### 创建类命令 +注意,如果新建表已存在,将会替换原有表。 + +默认虚拟数据库为`emudb`。 + +- `use ` 使用数据库,如果不存在,将会创建。 +- `addtable c1 t1,c2 t2, ...` 创建/替换先数据库表 + - 简写: `t c1 t1,c2 t2, ...` + +- `adddbtable c1 t1,c2 t2, ...` 创建/替换指定数据库中的表 + - 简写: `dt c1 t1,c2 t2, ...` +- `sql ` 使用sql创建表 +- `showtables` / `st` 显示所有表 + +#### `genddl ` + +可以帮助用户根据SQL直接生成最佳索引的建表语句,避免冗余索引(目前仅支持单数据库)。 + +- 范例1 +``` +t t1 a int, b bigint +t t2 a int, b bigint +genddl select *, count(b) over w1 from t1 window w1 as (partition by a order by b rows between 1 preceding and current row) +``` +输出: +``` +CREATE TABLE IF NOT EXISTS t1( + a int, + b bigint, + index(key=(a), ttl=1, ttl_type=latest, ts=`b`) +); +CREATE TABLE IF NOT EXISTS t2( + a int, + b bigint +); +``` +因为SQL不涉及t2的操作,所以t2创建为简单创建表格。t1创建为有索引的表格创建。 + +- 范例2 +``` +t t1 a int, b bigint +t t2 a int, b bigint +genddl select *, count(b) over w1 from t1 window w1 as (union t2 partition by a order by b rows_range between 1d preceding and current row) +``` +输出: +``` +CREATE TABLE IF NOT EXISTS t1( + a int, + b bigint, + index(key=(a), ttl=1440m, ttl_type=absolute, ts=`b`) +); +CREATE TABLE IF NOT EXISTS t2( + a int, + b bigint, + index(key=(a), ttl=1440m, ttl_type=absolute, ts=`b`) +); +``` +因为SQL涉及union window,t1和t2均为有索引的表格创建。 + +#### SQL验证命令 + +- `val ` 在线批模式验证 +- `valreq ` 在线请求模式验证 +``` +t t1 a int, b int64 +val select * from t1 where a == 123; +valreq select count(*) over w1 from t1 window w1 as (partition by a order by b rows between unbounded preceding and current row); +``` +#### toydb运行命令 + +`run ` + +在toydb中运行yaml文件。 可使用`gencase`生成。目前支持单个case。case中应包含创建表命令和单个SQL。默认模式为`request`,可以更改为`batch`模式。 + +由于Emulator中不支持表的添加和删除,请在yaml文件中添加相关操作。 + +该yaml文件也可用于错误的复现。如需帮助,可向我们提供对应的yaml文件。 + +#### 其他命令 +- `#` 注释. +- 单个命令**不可**写为多行,例如`val select * from t1;`不能写为: +``` +# 错误写法 +val select * +from +t1; +``` +- `?help` 提示 +- `?list` 查看所有命令 +- `!run-script $filename` 从文件中读取并运行命令。文件可为任意包含命令的文本文件,例如: +``` +!run-script src/test/resources/simple.emu +``` +- `!set-display-time true/false` 开启/关闭命令运行时间。单位为毫秒(ms),为方法运行的物理时间。 +- `!enable-logging filename` and `!disable-logging` shell输入输出log控制。 + +### CLI框架 + +我们使用`cliche`作为CLI框架,详见[操作手册](https://code.google.com/archive/p/cliche/wikis/Manual.wiki) 和[source](https://github.com/budhash/cliche)。 diff --git a/docs/zh/deploy/conf.md b/docs/zh/deploy/conf.md index de538720e5d..56b3f7f3df3 100644 --- a/docs/zh/deploy/conf.md +++ b/docs/zh/deploy/conf.md @@ -192,8 +192,8 @@ #--max_traverse_cnt=0 # 最大扫描不同key的个数(批处理),默认:0 #--max_traverse_key_cnt=0 -# 结果最大大小(byte),默认:2MB -#--scan_max_bytes_size=2097152 +# 结果最大大小(byte),默认:0 unlimited +#--scan_max_bytes_size=0 # loadtable # load时給线程池提交一次任务的数据条数 diff --git a/docs/zh/deploy/install_deploy.md b/docs/zh/deploy/install_deploy.md index 9b7a67fa857..cf719c24ef6 100644 --- a/docs/zh/deploy/install_deploy.md +++ b/docs/zh/deploy/install_deploy.md @@ -155,6 +155,7 @@ OpenMLDB 提供了两种启动模式:普通和守护进程启动。守护进 如果想要使守护进程模式启动,请使用`bash bin/start.sh start mon`或者`sbin/start-all.sh mon`的方式启动。守护进程模式中,`bin/.pid`将是 mon 进程的 pid,`bin/.pid.child` 为组件真实的 pid。 ## 部署方式一:一键部署(推荐) + OpenMLDB集群版需要部署ZooKeeper、NameServer、TabletServer、TaskManager等模块。其中ZooKeeper用于服务发现和保存元数据信息。NameServer用于管理TabletServer,实现高可用和failover。TabletServer用于存储数据和主从同步数据。APIServer是可选的,如果要用http的方式和OpenMLDB交互需要部署此模块。TaskManager 用于管理离线 job。我们提供了一键部署脚本,可以简化手动在每台机器上下载和配置的复杂性。 **注意:** 同一台机器部署多个组件时,一定要部署在不同的目录里,便于单独管理。尤其是部署TabletServer,一定不能重复使用目录,避免数据文件和日志文件冲突。 @@ -164,9 +165,9 @@ DataCollector和SyncTool暂不支持一键部署。请参考手动部署方式 ### 环境要求 - 部署机器(执行部署脚本的机器)可以免密登录其他部署节点 -- 部署机器安装 `rsync` 工具 -- 部署机器安装 Python3 -- 部署Zookeeper和TaskManager的机器安装 JRE (Java Runtime Environment) +- 部署机器需安装 `rsync` 工具 +- 部署机器需安装 Python3 +- Zookeeper和TaskManager的运行机器上需安装 JRE (Java Runtime Environment) ### 下载OpenMLDB发行版 @@ -176,28 +177,55 @@ tar -zxvf openmldb-0.8.4-linux.tar.gz cd openmldb-0.8.4-linux ``` +### 脚本使用逻辑 + +部署脚本均在sbin中,我们也称一键部署为sbin部署。初次部署过程一般是“修改环境和配置文件 -> sbin/deploy-all.sh -> sbin/start-all.sh”。如果需要停止服务,执行`sbin/stop-all.sh`。清理已部署的数据和日志,执行`sbin/clear-all.sh`。Docker镜像中的`/work/init.sh`脚本便是进行“deploy-all -> stop-all -> clear-all -> start-all”。 + +如果集群正在运行,需要修改配置(不能只deploy到单台,但全部覆盖配置不影响进程运行)并重启某一个组件(不能指定单进程,但可以指定组件),需要“修改配置 -> deploy-all.sh -> stop-tablets.sh -> start-tablets.sh”。但需要注意重启tablet可能会导致数据加载失败(影响服务),需要进行集群诊断与恢复,可使用[一键inspect](../maintain/diagnose.md#一键inspect)。数据量较大或不可出现服务中断时,更推荐使用扩缩容方式或手动重启单进程。 + ### 环境配置 -环境变量定义在`conf/openmldb-env.sh`,如下表所示: - -| 环境变量 | 默认值 | 定义 | -|-----------------------------------|------------------------------------|-------------------------------------------------------------------------| -| OPENMLDB_VERSION | 0.8.4 | OpenMLDB版本 | -| OPENMLDB_MODE | standalone | standalone或者cluster | -| OPENMLDB_HOME | 当前发行版的根目录 | openmldb发行版根目录 | -| SPARK_HOME | $OPENMLDB_HOME/spark | openmldb spark发行版根目录,如果该目录不存在,自动从网上下载 | -| OPENMLDB_TABLET_PORT | 10921 | TabletServer默认端口 | -| OPENMLDB_NAMESERVER_PORT | 7527 | NameServer默认端口 | -| OPENMLDB_TASKMANAGER_PORT | 9902 | taskmanager默认端口 | -| OPENMLDB_APISERVER_PORT | 9080 | APIServer默认端口 | -| OPENMLDB_USE_EXISTING_ZK_CLUSTER | false | 是否使用已经部署的ZooKeeper集群。如果是`false`,会在部署脚本里自动启动ZooKeeper集群 | -| OPENMLDB_ZK_HOME | $OPENMLDB_HOME/zookeeper | ZooKeeper发行版根目录 | -| OPENMLDB_ZK_CLUSTER | 自动从`conf/hosts`中的`[zookeeper]`配置获取 | ZooKeeper集群地址 | -| OPENMLDB_ZK_ROOT_PATH | /openmldb | OpenMLDB在ZooKeeper的根目录 | -| OPENMLDB_ZK_CLUSTER_CLIENT_PORT | 2181 | ZooKeeper client port, 即zoo.cfg里面的clientPort | -| OPENMLDB_ZK_CLUSTER_PEER_PORT | 2888 | ZooKeeper peer port,即zoo.cfg里面这种配置server.1=zoo1:2888:3888中的第一个端口配置 | -| OPENMLDB_ZK_CLUSTER_ELECTION_PORT | 3888 | ZooKeeper election port, 即zoo.cfg里面这种配置server.1=zoo1:2888:3888中的第二个端口配置 | + +环境变量定义在`conf/openmldb-env.sh`,主要变量如下表所示: + +| 环境变量 | 默认值 | 定义 | +| -------------------------------- | ------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------- | +| OPENMLDB_VERSION | 0.8.4 | OpenMLDB版本,主要用于spark下载,一般不改动。 | +| OPENMLDB_MODE | cluster | standalone或者cluster | +| OPENMLDB_HOME | 当前发行版的根目录 | openmldb发行版根目录,不则使用当前根目录,也就是openmldb-0.8.4-linux所在目录。 | +| SPARK_HOME | $OPENMLDB_HOME/spark | openmldb spark发行版根目录,如果该目录不存在,自动从网上下载。**此路径也将成为TaskManager运行机器上的Spark安装目录。** | +| RUNNER_EXISTING_SPARK_HOME | | 配置此项,运行TaskManager的机器将使用该Spark环境,将不下载、部署OpenMLDB Spark发行版。 | +| OPENMLDB_USE_EXISTING_ZK_CLUSTER | false | 是否使用已经运行的ZooKeeper集群。如果是`true`,将跳过ZooKeeper集群的部署与管理。 | +| OPENMLDB_ZK_HOME | $OPENMLDB_HOME/zookeeper | ZooKeeper发行版根目录,如果该目录不存在,自动从网上下载。 | +| OPENMLDB_ZK_CLUSTER | | ZooKeeper集群地址,为空时自动从`conf/hosts`中的`[zookeeper]`配置获取。建议自建ZooKeeper集群时在hosts中创建,使用已有ZooKeeper集群时配置此项。 | +| OPENMLDB_ZK_ROOT_PATH | /openmldb | OpenMLDB在ZooKeeper集群的根目录 | +| OPENMLDB_FORCE_LOCAL | false | 如果为`true`,所有部署将认定为本地拷贝。单机部署集群,又需要使用公网IP时,开启此项,避免ssh | +| RUNNER_JAVA_HOME | | 运行ZooKeeper和TaskManager的机器ssh可能无Java相关环境变量,可使用此变量设置。不设置则不覆盖环境。 | +| CLEAR_OPENMLDB_INSTALL_DIR | false | sbin/clear-all.sh只清理运行产生的数据与日志,如果是`true`,将把运行机器上的整个安装目录删除。 | + +通常来讲,需要确认以下几点: +- ZooKeeper集群地址,如果使用已有ZooKeeper集群,需要配置`OPENMLDB_USE_EXISTING_ZK_CLUSTER=true`,并配置`OPENMLDB_ZK_CLUSTER`。(如果在`conf/hosts`中配置外部ZK集群,请注释标注其不受sbin部署影响,避免混乱。) +- 需要此工具部署ZooKeeper集群时,在`conf/hosts`中配置`[zookeeper]`。填写多个ZooKeeper节点,即部署ZooKeeper集群,无需额外配置。 +- Spark环境,如果需要使用运行机器上已有的Spark环境,需要配置`RUNNER_EXISTING_SPARK_HOME`(地址为TaskManager运行机器上的路径)。如果部署机器存在Spark环境,并想要在TaskManager机器上使用此套环境,可配置`SPARK_HOME`(部署到TaskManager机器同名路径上)。`SPARK_HOME`不进行配置时,将自动下载、使用OpenMLDB Spark发行版。 + +#### 默认端口 +| 环境变量 | 默认值 | 定义 | +| ------------------------- | ------ | -------------------- | +| OPENMLDB_TABLET_PORT | 10921 | TabletServer默认端口 | +| OPENMLDB_NAMESERVER_PORT | 7527 | NameServer默认端口 | +| OPENMLDB_TASKMANAGER_PORT | 9902 | TaskManager默认端口 | +| OPENMLDB_APISERVER_PORT | 9080 | APIServer默认端口 | + +默认端口只会在节点配置不显式配置端口号时才会被使用,更推荐**直接在节点配置文件hosts中配置好端口号**。 + +#### ZooKeeper高级配置 +| 环境变量 | 默认值 | 定义 | +| --------------------------------- | ------ | --------------------------------------------------------------------------------------- | +| OPENMLDB_ZK_CLUSTER_CLIENT_PORT | 2181 | ZooKeeper client port, 即zoo.cfg里面的clientPort | +| OPENMLDB_ZK_CLUSTER_PEER_PORT | 2888 | ZooKeeper peer port,即zoo.cfg里面这种配置server.1=zoo1:2888:3888中的第一个端口配置 | +| OPENMLDB_ZK_CLUSTER_ELECTION_PORT | 3888 | ZooKeeper election port, 即zoo.cfg里面这种配置server.1=zoo1:2888:3888中的第二个端口配置 | ### 节点配置 + 节点配置文件为`conf/hosts`,示例如下: ```bash [tablet] @@ -229,14 +257,16 @@ node3:2181:2888:3888 /tmp/openmldb/zk-1 对于`[zookeeper]`, 会有额外端口参数,包括follower用来连接leader的`zk_peer_port`和用于leader选择的`zk_election_port`, 其格式为`host:port:zk_peer_port:zk_election_port WORKDIR`。 -每一行节点列表,除了`host`是必须的,其他均为可选,如果没有提供,会使用默认配置,默认配置参考`conf/openmldb-env.sh`。 +每一行节点列表,除了`host`是必须的,其他均为可选,如果没有提供,会使用默认配置,默认配置参考`conf/openmldb-env.sh`。无`WORKDIR`配置的节点,所有OpenMLDB Server的默认运行目录为`OPENMLDB_HOME`,ZooKeeper默认目录为`OPENMLDB_ZK_HOME`。 + +host配置为localhost或127.0.0.1时,将自动识别为部署到本地,不会进行ssh和rsync。当集群在本地部署且需要对外暴露服务,hosts中节点需配置为外网IP,如果不想配置本机ssh免密,可在`conf/openmldb-env.sh`中配置`OPENMLDB_FORCE_LOCAL=true`。 ```{warning} 如果在不同机器上部署多个 TaskManager,其 `offline.data.prefix` 配置的路径,这些机器必须可以访问,建议配置hdfs路径。 ``` ### 修改机器环境配置 (可选) -``` +```bash bash sbin/init_env.sh ``` 说明: @@ -248,15 +278,40 @@ bash sbin/init_env.sh ```bash sbin/deploy-all.sh ``` -该脚本会把相关的文件分发到`conf/hosts`里面配置的机器上,同时根据`conf/hosts`和`conf/openmldb-env.sh` -的配置,对相关组件的配置做出相应的更新。 +该脚本会把相关的文件分发到`conf/hosts`里面配置的机器上,同时根据`conf/hosts`和`conf/openmldb-env.sh`的配置,对相关组件的配置做出相应的更新。 -如果希望为每个节点添加一些额外的相同的定制化配置,可以在执行deploy脚本之前,修改`conf/xx.template`的配置, -这样在分发配置文件的时候,每个节点都可以用到更改后的配置。 -重复执行`sbin/deploy-all.sh`会覆盖上一次的配置。 +如果希望为每个节点添加一些额外的相同的定制化配置,可以在执行deploy脚本之前,**修改`conf/xx.template`的配置**。只有和openmldb-env.sh中相关的配置会被部署工具自动追加到配置尾部,其他配置不会被覆盖,可放心修改。执行deploy,将配置文件分发到运行节点中,重复执行`sbin/deploy-all.sh`会覆盖上一次的配置。 详细配置说明见[配置文件](./conf.md),请注意TaskManager Spark的选择与细节配置[Spark Config详解](./conf.md#spark-config详解)。 +执行阶段日志类似下文,请注意部署到的host与目录: +``` +deploy tablet to localhost:10921 /tmp/openmldb/tablet-1 +copy /work/openmldb to localhost:/tmp/openmldb/tablet-1 +deploy tablet to localhost:10922 /tmp/openmldb/tablet-2 +copy /work/openmldb to localhost:/tmp/openmldb/tablet-2 +deploy nameserver to localhost:7527 /work/openmldb +skip rsync as dest=src: /work/openmldb +deploy apiserver to localhost:9080 /work/openmldb +skip rsync as dest=src: /work/openmldb +/work/openmldb/spark already exists. Skip deploy spark locally +deploy taskmanager to localhost:9902 /work/openmldb +skip rsync as dest=src: /work/openmldb +/work/openmldb/zookeeper already exists. Skip download zookeeper. +deploy zookeeper to localhost:2181 /tmp/openmldb/zk-1 +copy /work/openmldb/zookeeper to localhost:/tmp/openmldb/zk-1 +``` + +对环境变量有疑问,注意日志`OPENMLDB envs:`的打印结果。 + +- 配置 +deploy不支持对单个组件的配置更新,更改单个组件也需要使用`deploy-all.sh`。如果你在部署host上单独修改,需要修改`xx.flags`/`taskmanager.properties`而不是template配置,而且`deploy-all.sh`将对该配置进行覆盖,请谨慎配置。检查配置时以host的运行目录中的`xx.flags`/`taskmanager.properties`为准。 + +- 日志 +相应的,各个节点的日志也在各自的运行目录中,具体位置参考[部署方式二:手动部署](#部署方式二手动部署)中各个组件的日志位置说明。 + +收集日志与配置,可以使用诊断工具[检查内容](../maintain/diagnose.md#检查内容),默认将各个节点的配置和日志都收集到`/tmp/diag_collect`目录中,可以统一查看。 + ### 启动服务 普通模式启动: @@ -270,17 +325,26 @@ sbin/start-all.sh sbin/start-all.sh mon ``` -该脚本会把 `conf/hosts` 里面配置的所有服务启动起来。启动完成以后,可以通过辅助脚本启动 CLI (`sbin/openmldb-cli.sh`),来验证集群是否正常启动。 +该脚本会把 `conf/hosts` 里面配置的所有服务启动起来。启动完成以后,可以通过辅助脚本启动 CLI (`sbin/openmldb-cli.sh`),来验证集群是否正常启动。对环境变量有疑问,注意日志`OPENMLDB envs:`的打印结果。 ```{tip} start-all.sh 是一个非常有用的工具。除了在部署阶段可以使用,也可以在运维阶段用于启动某一个下线的 OpenMLDB 进程。比如某一个 tablet 进程意外下线,你可以直接执行 start-all.sh。该脚本对于已经启动的进程不会产生副作用,对于已配置、但是未启动的进程,将会自动进行启动。 ``` ### 停止服务 + 如果需要停止所有服务,可以执行以下脚本: ```bash sbin/stop-all.sh ``` +### 清理数据和日志 + +如果需要清理所有服务的数据和日志,可以执行以下脚本: +```bash +sbin/clean-all.sh +``` + +如果需要保留集群数据,请不要执行该脚本。 ## 部署方式二:手动部署 OpenMLDB集群版需要部署ZooKeeper、NameServer、TabletServer、TaskManager等模块。其中ZooKeeper用于服务发现和保存元数据信息。NameServer用于管理TabletServer,实现高可用和failover。TabletServer用于存储数据和主从同步数据。APIServer是可选的,如果要用http的方式和OpenMLDB交互需要部署此模块。TaskManager用于管理离线job。 diff --git a/docs/zh/index.rst b/docs/zh/index.rst index f3b3f63106b..cd827813914 100644 --- a/docs/zh/index.rst +++ b/docs/zh/index.rst @@ -17,3 +17,11 @@ OpenMLDB 文档 (|version|) reference/index developer/index faq/index + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: 📚 应用生态 + + app_ecosystem/feat_insight/index + app_ecosystem/sql_emulator/index diff --git a/docs/zh/maintain/diagnose.md b/docs/zh/maintain/diagnose.md index cb5d7a30f74..17ba3319c79 100644 --- a/docs/zh/maintain/diagnose.md +++ b/docs/zh/maintain/diagnose.md @@ -185,7 +185,7 @@ JOB 检查是更灵活的离线任务检查命令,可以按条件筛选job, ### static-check 静态检查 -`static-check`静态检查,根据集群部署配置文件(通过参数`-f,--conf_file`指定),登录各个服务组件的部署地址,可以收集版本信息、配置文件、日志文件,检查版本是否一致,对收集到的配置文件和日志文件做分析。可以在集群未部署前进行检查,避免因程序版本或配置文件错误导致的集群部署失败。或在集群异常时,将分布式的日志文件收集在一起,方便调查问题。 +`static-check`静态检查,根据集群部署配置文件(通过参数`-f,--conf_file`指定),登录各个服务组件的部署地址,可以收集版本信息、配置文件、日志文件,检查版本是否一致,对收集到的配置文件和日志文件做分析。可以在集群*未部署前*进行检查,避免因程序版本或配置文件错误导致的集群部署失败。或在集群异常时,将分布式的日志文件收集在一起,方便调查问题。 ```bash openmldb_tool static-check -h @@ -260,7 +260,7 @@ nameserver: 检查可通过组合FLAG来来指定检查哪些内容,例如,`-V`只检查版本,`-CL`只检查配置文件和日志,`-VCL`检查全部。 -- `-V,--version`检查版本,检查各个组件的版本是否一致,如果不一致,会输出不一致的组件和版本信息。 +- `-V,--version`检查版本,检查各个组件的版本是否一致,如果不一致,会输出不一致的组件和版本信息(由于复杂度较高,openmldb-batch包的地址可能查不到,将忽略检查,替换batch包非常容易,可以推后检查)。 - `-C,--conf`收集配置文件,检查各个组件的配置文件中ZooKeeper地址是否一致等。 - `-L,--log`收集日志,输出WARNING及以上的日志。 diff --git a/docs/zh/maintain/monitoring.md b/docs/zh/maintain/monitoring.md index e51f0a3b8bc..b291ae90f2a 100644 --- a/docs/zh/maintain/monitoring.md +++ b/docs/zh/maintain/monitoring.md @@ -23,6 +23,15 @@ OpenMLDB exporter 是以 Python 实现的 Prometheus exporter,核心是通过 - Python >= 3.8 - OpenMLDB >= 0.5.0 +### 兼容性说明 + +**请根据部署的 OpenMLDB 版本选择正确的 openmldb-exporter.** + +| [OpenMLDB Exporter version](https://pypi.org/project/openmldb-exporter/) | [OpenMLDB supported version](https://github.com/4paradigm/OpenMLDB/releases) | [Grafana Dashboard revision](https://grafana.com/grafana/dashboards/17843-openmldb-dashboard/?tab=revisions) | Explaination | +| ---- | ---- | ---- | ------- | +| >= 0.9.0 | >= 0.8.4 | >=4 | OpenMLDB 0.8.4 移除了数据库表里的 deploy response time 信息 | +| < 0.9.0 | >= 0.5.0, < 0.8.4 | 3 | | + ### 准备 1. 获取 OpenMLDB diff --git a/docs/zh/openmldb_sql/ddl/CREATE_TABLE_STATEMENT.md b/docs/zh/openmldb_sql/ddl/CREATE_TABLE_STATEMENT.md index a44f699eed3..0113ef730b0 100644 --- a/docs/zh/openmldb_sql/ddl/CREATE_TABLE_STATEMENT.md +++ b/docs/zh/openmldb_sql/ddl/CREATE_TABLE_STATEMENT.md @@ -233,8 +233,8 @@ IndexOption ::= | ----------- | ------------------------------------------------------------ | ---------------------------------------------------- | ------------------------------------------------------------ | | `ABSOLUTE` | TTL的值代表过期时间。配置值为时间段如`100m, 12h, 1d, 365d`。最大可以配置的过期时间为`15768000m`(即30年) | 当记录过期时,会被淘汰。 | `INDEX(KEY=col1, TS=std_time, TTL_TYPE=absolute, TTL=100m)`
OpenMLDB将会删除100分钟之前的数据。 | | `LATEST` | TTL的值代表最大存活条数。即同一个索引下面,最大允许存在的数据条数。最大可以配置1000条 | 记录超过最大条数时,会被淘汰。 | `INDEX(KEY=col1, TS=std_time, TTL_TYPE=LATEST, TTL=10)`。OpenMLDB只会保留最近10条记录,删除以前的记录。 | -| `ABSORLAT` | 配置过期时间和最大存活条数。配置值是一个2元组,形如`(100m, 10), (1d, 1)`。最大可以配置`(15768000m, 1000)`。 | 当且仅当记录过期**或**记录超过最大条数时,才会淘汰。 | `INDEX(key=c1, ts=c6, ttl=(120min, 100), ttl_type=absorlat)`。当记录超过100条,**或者**当记录过期时,会被淘汰 | -| `ABSANDLAT` | 配置过期时间和最大存活条数。配置值是一个2元组,形如`(100m, 10), (1d, 1)`。最大可以配置`(15768000m, 1000)`。 | 当记录过期**且**记录超过最大条数时,记录会被淘汰。 | `INDEX(key=c1, ts=c6, ttl=(120min, 100), ttl_type=absandlat)`。当记录超过100条,**而且**记录过期时,会被淘汰 | +| `ABSORLAT` | 配置过期时间和最大存活条数。配置值是一个2元组,形如`(100m, 10), (1d, 1)`。最大可以配置`(15768000m, 1000)`。 | 当且仅当记录过期**或**记录超过最大条数时,才会淘汰。 | `INDEX(key=c1, ts=c6, ttl=(120m, 100), ttl_type=absorlat)`。当记录超过100条,**或者**当记录过期时,会被淘汰 | +| `ABSANDLAT` | 配置过期时间和最大存活条数。配置值是一个2元组,形如`(100m, 10), (1d, 1)`。最大可以配置`(15768000m, 1000)`。 | 当记录过期**且**记录超过最大条数时,记录会被淘汰。 | `INDEX(key=c1, ts=c6, ttl=(120m, 100), ttl_type=absandlat)`。当记录超过100条,**而且**记录过期时,会被淘汰 | ```{note} 最大过期时间和最大存活条数的限制,是出于性能考虑。如果你一定要配置更大的TTL值,请使用UpdateTTL来增大(可无视max限制),或者调整nameserver配置`absolute_ttl_max`和`latest_ttl_max`,重启生效。 diff --git a/docs/zh/openmldb_sql/ddl/SET_STATEMENT.md b/docs/zh/openmldb_sql/ddl/SET_STATEMENT.md index 1b513913e10..4b63861e59f 100644 --- a/docs/zh/openmldb_sql/ddl/SET_STATEMENT.md +++ b/docs/zh/openmldb_sql/ddl/SET_STATEMENT.md @@ -36,6 +36,7 @@ sessionVariableName ::= '@@'Identifier | '@@session.'Identifier | '@@global.'Ide | @@session.sync_job|@@sync_job | 当该变量值为 `true`,离线的命令将变为同步,等待执行的最终结果。
当该变量值为 `false`,离线的命令即时返回,若要查看命令的执行情况,请使用`SHOW JOB`。 | "true" \| "false" | "false" | | @@session.job_timeout|@@job_timeout | 可配置离线异步命令或离线管理命令的等待时间(以*毫秒*为单位),将立即返回。离线异步命令返回后仍可通过`SHOW JOB`查看命令执行情况。 | Int | "20000" | | @@session.spark_config|@@spark_config | 设置离线任务的 Spark 参数,配置项参考 'spark.executor.memory=2g;spark.executor.cores=2'。注意此 Spark 配置优先级高于 TaskManager 默认 Spark 配置,低于命令行的 Spark 配置文件。 | String | "" | +| @@session.insert_memory_usage_limit|@@insert_memory_usage_limit | 设置数据插入或者数据导入时服务端内存使用率限制。取值范围为0-100。如果服务端内存使用率超过设置的值,就会插入失败。设置为0表示不限制 | Int | "0" | ## Example ### 设置和显示会话系统变量 diff --git a/docs/zh/openmldb_sql/dml/INSERT_STATEMENT.md b/docs/zh/openmldb_sql/dml/INSERT_STATEMENT.md index 6ecf98390a3..4799e557577 100644 --- a/docs/zh/openmldb_sql/dml/INSERT_STATEMENT.md +++ b/docs/zh/openmldb_sql/dml/INSERT_STATEMENT.md @@ -5,7 +5,7 @@ OpenMLDB 支持一次插入单行或多行数据。 ## syntax ``` -INSERT INFO tbl_name (column_list) VALUES (value_list) [, value_list ...] +INSERT [[OR] IGNORE] INTO tbl_name (column_list) VALUES (value_list) [, value_list ...] column_list: col_name [, col_name] ... @@ -16,6 +16,7 @@ value_list: **说明** - `INSERT` 只能用在在线模式 +- 默认`INSERT`不会去重,`INSERT OR IGNORE` 则可以忽略已存在于表中的数据,可以反复重试。 ## Examples diff --git a/docs/zh/openmldb_sql/dml/LOAD_DATA_STATEMENT.md b/docs/zh/openmldb_sql/dml/LOAD_DATA_STATEMENT.md index 4fcc94c15fc..d2c456b2913 100644 --- a/docs/zh/openmldb_sql/dml/LOAD_DATA_STATEMENT.md +++ b/docs/zh/openmldb_sql/dml/LOAD_DATA_STATEMENT.md @@ -58,6 +58,7 @@ FilePathPattern | load_mode | String | cluster | `load_mode='local'`仅支持从csv本地文件导入在线存储, 它通过本地客户端同步插入数据;
`load_mode='cluster'`仅支持集群版, 通过spark插入数据,支持同步或异步模式 | | thread | Integer | 1 | 仅在本地文件导入时生效,即`load_mode='local'`或者单机版,表示本地插入数据的线程数。 最大值为`50`。 | | writer_type | String | single | 集群版在线导入中插入数据的writer类型。可选值为`single`和`batch`,默认为`single`。`single`表示数据即读即写,节省内存。`batch`则是将整个rdd分区读完,确认数据类型有效性后,再写入集群,需要更多内存。在部分情况下,`batch`模式有利于筛选未写入的数据,方便重试这部分数据。 | +| put_if_absent | Boolean | false | 在源数据无重复行也不与表中已有数据重复时,可以使用此选项避免插入重复数据,特别是job失败后可以重试。等价于使用`INSERT OR IGNORE`。更多详情见下文。 | ```{note} 在集群版中,`LOAD DATA INFILE`语句会根据当前执行模式(execute_mode)决定将数据导入到在线或离线存储。单机版中没有存储区别,只会导入到在线存储中,同时也不支持`deep_copy`选项。 @@ -73,6 +74,7 @@ FilePathPattern 所以,请尽量使用绝对路径。单机测试中,本地文件用`file://`开头;生产环境中,推荐使用hdfs等文件系统。 ``` + ## SQL语句模版 ```sql @@ -115,6 +117,8 @@ LOAD DATA INFILE 'hive://db1.t1' INTO TABLE t1; 在线导入只允许`mode='append'`,无法`overwrite`或`error_if_exists`。 +如果设置了 `insert_memory_usage_limit` session变量,服务端内存使用率超过设定的值就会返回失败。 + ## 离线导入规则 表的离线信息可通过`desc `查看。我们将数据地址分为两类,离线地址是OpenMLDB的内部存储路径,硬拷贝将写入此地址,仅一个;软链接地址是软链接导入的地址列表。 @@ -156,3 +160,12 @@ null,null 第二行两列都是两个双引号。 - cluster模式默认quote为`"`,所以这一行是两个空字符串。 - local模式默认quote为`\0`,所以这一行两列都是两个双引号。local模式quote可以配置为`"`,但escape规则是`""`为单个`"`,和Spark不一致,具体见[issue3015](https://github.com/4paradigm/OpenMLDB/issues/3015)。 + +## PutIfAbsent说明 + +PutIfAbsent是一个特殊的选项,它可以避免插入重复数据,仅需一个配置,操作简单,特别适合load datajob失败后重试,等价于使用`INSERT OR IGNORE`。如果你想要导入的数据中存在重复,那么通过PutIfAbsent导入,会导致部分数据丢失。如果你需要保留重复数据,不应使用此选项,建议通过其他方式去重后再导入。 + +PutIfAbsent需要去重这一额外开销,所以,它的性能与去重的复杂度有关: + +- 表中只存在ts索引,且同一key+ts的数据量少于10k时(为了精确去重,在同一个key+ts下会逐行对比整行数据),PutIfAbsent的性能表现不会很差,通常导入时间在普通导入时间的2倍以内。 +- 表中如果存在time索引(ts列为空),或者ts索引同一key+ts的数据量大于100k时,PutIfAbsent的性能会很差,导入时间可能超过普通导入时间的10倍,无法正常使用。这样的数据条件下,更建议进行去重后再导入。 diff --git a/docs/zh/openmldb_sql/dql/SELECT_STATEMENT.md b/docs/zh/openmldb_sql/dql/SELECT_STATEMENT.md index 01b34828382..215e3917b3a 100644 --- a/docs/zh/openmldb_sql/dql/SELECT_STATEMENT.md +++ b/docs/zh/openmldb_sql/dql/SELECT_STATEMENT.md @@ -115,12 +115,9 @@ select_expression: | `ORDER BY` Clause | **``x``** | **``x``** | **``x``** | 标准SQL还支持Order By子句。OpenMLDB目前尚未支持Order子句。例如,查询语句`SELECT * from t1 ORDER BY col1;`在OpenMLDB中不被支持。 | ```{warning} -在线模式或单机版的select,可能无法获取完整数据。 -因为一次查询可能在多台tablet 上进行大量的扫描,为了tablet 的稳定性,单个tablet 限制了最大扫描数据量,即`scan_max_bytes_size`。 +在线模式或单机版的select,可能无法获取完整数据。单个tablet 限制了最大扫描数据量,即`scan_max_bytes_size`,默认为无限。但如果你配置了它,查询的数据量超过这个值,会出现结果截断。如果出现select结果截断,tablet 会出现`reach the max byte ...`的日志,但查询不会报错。 -如果出现select结果截断,tablet 会出现`reach the max byte ...`的日志,但查询不会报错。 - -在线模式或单机版都不适合做大数据的扫描,推荐使用集群版的离线模式。如果一定要调大扫描量,需要对每台tablet配置`--scan_max_bytes_size=xxx`,并重启tablet生效。 +即使你没有配置`scan_max_bytes_size`,也可能出现select失败,比如 `body_size=xxx from xx:xxxx is too large`, ` Fail to parse response from xx:xxxx by baidu_std at client-side`等错误。我们不推荐全表扫描在线表,如果你想获得在线表的数据条数,可以使用`SELECT COUNT(*) FROM table_name`。 ``` ## FROM Clause @@ -142,17 +139,20 @@ FROM 子句的来源可以是: ```{attention} 离线同步模式 Query 仅用于展示,不保证结果完整。整个结果收集中可能出现文件写入失败,丢失HTTP包等问题,我们允许结果缺失。 ``` + ### 相关配置参数 TaskManager配置`batch.job.result.max.wait.time`,在 Query job完成后,我们会等待所有结果被收集并保存在TaskManager所在主机的文件系统中,超过这一时间将结束等待,返回错误。如果认为整个收集结果的过程没有问题,仅仅是等待时间不够,可以调大这一配置项,单位为ms,默认为10min。 Batch配置(spark.default.conf): + - spark.openmldb.savejobresult.rowperpost: 为了防止HTTP传送过多数据,我们对数据进行切割,默认为16000行。如果单行数据量较大,可以调小该值。 - spark.openmldb.savejobresult.posttimeouts: HTTP传送数据的超时配置,共三个超时配置项,用`,`分隔,分别为`ConnectionRequestTimeout,ConnectTimeout,SocketTimeout`,默认为`10000,10000,10000`。如果出现HTTP传输超时,可调整这一参数。 ### 重置 如果使用过程中出现错误,可能导致Result Id无法正确重置。所有Result Id都被虚假占用时,会出现错误"too much running jobs to save job result, reject this spark job"。这时可以通过HTTP请求TaskManager来重置,POST内容如下: + ``` curl -H "Content-Type:application/json" http://0.0.0.0:9902/openmldb.taskmanager.TaskManagerServer/SaveJobResult -X POST -d '{"result_id":-1, "json_data": "reset"}' ``` diff --git a/docs/zh/openmldb_sql/sql_difference.md b/docs/zh/openmldb_sql/sql_difference.md index b224db6d567..8effa43939e 100644 --- a/docs/zh/openmldb_sql/sql_difference.md +++ b/docs/zh/openmldb_sql/sql_difference.md @@ -41,10 +41,10 @@ 相关参数会在 tablet 配置文件 conf/tablet.flags 里进行配置,详见文档[配置文件](../deploy/conf.md#tablet配置文件-conftabletflags) 。影响到扫描限制的参数为: - 最大扫描条数 `--max_traverse_cnt` -- 最大扫描 key 的个数 `--max_traverse_pk_cnt` +- 最大扫描 key 的个数 `--max_traverse_pk_cnt/max_traverse_key_cnt` - 返回的结果大小限制 `--scan_max_bytes_size` -预计在 v0.7.3 以及以后版本中,以上参数的默认值都为 0,即不做相关限制。之前的版本需要注意相关参数的设置。 +v0.8.0及以后版本,删除`max_traverse_pk_cnt`,使用`max_traverse_key_cnt`,前两个参数默认值改为0,即不受限制。v0.8.4以后(不包括)版本`--scan_max_bytes_size`默认也为0。之前的版本需要注意相关参数的设置。 ### WHERE 子句 diff --git a/docs/zh/openmldb_sql/udf_develop_guide.md b/docs/zh/openmldb_sql/udf_develop_guide.md index 761e66dea6f..89771df4b5a 100644 --- a/docs/zh/openmldb_sql/udf_develop_guide.md +++ b/docs/zh/openmldb_sql/udf_develop_guide.md @@ -1,18 +1,19 @@ # 自定义函数(UDF)开发 -## 1. 背景 +## 背景 虽然OpenMLDB内置了上百个函数,以供数据科学家作数据分析和特征抽取。但是在某些场景下还是不能很好的满足要求,为了便于用户快速灵活实现特定的特征计算需求,我们支持了基于 C++ 的用户自定义函数(UDF)开发,以及动态用户自定义函数库的加载。 ```{seealso} 用户也可以使用内置函数开发的方式扩展 OpenMLDB 的计算函数库。但是内置函数开发需要修改源代码和重新编译。如果用户希望贡献扩展函数到 OpenMLDB 代码库,那么可以参考[内置函数的开发文档](../developer/built_in_function_develop_guide.md)。 ``` -## 2. 开发步骤 -### 2.1 开发自定义函数 -#### 2.1.1 C++函数名规范 +## 开发步骤 +### 开发自定义函数 +#### C++函数名规范 - C++内置函数名统一使用[snake_case](https://en.wikipedia.org/wiki/Snake_case)风格 - 要求函数名能清晰表达函数功能 - 函数不能重名。函数名不能和内置函数及其他自定义函数重名。所有内置函数的列表参考[这里](../openmldb_sql/udfs_8h.md) -#### 2.1.2 C++类型与SQL类型对应关系 +#### C++类型与SQL类型对应关系 + 内置C++函数的参数类型限定为:BOOL类型,数值类型,时间戳日期类型和字符串类型。C++类型SQL类型对应关系如下: | SQL类型 | C/C++ 类型 | @@ -26,7 +27,7 @@ | STRING | `StringRef` | | TIMESTAMP | `Timestamp` | | DATE | `Date` | -#### 2.1.3 函数参数和返回值 +#### 函数参数和返回值 返回值: * 如果udf输出类型是基本类型,并且`return_nullable`设置为false, 则通过函数返回值返回 * 如果udf输出类型是基本类型,并且`return_nullable`设置为true, 则通过函数参数返回 @@ -46,7 +47,7 @@ 函数声明: * 函数必须用extern "C"来声明 -#### 2.1.4 内存管理 +#### 内存管理 - 在单行函数中,不允许使用`new`和`malloc`给输入和输出参数开辟空间。函数内部可以使用`new`和`malloc`申请临时空间, 申请的空间在函数返回前需要释放掉。 - 在聚合函数中,在init函数中可以使用`new`/`malloc`开辟空间,但是必须在output函数中释放。最后的返回值如果是string需要保存在mempool开辟的空间中 @@ -67,7 +68,7 @@ extern "C" void sum(::openmldb::base::UDFContext* ctx, int64_t input1, bool is_null, int64_t input2, bool is_null, int64_t* output, bool* is_null) { ``` -#### 2.1.5 单行函数开发 +#### 单行函数开发 单行函数(scalar function)对单行数据进行处理,返回单个值,比如 `abs`, `sin`, `cos`, `date`, `year` 等。 @@ -94,7 +95,7 @@ void cut2(::openmldb::base::UDFContext* ctx, ::openmldb::base::StringRef* input, } ``` -#### 2.1.6 聚合函数开发 +#### 聚合函数开发 聚合函数(aggregate function)对一个数据集(比如一列数据)执行计算,返回单个值,比如 `sum`, `avg`, `max`, `min`, `count` 等。 @@ -144,15 +145,15 @@ int64_t special_sum_output(::openmldb::base::UDFContext* ctx) { 更多udf/udaf实现参考[这里](../../../src/examples/test_udf.cc)。 -### 2.2 编译动态库 +### 编译动态库 - 拷贝include目录 `https://github.com/4paradigm/OpenMLDB/tree/main/include` 到某个路径下,下一步编译会用到。如/work/OpenMLDB/ - 执行编译命令,其中 -I 指定inlcude目录的路径 -o 指定产出动态库的名称 -- + ```shell g++ -shared -o libtest_udf.so examples/test_udf.cc -I /work/OpenMLDB/include -std=c++17 -fPIC ``` -### 2.3 拷贝动态库 +### 拷贝动态库 编译过的动态库需要被拷贝到 TaskManager 和 tablets中。如果 TaskManager 和 tablets中不存在`udf`目录,请先创建并重启这些进程(保证环境变量生效)。 - tablet的UDF目录是 `path_to_tablet/udf`。 - TaskManager的UDF目录是 `path_to_taskmanager/taskmanager/bin/udf`。 @@ -181,7 +182,7 @@ g++ -shared -o libtest_udf.so examples/test_udf.cc -I /work/OpenMLDB/include -st - 在执行' DROP FUNCTION '之前请勿删除动态库。 ``` -### 2.4 注册、删除和查看函数 +### 注册、删除和查看函数 注册函数使用[CREATE FUNCTION](../openmldb_sql/ddl/CREATE_FUNCTION.md) 注册单行函数 diff --git a/docs/zh/quickstart/beginner_must_read.md b/docs/zh/quickstart/beginner_must_read.md index 60522283942..ad403a6b423 100644 --- a/docs/zh/quickstart/beginner_must_read.md +++ b/docs/zh/quickstart/beginner_must_read.md @@ -69,6 +69,16 @@ OpenMLDB是在线离线存储计算分离的,所以,你需要明确自己导 关于如何设计你的数据流入流出,可参考[实时决策系统中 OpenMLDB 的常见架构整合方式](../tutorial/app_arch.md)。 +### 在线表 + +在线表是存在内存中的数据,同时也会使用硬盘进行备份恢复。在线表的数据,可以通过`select count(*) from t1`来检查条数,或者使用`show table status`来查看表状态(可能有一定延迟,可以稍等再查)。 + +在线表是可以有多个索引的,通过`desc
`可以查看。写入一条数据时每个索引中都会写入一条,区别是各个索引的分类排序不同。但由于索引还有TTL淘汰机制,各个索引的数据量可能不一致。`select count(*) from t1`和`show table status`的结果是第一个索引的数据量,它并不代表其他索引的数据量。SQL查询会使用哪一个索引,是由SQL Engine选择的最优索引,可以通过SQL物理计划来查看。 + +建表时,可以指定索引,也可以不指定,不指定时,会默认创建一个索引。如果是默认索引,它无ts列(用当前time作为排序列,我们称为time索引)将会永不淘汰数据,可以以它为标准检查数据量是否准确,但这样的索引会占用太多的内存,目前也不可以删除第一条索引(计划未来支持),可以通过NS Client修改TTL淘汰数据,减少它的内存占用。 + +time索引(无ts的索引)还会影响PutIfAbsent导入。如果你的数据导入可能中途失败,无其他方法进行删除或去重,想要使用PutIfAbsent来进行导入重试时,请参考[PutIfAbsent说明](../openmldb_sql/dml/LOAD_DATA_STATEMENT.md#putifabsent说明)对自己的数据进行评估,避免PutIfAbsent效率太差。 + ## 源数据 ### LOAD DATA diff --git a/docs/zh/quickstart/concepts/modes.md b/docs/zh/quickstart/concepts/modes.md index b55ef32dd71..df8beaa79c6 100644 --- a/docs/zh/quickstart/concepts/modes.md +++ b/docs/zh/quickstart/concepts/modes.md @@ -58,7 +58,7 @@ OpenMLDB CLI 启动以后的**默认模式为离线模式**。离线数据导入 - 在线预览模式主要用于有限数据的预览,在 OpenMLDB CLI 或者 SDKs 执行 SELECT 直接查看数据可能出现数据截断;如果数据量较大,建议使用[导出工具](https://openmldb.ai/docs/zh/main/tutorial/data_export.html)查看完整数据。 - 在线预览模式的 SELECT 语句目前不支持 `LAST JOIN` 和 `ORDER BY` 等较复杂的查询,参考 [`SELECT`](https://openmldb.ai/docs/zh/main/openmldb_sql/dql/SELECT_STATEMENT.html)。 - 在线预览模式服务端均为单线程执行 SQL,对于大数据处理,会比较慢,有可能会触发超时,可以通过在客户端配置 `--request_timeout` 来提高超时时间。 -- 为了防止影响线上服务,在线预览模式控制了最大访问的条数和不同key的个数,可以通过`--max_traverse_cnt` 和 `--max_traverse_key_cnt` 来设置; +- 为了防止影响线上服务,可以在在线预览模式中控制最大访问的条数和不同key的个数,可以通过`--max_traverse_cnt` 和 `--max_traverse_key_cnt` 来设置; 同时,通过 `--scan_max_bytes_size` 来限制结果的大小。详细配置可参考[配置文件](../../deploy/conf.md)。 在线预览模式设置命令 (OpenMLDB CLI):`SET @@execute_mode='online'` diff --git a/docs/zh/quickstart/openmldb_quickstart.md b/docs/zh/quickstart/openmldb_quickstart.md index c9a0dee18a8..77b1c1e29c1 100644 --- a/docs/zh/quickstart/openmldb_quickstart.md +++ b/docs/zh/quickstart/openmldb_quickstart.md @@ -38,12 +38,16 @@ docker run -it 4pdosc/openmldb:0.8.4 bash ```bash /work/openmldb/bin/openmldb --zk_cluster=127.0.0.1:2181 --zk_root_path=/openmldb --role=sql_client +# 或脚本 +/work/openmldb/sbin/openmldb-cli.sh ``` 成功启动 OpenMLDB CLI 后如下图显示: ![image](./images/cli_cluster.png) +如果你需要对 OpenMLDB 集群进行配置修改,/work/init.sh使用的是sbin一键部署方式,具体参考[一键部署](../deploy/install_deploy.md#部署方式一一键部署推荐)。 + ## 使用流程 参照核心概念,使用 OpenMLDB 的流程一般包含:建立数据库和表、导入离线数据、离线特征计算、SQL 方案上线、导入在线数据、在线实时特征计算六个步骤。 diff --git a/hybridse/include/base/fe_status.h b/hybridse/include/base/fe_status.h index b91b8d8fb16..8f11a16a8c8 100644 --- a/hybridse/include/base/fe_status.h +++ b/hybridse/include/base/fe_status.h @@ -16,11 +16,12 @@ #ifndef HYBRIDSE_INCLUDE_BASE_FE_STATUS_H_ #define HYBRIDSE_INCLUDE_BASE_FE_STATUS_H_ + +#include #include #include -#include "glog/logging.h" + #include "proto/fe_common.pb.h" -#include "proto/fe_type.pb.h" namespace hybridse { namespace base { diff --git a/hybridse/include/codec/list_iterator_codec.h b/hybridse/include/codec/list_iterator_codec.h index c31f2e4bddc..e1dcbd6dfea 100644 --- a/hybridse/include/codec/list_iterator_codec.h +++ b/hybridse/include/codec/list_iterator_codec.h @@ -534,7 +534,7 @@ class ColumnIterator : public ConstIterator { ColumnIterator(ListV *list, const ColumnImpl *column_impl) : ConstIterator(), column_impl_(column_impl) { row_iter_ = list->GetIterator(); - if (!row_iter_) { + if (row_iter_ != nullptr) { row_iter_->SeekToFirst(); } } diff --git a/hybridse/include/node/expr_node.h b/hybridse/include/node/expr_node.h index 442064b6873..490e4d48c28 100644 --- a/hybridse/include/node/expr_node.h +++ b/hybridse/include/node/expr_node.h @@ -18,7 +18,6 @@ #define HYBRIDSE_INCLUDE_NODE_EXPR_NODE_H_ #include -#include #include "base/fe_status.h" #include "codec/fe_row_codec.h" diff --git a/hybridse/include/node/node_base.h b/hybridse/include/node/node_base.h index 8aa678c90a8..c6894f2b682 100644 --- a/hybridse/include/node/node_base.h +++ b/hybridse/include/node/node_base.h @@ -22,7 +22,6 @@ #include #include "base/fe_object.h" -#include "glog/logging.h" #include "node/node_enum.h" namespace hybridse { diff --git a/hybridse/include/node/node_enum.h b/hybridse/include/node/node_enum.h index 7c9ebf0ecbe..38d8336258f 100644 --- a/hybridse/include/node/node_enum.h +++ b/hybridse/include/node/node_enum.h @@ -17,9 +17,6 @@ #ifndef HYBRIDSE_INCLUDE_NODE_NODE_ENUM_H_ #define HYBRIDSE_INCLUDE_NODE_NODE_ENUM_H_ -#include -#include "proto/fe_common.pb.h" -#include "proto/fe_type.pb.h" namespace hybridse { namespace node { @@ -98,6 +95,7 @@ enum SqlNodeType { kAlterTableStmt, kShowStmt, kCompressType, + kColumnSchema, kSqlNodeTypeLast, // debug type }; @@ -143,7 +141,8 @@ enum ExprType { kExprIn, kExprEscaped, kExprArray, - kExprFake, // not a real one + kExprArrayElement, // extract value from a array or map, with `[]` operator + kExprFake, // not a real one kExprLast = kExprFake, }; @@ -175,9 +174,21 @@ enum DataType { kArray, // fixed size. In SQL: [1, 2, 3] or ARRAY[1, 2, 3] kDataTypeFake, // not a data type, for testing purpose only kLastDataType = kDataTypeFake, + // the tree type are not moved above kLastDataType for compatibility // it may necessary to do it in the further + + // kVoid + // A distinct data type: signifies no value or meaningful result. + // Typically used for function that does not returns value. kVoid = 100, + // kNull + // A special marker representing the absence of a value. + // Not a true data type but a placeholder for missing or unknown information. + // A `NULL` literal can be eventually resolved to: + // - NULL of void type, if no extra info provided: 'SELECT NULL' + // - NULL of int (or any other) type, extra information provided, e.g with 'CAST' operator + // 'SELECT CAST(NULL as INT)' kNull = 101, kPlaceholder = 102 }; diff --git a/hybridse/include/node/node_manager.h b/hybridse/include/node/node_manager.h index 6949faf6f88..fdee40b20e9 100644 --- a/hybridse/include/node/node_manager.h +++ b/hybridse/include/node/node_manager.h @@ -21,7 +21,6 @@ #ifndef HYBRIDSE_INCLUDE_NODE_NODE_MANAGER_H_ #define HYBRIDSE_INCLUDE_NODE_NODE_MANAGER_H_ -#include #include #include #include @@ -166,16 +165,12 @@ class NodeManager { SqlNode *MakeInsertTableNode(const std::string &db_name, const std::string &table_name, const ExprListNode *column_names, - const ExprListNode *values); + const ExprListNode *values, InsertStmt::InsertMode insert_mode); CreateStmt *MakeCreateTableNode(bool op_if_not_exist, const std::string &db_name, const std::string &table_name, SqlNodeList *column_desc_list, SqlNodeList *partition_meta_list); - SqlNode *MakeColumnDescNode(const std::string &column_name, - const DataType data_type, - bool op_not_null, - ExprNode* default_value = nullptr); SqlNode *MakeColumnIndexNode(SqlNodeList *keys, SqlNode *ts, SqlNode *ttl, SqlNode *version); SqlNode *MakeColumnIndexNode(SqlNodeList *index_item_list); diff --git a/hybridse/include/node/sql_node.h b/hybridse/include/node/sql_node.h index 8d641ad8283..14e139bdd3f 100644 --- a/hybridse/include/node/sql_node.h +++ b/hybridse/include/node/sql_node.h @@ -450,9 +450,7 @@ class ExprNode : public SqlNode { uint32_t GetChildNum() const { return children_.size(); } const ExprType GetExprType() const { return expr_type_; } - void PushBack(ExprNode *node_ptr) { children_.push_back(node_ptr); } - std::vector children_; void Print(std::ostream &output, const std::string &org_tab) const override; virtual const std::string GetExprString() const; virtual const std::string GenerateExpressionName() const; @@ -542,6 +540,8 @@ class ExprNode : public SqlNode { static Status RlikeTypeAccept(node::NodeManager* nm, const TypeNode* lhs, const TypeNode* rhs, const TypeNode** output); + std::vector children_; + private: const TypeNode *output_type_ = nullptr; bool nullable_ = true; @@ -570,10 +570,26 @@ class ArrayExpr : public ExprNode { Status InferAttr(ExprAnalysisContext *ctx) override; - // array type may specific already in SQL, e.g. ARRAY[1,2,3] + // array type may specified type in SQL already, e.g. ARRAY[1,2,3] TypeNode* specific_type_ = nullptr; }; +// extract value from array or map value, using '[]' operator +class ArrayElementExpr : public ExprNode { + public: + ArrayElementExpr(ExprNode *array, ExprNode *pos); + ~ArrayElementExpr() override {} + + ExprNode *array() const; + ExprNode *position() const; + + void Print(std::ostream &output, const std::string &org_tab) const override; + const std::string GetExprString() const override; + ArrayElementExpr *ShadowCopy(NodeManager *nm) const override; + + Status InferAttr(ExprAnalysisContext *ctx) override; +}; + class FnNode : public SqlNode { public: FnNode() : SqlNode(kFn, 0, 0), indent(0) {} @@ -1836,48 +1852,86 @@ class ResTarget : public SqlNode { NodePointVector indirection_; /* subscripts, field names, and '*', or NIL */ }; +class ColumnSchemaNode : public SqlNode { + public: + ColumnSchemaNode(DataType type, bool attr_not_null, const ExprNode *default_val = nullptr) + : SqlNode(kColumnSchema, 0, 0), type_(type), not_null_(attr_not_null), default_value_(default_val) {} + + ColumnSchemaNode(DataType type, absl::Span generics, bool attr_not_null, + const ExprNode *default_val) + : SqlNode(kColumnSchema, 0, 0), + type_(type), + generics_(generics.begin(), generics.end()), + not_null_(attr_not_null), + default_value_(default_val) {} + ~ColumnSchemaNode() override {} + + DataType type() const { return type_; } + absl::Span generics() const { return generics_; } + bool not_null() const { return not_null_; } + const ExprNode *default_value() const { return default_value_; } + + std::string DebugString() const; + + private: + DataType type_; + std::vector generics_; + bool not_null_; + const ExprNode* default_value_ = nullptr; +}; + class ColumnDefNode : public SqlNode { public: - ColumnDefNode() : SqlNode(kColumnDesc, 0, 0), column_name_(""), column_type_() {} - ColumnDefNode(const std::string &name, const DataType &data_type, bool op_not_null, ExprNode *default_value) - : SqlNode(kColumnDesc, 0, 0), - column_name_(name), - column_type_(data_type), - op_not_null_(op_not_null), - default_value_(default_value) {} + ColumnDefNode(const std::string &name, const ColumnSchemaNode *schema) + : SqlNode(kColumnDesc, 0, 0), column_name_(name), schema_(schema) {} ~ColumnDefNode() {} std::string GetColumnName() const { return column_name_; } - DataType GetColumnType() const { return column_type_; } + DataType GetColumnType() const { return schema_->type(); } + + const ExprNode* GetDefaultValue() const { return schema_->default_value(); } - ExprNode* GetDefaultValue() const { return default_value_; } + bool GetIsNotNull() const { return schema_->not_null(); } - bool GetIsNotNull() const { return op_not_null_; } void Print(std::ostream &output, const std::string &org_tab) const; private: std::string column_name_; - DataType column_type_; - bool op_not_null_; - ExprNode* default_value_ = nullptr; + const ColumnSchemaNode* schema_; }; class InsertStmt : public SqlNode { public: + // ref zetasql ASTInsertStatement + enum InsertMode { + DEFAULT_MODE, // plain INSERT + REPLACE, // INSERT OR REPLACE + UPDATE, // INSERT OR UPDATE + IGNORE // INSERT OR IGNORE + }; + InsertStmt(const std::string &db_name, const std::string &table_name, const std::vector &columns, - const std::vector &values) + const std::vector &values, + InsertMode insert_mode) : SqlNode(kInsertStmt, 0, 0), db_name_(db_name), table_name_(table_name), columns_(columns), values_(values), - is_all_(columns.empty()) {} + is_all_(columns.empty()), + insert_mode_(insert_mode) {} - InsertStmt(const std::string &db_name, const std::string &table_name, const std::vector &values) - : SqlNode(kInsertStmt, 0, 0), db_name_(db_name), table_name_(table_name), values_(values), is_all_(true) {} + InsertStmt(const std::string &db_name, const std::string &table_name, const std::vector &values, + InsertMode insert_mode) + : SqlNode(kInsertStmt, 0, 0), + db_name_(db_name), + table_name_(table_name), + values_(values), + is_all_(true), + insert_mode_(insert_mode) {} void Print(std::ostream &output, const std::string &org_tab) const; const std::string db_name_; @@ -1885,6 +1939,7 @@ class InsertStmt : public SqlNode { const std::vector columns_; const std::vector values_; const bool is_all_; + const InsertMode insert_mode_; }; class StorageModeNode : public SqlNode { diff --git a/hybridse/include/node/type_node.h b/hybridse/include/node/type_node.h index e27ef34ce46..110b6329e59 100644 --- a/hybridse/include/node/type_node.h +++ b/hybridse/include/node/type_node.h @@ -21,6 +21,7 @@ #include #include "codec/fe_row_codec.h" +#include "node/expr_node.h" #include "node/sql_node.h" #include "vm/schemas_context.h" @@ -31,7 +32,7 @@ class NodeManager; class TypeNode : public SqlNode { public: - TypeNode() : SqlNode(node::kType, 0, 0), base_(hybridse::node::kNull) {} + TypeNode() : SqlNode(node::kType, 0, 0), base_(hybridse::node::kVoid) {} explicit TypeNode(hybridse::node::DataType base) : SqlNode(node::kType, 0, 0), base_(base), generics_({}) {} explicit TypeNode(hybridse::node::DataType base, const TypeNode *v1) @@ -48,44 +49,44 @@ class TypeNode : public SqlNode { generics_nullable_({false, false}) {} ~TypeNode() override {} - friend bool operator==(const TypeNode& lhs, const TypeNode& rhs); + friend bool operator==(const TypeNode &lhs, const TypeNode &rhs); + + // Return this node cast as a NodeType. + // Use only when this node is known to be that type, otherwise, behavior is undefined. + template + const NodeType *GetAsOrNull() const { + static_assert(std::is_base_of::value, + "NodeType must be a member of the TypeNode class hierarchy"); + return dynamic_cast(this); + } + + template + NodeType *GetAsOrNull() { + static_assert(std::is_base_of::value, + "NodeType must be a member of the TypeNode class hierarchy"); + return dynamic_cast(this); + } // canonical name for the type // this affect the function generated by codegen - virtual const std::string GetName() const { - std::string type_name = DataTypeName(base_); - if (!generics_.empty()) { - for (auto type : generics_) { - type_name.append("_"); - type_name.append(type->GetName()); - } - } - return type_name; - } + virtual const std::string GetName() const; // readable string representation virtual std::string DebugString() const; - const hybridse::node::TypeNode *GetGenericType(size_t idx) const { - return generics_[idx]; - } + const hybridse::node::TypeNode *GetGenericType(size_t idx) const; bool IsGenericNullable(size_t idx) const { return generics_nullable_[idx]; } size_t GetGenericSize() const { return generics_.size(); } hybridse::node::DataType base() const { return base_; } - const std::vector &generics() const { - return generics_; - } + const std::vector &generics() const { return generics_; } - void AddGeneric(const node::TypeNode *dtype, bool nullable) { - generics_.push_back(dtype); - generics_nullable_.push_back(nullable); - } + void AddGeneric(const node::TypeNode *dtype, bool nullable); void Print(std::ostream &output, const std::string &org_tab) const override; - virtual bool Equals(const SqlNode *node) const; + bool Equals(const SqlNode *node) const override; TypeNode *ShadowCopy(NodeManager *) const override; TypeNode *DeepCopy(NodeManager *) const override; @@ -105,9 +106,22 @@ class TypeNode : public SqlNode { bool IsFloating() const; bool IsGeneric() const; + virtual bool IsMap() const { return false; } + virtual bool IsArray() const { return base_ == kArray; } + static Status CheckTypeNodeNotNull(const TypeNode *left_type); hybridse::node::DataType base_; + + // generics_ not empty if it is a complex data type: + // 1. base = ARRAY, generics = [ element_type ] + // 2. base = MAP, generics = [ key_type, value_type ] + // 3. base = STRUCT, generics = [ fileld_type, ... ] (unimplemented) + // inner types, not exists in SQL level + // 4. base = LIST, generics = [ element_type ] + // 5. base = ITERATOR, generics = [ element_type ] + // 6. base = TUPLE (like STRUCT), generics = [ element_type, ... ] + // 7. ... (might others, undocumented) std::vector generics_; std::vector generics_nullable_; }; @@ -120,9 +134,7 @@ class OpaqueTypeNode : public TypeNode { size_t bytes() const { return bytes_; } - const std::string GetName() const override { - return "opaque<" + std::to_string(bytes_) + ">"; - } + const std::string GetName() const override; OpaqueTypeNode *ShadowCopy(NodeManager *) const override; @@ -173,11 +185,28 @@ class FixedArrayType : public TypeNode { std::string DebugString() const override; FixedArrayType *ShadowCopy(NodeManager *) const override; + bool IsArray() const override { return true; } + private: const TypeNode* ele_ty_; uint64_t num_elements_; }; +class MapType : public TypeNode { + public: + MapType(const TypeNode *key_ty, const TypeNode *value_ty, bool value_not_null = false) ABSL_ATTRIBUTE_NONNULL(); + ~MapType() override; + + bool IsMap() const override { return true; } + + const TypeNode *key_type() const; + const TypeNode *value_type() const; + bool value_nullable() const; + + // test if input args can safely apply to a map function + static absl::StatusOr InferMapType(NodeManager *, absl::Span types); +}; + } // namespace node } // namespace hybridse #endif // HYBRIDSE_INCLUDE_NODE_TYPE_NODE_H_ diff --git a/hybridse/include/plan/plan_api.h b/hybridse/include/plan/plan_api.h index 0ad45f91f9f..1e4f3b74845 100644 --- a/hybridse/include/plan/plan_api.h +++ b/hybridse/include/plan/plan_api.h @@ -15,9 +15,13 @@ */ #ifndef HYBRIDSE_INCLUDE_PLAN_PLAN_API_H_ #define HYBRIDSE_INCLUDE_PLAN_PLAN_API_H_ + #include #include + #include "node/node_manager.h" +#include "vm/sql_ctx.h" + namespace hybridse { namespace plan { @@ -27,6 +31,10 @@ using hybridse::node::NodePointVector; using hybridse::node::PlanNodeList; class PlanAPI { public: + // parse SQL string to logic plan. ASTNode and LogicNode saved in SqlContext + static base::Status CreatePlanTreeFromScript(vm::SqlContext* ctx); + + // deprecated, use CreatePlanTreeFromScript(vm::SqlContext*) instead static bool CreatePlanTreeFromScript(const std::string& sql, PlanNodeList& plan_trees, // NOLINT NodeManager* node_manager, @@ -34,6 +42,7 @@ class PlanAPI { bool is_batch_mode = true, bool is_cluster = false, bool enable_batch_window_parallelization = false, const std::unordered_map* extra_options = nullptr); + static const int GetPlanLimitCount(node::PlanNode* plan_trees); static const std::string GenerateName(const std::string prefix, int id); }; diff --git a/hybridse/include/sdk/base.h b/hybridse/include/sdk/base.h index e5da48094f8..8d0cd4d9e1c 100644 --- a/hybridse/include/sdk/base.h +++ b/hybridse/include/sdk/base.h @@ -20,7 +20,6 @@ #include #include -#include #include #include #include diff --git a/hybridse/include/vm/sql_ctx.h b/hybridse/include/vm/sql_ctx.h new file mode 100644 index 00000000000..25182b86647 --- /dev/null +++ b/hybridse/include/vm/sql_ctx.h @@ -0,0 +1,91 @@ +/** + * Copyright (c) 2023 OpenMLDB Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HYBRIDSE_INCLUDE_VM_SQL_CTX_H_ +#define HYBRIDSE_INCLUDE_VM_SQL_CTX_H_ + +#include +#include +#include + +#include "node/node_manager.h" +#include "vm/engine_context.h" + +namespace zetasql { +class ParserOutput; +} + +namespace hybridse { +namespace vm { + +class HybridSeJitWrapper; +class ClusterJob; + +struct SqlContext { + // mode: batch|request|batch request + ::hybridse::vm::EngineMode engine_mode; + bool is_cluster_optimized = false; + bool is_batch_request_optimized = false; + bool enable_expr_optimize = false; + bool enable_batch_window_parallelization = true; + bool enable_window_column_pruning = false; + + // the sql content + std::string sql; + // the database + std::string db; + + std::unique_ptr ast_node; + // the logical plan + ::hybridse::node::PlanNodeList logical_plan; + ::hybridse::vm::PhysicalOpNode* physical_plan = nullptr; + + std::shared_ptr cluster_job; + // TODO(wangtaize) add a light jit engine + // eg using bthead to compile ir + hybridse::vm::JitOptions jit_options; + std::shared_ptr jit = nullptr; + Schema schema; + Schema request_schema; + std::string request_db_name; + std::string request_name; + Schema parameter_types; + uint32_t row_size; + uint32_t limit_cnt = 0; + std::string ir; + std::string logical_plan_str; + std::string physical_plan_str; + std::string encoded_schema; + std::string encoded_request_schema; + ::hybridse::node::NodeManager nm; + ::hybridse::udf::UdfLibrary* udf_library = nullptr; + + ::hybridse::vm::BatchRequestInfo batch_request_info; + + std::shared_ptr> options; + + // [ALPHA] SQL diagnostic infos + // not standardized, only index hints, no error, no warning, no other hint/info + std::shared_ptr index_hints; + + SqlContext(); + ~SqlContext(); +}; + +} // namespace vm +} // namespace hybridse + +#endif // HYBRIDSE_INCLUDE_VM_SQL_CTX_H_ diff --git a/hybridse/src/case/sql_case.cc b/hybridse/src/case/sql_case.cc index c98defb679b..be0633dc703 100644 --- a/hybridse/src/case/sql_case.cc +++ b/hybridse/src/case/sql_case.cc @@ -751,7 +751,7 @@ const std::string SqlCase::case_name() const { } bool SqlCase::ExtractInputTableDef(type::TableDef& table, int32_t input_idx) const { - if (inputs_.size() <= input_idx) { + if (inputs_.size() <= static_cast(input_idx)) { return false; } return ExtractInputTableDef(inputs_[input_idx], table); diff --git a/hybridse/src/codegen/aggregate_ir_builder.cc b/hybridse/src/codegen/aggregate_ir_builder.cc index 19e2a4f5cc3..22de3d3d742 100644 --- a/hybridse/src/codegen/aggregate_ir_builder.cc +++ b/hybridse/src/codegen/aggregate_ir_builder.cc @@ -21,10 +21,10 @@ #include #include +#include "codegen/buf_ir_builder.h" #include "codegen/expr_ir_builder.h" #include "codegen/ir_base_builder.h" #include "codegen/variable_ir_builder.h" -#include "gflags/gflags.h" #include "glog/logging.h" namespace hybridse { namespace codegen { diff --git a/hybridse/src/codegen/array_ir_builder.cc b/hybridse/src/codegen/array_ir_builder.cc index 5bf1bf06e99..5f3d22edc5c 100644 --- a/hybridse/src/codegen/array_ir_builder.cc +++ b/hybridse/src/codegen/array_ir_builder.cc @@ -17,26 +17,26 @@ #include "codegen/array_ir_builder.h" #include + +#include "codegen/context.h" #include "codegen/ir_base_builder.h" namespace hybridse { namespace codegen { +#define SZ_IDX 2 +#define RAW_IDX 0 +#define NULL_IDX 1 + ArrayIRBuilder::ArrayIRBuilder(::llvm::Module* m, llvm::Type* ele_ty) : StructTypeIRBuilder(m), element_type_(ele_ty) { InitStructType(); } -ArrayIRBuilder::ArrayIRBuilder(::llvm::Module* m, llvm::Type* ele_ty, llvm::Value* num_ele) - : StructTypeIRBuilder(m), element_type_(ele_ty), num_elements_(num_ele) { - InitStructType(); -} - void ArrayIRBuilder::InitStructType() { // name must unique between different array type std::string name = absl::StrCat("fe.array_", GetLlvmObjectString(element_type_)); - ::llvm::StringRef sr(name); - ::llvm::StructType* stype = m_->getTypeByName(sr); + ::llvm::StructType* stype = m_->getTypeByName(name); if (stype != NULL) { struct_type_ = stype; return; @@ -46,29 +46,36 @@ void ArrayIRBuilder::InitStructType() { ::llvm::Type* arr_type = element_type_->getPointerTo(); ::llvm::Type* nullable_type = ::llvm::IntegerType::getInt1Ty(m_->getContext())->getPointerTo(); ::llvm::Type* size_type = ::llvm::IntegerType::getInt64Ty(m_->getContext()); - std::vector<::llvm::Type*> elements = {arr_type, nullable_type, size_type}; - stype->setBody(::llvm::ArrayRef<::llvm::Type*>(elements)); + stype->setBody({arr_type, nullable_type, size_type}); struct_type_ = stype; } -base::Status ArrayIRBuilder::NewFixedArray(llvm::BasicBlock* bb, const std::vector& elements, - NativeValue* output) const { - // TODO(ace): reduce IR size with loop block - - CHECK_TRUE(num_elements_ != nullptr, common::kCodegenError, "num elements unknown"); - +absl::StatusOr ArrayIRBuilder::Construct(CodeGenContext* ctx, + absl::Span elements) const { + auto bb = ctx->GetCurrentBlock(); // alloc array struct llvm::Value* array_alloca = nullptr; - CHECK_TRUE(Create(bb, &array_alloca), common::kCodegenError, "can't create struct type for array"); + if (!Allocate(bb, &array_alloca)) { + return absl::InternalError("can't create struct type for array"); + } // ============================ // Init array elements // ============================ llvm::IRBuilder<> builder(bb); + auto num_elements = ctx->GetBuilder()->getInt64(elements.size()); + if (!Set(bb, array_alloca, SZ_IDX, num_elements)) { + return absl::InternalError("fail to set array size"); + } + + if (elements.empty()) { + // empty array + return NativeValue::Create(array_alloca); + } // init raw array and nullable array - auto* raw_array_ptr = builder.CreateAlloca(element_type_, num_elements_); - auto* nullables_ptr = builder.CreateAlloca(builder.getInt1Ty(), num_elements_); + auto* raw_array_ptr = builder.CreateAlloca(element_type_, num_elements); + auto* nullables_ptr = builder.CreateAlloca(builder.getInt1Ty(), num_elements); // fullfill the array struct auto* idx_val_ptr = builder.CreateAlloca(builder.getInt64Ty()); @@ -88,41 +95,26 @@ base::Status ArrayIRBuilder::NewFixedArray(llvm::BasicBlock* bb, const std::vect } // Set raw array - CHECK_TRUE(Set(bb, array_alloca, 0, raw_array_ptr), common::kCodegenError); + if (!Set(bb, array_alloca, RAW_IDX, raw_array_ptr)) { + return absl::InternalError("fail to set array values"); + } // Set nullable list - CHECK_TRUE(Set(bb, array_alloca, 1, nullables_ptr), common::kCodegenError); - - ::llvm::Value* array_sz = builder.CreateLoad(idx_val_ptr); - CHECK_TRUE(Set(bb, array_alloca, 2, array_sz), common::kCodegenError); - - *output = NativeValue::Create(array_alloca); - return base::Status::OK(); -} - - -base::Status ArrayIRBuilder::NewEmptyArray(llvm::BasicBlock* bb, NativeValue* output) const { - llvm::Value* array_alloca = nullptr; - CHECK_TRUE(Create(bb, &array_alloca), common::kCodegenError, "can't create struct type for array"); - - llvm::IRBuilder<> builder(bb); - - ::llvm::Value* array_sz = builder.getInt64(0); - CHECK_TRUE(Set(bb, array_alloca, 2, array_sz), common::kCodegenError); - - *output = NativeValue::Create(array_alloca); + if (!Set(bb, array_alloca, NULL_IDX, nullables_ptr)) { + return absl::InternalError("fail to set array nulls"); + } - return base::Status::OK(); + return NativeValue::Create(array_alloca); } bool ArrayIRBuilder::CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) { llvm::Value* array_alloca = nullptr; - if (!Create(block, &array_alloca)) { + if (!Allocate(block, &array_alloca)) { return false; } llvm::IRBuilder<> builder(block); ::llvm::Value* array_sz = builder.getInt64(0); - if (!Set(block, array_alloca, 2, array_sz)) { + if (!Set(block, array_alloca, SZ_IDX, array_sz)) { return false; } diff --git a/hybridse/src/codegen/array_ir_builder.h b/hybridse/src/codegen/array_ir_builder.h index 66ef2fe05da..b6ff275ac45 100644 --- a/hybridse/src/codegen/array_ir_builder.h +++ b/hybridse/src/codegen/array_ir_builder.h @@ -17,9 +17,6 @@ #ifndef HYBRIDSE_SRC_CODEGEN_ARRAY_IR_BUILDER_H_ #define HYBRIDSE_SRC_CODEGEN_ARRAY_IR_BUILDER_H_ -#include - -#include "absl/base/attributes.h" #include "codegen/struct_ir_builder.h" namespace hybridse { @@ -29,27 +26,15 @@ namespace codegen { // - Array of raw values: T* // - Array of nullable values: bool* // - array size: int64 -class ArrayIRBuilder : public StructTypeIRBuilder { +class ArrayIRBuilder : public StructTypeIRBuilder { public: // Array builder with num elements unknown ArrayIRBuilder(::llvm::Module* m, llvm::Type* ele_ty); - // Array builder with num elements known at some point - ArrayIRBuilder(::llvm::Module* m, llvm::Type* ele_ty, llvm::Value* num_ele); - ~ArrayIRBuilder() override {} // create a new array from `elements` as value - ABSL_MUST_USE_RESULT - base::Status NewFixedArray(llvm::BasicBlock* bb, const std::vector& elements, - NativeValue* output) const; - - ABSL_MUST_USE_RESULT - base::Status NewEmptyArray(llvm::BasicBlock* bb, NativeValue* output) const; - - void InitStructType() override; - - bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; + absl::StatusOr Construct(CodeGenContext* ctx, absl::Span args) const override; bool CopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist) override { return true; } @@ -57,9 +42,13 @@ class ArrayIRBuilder : public StructTypeIRBuilder { CHECK_TRUE(false, common::kCodegenError, "casting to array un-implemented"); }; + private: + void InitStructType() override; + + bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; + private: ::llvm::Type* element_type_ = nullptr; - ::llvm::Value* num_elements_ = nullptr; }; } // namespace codegen diff --git a/hybridse/src/codegen/block_ir_builder.cc b/hybridse/src/codegen/block_ir_builder.cc index 6f53e80aa40..818229553ca 100644 --- a/hybridse/src/codegen/block_ir_builder.cc +++ b/hybridse/src/codegen/block_ir_builder.cc @@ -15,15 +15,15 @@ */ #include "codegen/block_ir_builder.h" + #include "codegen/context.h" #include "codegen/expr_ir_builder.h" +#include "codegen/ir_base_builder.h" #include "codegen/list_ir_builder.h" #include "codegen/struct_ir_builder.h" #include "codegen/type_ir_builder.h" #include "codegen/variable_ir_builder.h" #include "glog/logging.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/IR/CFG.h" #include "llvm/IR/IRBuilder.h" using ::hybridse::common::kCodegenError; diff --git a/hybridse/src/codegen/date_ir_builder.cc b/hybridse/src/codegen/date_ir_builder.cc index 19bf319d7c3..1bfb1d31160 100644 --- a/hybridse/src/codegen/date_ir_builder.cc +++ b/hybridse/src/codegen/date_ir_builder.cc @@ -55,7 +55,7 @@ bool DateIRBuilder::NewDate(::llvm::BasicBlock* block, ::llvm::Value** output) { return false; } ::llvm::Value* date; - if (!Create(block, &date)) { + if (!Allocate(block, &date)) { return false; } if (!SetDate(block, date, @@ -73,7 +73,7 @@ bool DateIRBuilder::NewDate(::llvm::BasicBlock* block, ::llvm::Value* days, return false; } ::llvm::Value* date; - if (!Create(block, &date)) { + if (!Allocate(block, &date)) { return false; } if (!SetDate(block, date, days)) { diff --git a/hybridse/src/codegen/date_ir_builder.h b/hybridse/src/codegen/date_ir_builder.h index d9004d48da1..1d51cc98ceb 100644 --- a/hybridse/src/codegen/date_ir_builder.h +++ b/hybridse/src/codegen/date_ir_builder.h @@ -28,8 +28,6 @@ class DateIRBuilder : public StructTypeIRBuilder { explicit DateIRBuilder(::llvm::Module* m); ~DateIRBuilder(); - void InitStructType() override; - bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; bool CopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist) override; base::Status CastFrom(::llvm::BasicBlock* block, const NativeValue& src, NativeValue* output) override; @@ -46,6 +44,9 @@ class DateIRBuilder : public StructTypeIRBuilder { ::llvm::Value** output, base::Status& status); // NOLINT bool Year(::llvm::BasicBlock* block, ::llvm::Value* date, ::llvm::Value** output, base::Status& status); // NOLINT + private: + void InitStructType() override; + bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; }; } // namespace codegen } // namespace hybridse diff --git a/hybridse/src/codegen/expr_ir_builder.cc b/hybridse/src/codegen/expr_ir_builder.cc index 6b95bfb8ce1..ccf3838cbcf 100644 --- a/hybridse/src/codegen/expr_ir_builder.cc +++ b/hybridse/src/codegen/expr_ir_builder.cc @@ -19,8 +19,10 @@ #include #include #include +#include #include "base/numeric.h" +#include "codegen/arithmetic_expr_ir_builder.h" #include "codegen/array_ir_builder.h" #include "codegen/buf_ir_builder.h" #include "codegen/cond_select_ir_builder.h" @@ -28,11 +30,19 @@ #include "codegen/date_ir_builder.h" #include "codegen/ir_base_builder.h" #include "codegen/list_ir_builder.h" +#include "codegen/map_ir_builder.h" +#include "codegen/predicate_expr_ir_builder.h" +#include "codegen/scope_var.h" #include "codegen/timestamp_ir_builder.h" #include "codegen/type_ir_builder.h" #include "codegen/udf_ir_builder.h" +#include "codegen/variable_ir_builder.h" #include "codegen/window_ir_builder.h" #include "glog/logging.h" +#include "llvm/IR/IRBuilder.h" +#include "node/node_manager.h" +#include "node/type_node.h" +#include "passes/resolve_fn_and_attrs.h" #include "proto/fe_common.pb.h" #include "udf/default_udf_library.h" #include "vm/schemas_context.h" @@ -199,6 +209,10 @@ Status ExprIRBuilder::Build(const ::hybridse::node::ExprNode* node, CHECK_STATUS(BuildArrayExpr(dynamic_cast(node), output)); break; } + case ::hybridse::node::kExprArrayElement: { + CHECK_STATUS(BuildArrayElement(dynamic_cast(node), output)); + break; + } default: { return Status(kCodegenError, "Expression Type " + @@ -1157,13 +1171,6 @@ Status ExprIRBuilder::BuildArrayExpr(const ::hybridse::node::ArrayExpr* node, Na llvm::IRBuilder<> builder(ctx_->GetCurrentBlock()); - if (node->GetChildNum() == 0) { - // build empty array - ArrayIRBuilder ir_builder(ctx_->GetModule(), ele_type); - CHECK_STATUS(ir_builder.NewEmptyArray(ctx_->GetCurrentBlock(), output)); - return Status::OK(); - } - CastExprIRBuilder cast_builder(ctx_->GetCurrentBlock()); std::vector elements; for (auto& ele : node->children_) { @@ -1178,11 +1185,46 @@ Status ExprIRBuilder::BuildArrayExpr(const ::hybridse::node::ArrayExpr* node, Na } } - ::llvm::Value* num_elements = builder.getInt64(elements.size()); - ArrayIRBuilder array_builder(ctx_->GetModule(), ele_type, num_elements); - CHECK_STATUS(array_builder.NewFixedArray(ctx_->GetCurrentBlock(), elements, output)); + ArrayIRBuilder array_builder(ctx_->GetModule(), ele_type); + auto rs = array_builder.Construct(ctx_, elements); + if (!rs.ok()) { + FAIL_STATUS(kCodegenError, rs.status()); + } + + *output = rs.value(); return Status::OK(); } +Status ExprIRBuilder::BuildArrayElement(const ::hybridse::node::ArrayElementExpr* expr, NativeValue* output) { + auto* arr_type = expr->array()->GetOutputType(); + NativeValue arr_val; + CHECK_STATUS(Build(expr->array(), &arr_val)); + + NativeValue pos_val; + CHECK_STATUS(Build(expr->position(), &pos_val)); + + std::unique_ptr type_builder; + + if (arr_type->IsMap()) { + auto* map_type = arr_type->GetAsOrNull(); + ::llvm::Type* key_type = nullptr; + ::llvm::Type* value_type = nullptr; + CHECK_TRUE(GetLlvmType(ctx_->GetModule(), map_type->key_type(), &key_type), kCodegenError); + CHECK_TRUE(GetLlvmType(ctx_->GetModule(), map_type->value_type(), &value_type), kCodegenError); + type_builder.reset(new MapIRBuilder(ctx_->GetModule(), key_type, value_type)); + } else if (arr_type->IsArray()) { + ::llvm::Type* ele_type = nullptr; + CHECK_TRUE(GetLlvmType(ctx_->GetModule(), arr_type->GetGenericType(0), &ele_type), kCodegenError); + type_builder.reset(new ArrayIRBuilder(ctx_->GetModule(), ele_type)); + } else { + return {common::kCodegenError, absl::StrCat("can't get element from type ", arr_type->DebugString())}; + } + + auto res = type_builder->ExtractElement(ctx_, arr_val, pos_val); + CHECK_TRUE(res.ok(), common::kCodegenError, res.status().ToString()); + *output = res.value(); + + return {}; +} } // namespace codegen } // namespace hybridse diff --git a/hybridse/src/codegen/expr_ir_builder.h b/hybridse/src/codegen/expr_ir_builder.h index 6838d96a88b..051c9a32bfd 100644 --- a/hybridse/src/codegen/expr_ir_builder.h +++ b/hybridse/src/codegen/expr_ir_builder.h @@ -17,24 +17,12 @@ #ifndef HYBRIDSE_SRC_CODEGEN_EXPR_IR_BUILDER_H_ #define HYBRIDSE_SRC_CODEGEN_EXPR_IR_BUILDER_H_ -#include -#include #include #include + #include "base/fe_status.h" -#include "codegen/arithmetic_expr_ir_builder.h" -#include "codegen/buf_ir_builder.h" -#include "codegen/predicate_expr_ir_builder.h" -#include "codegen/row_ir_builder.h" -#include "codegen/scope_var.h" -#include "codegen/variable_ir_builder.h" -#include "codegen/window_ir_builder.h" -#include "llvm/IR/IRBuilder.h" -#include "node/node_manager.h" +#include "codegen/context.h" #include "node/sql_node.h" -#include "node/type_node.h" -#include "passes/resolve_fn_and_attrs.h" -#include "vm/schemas_context.h" namespace hybridse { namespace codegen { @@ -117,6 +105,8 @@ class ExprIRBuilder { Status BuildArrayExpr(const ::hybridse::node::ArrayExpr* node, NativeValue* output); + Status BuildArrayElement(const ::hybridse::node::ArrayElementExpr*, NativeValue*); + private: CodeGenContext* ctx_; diff --git a/hybridse/src/codegen/fn_let_ir_builder.cc b/hybridse/src/codegen/fn_let_ir_builder.cc index 362e4a83df6..6d8e86e3933 100644 --- a/hybridse/src/codegen/fn_let_ir_builder.cc +++ b/hybridse/src/codegen/fn_let_ir_builder.cc @@ -15,13 +15,14 @@ */ #include "codegen/fn_let_ir_builder.h" + #include "codegen/aggregate_ir_builder.h" +#include "codegen/buf_ir_builder.h" #include "codegen/context.h" #include "codegen/expr_ir_builder.h" #include "codegen/ir_base_builder.h" #include "codegen/variable_ir_builder.h" #include "glog/logging.h" -#include "vm/transform.h" using ::hybridse::common::kCodegenError; diff --git a/hybridse/src/codegen/ir_base_builder.cc b/hybridse/src/codegen/ir_base_builder.cc index 992d41d0998..81fadbfdd3d 100644 --- a/hybridse/src/codegen/ir_base_builder.cc +++ b/hybridse/src/codegen/ir_base_builder.cc @@ -556,7 +556,24 @@ bool GetFullType(node::NodeManager* nm, ::llvm::Type* type, return false; } case hybridse::node::kMap: { - LOG(WARNING) << "fail to get type for map"; + if (type->isPointerTy()) { + auto type_pointee = type->getPointerElementType(); + if (type_pointee->isStructTy()) { + auto* key_type = type_pointee->getStructElementType(1); + const node::TypeNode* key = nullptr; + if (key_type->isPointerTy() && !GetFullType(nm, key_type->getPointerElementType(), &key)) { + return false; + } + const node::TypeNode* value = nullptr; + auto* value_type = type_pointee->getStructElementType(2); + if (value_type->isPointerTy() && !GetFullType(nm, value_type->getPointerElementType(), &value)) { + return false; + } + + *type_node = nm->MakeNode(key, value); + return true; + } + } return false; } default: { @@ -643,6 +660,9 @@ bool GetBaseType(::llvm::Type* type, ::hybridse::node::DataType* output) { } else if (struct_name.startswith("fe.array_")) { *output = hybridse::node::kArray; return true; + } else if (struct_name.startswith("fe.map_")) { + *output = hybridse::node::kMap; + return true; } LOG(WARNING) << "no mapping pointee_ty for llvm pointee_ty " << pointee_ty->getStructName().str(); diff --git a/hybridse/src/codegen/ir_base_builder_test.h b/hybridse/src/codegen/ir_base_builder_test.h index 478d8ae5ea3..af29e4fd56c 100644 --- a/hybridse/src/codegen/ir_base_builder_test.h +++ b/hybridse/src/codegen/ir_base_builder_test.h @@ -22,8 +22,8 @@ #include #include +#include "codegen/ir_base_builder.h" #include "llvm/IR/Verifier.h" -#include "llvm/Support/InitLLVM.h" #include "llvm/Support/TargetSelect.h" #include "base/fe_status.h" @@ -34,8 +34,7 @@ #include "passes/resolve_fn_and_attrs.h" #include "udf/default_udf_library.h" #include "udf/literal_traits.h" -#include "udf/udf.h" -#include "vm/sql_compiler.h" +#include "vm/jit_wrapper.h" namespace hybridse { namespace codegen { @@ -360,8 +359,7 @@ void ModuleFunctionBuilderWithFullInfo::ExpandApplyArg( ::llvm::Value* alloca; if (TypeIRBuilder::IsStructPtr(expect_ty)) { auto struct_builder = - StructTypeIRBuilder::CreateStructTypeIRBuilder( - function->getEntryBlock().getModule(), expect_ty); + StructTypeIRBuilder::CreateStructTypeIRBuilder(function->getEntryBlock().getModule(), expect_ty); struct_builder->CreateDefault(&function->getEntryBlock(), &alloca); arg = builder.CreateSelect( diff --git a/hybridse/src/codegen/map_ir_builder.cc b/hybridse/src/codegen/map_ir_builder.cc new file mode 100644 index 00000000000..8945c88f9b7 --- /dev/null +++ b/hybridse/src/codegen/map_ir_builder.cc @@ -0,0 +1,326 @@ +/* + * Copyright 2022 OpenMLDB authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "codegen/map_ir_builder.h" + +#include + +#include "absl/status/status.h" +#include "codegen/array_ir_builder.h" +#include "codegen/cast_expr_ir_builder.h" +#include "codegen/context.h" +#include "codegen/ir_base_builder.h" +#include "codegen/cond_select_ir_builder.h" +#include "codegen/predicate_expr_ir_builder.h" + +namespace hybridse { +namespace codegen { + +static const char* PREFIX = "fe.map"; +#define SZ_IDX 0 +#define KEY_VEC_IDX 1 +#define VALUE_VEC_IDX 2 +#define VALUE_NULL_VEC_IDX 3 + +MapIRBuilder::MapIRBuilder(::llvm::Module* m, ::llvm::Type* key_ty, ::llvm::Type* value_ty) + : StructTypeIRBuilder(m), key_type_(key_ty), value_type_(value_ty) { + InitStructType(); +} + +void MapIRBuilder::InitStructType() { + std::string name = + absl::StrCat(PREFIX, "__", GetLlvmObjectString(key_type_), "_", GetLlvmObjectString(value_type_), "__"); + ::llvm::StringRef sr(name); + ::llvm::StructType* stype = m_->getTypeByName(sr); + if (stype != NULL) { + struct_type_ = stype; + return; + } + stype = ::llvm::StructType::create(m_->getContext(), name); + + // %map__{key}_{value}__ = { size, vec, vec, vec } + ::llvm::Type* size_type = ::llvm::IntegerType::getInt64Ty(m_->getContext()); + // ::llvm::Type* key_vec = ::llvm::VectorType::get(key_type_, {MIN_VEC_SIZE, true}); + LOG(INFO) << "key vec is " << GetLlvmObjectString(key_type_); + ::llvm::Type* key_vec = key_type_->getPointerTo(); + ::llvm::Type* value_vec = value_type_->getPointerTo(); + ::llvm::Type* value_null_type = ::llvm::IntegerType::getInt1Ty(m_->getContext())->getPointerTo(); + stype->setBody({size_type, key_vec, value_vec, value_null_type}); + struct_type_ = stype; +} + +absl::StatusOr MapIRBuilder::Construct(CodeGenContext* ctx, absl::Span args) const { + EnsureOK(); + + ::llvm::Value* map_alloca = nullptr; + if (!Allocate(ctx->GetCurrentBlock(), &map_alloca)) { + return absl::FailedPreconditionError(absl::StrCat("unable to allocate ", GetLlvmObjectString(struct_type_))); + } + + auto builder = ctx->GetBuilder(); + auto* original_size = builder->getInt64(args.size() / 2); + auto* key_vec = builder->CreateAlloca(key_type_, original_size, "key_vec"); + auto* value_vec = builder->CreateAlloca(value_type_, original_size, "value_vec"); + auto* value_nulls_vec = builder->CreateAlloca(builder->getInt1Ty(), original_size, "value_nulls_vec"); + + // creating raw values for map + + CastExprIRBuilder cast_builder(ctx->GetCurrentBlock()); + + // original vector, may contains duplicate keys + auto* original_keys = builder->CreateAlloca(key_type_, original_size, "original_keys"); + auto* original_keys_is_null = builder->CreateAlloca(builder->getInt1Ty(), original_size, "original_keys_is_null"); + auto* original_values = builder->CreateAlloca(value_type_, original_size, "original_values"); + auto* original_values_is_null = + builder->CreateAlloca(builder->getInt1Ty(), original_size, "original_values_is_null"); + for (size_t i = 0; i < args.size(); i += 2) { + auto* update_idx = builder->getInt64(i / 2); + NativeValue key = args[i]; + if (key.GetValue(builder)->getType() != key_type_) { + auto s = cast_builder.Cast(key, key_type_, &key); + if (!s.isOK()) { + return absl::InternalError(absl::StrCat("fail to case map key: ", s.str())); + } + } + NativeValue value = args[i + 1]; + if (value.GetValue(builder)->getType() != value_type_) { + auto s = cast_builder.Cast(value, value_type_, &value); + if (!s.isOK()) { + return absl::InternalError(absl::StrCat("fail to case map value: ", s.str())); + } + } + builder->CreateStore(key.GetIsNull(ctx), builder->CreateGEP(original_keys_is_null, update_idx)); + builder->CreateStore(key.GetValue(ctx), builder->CreateGEP(original_keys, update_idx)); + builder->CreateStore(value.GetIsNull(ctx), builder->CreateGEP(original_values_is_null, update_idx)); + builder->CreateStore(value.GetValue(ctx), builder->CreateGEP(original_values, update_idx)); + } + + ::llvm::Value* update_idx_ptr = builder->CreateAlloca(builder->getInt64Ty(), nullptr, "update_idx"); + builder->CreateStore(builder->getInt64(0), update_idx_ptr); + ::llvm::Value* true_idx_ptr = builder->CreateAlloca(builder->getInt64Ty(), nullptr, "true_idx"); + builder->CreateStore(builder->getInt64(0), true_idx_ptr); + + auto s = ctx->CreateWhile( + [&](llvm::Value** cond) -> base::Status { + *cond = builder->CreateAnd( + builder->CreateICmpSLT(builder->CreateLoad(update_idx_ptr), original_size, "if_while_true"), + builder->CreateICmpSLT(builder->CreateLoad(true_idx_ptr), original_size)); + return {}; + }, + [&]() -> base::Status { + auto idx = builder->CreateLoad(update_idx_ptr, "update_idx_value"); + auto true_idx = builder->CreateLoad(true_idx_ptr, "true_idx_value"); + CHECK_STATUS(ctx->CreateBranchNot( + builder->CreateLoad(builder->CreateGEP(original_keys_is_null, idx)), [&]() -> base::Status { + // write to map if key is not null + builder->CreateStore(builder->CreateLoad(builder->CreateGEP(original_keys, idx)), + builder->CreateGEP(key_vec, true_idx)); + builder->CreateStore(builder->CreateLoad(builder->CreateGEP(original_values, idx)), + builder->CreateGEP(value_vec, true_idx)); + builder->CreateStore(builder->CreateLoad(builder->CreateGEP(original_values_is_null, idx)), + builder->CreateGEP(value_nulls_vec, true_idx)); + + builder->CreateStore(builder->CreateAdd(builder->getInt64(1), true_idx), true_idx_ptr); + return {}; + })); + + builder->CreateStore(builder->CreateAdd(builder->getInt64(1), idx), update_idx_ptr); + return {}; + }); + if (!s.isOK()) { + return absl::InternalError(s.str()); + } + + auto* final_size = builder->CreateLoad(true_idx_ptr, "true_size"); + auto as = Set(ctx, map_alloca, {final_size, key_vec, value_vec, value_nulls_vec}); + + if (!as.ok()) { + return as; + } + + return NativeValue::Create(map_alloca); +} + +bool MapIRBuilder::CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) { + llvm::Value* map_alloca = nullptr; + if (!Allocate(block, &map_alloca)) { + return false; + } + + llvm::IRBuilder<> builder(block); + ::llvm::Value* size = builder.getInt64(0); + if (!Set(block, map_alloca, SZ_IDX, size)) { + return false; + } + + *output = map_alloca; + return true; +} + +absl::StatusOr MapIRBuilder::ExtractElement(CodeGenContext* ctx, const NativeValue& arr, + const NativeValue& key) const { + EnsureOK(); + + auto builder = ctx->GetBuilder(); + auto* arr_is_null = arr.GetIsNull(ctx); + auto* key_is_null = key.GetIsNull(ctx); + + auto* out_val_alloca = builder->CreateAlloca(value_type_); + builder->CreateStore(::llvm::UndefValue::get(value_type_), out_val_alloca); + auto* out_null_alloca = builder->CreateAlloca(builder->getInt1Ty()); + builder->CreateStore(builder->getInt1(true), out_null_alloca); + + auto s = ctx->CreateBranch( + builder->CreateOr(arr_is_null, key_is_null), + [&]() -> base::Status { + return {}; + }, + [&]() -> base::Status { + NativeValue casted_key = key; + if (key.GetType() != key_type_) { + CastExprIRBuilder cast_builder(ctx->GetCurrentBlock()); + CHECK_STATUS(cast_builder.Cast(key, key_type_, &casted_key)); + } + auto* key_val = casted_key.GetValue(ctx); + + auto* map_ptr = arr.GetValue(ctx); + ::llvm::Value* sz = nullptr; + CHECK_TRUE(Load(ctx->GetCurrentBlock(), map_ptr, SZ_IDX, &sz), common::kCodegenError); + + ::llvm::Value* keys = nullptr; + CHECK_TRUE(Load(ctx->GetCurrentBlock(), map_ptr, KEY_VEC_IDX, &keys), common::kCodegenError); + + ::llvm::Value* idx_alloc = builder->CreateAlloca(builder->getInt64Ty()); + builder->CreateStore(builder->getInt64(0), idx_alloc); + ::llvm::Value* found_idx_alloc = builder->CreateAlloca(builder->getInt64Ty()); + builder->CreateStore(builder->getInt64(-1), found_idx_alloc); + + CHECK_STATUS(ctx->CreateWhile( + [&](::llvm::Value** cond) -> base::Status { + ::llvm::Value* idx = builder->CreateLoad(idx_alloc); + ::llvm::Value* found = builder->CreateLoad(found_idx_alloc); + *cond = builder->CreateAnd(builder->CreateICmpSLT(idx, sz), + builder->CreateICmpSLT(found, builder->getInt64(0))); + return {}; + }, + [&]() -> base::Status { + ::llvm::Value* idx = builder->CreateLoad(idx_alloc); + // key never null + auto* ele = builder->CreateLoad(builder->CreateGEP(keys, idx)); + ::llvm::Value* eq = nullptr; + base::Status s; + PredicateIRBuilder::BuildEqExpr(ctx->GetCurrentBlock(), ele, key_val, &eq, s); + CHECK_STATUS(s); + + ::llvm::Value* update_found_idx = builder->CreateSelect(eq, idx, builder->getInt64(-1)); + + builder->CreateStore(update_found_idx, found_idx_alloc); + builder->CreateStore(builder->CreateAdd(idx, builder->getInt64(1)), idx_alloc); + return {}; + })); + + auto* found_idx = builder->CreateLoad(found_idx_alloc); + + CHECK_STATUS(ctx->CreateBranch( + builder->CreateAnd(builder->CreateICmpSLT(found_idx, sz), + builder->CreateICmpSGE(found_idx, builder->getInt64(0))), + [&]() -> base::Status { + ::llvm::Value* values = nullptr; + CHECK_TRUE(Load(ctx->GetCurrentBlock(), map_ptr, VALUE_VEC_IDX, &values), common::kCodegenError); + + ::llvm::Value* value_nulls = nullptr; + CHECK_TRUE(Load(ctx->GetCurrentBlock(), map_ptr, VALUE_NULL_VEC_IDX, &value_nulls), + common::kCodegenError); + + auto* val = builder->CreateLoad(builder->CreateGEP(values, found_idx)); + auto* val_nullable = builder->CreateLoad(builder->CreateGEP(value_nulls, found_idx)); + + builder->CreateStore(val, out_val_alloca); + builder->CreateStore(val_nullable, out_null_alloca); + return {}; + }, + [&]() -> base::Status { return {}; })); + + return {}; + }); + + if (!s.isOK()) { + return absl::InvalidArgumentError(s.str()); + } + + auto* out_val = builder->CreateLoad(out_val_alloca); + auto* out_null_val = builder->CreateLoad(out_null_alloca); + + return NativeValue::CreateWithFlag(out_val, out_null_val); +} + +absl::StatusOr MapIRBuilder::MapKeys(CodeGenContext* ctx, const NativeValue& in) const { + EnsureOK(); + + auto map_is_null = in.GetIsNull(ctx); + auto map_ptr = in.GetValue(ctx); + + auto builder = ctx->GetBuilder(); + ::llvm::Value* keys_ptr = nullptr; + if (!Load(ctx->GetCurrentBlock(), map_ptr, KEY_VEC_IDX, &keys_ptr)) { + return absl::FailedPreconditionError("failed to extract map keys"); + } + if (!keys_ptr->getType()->isPointerTy()) { + return absl::FailedPreconditionError("map keys entry is not pointer"); + } + ::llvm::Value* size = nullptr; + if (!Load(ctx->GetCurrentBlock(), map_ptr, SZ_IDX, &size)) { + return absl::FailedPreconditionError("failed to extract map size"); + } + + // construct nulls as [false ...] + auto nulls = builder->CreateAlloca(builder->getInt1Ty(), size); + auto idx_ptr = builder->CreateAlloca(builder->getInt64Ty()); + builder->CreateStore(builder->getInt64(0), idx_ptr); + ctx->CreateWhile( + [&](::llvm::Value** cond) -> base::Status { + *cond = builder->CreateICmpSLT(builder->CreateLoad(idx_ptr), size); + return {}; + }, + [&]() -> base::Status { + auto idx = builder->CreateLoad(idx_ptr); + + builder->CreateStore(builder->getInt1(false), builder->CreateGEP(nulls, idx)); + + builder->CreateStore(builder->CreateAdd(idx, builder->getInt64(1)), idx_ptr); + return {}; + }); + + ArrayIRBuilder array_builder(ctx->GetModule(), keys_ptr->getType()->getPointerElementType()); + auto rs = array_builder.ConstructFromRaw(ctx, {keys_ptr, nulls, size}); + + if (!rs.ok()) { + return rs.status(); + } + + NativeValue out; + CondSelectIRBuilder cond_builder; + auto s = cond_builder.Select(ctx->GetCurrentBlock(), NativeValue::Create(map_is_null), + NativeValue::CreateNull(array_builder.GetType()), NativeValue::Create(rs.value()), &out); + + if (!s.isOK()) { + return absl::FailedPreconditionError(s.str()); + } + + return out; +} +} // namespace codegen +} // namespace hybridse diff --git a/hybridse/src/codegen/map_ir_builder.h b/hybridse/src/codegen/map_ir_builder.h new file mode 100644 index 00000000000..478c6cc975b --- /dev/null +++ b/hybridse/src/codegen/map_ir_builder.h @@ -0,0 +1,55 @@ +/* + * Copyright 2022 OpenMLDB authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HYBRIDSE_SRC_CODEGEN_MAP_IR_BUILDER_H_ +#define HYBRIDSE_SRC_CODEGEN_MAP_IR_BUILDER_H_ + +#include "codegen/struct_ir_builder.h" + +namespace hybridse { +namespace codegen { + +class MapIRBuilder final : public StructTypeIRBuilder { + public: + MapIRBuilder(::llvm::Module* m, ::llvm::Type* key_ty, ::llvm::Type* value_ty); + ~MapIRBuilder() override {} + + absl::StatusOr Construct(CodeGenContext* ctx, absl::Span args) const override; + + bool CopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist) override { return true; } + base::Status CastFrom(::llvm::BasicBlock* block, const NativeValue& src, NativeValue* output) override { + return {}; + } + + absl::StatusOr ExtractElement(CodeGenContext* ctx, const NativeValue&, + const NativeValue&) const override; + + absl::StatusOr MapKeys(CodeGenContext*, const NativeValue&) const; + + private: + void InitStructType() override; + + bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) override; + + private: + ::llvm::Type* key_type_ = nullptr; + ::llvm::Type* value_type_ = nullptr; +}; + +} // namespace codegen +} // namespace hybridse + +#endif // HYBRIDSE_SRC_CODEGEN_MAP_IR_BUILDER_H_ diff --git a/hybridse/src/codegen/string_ir_builder.cc b/hybridse/src/codegen/string_ir_builder.cc index 8c41d326ee0..083c907fbe4 100644 --- a/hybridse/src/codegen/string_ir_builder.cc +++ b/hybridse/src/codegen/string_ir_builder.cc @@ -66,7 +66,7 @@ bool StringIRBuilder::CreateDefault(::llvm::BasicBlock* block, bool StringIRBuilder::NewString(::llvm::BasicBlock* block, ::llvm::Value** output) { - if (!Create(block, output)) { + if (!Allocate(block, output)) { LOG(WARNING) << "Fail to Create Default String"; return false; } @@ -86,7 +86,7 @@ bool StringIRBuilder::NewString(::llvm::BasicBlock* block, } bool StringIRBuilder::NewString(::llvm::BasicBlock* block, ::llvm::Value* size, ::llvm::Value* data, ::llvm::Value** output) { - if (!Create(block, output)) { + if (!Allocate(block, output)) { LOG(WARNING) << "Fail to Create Default String"; return false; } diff --git a/hybridse/src/codegen/struct_ir_builder.cc b/hybridse/src/codegen/struct_ir_builder.cc index 7adfb5d950f..4b0be401065 100644 --- a/hybridse/src/codegen/struct_ir_builder.cc +++ b/hybridse/src/codegen/struct_ir_builder.cc @@ -15,10 +15,15 @@ */ #include "codegen/struct_ir_builder.h" + +#include "absl/status/status.h" +#include "absl/strings/substitute.h" +#include "codegen/context.h" #include "codegen/date_ir_builder.h" #include "codegen/ir_base_builder.h" #include "codegen/string_ir_builder.h" #include "codegen/timestamp_ir_builder.h" + namespace hybridse { namespace codegen { StructTypeIRBuilder::StructTypeIRBuilder(::llvm::Module* m) @@ -54,6 +59,8 @@ StructTypeIRBuilder* StructTypeIRBuilder::CreateStructTypeIRBuilder(::llvm::Modu } absl::StatusOr StructTypeIRBuilder::CreateNull(::llvm::BasicBlock* block) { + EnsureOK(); + ::llvm::Value* value = nullptr; if (!CreateDefault(block, &value)) { return absl::InternalError(absl::StrCat("fail to construct ", GetLlvmObjectString(GetType()))); @@ -62,16 +69,17 @@ absl::StatusOr StructTypeIRBuilder::CreateNull(::llvm::BasicBlock* return NativeValue::CreateWithFlag(value, builder.getInt1(true)); } -::llvm::Type* StructTypeIRBuilder::GetType() { return struct_type_; } +::llvm::Type* StructTypeIRBuilder::GetType() const { return struct_type_; } -bool StructTypeIRBuilder::Create(::llvm::BasicBlock* block, +bool StructTypeIRBuilder::Allocate(::llvm::BasicBlock* block, ::llvm::Value** output) const { if (block == NULL || output == NULL) { LOG(WARNING) << "the output ptr or block is NULL "; return false; } ::llvm::IRBuilder<> builder(block); - ::llvm::Value* value = CreateAllocaAtHead(&builder, struct_type_, "struct_alloca"); + // value is a pointer to struct type + ::llvm::Value* value = CreateAllocaAtHead(&builder, struct_type_, GetLlvmObjectString(struct_type_)); *output = value; return true; } @@ -96,22 +104,10 @@ bool StructTypeIRBuilder::Set(::llvm::BasicBlock* block, ::llvm::Value* struct_v LOG(WARNING) << "Fail set Struct value: struct pointer is required"; return false; } - if (struct_value->getType()->getPointerElementType() != struct_type_) { - LOG(WARNING) << "Fail set Struct value: struct value type invalid " - << struct_value->getType() - ->getPointerElementType() - ->getStructName() - .str(); - return false; - } + ::llvm::IRBuilder<> builder(block); - builder.getInt64(1); - ::llvm::Value* value_ptr = - builder.CreateStructGEP(struct_type_, struct_value, idx); - if (nullptr == builder.CreateStore(value, value_ptr)) { - LOG(WARNING) << "Fail Set Struct Value idx = " << idx; - return false; - } + ::llvm::Value* value_ptr = builder.CreateStructGEP(struct_type_, struct_value, idx); + builder.CreateStore(value, value_ptr); return true; } @@ -137,5 +133,77 @@ bool StructTypeIRBuilder::Get(::llvm::BasicBlock* block, ::llvm::Value* struct_v *output = builder.CreateStructGEP(struct_type_, struct_value, idx); return true; } +absl::StatusOr StructTypeIRBuilder::Construct(CodeGenContext* ctx, + absl::Span args) const { + return absl::UnimplementedError(absl::StrCat("Construct for type ", GetLlvmObjectString(struct_type_))); +} + +absl::StatusOr<::llvm::Value*> StructTypeIRBuilder::ConstructFromRaw(CodeGenContext* ctx, + absl::Span<::llvm::Value* const> args) const { + EnsureOK(); + + llvm::Value* alloca = nullptr; + if (!Allocate(ctx->GetCurrentBlock(), &alloca)) { + return absl::FailedPreconditionError("failed to allocate array"); + } + + auto s = Set(ctx, alloca, args); + if (!s.ok()) { + return s; + } + + return alloca; +} + +absl::StatusOr StructTypeIRBuilder::ExtractElement(CodeGenContext* ctx, const NativeValue& arr, + const NativeValue& key) const { + return absl::UnimplementedError( + absl::StrCat("extract element unimplemented for ", GetLlvmObjectString(struct_type_))); +} + +void StructTypeIRBuilder::EnsureOK() const { + assert(struct_type_ != nullptr); + // it's a identified type + assert(!struct_type_->getName().empty()); +} +std::string StructTypeIRBuilder::GetTypeDebugString() const { return GetLlvmObjectString(struct_type_); } + +absl::Status StructTypeIRBuilder::Set(CodeGenContext* ctx, ::llvm::Value* struct_value, + absl::Span<::llvm::Value* const> members) const { + if (ctx == nullptr || struct_value == nullptr) { + return absl::InvalidArgumentError("ctx or struct pointer is null"); + } + + if (!IsStructPtr(struct_value->getType())) { + return absl::InvalidArgumentError( + absl::StrCat("value not a struct pointer: ", GetLlvmObjectString(struct_value->getType()))); + } + + if (struct_value->getType()->getPointerElementType() != struct_type_) { + return absl::InvalidArgumentError(absl::Substitute("input value has different type, expect $0 but got $1", + GetLlvmObjectString(struct_type_), + GetLlvmObjectString(struct_value->getType()))); + } + + if (members.size() != struct_type_->getNumElements()) { + return absl::InvalidArgumentError(absl::Substitute("struct $0 requires exact $1 member, but got $2", + GetLlvmObjectString(struct_type_), + struct_type_->getNumElements(), members.size())); + } + + for (unsigned idx = 0; idx < struct_type_->getNumElements(); ++idx) { + auto ele_type = struct_type_->getElementType(idx); + if (ele_type != members[idx]->getType()) { + return absl::InvalidArgumentError(absl::Substitute("$0th member: expect $1 but got $2", idx, + GetLlvmObjectString(ele_type), + GetLlvmObjectString(members[idx]->getType()))); + } + ::llvm::Value* value_ptr = ctx->GetBuilder()->CreateStructGEP(struct_type_, struct_value, idx); + ctx->GetBuilder()->CreateStore(members[idx], value_ptr); + } + + return absl::OkStatus(); +} + } // namespace codegen } // namespace hybridse diff --git a/hybridse/src/codegen/struct_ir_builder.h b/hybridse/src/codegen/struct_ir_builder.h index e197665855b..f9b6ca30731 100644 --- a/hybridse/src/codegen/struct_ir_builder.h +++ b/hybridse/src/codegen/struct_ir_builder.h @@ -17,6 +17,8 @@ #ifndef HYBRIDSE_SRC_CODEGEN_STRUCT_IR_BUILDER_H_ #define HYBRIDSE_SRC_CODEGEN_STRUCT_IR_BUILDER_H_ +#include + #include "absl/status/statusor.h" #include "base/fe_status.h" #include "codegen/native_value.h" @@ -27,20 +29,46 @@ namespace codegen { class StructTypeIRBuilder : public TypeIRBuilder { public: + // TODO(ace): construct with CodeGenContext instead of llvm::Module explicit StructTypeIRBuilder(::llvm::Module*); ~StructTypeIRBuilder(); static StructTypeIRBuilder* CreateStructTypeIRBuilder(::llvm::Module*, ::llvm::Type*); static bool StructCopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist); - virtual void InitStructType() = 0; virtual bool CopyFrom(::llvm::BasicBlock* block, ::llvm::Value* src, ::llvm::Value* dist) = 0; virtual base::Status CastFrom(::llvm::BasicBlock* block, const NativeValue& src, NativeValue* output) = 0; - virtual bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) = 0; + // construct the default null safe struct absl::StatusOr CreateNull(::llvm::BasicBlock* block); - ::llvm::Type* GetType(); - bool Create(::llvm::BasicBlock* block, ::llvm::Value** output) const; + + virtual bool CreateDefault(::llvm::BasicBlock* block, ::llvm::Value** output) = 0; + + // Allocate and Initialize the struct value from args, each element in list represent exact argument in SQL literal. + // So for map data type, we create it in SQL with `map(key1, value1, ...)`, args is key or value for the result map + virtual absl::StatusOr Construct(CodeGenContext* ctx, absl::Span args) const; + + // construct struct value from llvm values, each element in list represent exact + // llvm struct field at that index + virtual absl::StatusOr<::llvm::Value*> ConstructFromRaw(CodeGenContext* ctx, + absl::Span<::llvm::Value* const> args) const; + + // Extract element value from composite data type + // 1. extract from array type by index + // 2. extract from struct type by field name + // 3. extract from map type by key + virtual absl::StatusOr ExtractElement(CodeGenContext* ctx, const NativeValue& arr, + const NativeValue& key) const; + + ::llvm::Type* GetType() const; + + std::string GetTypeDebugString() const; + + protected: + virtual void InitStructType() = 0; + + // allocate the given struct on current stack, no initialization + bool Allocate(::llvm::BasicBlock* block, ::llvm::Value** output) const; // Load the 'idx' th field into ''*output' // NOTE: not all types are loaded correctly, e.g for array type @@ -50,9 +78,13 @@ class StructTypeIRBuilder : public TypeIRBuilder { // Get the address of 'idx' th field bool Get(::llvm::BasicBlock* block, ::llvm::Value* struct_value, unsigned int idx, ::llvm::Value** output) const; + absl::Status Set(CodeGenContext* ctx, ::llvm::Value* struct_value, absl::Span<::llvm::Value* const> members) const; + + void EnsureOK() const; + protected: ::llvm::Module* m_; - ::llvm::Type* struct_type_; + ::llvm::StructType* struct_type_; }; } // namespace codegen } // namespace hybridse diff --git a/hybridse/src/codegen/timestamp_ir_builder.cc b/hybridse/src/codegen/timestamp_ir_builder.cc index c3a8054e1cd..a07c29ee3de 100644 --- a/hybridse/src/codegen/timestamp_ir_builder.cc +++ b/hybridse/src/codegen/timestamp_ir_builder.cc @@ -267,7 +267,7 @@ bool TimestampIRBuilder::NewTimestamp(::llvm::BasicBlock* block, return false; } ::llvm::Value* timestamp; - if (!Create(block, ×tamp)) { + if (!Allocate(block, ×tamp)) { return false; } if (!SetTs(block, timestamp, @@ -286,7 +286,7 @@ bool TimestampIRBuilder::NewTimestamp(::llvm::BasicBlock* block, return false; } ::llvm::Value* timestamp; - if (!Create(block, ×tamp)) { + if (!Allocate(block, ×tamp)) { return false; } if (!SetTs(block, timestamp, ts)) { diff --git a/hybridse/src/codegen/type_ir_builder.cc b/hybridse/src/codegen/type_ir_builder.cc index 07adfb21855..0cba6015b9d 100644 --- a/hybridse/src/codegen/type_ir_builder.cc +++ b/hybridse/src/codegen/type_ir_builder.cc @@ -103,11 +103,7 @@ bool TypeIRBuilder::IsStringPtr(::llvm::Type* type) { } bool TypeIRBuilder::IsStructPtr(::llvm::Type* type) { - if (type->getTypeID() == ::llvm::Type::PointerTyID) { - type = reinterpret_cast<::llvm::PointerType*>(type)->getElementType(); - return type->isStructTy(); - } - return false; + return type->isPointerTy() && type->getPointerElementType()->isStructTy(); } base::Status TypeIRBuilder::UnaryOpTypeInfer( diff --git a/hybridse/src/codegen/udf_ir_builder.cc b/hybridse/src/codegen/udf_ir_builder.cc index 5030f3cd8ae..c9f613e5748 100644 --- a/hybridse/src/codegen/udf_ir_builder.cc +++ b/hybridse/src/codegen/udf_ir_builder.cc @@ -16,6 +16,8 @@ #include "codegen/udf_ir_builder.h" +#include +#include #include #include "codegen/context.h" @@ -172,7 +174,7 @@ Status UdfIRBuilder::BuildCodeGenUdfCall( } NativeValue gen_output; - CHECK_STATUS(gen_impl->gen(ctx_, args, &gen_output)); + CHECK_STATUS(gen_impl->gen(ctx_, args, {fn->GetReturnType(), fn->IsReturnNullable()}, &gen_output)); if (ret_null != nullptr) { if (gen_output.IsNullable()) { diff --git a/hybridse/src/node/expr_node.cc b/hybridse/src/node/expr_node.cc index 44acc336cef..8ad099a98b4 100644 --- a/hybridse/src/node/expr_node.cc +++ b/hybridse/src/node/expr_node.cc @@ -19,8 +19,6 @@ #include "absl/strings/str_cat.h" #include "absl/strings/substitute.h" #include "codec/fe_row_codec.h" -#include "codegen/arithmetic_expr_ir_builder.h" -#include "codegen/type_ir_builder.h" #include "node/node_manager.h" #include "node/sql_node.h" #include "passes/expression/expr_pass.h" @@ -210,18 +208,26 @@ Status ExprNode::IsCastAccept(node::NodeManager* nm, const TypeNode* src, // this handles compatible type when both lhs and rhs are basic types // composited types like array, list, tuple are not handled correctly, so do not expect the function to handle those -// types absl::StatusOr ExprNode::CompatibleType(NodeManager* nm, const TypeNode* lhs, const TypeNode* rhs) { if (*lhs == *rhs) { // include Null = Null return rhs; } + + if (lhs->base() == kVoid && rhs->base() == kNull) { + return lhs; + } + + if (lhs->base() == kNull && rhs->base() == kVoid) { + return rhs; + } + if (lhs->IsNull()) { - // NULL + T -> T + // NULL/VOID + T -> T return rhs; } if (rhs->IsNull()) { - // T + NULL -> T + // T + NULL/VOID -> T return lhs; } @@ -845,21 +851,15 @@ Status ArrayExpr::InferAttr(ExprAnalysisContext* ctx) { return Status::OK(); } - // auto top_type = ctx->node_manager()->MakeTypeNode(kArray); TypeNode* top_type = nullptr; auto nm = ctx->node_manager(); - if (children_.empty()) { - FAIL_STATUS(kTypeError, "element type unknown for empty array expression"); - } else { - const TypeNode* ele_type = children_[0]->GetOutputType(); - for (size_t i = 1; i < children_.size() ; ++i) { - auto res = CompatibleType(ctx->node_manager(), ele_type, children_[i]->GetOutputType()); - CHECK_TRUE(res.ok(), kTypeError, res.status()); - ele_type = res.value(); - } - CHECK_TRUE(!ele_type->IsNull(), kTypeError, "unable to infer array type, all elements are null"); - top_type = nm->MakeArrayType(ele_type, children_.size()); + const TypeNode* ele_type = nm->MakeNode(); // void type + for (size_t i = 0; i < children_.size(); ++i) { + auto res = CompatibleType(ctx->node_manager(), ele_type, children_[i]->GetOutputType()); + CHECK_TRUE(res.ok(), kTypeError, res.status()); + ele_type = res.value(); } + top_type = nm->MakeArrayType(ele_type, children_.size()); SetOutputType(top_type); // array is nullable SetNullable(true); @@ -1142,5 +1142,50 @@ ExprNode* ExprNode::DeepCopy(NodeManager* nm) const { return root; } +ArrayElementExpr::ArrayElementExpr(ExprNode* array, ExprNode* pos) : ExprNode(kExprArrayElement) { + AddChild(array); + AddChild(pos); +} + +void ArrayElementExpr::Print(std::ostream& output, const std::string& org_tab) const { + // Print for ExprNode just talk too much, I don't intend impl that + // GetExprString is much simpler + output << org_tab << GetExprString(); +} + +const std::string ArrayElementExpr::GetExprString() const { + return absl::StrCat(array()->GetExprString(), "[", position()->GetExprString(), "]"); +} + +ArrayElementExpr* ArrayElementExpr::ShadowCopy(NodeManager* nm) const { + return nm->MakeNode(array(), position()); +} + +Status ArrayElementExpr::InferAttr(ExprAnalysisContext* ctx) { + auto* arr_type = array()->GetOutputType(); + auto* pos_type = position()->GetOutputType(); + + if (arr_type->IsMap()) { + auto map_type = arr_type->GetAsOrNull(); + CHECK_TRUE(node::ExprNode::IsSafeCast(pos_type, map_type->key_type()), common::kTypeError, + "incompatiable key type for ArrayElement, expect ", map_type->key_type()->DebugString(), ", got ", + pos_type->DebugString()); + + SetOutputType(map_type->value_type()); + SetNullable(map_type->value_nullable()); + } else if (arr_type->IsArray()) { + CHECK_TRUE(pos_type->IsInteger(), common::kTypeError, + "index type mismatch for ArrayElement, expect integer, got ", pos_type->DebugString()); + CHECK_TRUE(arr_type->GetGenericSize() == 1, common::kTypeError, "internal error: array of empty T"); + + SetOutputType(arr_type->GetGenericType(0)); + SetNullable(arr_type->IsGenericNullable(0)); + } else { + FAIL_STATUS(common::kTypeError, "can't get element from ", arr_type->DebugString(), ", expect map or array"); + } + return {}; +} +ExprNode *ArrayElementExpr::array() const { return GetChild(0); } +ExprNode *ArrayElementExpr::position() const { return GetChild(1); } } // namespace node } // namespace hybridse diff --git a/hybridse/src/node/node_manager.cc b/hybridse/src/node/node_manager.cc index 86d51249e19..5b1d18e5973 100644 --- a/hybridse/src/node/node_manager.cc +++ b/hybridse/src/node/node_manager.cc @@ -484,12 +484,6 @@ SqlNode *NodeManager::MakeColumnIndexNode(SqlNodeList *keys, SqlNode *ts, SqlNod return RegisterNode(node_ptr); } -SqlNode *NodeManager::MakeColumnDescNode(const std::string &column_name, const DataType data_type, bool op_not_null, - ExprNode *default_value) { - SqlNode *node_ptr = new ColumnDefNode(column_name, data_type, op_not_null, default_value); - return RegisterNode(node_ptr); -} - SqlNodeList *NodeManager::MakeNodeList() { SqlNodeList *new_list_ptr = new SqlNodeList(); RegisterNode(new_list_ptr); @@ -792,9 +786,10 @@ AllNode *NodeManager::MakeAllNode(const std::string &relation_name, const std::s } SqlNode *NodeManager::MakeInsertTableNode(const std::string &db_name, const std::string &table_name, - const ExprListNode *columns_expr, const ExprListNode *values) { + const ExprListNode *columns_expr, const ExprListNode *values, + InsertStmt::InsertMode insert_mode) { if (nullptr == columns_expr) { - InsertStmt *node_ptr = new InsertStmt(db_name, table_name, values->children_); + InsertStmt *node_ptr = new InsertStmt(db_name, table_name, values->children_, insert_mode); return RegisterNode(node_ptr); } else { std::vector column_names; @@ -811,7 +806,7 @@ SqlNode *NodeManager::MakeInsertTableNode(const std::string &db_name, const std: } } } - InsertStmt *node_ptr = new InsertStmt(db_name, table_name, column_names, values->children_); + InsertStmt *node_ptr = new InsertStmt(db_name, table_name, column_names, values->children_, insert_mode); return RegisterNode(node_ptr); } } diff --git a/hybridse/src/node/plan_node_test.cc b/hybridse/src/node/plan_node_test.cc index 5ffb76142a7..aac111f8bf3 100644 --- a/hybridse/src/node/plan_node_test.cc +++ b/hybridse/src/node/plan_node_test.cc @@ -234,11 +234,12 @@ TEST_F(PlanNodeTest, ExtractColumnsAndIndexsTest) { index_node->SetName("index1"); CreatePlanNode *node = manager_->MakeCreateTablePlanNode( "", "t1", - {manager_->MakeColumnDescNode("col1", node::kInt32, true), - manager_->MakeColumnDescNode("col2", node::kInt32, true), - manager_->MakeColumnDescNode("col3", node::kFloat, true), - manager_->MakeColumnDescNode("col4", node::kVarchar, true), - manager_->MakeColumnDescNode("col5", node::kTimestamp, true), index_node}, + {manager_->MakeNode("col1", manager_->MakeNode(node::kInt32, true)), + manager_->MakeNode("col2", manager_->MakeNode(node::kInt32, true)), + manager_->MakeNode("col3", manager_->MakeNode(node::kFloat, true)), + manager_->MakeNode("col4", manager_->MakeNode(node::kVarchar, true)), + manager_->MakeNode("col5", manager_->MakeNode(node::kTimestamp, true)), + index_node}, {manager_->MakeReplicaNumNode(3), manager_->MakePartitionNumNode(8), manager_->MakeNode(kMemory)}, false); diff --git a/hybridse/src/node/sql_node.cc b/hybridse/src/node/sql_node.cc index 9114bad2d53..f5543d6e8b8 100644 --- a/hybridse/src/node/sql_node.cc +++ b/hybridse/src/node/sql_node.cc @@ -17,7 +17,6 @@ #include "node/sql_node.h" #include -#include #include #include #include @@ -142,6 +141,7 @@ static absl::flat_hash_map CreateExprTypeNamesMap() {kExprOrderExpression, "order"}, {kExprEscaped, "escape"}, {kExprArray, "array"}, + {kExprArrayElement, "array element"}, }; for (auto kind = 0; kind < ExprType::kExprLast; ++kind) { DCHECK(map.find(static_cast(kind)) != map.end()); @@ -1185,6 +1185,7 @@ static absl::flat_hash_map CreateSqlNodeTypeToNa {kDynamicUdafFnDef, "kDynamicUdafFnDef"}, {kWithClauseEntry, "kWithClauseEntry"}, {kAlterTableStmt, "kAlterTableStmt"}, + {kColumnSchema, "kColumnSchema"}, }; for (auto kind = 0; kind < SqlNodeType::kSqlNodeTypeLast; ++kind) { DCHECK(map.find(static_cast(kind)) != map.end()) @@ -1454,19 +1455,35 @@ void CreateTableLikeClause::Print(std::ostream &output, const std::string &tab) output << "\n"; } +std::string ColumnSchemaNode::DebugString() const { + auto res = DataTypeName(type()); + if (!generics().empty()) { + absl::StrAppend(&res, "<", + absl::StrJoin(generics(), ", ", + [](std::string *out, const ColumnSchemaNode *in) { + absl::StrAppend(out, in->DebugString()); + }), + ">"); + } + + if (not_null()) { + absl::StrAppend(&res, " NOT NULL"); + } + + if (default_value()) { + absl::StrAppend(&res, " DEFAULT ", default_value()->GetExprString()); + } + + return res; +} + void ColumnDefNode::Print(std::ostream &output, const std::string &org_tab) const { SqlNode::Print(output, org_tab); const std::string tab = org_tab + INDENT + SPACE_ED; output << "\n"; - PrintValue(output, tab, column_name_, "column_name", false); - output << "\n"; - PrintValue(output, tab, DataTypeName(column_type_), "column_type", false); + PrintValue(output, tab, GetColumnName(), "column_name", false); output << "\n"; - PrintValue(output, tab, std::to_string(op_not_null_), "NOT NULL", !default_value_); - if (default_value_) { - output << "\n"; - PrintSqlNode(output, tab, default_value_, "default_value", true); - } + PrintValue(output, tab, schema_->DebugString(), "column_type", true); } void ColumnIndexNode::SetTTL(ExprListNode *ttl_node_list) { @@ -1995,25 +2012,6 @@ void StructExpr::Print(std::ostream &output, const std::string &org_tab) const { PrintSqlNode(output, tab, methods_, "methods", true); } -void TypeNode::Print(std::ostream &output, const std::string &org_tab) const { - SqlNode::Print(output, org_tab); - const std::string tab = org_tab + INDENT + SPACE_ED; - - output << "\n"; - PrintValue(output, tab, GetName(), "type", true); -} -bool TypeNode::Equals(const SqlNode *node) const { - if (!SqlNode::Equals(node)) { - return false; - } - - const TypeNode *that = dynamic_cast(node); - return this->base_ == that->base_ && - std::equal( - this->generics_.cbegin(), this->generics_.cend(), that->generics_.cbegin(), - [&](const hybridse::node::TypeNode *a, const hybridse::node::TypeNode *b) { return TypeEquals(a, b); }); -} - void JoinNode::Print(std::ostream &output, const std::string &org_tab) const { TableRefNode::Print(output, org_tab); diff --git a/hybridse/src/node/sql_node_test.cc b/hybridse/src/node/sql_node_test.cc index e2938656dcc..67bb861a812 100644 --- a/hybridse/src/node/sql_node_test.cc +++ b/hybridse/src/node/sql_node_test.cc @@ -209,11 +209,11 @@ TEST_F(SqlNodeTest, MakeWindowDefNodetTest) { ExprListNode *partitions = node_manager_->MakeExprList(); ExprNode *ptr1 = node_manager_->MakeColumnRefNode("keycol", ""); - partitions->PushBack(ptr1); + partitions->AddChild(ptr1); ExprNode *ptr2 = node_manager_->MakeOrderExpression(node_manager_->MakeColumnRefNode("col1", ""), true); ExprListNode *orders = node_manager_->MakeExprList(); - orders->PushBack(ptr2); + orders->AddChild(ptr2); int64_t maxsize = 0; SqlNode *frame = @@ -286,29 +286,30 @@ TEST_F(SqlNodeTest, NewFrameNodeTest) { TEST_F(SqlNodeTest, MakeInsertNodeTest) { ExprListNode *column_expr_list = node_manager_->MakeExprList(); ExprNode *ptr1 = node_manager_->MakeColumnRefNode("col1", ""); - column_expr_list->PushBack(ptr1); + column_expr_list->AddChild(ptr1); ExprNode *ptr2 = node_manager_->MakeColumnRefNode("col2", ""); - column_expr_list->PushBack(ptr2); + column_expr_list->AddChild(ptr2); ExprNode *ptr3 = node_manager_->MakeColumnRefNode("col3", ""); - column_expr_list->PushBack(ptr3); + column_expr_list->AddChild(ptr3); ExprNode *ptr4 = node_manager_->MakeColumnRefNode("col4", ""); - column_expr_list->PushBack(ptr4); + column_expr_list->AddChild(ptr4); ExprListNode *value_expr_list = node_manager_->MakeExprList(); ExprNode *value1 = node_manager_->MakeConstNode(1); ExprNode *value2 = node_manager_->MakeConstNode(2.3f); ExprNode *value3 = node_manager_->MakeConstNode(2.3); ExprNode *value4 = node_manager_->MakeParameterExpr(1); - value_expr_list->PushBack(value1); - value_expr_list->PushBack(value2); - value_expr_list->PushBack(value3); - value_expr_list->PushBack(value4); + value_expr_list->AddChild(value1); + value_expr_list->AddChild(value2); + value_expr_list->AddChild(value3); + value_expr_list->AddChild(value4); ExprListNode *insert_values = node_manager_->MakeExprList(); - insert_values->PushBack(value_expr_list); - SqlNode *node_ptr = node_manager_->MakeInsertTableNode("", "t1", column_expr_list, insert_values); + insert_values->AddChild(value_expr_list); + SqlNode *node_ptr = node_manager_->MakeInsertTableNode("", "t1", column_expr_list, insert_values, + InsertStmt::InsertMode::DEFAULT_MODE); ASSERT_EQ(kInsertStmt, node_ptr->GetType()); InsertStmt *insert_stmt = dynamic_cast(node_ptr); @@ -670,11 +671,17 @@ TEST_F(SqlNodeTest, CreateIndexNodeTest) { ColumnIndexNode *index_node = dynamic_cast(node_manager_->MakeColumnIndexNode(index_items)); CreatePlanNode *node = node_manager_->MakeCreateTablePlanNode( "", "t1", - {node_manager_->MakeColumnDescNode("col1", node::kInt32, true), - node_manager_->MakeColumnDescNode("col2", node::kInt32, true), - node_manager_->MakeColumnDescNode("col3", node::kFloat, true), - node_manager_->MakeColumnDescNode("col4", node::kVarchar, true), - node_manager_->MakeColumnDescNode("col5", node::kTimestamp, true), index_node}, + {node_manager_->MakeNode( + "col1", node_manager_->MakeNode(node::kInt32, true, nullptr)), + node_manager_->MakeNode( + "col2", node_manager_->MakeNode(node::kInt32, true, nullptr)), + node_manager_->MakeNode( + "col3", node_manager_->MakeNode(node::kFloat, true, nullptr)), + node_manager_->MakeNode( + "col4", node_manager_->MakeNode(node::kVarchar, true, nullptr)), + node_manager_->MakeNode( + "col5", node_manager_->MakeNode(node::kTimestamp, true, nullptr)), + index_node}, {node_manager_->MakeReplicaNumNode(3), node_manager_->MakePartitionNumNode(8), node_manager_->MakeNode(kMemory)}, false); diff --git a/hybridse/src/node/type_node.cc b/hybridse/src/node/type_node.cc index e0052fca74c..c3c1015ce8f 100644 --- a/hybridse/src/node/type_node.cc +++ b/hybridse/src/node/type_node.cc @@ -20,7 +20,6 @@ #include "absl/strings/str_join.h" #include "absl/strings/str_cat.h" #include "node/node_manager.h" -#include "vm/physical_op.h" namespace hybridse { namespace node { @@ -52,7 +51,11 @@ bool TypeNode::IsTimestamp() const { return base_ == node::kTimestamp; } bool TypeNode::IsString() const { return base_ == node::kVarchar; } bool TypeNode::IsArithmetic() const { return IsInteger() || IsFloating(); } bool TypeNode::IsNumber() const { return IsInteger() || IsFloating(); } -bool TypeNode::IsNull() const { return base_ == node::kNull; } + +// Better function name ? Note the difference of VOID and NULL, VOID is a data type +// while NULL is a placeholder for missing or unknown information, not a real data type. +bool TypeNode::IsNull() const { return base_ == node::kNull || base_ == node::kVoid; } + bool TypeNode::IsBool() const { return base_ == node::kBool; } bool TypeNode::IsIntegral() const { @@ -137,5 +140,89 @@ FixedArrayType *FixedArrayType::ShadowCopy(NodeManager *nm) const { return nm->MakeArrayType(element_type(), num_elements_); } +void TypeNode::AddGeneric(const node::TypeNode *dtype, bool nullable) { + generics_.push_back(dtype); + generics_nullable_.push_back(nullable); +} +const hybridse::node::TypeNode *TypeNode::GetGenericType(size_t idx) const { return generics_[idx]; } +const std::string TypeNode::GetName() const { + std::string type_name = DataTypeName(base_); + if (!generics_.empty()) { + for (auto type : generics_) { + type_name.append("_"); + type_name.append(type->GetName()); + } + } + return type_name; +} + +void TypeNode::Print(std::ostream &output, const std::string &org_tab) const { + SqlNode::Print(output, org_tab); + const std::string tab = org_tab + INDENT + SPACE_ED; + + output << "\n"; + PrintValue(output, tab, GetName(), "type", true); +} +bool TypeNode::Equals(const SqlNode *node) const { + if (!SqlNode::Equals(node)) { + return false; + } + + const TypeNode *that = dynamic_cast(node); + return this->base_ == that->base_ && + std::equal( + this->generics_.cbegin(), this->generics_.cend(), that->generics_.cbegin(), + [&](const hybridse::node::TypeNode *a, const hybridse::node::TypeNode *b) { return TypeEquals(a, b); }); +} + +const std::string OpaqueTypeNode::GetName() const { return "opaque<" + std::to_string(bytes_) + ">"; } + +MapType::MapType(const TypeNode *key_ty, const TypeNode *value_ty, bool value_not_null) : TypeNode(node::kMap) { + // map key does not accept null, value is nullable unless extra attributes specified + AddGeneric(key_ty, false); + AddGeneric(value_ty, !value_not_null); +} +MapType::~MapType() {} +const TypeNode *MapType::key_type() const { return GetGenericType(0); } +const TypeNode *MapType::value_type() const { return GetGenericType(1); } +bool MapType::value_nullable() const { return IsGenericNullable(1); } + +// MAP +// 1. ALL KEYs or VALUEs must share a least common type. +// 2. KEY is simple type only: void/bool/numeric/data/timestamp/string +// 3. Resolve to MAP if arguments is empty +absl::StatusOr MapType::InferMapType(NodeManager* nm, absl::Span types) { + if (types.size() % 2 != 0) { + return absl::InvalidArgumentError("map expects a positive even number of arguments"); + } + + const node::TypeNode* key = nm->MakeNode(); // void type + const node::TypeNode* value = nm->MakeNode(); // void type + for (size_t i = 0; i < types.size(); i += 2) { + if (!types[i].type()->IsBaseOrNullType()) { + return absl::FailedPreconditionError( + absl::StrCat("key type for map should be void/bool/numeric/data/timestamp/string only, got ", + types[i].type()->DebugString())); + } + auto key_res = node::ExprNode::CompatibleType(nm, key, types[i].type()); + if (!key_res.ok()) { + return key_res.status(); + } + key = key_res.value(); + auto value_res = node::ExprNode::CompatibleType(nm, value, types[i + 1].type()); + if (!value_res.ok()) { + return value_res.status(); + } + value = value_res.value(); + } + + if (!types.empty() && (key->base() == kVoid || value->base() == kVoid)) { + // only empty map resolved to MAP + return absl::FailedPreconditionError("KEY/VALUE type of non-empty map can't be VOID"); + } + + return nm->MakeNode(key, value); +} + } // namespace node } // namespace hybridse diff --git a/hybridse/src/passes/lambdafy_projects.h b/hybridse/src/passes/lambdafy_projects.h index 3371cd12902..6afed956ee3 100644 --- a/hybridse/src/passes/lambdafy_projects.h +++ b/hybridse/src/passes/lambdafy_projects.h @@ -17,16 +17,12 @@ #ifndef HYBRIDSE_SRC_PASSES_LAMBDAFY_PROJECTS_H_ #define HYBRIDSE_SRC_PASSES_LAMBDAFY_PROJECTS_H_ -#include #include #include #include #include "node/expr_node.h" -#include "node/plan_node.h" #include "node/sql_node.h" -#include "udf/udf_library.h" -#include "vm/schemas_context.h" namespace hybridse { namespace passes { diff --git a/hybridse/src/plan/planner.cc b/hybridse/src/plan/planner.cc index 164dba11f2b..e05e639efb1 100644 --- a/hybridse/src/plan/planner.cc +++ b/hybridse/src/plan/planner.cc @@ -345,7 +345,8 @@ base::Status Planner::CreateSelectQueryPlan(const node::SelectQueryNode *root, n return base::Status::OK(); } -base::Status Planner::CreateSetOperationPlan(const node::SetOperationNode *root, node::SetOperationPlanNode **plan_tree) { +base::Status Planner::CreateSetOperationPlan(const node::SetOperationNode *root, + node::SetOperationPlanNode **plan_tree) { CHECK_TRUE(nullptr != root, common::kPlanError, "can not create query plan node with null query node") auto list = node_manager_->MakeList(); diff --git a/hybridse/src/plan/planner.h b/hybridse/src/plan/planner.h index 731663ab246..6da3068fdd8 100644 --- a/hybridse/src/plan/planner.h +++ b/hybridse/src/plan/planner.h @@ -49,6 +49,7 @@ class Planner { virtual ~Planner() {} virtual base::Status CreatePlanTree(const NodePointVector &parser_trees, PlanNodeList &plan_trees) = 0; // NOLINT (runtime/references) + static base::Status TransformTableDef(const std::string &table_name, const NodePointVector &column_desc_list, type::TableDef *table); bool MergeWindows(const std::map &map, @@ -132,11 +133,11 @@ class SimplePlanner : public Planner { bool enable_batch_window_parallelization = true, const std::unordered_map* extra_options = nullptr) : Planner(manager, is_batch_mode, is_cluster_optimized, enable_batch_window_parallelization, extra_options) {} - ~SimplePlanner() {} + ~SimplePlanner() override {} protected: base::Status CreatePlanTree(const NodePointVector &parser_trees, - PlanNodeList &plan_trees); // NOLINT + PlanNodeList &plan_trees) override; // NOLINT }; } // namespace plan diff --git a/hybridse/src/planv2/ast_node_converter.cc b/hybridse/src/planv2/ast_node_converter.cc index 5d9eb939113..2c6225be9a8 100644 --- a/hybridse/src/planv2/ast_node_converter.cc +++ b/hybridse/src/planv2/ast_node_converter.cc @@ -25,8 +25,10 @@ #include "absl/strings/match.h" #include "absl/types/span.h" #include "base/fe_status.h" +#include "node/sql_node.h" #include "udf/udf.h" #include "zetasql/parser/ast_node_kind.h" +#include "zetasql/parser/parse_tree_manual.h" namespace hybridse { namespace plan { @@ -57,6 +59,10 @@ static base::Status ConvertAlterTableStmt(const zetasql::ASTAlterTableStatement* node::SqlNode** out); static base::Status ConvertSetOperation(const zetasql::ASTSetOperation* stmt, node::NodeManager* nm, node::SetOperationNode** out); +static base::Status ConvertSchemaNode(const zetasql::ASTColumnSchema* stmt, node::NodeManager* nm, + node::ColumnSchemaNode** out); +static base::Status ConvertArrayElement(const zetasql::ASTArrayElement* expr, node::NodeManager* nm, + node::ArrayElementExpr** out); /// Used to convert zetasql ASTExpression Node into our ExprNode base::Status ConvertExprNode(const zetasql::ASTExpression* ast_expression, node::NodeManager* node_manager, @@ -107,6 +113,13 @@ base::Status ConvertExprNode(const zetasql::ASTExpression* ast_expression, node: } return base::Status::OK(); } + case zetasql::AST_ARRAY_ELEMENT: { + node::ArrayElementExpr* expr = nullptr; + CHECK_STATUS( + ConvertGuard(ast_expression, node_manager, &expr, ConvertArrayElement)); + *output = expr; + return base::Status::OK(); + } case zetasql::AST_CASE_VALUE_EXPRESSION: { auto* case_expression = ast_expression->GetAsOrDie(); auto& arguments = case_expression->arguments(); @@ -123,7 +136,7 @@ base::Status ConvertExprNode(const zetasql::ASTExpression* ast_expression, node: node::ExprNode* then_expr = nullptr; CHECK_STATUS(ConvertExprNode(arguments[i], node_manager, &when_expr)) CHECK_STATUS(ConvertExprNode(arguments[i + 1], node_manager, &then_expr)) - when_list_expr->PushBack(node_manager->MakeWhenNode(when_expr, then_expr)); + when_list_expr->AddChild(node_manager->MakeWhenNode(when_expr, then_expr)); i += 2; } else { CHECK_STATUS(ConvertExprNode(arguments[i], node_manager, &else_expr)) @@ -147,7 +160,7 @@ base::Status ConvertExprNode(const zetasql::ASTExpression* ast_expression, node: node::ExprNode* then_expr = nullptr; CHECK_STATUS(ConvertExprNode(arguments[i], node_manager, &when_expr)) CHECK_STATUS(ConvertExprNode(arguments[i + 1], node_manager, &then_expr)) - when_list_expr->PushBack(node_manager->MakeWhenNode(when_expr, then_expr)); + when_list_expr->AddChild(node_manager->MakeWhenNode(when_expr, then_expr)); i += 2; } else { CHECK_STATUS(ConvertExprNode(arguments[i], node_manager, &else_expr)) @@ -1475,9 +1488,7 @@ base::Status ConvertCreateProcedureNode(const zetasql::ASTCreateProcedureStateme } // case element -// ASTColumnDefinition -> case element.schema -// ASSTSimpleColumnSchema -> ColumnDeefNode -// otherwise -> not implemented +// ASTColumnDefinition -> ColumnDefNode // ASTIndexDefinition -> ColumnIndexNode // otherwise -> not implemented base::Status ConvertTableElement(const zetasql::ASTTableElement* element, node::NodeManager* node_manager, @@ -1489,38 +1500,10 @@ base::Status ConvertTableElement(const zetasql::ASTTableElement* element, node:: auto column_def = element->GetAsOrNull(); CHECK_TRUE(column_def != nullptr, common::kSqlAstError, "not an ASTColumnDefinition"); - auto not_null_columns = column_def->schema()->FindAttributes( - zetasql::AST_NOT_NULL_COLUMN_ATTRIBUTE); - bool not_null = !not_null_columns.empty(); - const std::string name = column_def->name()->GetAsString(); - - auto kind = column_def->schema()->node_kind(); - switch (kind) { - case zetasql::AST_SIMPLE_COLUMN_SCHEMA: { - // only simple column schema is supported - auto simple_column_schema = column_def->schema()->GetAsOrNull(); - CHECK_TRUE(simple_column_schema != nullptr, common::kSqlAstError, "not and ASTSimpleColumnSchema"); - - std::string type_name = ""; - CHECK_STATUS(AstPathExpressionToString(simple_column_schema->type_name(), &type_name)) - node::DataType type; - CHECK_STATUS(node::StringToDataType(type_name, &type)); - - node::ExprNode* default_value = nullptr; - if (simple_column_schema->default_expression()) { - CHECK_STATUS( - ConvertExprNode(simple_column_schema->default_expression(), node_manager, &default_value)); - } - - *node = node_manager->MakeColumnDescNode(name, type, not_null, default_value); - return base::Status::OK(); - } - default: { - return base::Status(common::kSqlAstError, absl::StrCat("unsupported column schema type: ", - zetasql::ASTNode::NodeKindToString(kind))); - } - } + node::ColumnSchemaNode* schema = nullptr; + CHECK_STATUS(ConvertSchemaNode(column_def->schema(), node_manager, &schema)); + *node = node_manager->MakeNode(name, schema); break; } case zetasql::AST_INDEX_DEFINITION: { @@ -1528,13 +1511,14 @@ base::Status ConvertTableElement(const zetasql::ASTTableElement* element, node:: node::ColumnIndexNode* index_node = nullptr; CHECK_STATUS(ConvertColumnIndexNode(ast_index_node, node_manager, &index_node)); *node = index_node; - return base::Status::OK(); + break; } default: { return base::Status(common::kSqlAstError, absl::StrCat("unsupported table column elemnt: ", element->GetNodeKindString())); } } + return base::Status::OK(); } // ASTIndexDefinition node @@ -1628,14 +1612,14 @@ base::Status ConvertIndexOption(const zetasql::ASTOptionsEntry* entry, node::Nod node::DataType unit; CHECK_STATUS(ASTIntervalLIteralToNum(entry->value(), &value, &unit)); auto node = node_manager->MakeConstNode(value, unit); - ttl_list->PushBack(node); + ttl_list->AddChild(node); break; } case zetasql::AST_INT_LITERAL: { int64_t value; CHECK_STATUS(ASTIntLiteralToNum(entry->value(), &value)); auto node = node_manager->MakeConstNode(value, node::kLatest); - ttl_list->PushBack(node); + ttl_list->AddChild(node); break; } case zetasql::AST_STRUCT_CONSTRUCTOR_WITH_PARENS: { @@ -1649,11 +1633,11 @@ base::Status ConvertIndexOption(const zetasql::ASTOptionsEntry* entry, node::Nod CHECK_STATUS(ASTIntervalLIteralToNum(struct_parens->field_expression(0), &value, &unit)); auto node = node_manager->MakeConstNode(value, unit); - ttl_list->PushBack(node); + ttl_list->AddChild(node); value = 0; CHECK_STATUS(ASTIntLiteralToNum(struct_parens->field_expression(1), &value)); - ttl_list->PushBack(node_manager->MakeConstNode(value, node::kLatest)); + ttl_list->AddChild(node_manager->MakeConstNode(value, node::kLatest)); break; } default: { @@ -1962,8 +1946,9 @@ base::Status ConvertInsertStatement(const zetasql::ASTInsertStatement* root, nod } CHECK_TRUE(nullptr == root->query(), common::kSqlAstError, "Un-support insert statement with query"); - CHECK_TRUE(zetasql::ASTInsertStatement::InsertMode::DEFAULT_MODE == root->insert_mode(), common::kSqlAstError, - "Un-support insert mode ", root->GetSQLForInsertMode()); + CHECK_TRUE(zetasql::ASTInsertStatement::InsertMode::DEFAULT_MODE == root->insert_mode() || + zetasql::ASTInsertStatement::InsertMode::IGNORE == root->insert_mode(), + common::kSqlAstError, "Un-support insert mode ", root->GetSQLForInsertMode()); CHECK_TRUE(nullptr == root->returning(), common::kSqlAstError, "Un-support insert statement with return clause currently", root->GetSQLForInsertMode()); CHECK_TRUE(nullptr == root->assert_rows_modified(), common::kSqlAstError, @@ -1972,7 +1957,7 @@ base::Status ConvertInsertStatement(const zetasql::ASTInsertStatement* root, nod node::ExprListNode* column_list = node_manager->MakeExprList(); if (nullptr != root->column_list()) { for (auto column : root->column_list()->identifiers()) { - column_list->PushBack(node_manager->MakeColumnRefNode(column->GetAsString(), "")); + column_list->AddChild(node_manager->MakeColumnRefNode(column->GetAsString(), "")); } } @@ -2000,8 +1985,8 @@ base::Status ConvertInsertStatement(const zetasql::ASTInsertStatement* root, nod if (names.size() == 2) { db_name = names[0]; } - *output = - dynamic_cast(node_manager->MakeInsertTableNode(db_name, table_name, column_list, rows)); + *output = dynamic_cast(node_manager->MakeInsertTableNode( + db_name, table_name, column_list, rows, static_cast(root->insert_mode()))); return base::Status::OK(); } base::Status ConvertDropStatement(const zetasql::ASTDropStatement* root, node::NodeManager* node_manager, @@ -2307,6 +2292,19 @@ base::Status ConvertASTType(const zetasql::ASTType* ast_type, node::NodeManager* }))); break; } + case zetasql::AST_MAP_TYPE: { + CHECK_STATUS((ConvertGuard( + ast_type, nm, output, + [](const zetasql::ASTMapType* map_tp, node::NodeManager* nm, node::TypeNode** out) -> base::Status { + node::TypeNode* key = nullptr; + node::TypeNode* value = nullptr; + CHECK_STATUS(ConvertASTType(map_tp->key_type(), nm, &key)); + CHECK_STATUS(ConvertASTType(map_tp->value_type(), nm, &value)); + *out = nm->MakeNode(key, value); + return base::Status::OK(); + }))); + break; + } default: { return base::Status(common::kSqlAstError, "Un-support type: " + ast_type->GetNodeKindString()); } @@ -2406,5 +2404,82 @@ base::Status ConvertSetOperation(const zetasql::ASTSetOperation* set_op, node::N } } +base::Status ConvertSchemaNode(const zetasql::ASTColumnSchema* stmt, node::NodeManager* nm, + node::ColumnSchemaNode** out) { + auto not_null_columns = + stmt->FindAttributes(zetasql::AST_NOT_NULL_COLUMN_ATTRIBUTE); + bool not_null = !not_null_columns.empty(); + + node::ExprNode* default_value = nullptr; + if (stmt->default_expression()) { + CHECK_STATUS(ConvertExprNode(stmt->default_expression(), nm, &default_value)); + } + + switch (stmt->node_kind()) { + case zetasql::AST_SIMPLE_COLUMN_SCHEMA: { + auto simple_column_schema = stmt->GetAsOrNull(); + CHECK_TRUE(simple_column_schema != nullptr, common::kSqlAstError, "not and ASTSimpleColumnSchema"); + + std::string type_name = ""; + CHECK_STATUS(AstPathExpressionToString(simple_column_schema->type_name(), &type_name)) + node::DataType type; + CHECK_STATUS(node::StringToDataType(type_name, &type)); + + *out = nm->MakeNode(type, not_null, default_value); + break; + } + case zetasql::AST_ARRAY_COLUMN_SCHEMA: { + CHECK_STATUS((ConvertGuard( + stmt, nm, out, + [not_null, default_value](const zetasql::ASTArrayColumnSchema* array_type, node::NodeManager* nm, + node::ColumnSchemaNode** out) -> base::Status { + node::ColumnSchemaNode* element_ty = nullptr; + CHECK_STATUS(ConvertSchemaNode(array_type->element_schema(), nm, &element_ty)); + + *out = nm->MakeNode( + node::DataType::kArray, std::initializer_list{element_ty}, + not_null, default_value); + return base::Status::OK(); + }))); + break; + } + case zetasql::AST_MAP_COLUMN_SCHEMA: { + CHECK_STATUS((ConvertGuard( + stmt, nm, out, + [not_null, default_value](const zetasql::ASTMapColumnSchema* map_type, node::NodeManager* nm, + node::ColumnSchemaNode** out) -> base::Status { + node::ColumnSchemaNode* key = nullptr; + CHECK_STATUS(ConvertSchemaNode(map_type->key_schema(), nm, &key)); + node::ColumnSchemaNode* value = nullptr; + CHECK_STATUS(ConvertSchemaNode(map_type->value_schema(), nm, &value)); + + *out = nm->MakeNode( + node::DataType::kMap, std::initializer_list{key, value}, + not_null, default_value); + return base::Status::OK(); + }))); + break; + } + default: { + return base::Status(common::kSqlAstError, + absl::StrCat("unsupported column schema type: ", stmt->GetNodeKindString())); + } + } + + return base::Status::OK(); +} + +base::Status ConvertArrayElement(const zetasql::ASTArrayElement* expr, node::NodeManager* nm, + node::ArrayElementExpr** out) { + node::ExprNode* array = nullptr; + node::ExprNode* pos = nullptr; + + CHECK_STATUS(ConvertExprNode(expr->array(), nm, &array)); + CHECK_STATUS(ConvertExprNode(expr->position(), nm, &pos)); + + *out = nm->MakeNode(array, pos); + return {}; +} + } // namespace plan } // namespace hybridse diff --git a/hybridse/src/planv2/ast_node_converter_test.cc b/hybridse/src/planv2/ast_node_converter_test.cc index 51447011f78..b2a36f9420f 100644 --- a/hybridse/src/planv2/ast_node_converter_test.cc +++ b/hybridse/src/planv2/ast_node_converter_test.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "absl/strings/match.h" #include "case/sql_case.h" @@ -945,20 +946,6 @@ TEST_F(ASTNodeConverterTest, ConvertCreateTableNodeErrorTest) { auto status = ConvertCreateTableNode(create_stmt, &node_manager, &output); EXPECT_EQ(common::kTypeError, status.code); } - { - // not supported schema - const std::string sql = "create table t (a Array) "; - - std::unique_ptr parser_output; - ZETASQL_ASSERT_OK(zetasql::ParseStatement(sql, zetasql::ParserOptions(), &parser_output)); - const auto* statement = parser_output->statement(); - ASSERT_TRUE(statement->Is()); - - const auto create_stmt = statement->GetAsOrDie(); - node::CreateStmt* output = nullptr; - auto status = ConvertCreateTableNode(create_stmt, &node_manager, &output); - EXPECT_EQ(common::kSqlAstError, status.code); - } { // not supported table element const std::string sql = "create table t (a int64, primary key (a)) "; @@ -1206,6 +1193,8 @@ INSTANTIATE_TEST_SUITE_P(ASTHWindowQueryTest, ASTNodeConverterTest, testing::ValuesIn(sqlcase::InitCases("cases/plan/window_query.yaml", FILTERS))); INSTANTIATE_TEST_SUITE_P(ASTUnionQueryTest, ASTNodeConverterTest, testing::ValuesIn(sqlcase::InitCases("cases/plan/union_query.yaml", FILTERS))); +INSTANTIATE_TEST_SUITE_P(ASTConstQueryTest, ASTNodeConverterTest, + testing::ValuesIn(sqlcase::InitCases("cases/plan/const_query.yaml", FILTERS))); } // namespace plan } // namespace hybridse diff --git a/hybridse/src/planv2/plan_api.cc b/hybridse/src/planv2/plan_api.cc index affe2ca80f0..d3f8f7644bf 100644 --- a/hybridse/src/planv2/plan_api.cc +++ b/hybridse/src/planv2/plan_api.cc @@ -16,13 +16,36 @@ #include "plan/plan_api.h" #include "planv2/planner_v2.h" +#include "zetasql/parser/parser.h" #include "zetasql/public/error_helpers.h" #include "zetasql/public/error_location.pb.h" namespace hybridse { namespace plan { -using hybridse::plan::SimplePlannerV2; +base::Status PlanAPI::CreatePlanTreeFromScript(vm::SqlContext *ctx) { + zetasql::ParserOptions parser_opts; + zetasql::LanguageOptions language_opts; + language_opts.EnableLanguageFeature(zetasql::FEATURE_V_1_3_COLUMN_DEFAULT_VALUE); + parser_opts.set_language_options(&language_opts); + // save parse result into SqlContext so SQL engine can reference fields inside ASTNode during whole compile stage + auto zetasql_status = + zetasql::ParseScript(ctx->sql, parser_opts, zetasql::ERROR_MESSAGE_MULTI_LINE_WITH_CARET, &ctx->ast_node); + zetasql::ErrorLocation location; + if (!zetasql_status.ok()) { + zetasql::ErrorLocation location; + GetErrorLocation(zetasql_status, &location); + return {common::kSyntaxError, zetasql::FormatError(zetasql_status)}; + } + + DLOG(INFO) << "AST Node:\n" << ctx->ast_node->script()->DebugString(); + + const zetasql::ASTScript *script = ctx->ast_node->script(); + auto planner_ptr = + std::make_unique(&ctx->nm, ctx->engine_mode == vm::kBatchMode, ctx->is_cluster_optimized, + ctx->enable_batch_window_parallelization, ctx->options.get()); + return planner_ptr->CreateASTScriptPlan(script, ctx->logical_plan); +} bool PlanAPI::CreatePlanTreeFromScript(const std::string &sql, PlanNodeList &plan_trees, NodeManager *node_manager, Status &status, bool is_batch_mode, bool is_cluster, diff --git a/hybridse/src/planv2/planner_v2.h b/hybridse/src/planv2/planner_v2.h index 46627f10a90..2555ffd66e2 100644 --- a/hybridse/src/planv2/planner_v2.h +++ b/hybridse/src/planv2/planner_v2.h @@ -35,12 +35,12 @@ using node::PlanNodeList; class SimplePlannerV2 : public SimplePlanner { public: - explicit SimplePlannerV2(node::NodeManager *manager) : SimplePlanner(manager, true, false, false) {} SimplePlannerV2(node::NodeManager *manager, bool is_batch_mode, bool is_cluster_optimized = false, bool enable_batch_window_parallelization = false, const std::unordered_map *extra_options = nullptr) : SimplePlanner(manager, is_batch_mode, is_cluster_optimized, enable_batch_window_parallelization, extra_options) {} + base::Status CreateASTScriptPlan(const zetasql::ASTScript *script, PlanNodeList &plan_trees); // NOLINT (runtime/references) }; diff --git a/hybridse/src/sdk/hybridse_interface_core.i b/hybridse/src/sdk/hybridse_interface_core.i index 660f9bac7a1..9c053b69b71 100644 --- a/hybridse/src/sdk/hybridse_interface_core.i +++ b/hybridse/src/sdk/hybridse_interface_core.i @@ -118,6 +118,7 @@ SWIG_JAVABODY_PROXY(public, public, SWIGTYPE) #include "base/iterator.h" #include "vm/catalog.h" #include "vm/engine.h" +#include "vm/sql_ctx.h" #include "vm/engine_context.h" #include "vm/sql_compiler.h" #include "vm/jit_wrapper.h" @@ -140,6 +141,7 @@ using hybridse::vm::WindowOp; using hybridse::vm::EngineMode; using hybridse::vm::EngineOptions; using hybridse::vm::IndexHintHandler; +using hybridse::vm::SqlContext; using hybridse::base::Iterator; using hybridse::base::ConstIterator; using hybridse::base::Trace; diff --git a/hybridse/src/testing/engine_test_base.cc b/hybridse/src/testing/engine_test_base.cc index 7d02528b5ce..3aebea8f2de 100644 --- a/hybridse/src/testing/engine_test_base.cc +++ b/hybridse/src/testing/engine_test_base.cc @@ -409,7 +409,7 @@ Status EngineTestRunner::Compile() { DLOG(INFO) << "Physical plan:\n" << oss.str(); std::ostringstream runner_oss; - std::dynamic_pointer_cast(session_->GetCompileInfo())->GetClusterJob().Print(runner_oss, ""); + std::dynamic_pointer_cast(session_->GetCompileInfo())->GetClusterJob()->Print(runner_oss, ""); DLOG(INFO) << "Runner plan:\n" << runner_oss.str(); } return status; diff --git a/hybridse/src/udf/default_defs/map_defs.cc b/hybridse/src/udf/default_defs/map_defs.cc new file mode 100644 index 00000000000..c1cae3e554c --- /dev/null +++ b/hybridse/src/udf/default_defs/map_defs.cc @@ -0,0 +1,123 @@ +/** + * Copyright (c) 2023 4Paradigm Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "codegen/map_ir_builder.h" +#include "codegen/ir_base_builder.h" +#include "node/expr_node.h" +#include "node/type_node.h" +#include "udf/default_udf_library.h" +#include "udf/udf_registry.h" + +namespace hybridse { +namespace udf { + +void DefaultUdfLibrary::InitMapUdfs() { + RegisterCodeGenUdf("map") + .variadic_args<>( + // infer + [](UdfResolveContext* ctx, const std::vector& arg_attrs, + ExprAttrNode* out) -> base::Status { + auto ret = node::MapType::InferMapType(ctx->node_manager(), arg_attrs); + CHECK_TRUE(ret.ok(), common::kTypeError, ret.status().ToString()); + out->SetType(ret.value()); + out->SetNullable(true); + return {}; + }, + // gen + [](codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* out) -> base::Status { + CHECK_TRUE(return_info.type()->IsMap(), common::kTypeError, "not a map type output"); + auto* map_type = return_info.type()->GetAsOrNull(); + CHECK_TRUE(map_type != nullptr, common::kTypeError, "can not cast to MapType"); + + ::llvm::Type* key_type = nullptr; + ::llvm::Type* value_type = nullptr; + CHECK_TRUE(codegen::GetLlvmType(ctx->GetModule(), map_type->key_type(), &key_type), + common::kCodegenError); + CHECK_TRUE(codegen::GetLlvmType(ctx->GetModule(), map_type->value_type(), &value_type), + common::kCodegenError); + codegen::MapIRBuilder builder(ctx->GetModule(), key_type, value_type); + auto res = builder.Construct(ctx, args); + if (res.ok()) { + *out = res.value(); + return {}; + } + return {common::kCodegenError, res.status().ToString()}; + }) + .doc(R"( + @brief map(key1, value1, key2, value2, ...) - Creates a map with the given key/value pairs. + + Example: + + @code{.sql} + select map(1, '1', 2, '2'); + -- {1: "1", 2: "2"} + @endcode + + @since 0.9.0 + )"); + + RegisterCodeGenUdf("map_keys") + .args( + [](UdfResolveContext* ctx, const ExprAttrNode& in, ExprAttrNode* out) -> base::Status { + CHECK_TRUE(in.type()->IsMap(), common::kTypeError, "map_keys requires a map data type, got ", + in.type()->DebugString()); + + auto map_type = in.type()->GetAsOrNull(); + CHECK_TRUE(map_type != nullptr, common::kTypeError); + + out->SetType(ctx->node_manager()->MakeNode(node::kArray, map_type->key_type())); + out->SetNullable(true); + return {}; + }, + [](codegen::CodeGenContext* ctx, codegen::NativeValue in, const node::ExprAttrNode& return_info, + codegen::NativeValue* out) -> base::Status { + const node::TypeNode* type = nullptr; + CHECK_TRUE(codegen::GetFullType(ctx->node_manager(), in.GetType(), &type), common::kTypeError); + auto map_type = type->GetAsOrNull(); + CHECK_TRUE(map_type != nullptr, common::kTypeError); + + ::llvm::Type* key_type = nullptr; + ::llvm::Type* value_type = nullptr; + CHECK_TRUE(codegen::GetLlvmType(ctx->GetModule(), map_type->key_type(), &key_type), + common::kCodegenError); + CHECK_TRUE(codegen::GetLlvmType(ctx->GetModule(), map_type->value_type(), &value_type), + common::kCodegenError); + codegen::MapIRBuilder builder(ctx->GetModule(), key_type, value_type); + + auto res = builder.MapKeys(ctx, in); + if (res.ok()) { + *out = res.value(); + return {}; + } + return {common::kCodegenError, res.status().ToString()}; + }) + .doc(R"( + @brief map_keys(map) - Returns an unordered array containing the keys of the map. + + Example: + + @code{.sql} + select map_keys(map(1, '2', 3, '4')); + -- [1, 3] + @endcode + + @since 0.9.0 + )"); +} + +} // namespace udf +} // namespace hybridse diff --git a/hybridse/src/udf/default_udf_library.cc b/hybridse/src/udf/default_udf_library.cc index e6a546095ec..265a1e09250 100644 --- a/hybridse/src/udf/default_udf_library.cc +++ b/hybridse/src/udf/default_udf_library.cc @@ -665,6 +665,7 @@ void DefaultUdfLibrary::Init() { InitFeatureZero(); InitArrayUdfs(); + InitMapUdfs(); InitEarthDistanceUdf(); InitJsonUdfs(); @@ -794,7 +795,7 @@ void DefaultUdfLibrary::InitStringUdf() { RegisterCodeGenUdf("concat").variadic_args<>( /* infer */ [](UdfResolveContext* ctx, - const std::vector& arg_attrs, + const std::vector& arg_attrs, ExprAttrNode* out) { out->SetType(ctx->node_manager()->MakeTypeNode(node::kVarchar)); out->SetNullable(false); @@ -802,7 +803,7 @@ void DefaultUdfLibrary::InitStringUdf() { }, /* gen */ [](CodeGenContext* ctx, const std::vector& args, - NativeValue* out) { + const ExprAttrNode& return_info, NativeValue* out) { codegen::StringIRBuilder string_ir_builder(ctx->GetModule()); return string_ir_builder.Concat(ctx->GetCurrentBlock(), args, out); }) @@ -821,16 +822,16 @@ void DefaultUdfLibrary::InitStringUdf() { RegisterCodeGenUdf("concat_ws") .variadic_args( /* infer */ - [](UdfResolveContext* ctx, const ExprAttrNode* arg, - const std::vector& arg_types, + [](UdfResolveContext* ctx, const ExprAttrNode& arg, + const std::vector& arg_types, ExprAttrNode* out) { out->SetType(ctx->node_manager()->MakeTypeNode(node::kVarchar)); out->SetNullable(false); return Status::OK(); }, /* gen */ - [](CodeGenContext* ctx, NativeValue arg, - const std::vector& args, NativeValue* out) { + [](CodeGenContext* ctx, NativeValue arg, const std::vector& args, + const ExprAttrNode& return_info, NativeValue* out) { codegen::StringIRBuilder string_ir_builder(ctx->GetModule()); return string_ir_builder.ConcatWS(ctx->GetCurrentBlock(), arg, @@ -1651,7 +1652,7 @@ void DefaultUdfLibrary::InitMathUdf() { RegisterExprUdf("round") .variadic_args( - [](UdfResolveContext* ctx, ExprNode* x, const std::vector& other) -> ExprNode* { + [](UdfResolveContext* ctx, ExprNode* x, absl::Span other) -> ExprNode* { if (!x->GetOutputType()->IsArithmetic() || x->GetOutputType()->IsBool()) { ctx->SetError("round do not support first parameter of type " + x->GetOutputType()->GetName()); return nullptr; @@ -2233,18 +2234,15 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { )"); RegisterCodeGenUdf("year") - .args( - [](CodeGenContext* ctx, NativeValue date, NativeValue* out) { - codegen::DateIRBuilder date_ir_builder(ctx->GetModule()); - ::llvm::Value* ret = nullptr; - Status status; - CHECK_TRUE(date_ir_builder.Year(ctx->GetCurrentBlock(), - date.GetRaw(), &ret, status), - kCodegenError, - "Fail to build udf year(date): ", status.str()); - *out = NativeValue::Create(ret); - return status; - }) + .args([](CodeGenContext* ctx, NativeValue date, const node::ExprAttrNode& return_info, NativeValue* out) { + codegen::DateIRBuilder date_ir_builder(ctx->GetModule()); + ::llvm::Value* ret = nullptr; + Status status; + CHECK_TRUE(date_ir_builder.Year(ctx->GetCurrentBlock(), date.GetRaw(), &ret, status), kCodegenError, + "Fail to build udf year(date): ", status.str()); + *out = NativeValue::Create(ret); + return status; + }) .returns(); RegisterExternal("month") @@ -2264,7 +2262,7 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { RegisterCodeGenUdf("month") .args( - [](CodeGenContext* ctx, NativeValue date, NativeValue* out) { + [](CodeGenContext* ctx, NativeValue date, const node::ExprAttrNode& ri, NativeValue* out) { codegen::DateIRBuilder date_ir_builder(ctx->GetModule()); ::llvm::Value* ret = nullptr; Status status; @@ -2298,7 +2296,7 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { )"); RegisterCodeGenUdf("dayofmonth").args( - [](CodeGenContext* ctx, NativeValue date, NativeValue* out) { + [](CodeGenContext* ctx, NativeValue date, const node::ExprAttrNode& ri, NativeValue* out) { codegen::DateIRBuilder date_ir_builder(ctx->GetModule()); ::llvm::Value* ret = nullptr; Status status; @@ -2554,13 +2552,13 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { .variadic_args<>( /* infer */ [](UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out) { auto nm = ctx->node_manager(); auto tuple_type = nm->MakeTypeNode(node::kTuple); for (auto attr : args) { - tuple_type->generics_.push_back(attr->type()); - tuple_type->generics_nullable_.push_back(attr->nullable()); + tuple_type->generics_.push_back(attr.type()); + tuple_type->generics_nullable_.push_back(attr.nullable()); } out->SetType(tuple_type); out->SetNullable(false); @@ -2568,7 +2566,7 @@ void DefaultUdfLibrary::InitTimeAndDateUdf() { }, /* gen */ [](CodeGenContext* ctx, const std::vector& args, - NativeValue* out) { + const ExprAttrNode& return_info, NativeValue* out) { *out = NativeValue::CreateTuple(args); return Status::OK(); }); diff --git a/hybridse/src/udf/default_udf_library.h b/hybridse/src/udf/default_udf_library.h index be5ed6c2414..92152649fa0 100644 --- a/hybridse/src/udf/default_udf_library.h +++ b/hybridse/src/udf/default_udf_library.h @@ -52,6 +52,9 @@ class DefaultUdfLibrary : public UdfLibrary { // Array Udf defines, udfs either accept array as parameter or returns array void InitArrayUdfs(); + // Map functions + void InitMapUdfs(); + // aggregate functions for statistic void InitStatisticsUdafs(); diff --git a/hybridse/src/udf/dynamic_lib_manager.cc b/hybridse/src/udf/dynamic_lib_manager.cc index c6a034247cd..b3b281a0346 100644 --- a/hybridse/src/udf/dynamic_lib_manager.cc +++ b/hybridse/src/udf/dynamic_lib_manager.cc @@ -19,6 +19,8 @@ #include #include +#include "glog/logging.h" + namespace hybridse { namespace udf { diff --git a/hybridse/src/udf/literal_traits.h b/hybridse/src/udf/literal_traits.h index 13c876951e8..2c79c8a365d 100644 --- a/hybridse/src/udf/literal_traits.h +++ b/hybridse/src/udf/literal_traits.h @@ -18,15 +18,12 @@ #define HYBRIDSE_SRC_UDF_LITERAL_TRAITS_H_ #include -#include #include #include #include -#include #include #include -#include "base/fe_status.h" #include "base/string_ref.h" #include "base/type.h" #include "codec/fe_row_codec.h" @@ -139,8 +136,10 @@ static bool operator==(const Nullable& x, const Nullable& y) { // ===================================== // // ArrayRef // ===================================== // -template ::CCallArgType> +template struct ArrayRef { + using CType = typename DataTypeTrait::CCallArgType; + CType* raw; bool* nullables; uint64_t size; diff --git a/hybridse/src/udf/udf_registry.cc b/hybridse/src/udf/udf_registry.cc index 932174d8145..60e93460c24 100644 --- a/hybridse/src/udf/udf_registry.cc +++ b/hybridse/src/udf/udf_registry.cc @@ -206,20 +206,17 @@ Status ExprUdfRegistry::ResolveFunction(UdfResolveContext* ctx, Status LlvmUdfRegistry::ResolveFunction(UdfResolveContext* ctx, node::FnDefNode** result) { std::vector arg_types; - std::vector arg_attrs; + std::vector arg_attrs; for (size_t i = 0; i < ctx->arg_size(); ++i) { auto arg_type = ctx->arg_type(i); bool nullable = ctx->arg_nullable(i); CHECK_TRUE(arg_type != nullptr, kCodegenError, i, "th argument node type is unknown: ", name()); arg_types.push_back(arg_type); - arg_attrs.push_back(new ExprAttrNode(arg_type, nullable)); + arg_attrs.emplace_back(arg_type, nullable); } ExprAttrNode out_attr(nullptr, true); auto status = gen_impl_func_->infer(ctx, arg_attrs, &out_attr); - for (auto ptr : arg_attrs) { - delete const_cast(ptr); - } CHECK_STATUS(status, "Infer llvm output attr failed: ", status.str()); auto return_type = out_attr.type(); diff --git a/hybridse/src/udf/udf_registry.h b/hybridse/src/udf/udf_registry.h index 3ea96d25c13..d9512e581f0 100644 --- a/hybridse/src/udf/udf_registry.h +++ b/hybridse/src/udf/udf_registry.h @@ -28,13 +28,11 @@ #include #include "base/fe_status.h" -#include "codec/list_iterator_codec.h" #include "codegen/context.h" #include "node/node_manager.h" #include "node/sql_node.h" #include "udf/literal_traits.h" #include "udf/udf_library.h" -#include "vm/schemas_context.h" namespace hybridse { namespace udf { @@ -394,10 +392,11 @@ class LlvmUdfGenBase { public: virtual Status gen(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* res) = 0; virtual Status infer(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode*) = 0; node::TypeNode* fixed_ret_type() const { return fixed_ret_type_; } @@ -417,33 +416,36 @@ struct LlvmUdfGen : public LlvmUdfGenBase { using FType = std::function::second_type..., + const ExprAttrNode& return_info, codegen::NativeValue*)>; using InferFType = std::function::second_type..., + typename std::pair::second_type..., ExprAttrNode*)>; Status gen(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* result) override { CHECK_TRUE(args.size() == sizeof...(Args), common::kCodegenError, "Fail to invoke LlvmUefGen::gen, args size do not " "match with template args)"); - return gen_internal(ctx, args, result, + return gen_internal(ctx, args, return_info, result, std::index_sequence_for()); } template Status gen_internal(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* result, const std::index_sequence&) { - return gen_func(ctx, args[I]..., result); + return gen_func(ctx, args[I]..., return_info, result); } Status infer(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out) override { return infer_internal(ctx, args, out, std::index_sequence_for()); @@ -451,7 +453,7 @@ struct LlvmUdfGen : public LlvmUdfGenBase { template Status infer_internal(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out, const std::index_sequence&) { if (this->infer_func) { return infer_func(ctx, args[I]..., out); @@ -475,39 +477,39 @@ struct LlvmUdfGen : public LlvmUdfGenBase { template struct VariadicLLVMUdfGen : public LlvmUdfGenBase { using FType = std::function::second_type..., - const std::vector&, codegen::NativeValue*)>; + codegen::CodeGenContext*, typename std::pair::second_type..., + const std::vector&, const ExprAttrNode& return_info, codegen::NativeValue*)>; using InferFType = std::function::second_type..., - const std::vector&, ExprAttrNode*)>; + typename std::pair::second_type..., + const std::vector&, ExprAttrNode*)>; Status gen(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* result) override { CHECK_TRUE(args.size() >= sizeof...(Args), common::kCodegenError, "Fail to invoke VariadicLLVMUdfGen::gen, " "args size do not match with template args)"); - return gen_internal(ctx, args, result, - std::index_sequence_for()); + return gen_internal(ctx, args, return_info, result, std::index_sequence_for()); }; template Status gen_internal(codegen::CodeGenContext* ctx, const std::vector& args, + const ExprAttrNode& return_info, codegen::NativeValue* result, const std::index_sequence&) { std::vector variadic_args; for (size_t i = sizeof...(I); i < args.size(); ++i) { variadic_args.emplace_back(args[i]); } - return this->gen_func(ctx, args[I]..., variadic_args, result); + return this->gen_func(ctx, args[I]..., variadic_args, return_info, result); } Status infer(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out) override { return infer_internal(ctx, args, out, std::index_sequence_for()); @@ -515,9 +517,9 @@ struct VariadicLLVMUdfGen : public LlvmUdfGenBase { template Status infer_internal(UdfResolveContext* ctx, - const std::vector& args, + const std::vector& args, ExprAttrNode* out, const std::index_sequence&) { - std::vector variadic_args; + std::vector variadic_args; for (size_t i = sizeof...(I); i < args.size(); ++i) { variadic_args.emplace_back(args[i]); } @@ -723,9 +725,8 @@ class CodeGenUdfTemplateRegistryHelper { LlvmUdfRegistryHelper& helper) { // NOLINT helper.args( [](codegen::CodeGenContext* ctx, - typename std::pair< - Args, codegen::NativeValue>::second_type... args, - codegen::NativeValue* result) { + typename std::pair::second_type... args, + const ExprAttrNode& return_info, codegen::NativeValue* result) { return FTemplate()(ctx, args..., result); }); return helper.cur_def(); diff --git a/hybridse/src/udf/udf_registry_test.cc b/hybridse/src/udf/udf_registry_test.cc index 962b367819b..aac28fc8f17 100644 --- a/hybridse/src/udf/udf_registry_test.cc +++ b/hybridse/src/udf/udf_registry_test.cc @@ -384,14 +384,14 @@ TEST_F(UdfRegistryTest, test_codegen_udf_register) { library.RegisterCodeGenUdf("add").args( /* infer */ - [](UdfResolveContext* ctx, const ExprAttrNode* x, const ExprAttrNode* y, + [](UdfResolveContext* ctx, const ExprAttrNode& x, const ExprAttrNode& y, ExprAttrNode* out) { - out->SetType(x->type()); + out->SetType(x.type()); return Status::OK(); }, /* gen */ [](CodeGenContext* ctx, NativeValue x, NativeValue y, - NativeValue* out) { + const ExprAttrNode& ri, NativeValue* out) { *out = x; return Status::OK(); }); @@ -409,14 +409,14 @@ TEST_F(UdfRegistryTest, test_variadic_codegen_udf_register) { library.RegisterCodeGenUdf("concat").variadic_args<>( /* infer */ [](UdfResolveContext* ctx, - const std::vector& arg_attrs, + const std::vector& arg_attrs, ExprAttrNode* out) { - out->SetType(arg_attrs[0]->type()); + out->SetType(arg_attrs[0].type()); return Status::OK(); }, /* gen */ [](CodeGenContext* ctx, const std::vector& args, - NativeValue* out) { + const ExprAttrNode& return_info, NativeValue* out) { *out = args[0]; return Status::OK(); }); diff --git a/hybridse/src/vm/engine.cc b/hybridse/src/vm/engine.cc index c0d9be8c333..0865655f3c1 100644 --- a/hybridse/src/vm/engine.cc +++ b/hybridse/src/vm/engine.cc @@ -160,7 +160,7 @@ bool Engine::Get(const std::string& sql, const std::string& db, RunSession& sess sql_context.enable_expr_optimize = options_.IsEnableExprOptimize(); sql_context.jit_options = options_.jit_options(); sql_context.options = session.GetOptions(); - sql_context.index_hints_ = session.index_hints_; + sql_context.index_hints = session.index_hints_; if (session.engine_mode() == kBatchMode) { sql_context.parameter_types = dynamic_cast(&session)->GetParameterSchema(); } else if (session.engine_mode() == kBatchRequestMode) { @@ -191,7 +191,7 @@ bool Engine::Get(const std::string& sql, const std::string& db, RunSession& sess LOG(INFO) << "physical plan:\n" << plan_oss.str() << std::endl; } std::ostringstream runner_oss; - sql_context.cluster_job.Print(runner_oss, ""); + sql_context.cluster_job->Print(runner_oss, ""); LOG(INFO) << "cluster job:\n" << runner_oss.str() << std::endl; } return true; @@ -377,20 +377,20 @@ bool RunSession::SetCompileInfo(const std::shared_ptr& compile_info int32_t RequestRunSession::Run(const Row& in_row, Row* out_row) { DLOG(INFO) << "Request Row Run with main task"; - return Run(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job.main_task_id(), + return Run(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job->main_task_id(), in_row, out_row); } int32_t RequestRunSession::Run(const uint32_t task_id, const Row& in_row, Row* out_row) { auto task = std::dynamic_pointer_cast(compile_info_) ->get_sql_context() - .cluster_job.GetTask(task_id) + .cluster_job->GetTask(task_id) .GetRoot(); if (nullptr == task) { LOG(WARNING) << "fail to run request plan: taskid" << task_id << " not exist!"; return -2; } DLOG(INFO) << "Request Row Run with task_id " << task_id; - RunnerContext ctx(&std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job, in_row, + RunnerContext ctx(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job, in_row, sp_name_, is_debug_); auto output = task->RunWithCache(ctx); if (!output) { @@ -405,15 +405,15 @@ int32_t RequestRunSession::Run(const uint32_t task_id, const Row& in_row, Row* o } int32_t BatchRequestRunSession::Run(const std::vector& request_batch, std::vector& output) { - return Run(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job.main_task_id(), + return Run(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job->main_task_id(), request_batch, output); } int32_t BatchRequestRunSession::Run(const uint32_t id, const std::vector& request_batch, std::vector& output) { - RunnerContext ctx(&std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job, + RunnerContext ctx(std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job, request_batch, sp_name_, is_debug_); auto task = - std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job.GetTask(id).GetRoot(); + std::dynamic_pointer_cast(compile_info_)->get_sql_context().cluster_job->GetTask(id).GetRoot(); if (nullptr == task) { LOG(WARNING) << "Fail to run request plan: taskid" << id << " not exist!"; return -2; @@ -435,8 +435,8 @@ int32_t BatchRunSession::Run(std::vector& rows, uint64_t limit) { } int32_t BatchRunSession::Run(const Row& parameter_row, std::vector& rows, uint64_t limit) { auto& sql_ctx = std::dynamic_pointer_cast(compile_info_)->get_sql_context(); - RunnerContext ctx(&sql_ctx.cluster_job, parameter_row, is_debug_); - auto output = sql_ctx.cluster_job.GetTask(0).GetRoot()->RunWithCache(ctx); + RunnerContext ctx(sql_ctx.cluster_job, parameter_row, is_debug_); + auto output = sql_ctx.cluster_job->GetTask(0).GetRoot()->RunWithCache(ctx); if (!output) { DLOG(INFO) << "Run batch plan output is empty"; return 0; diff --git a/hybridse/src/vm/runner_ctx.h b/hybridse/src/vm/runner_ctx.h index 0924015450a..350d2372a09 100644 --- a/hybridse/src/vm/runner_ctx.h +++ b/hybridse/src/vm/runner_ctx.h @@ -29,8 +29,7 @@ namespace vm { class RunnerContext { public: - explicit RunnerContext(hybridse::vm::ClusterJob* cluster_job, - const hybridse::codec::Row& parameter, + explicit RunnerContext(std::shared_ptr cluster_job, const hybridse::codec::Row& parameter, const bool is_debug = false) : cluster_job_(cluster_job), sp_name_(""), @@ -39,7 +38,7 @@ class RunnerContext { parameter_(parameter), is_debug_(is_debug), batch_cache_() {} - explicit RunnerContext(hybridse::vm::ClusterJob* cluster_job, + explicit RunnerContext(std::shared_ptr cluster_job, const hybridse::codec::Row& request, const std::string& sp_name = "", const bool is_debug = false) @@ -50,7 +49,7 @@ class RunnerContext { parameter_(), is_debug_(is_debug), batch_cache_() {} - explicit RunnerContext(hybridse::vm::ClusterJob* cluster_job, + explicit RunnerContext(std::shared_ptr cluster_job, const std::vector& request_batch, const std::string& sp_name = "", const bool is_debug = false) @@ -68,7 +67,7 @@ class RunnerContext { return requests_[idx]; } const hybridse::codec::Row& GetParameterRow() const { return parameter_; } - hybridse::vm::ClusterJob* cluster_job() { return cluster_job_; } + std::shared_ptr cluster_job() { return cluster_job_; } void SetRequest(const hybridse::codec::Row& request); void SetRequests(const std::vector& requests); bool is_debug() const { return is_debug_; } @@ -81,7 +80,7 @@ class RunnerContext { void SetBatchCache(int64_t id, std::shared_ptr data); private: - hybridse::vm::ClusterJob* cluster_job_; + std::shared_ptr cluster_job_; const std::string sp_name_; hybridse::codec::Row request_; std::vector requests_; diff --git a/hybridse/src/vm/runner_test.cc b/hybridse/src/vm/runner_test.cc index ea8d9c9643e..bce8c8712d3 100644 --- a/hybridse/src/vm/runner_test.cc +++ b/hybridse/src/vm/runner_test.cc @@ -75,13 +75,13 @@ void RunnerCheck(std::shared_ptr catalog, const std::string sql, ASSERT_TRUE(ok) << compile_status; ASSERT_TRUE(sql_compiler.BuildClusterJob(sql_context, compile_status)); ASSERT_TRUE(nullptr != sql_context.physical_plan); - ASSERT_TRUE(sql_context.cluster_job.IsValid()); + ASSERT_TRUE(sql_context.cluster_job->IsValid()); std::ostringstream oss; sql_context.physical_plan->Print(oss, ""); std::cout << "physical plan:\n" << sql << "\n" << oss.str() << std::endl; std::ostringstream runner_oss; - sql_context.cluster_job.Print(runner_oss, ""); + sql_context.cluster_job->Print(runner_oss, ""); std::cout << "runner: \n" << runner_oss.str() << std::endl; std::ostringstream oss_schema; @@ -349,7 +349,7 @@ TEST_F(RunnerTest, KeyGeneratorTest) { ASSERT_TRUE(sql_context.physical_plan != nullptr); auto root = GetFirstRunnerOfType( - sql_context.cluster_job.GetTask(0).GetRoot(), kRunnerGroup); + sql_context.cluster_job->GetTask(0).GetRoot(), kRunnerGroup); auto group_runner = dynamic_cast(root); std::vector rows; hybridse::type::TableDef temp_table; diff --git a/hybridse/src/vm/sql_compiler.cc b/hybridse/src/vm/sql_compiler.cc index c686e1401b4..ea5626545ee 100644 --- a/hybridse/src/vm/sql_compiler.cc +++ b/hybridse/src/vm/sql_compiler.cc @@ -159,7 +159,7 @@ Status SqlCompiler::BuildBatchModePhysicalPlan(SqlContext* ctx, const ::hybridse vm::BatchModeTransformer transformer(&ctx->nm, ctx->db, cl_, &ctx->parameter_types, llvm_module, library, ctx->is_cluster_optimized, ctx->enable_expr_optimize, ctx->enable_batch_window_parallelization, ctx->enable_window_column_pruning, - ctx->options.get(), ctx->index_hints_); + ctx->options.get(), ctx->index_hints); transformer.AddDefaultPasses(); CHECK_STATUS(transformer.TransformPhysicalPlan(plan_list, output), "Fail to generate physical plan batch mode"); ctx->schema = *(*output)->GetOutputSchema(); @@ -172,7 +172,7 @@ Status SqlCompiler::BuildRequestModePhysicalPlan(SqlContext* ctx, const ::hybrid PhysicalOpNode** output) { vm::RequestModeTransformer transformer(&ctx->nm, ctx->db, cl_, &ctx->parameter_types, llvm_module, library, {}, ctx->is_cluster_optimized, false, ctx->enable_expr_optimize, - enable_request_performance_sensitive, ctx->options.get(), ctx->index_hints_); + enable_request_performance_sensitive, ctx->options.get(), ctx->index_hints); if (ctx->options && ctx->options->count(LONG_WINDOWS)) { transformer.AddPass(passes::kPassSplitAggregationOptimized); transformer.AddPass(passes::kPassLongWindowOptimized); @@ -196,7 +196,7 @@ Status SqlCompiler::BuildBatchRequestModePhysicalPlan(SqlContext* ctx, const ::h vm::RequestModeTransformer transformer(&ctx->nm, ctx->db, cl_, &ctx->parameter_types, llvm_module, library, ctx->batch_request_info.common_column_indices, ctx->is_cluster_optimized, ctx->is_batch_request_optimized, ctx->enable_expr_optimize, true, - ctx->options.get(), ctx->index_hints_); + ctx->options.get(), ctx->index_hints); if (ctx->options && ctx->options->count(LONG_WINDOWS)) { transformer.AddPass(passes::kPassSplitAggregationOptimized); transformer.AddPass(passes::kPassLongWindowOptimized); @@ -297,7 +297,10 @@ bool SqlCompiler::BuildClusterJob(SqlContext& ctx, Status& status) { // NOLINT ctx.is_cluster_optimized && is_request_mode, ctx.batch_request_info.common_column_indices, ctx.batch_request_info.common_node_set); - ctx.cluster_job = runner_builder.BuildClusterJob(ctx.physical_plan, status); + if (ctx.cluster_job == nullptr) { + ctx.cluster_job = std::make_shared(); + } + *ctx.cluster_job = runner_builder.BuildClusterJob(ctx.physical_plan, status); return status.isOK(); } @@ -310,11 +313,8 @@ bool SqlCompiler::BuildClusterJob(SqlContext& ctx, Status& status) { // NOLINT */ bool SqlCompiler::Parse(SqlContext& ctx, ::hybridse::base::Status& status) { // NOLINT - bool is_batch_mode = ctx.engine_mode == kBatchMode; - if (!::hybridse::plan::PlanAPI::CreatePlanTreeFromScript(ctx.sql, ctx.logical_plan, &ctx.nm, status, is_batch_mode, - ctx.is_cluster_optimized, - ctx.enable_batch_window_parallelization, - ctx.options.get())) { + status = hybridse::plan::PlanAPI::CreatePlanTreeFromScript(&ctx); + if (!status.isOK()) { LOG(WARNING) << "Fail create sql plan: " << status; return false; } diff --git a/hybridse/src/vm/sql_compiler.h b/hybridse/src/vm/sql_compiler.h index a70f5275276..a874be405fa 100644 --- a/hybridse/src/vm/sql_compiler.h +++ b/hybridse/src/vm/sql_compiler.h @@ -19,7 +19,7 @@ #include #include -#include + #include "base/fe_status.h" #include "llvm/IR/Module.h" #include "udf/udf_library.h" @@ -30,60 +30,13 @@ #include "vm/physical_op.h" #include "vm/physical_plan_context.h" #include "vm/runner.h" +#include "vm/sql_ctx.h" namespace hybridse { namespace vm { using hybridse::base::Status; -struct SqlContext { - // mode: batch|request|batch request - ::hybridse::vm::EngineMode engine_mode; - bool is_cluster_optimized = false; - bool is_batch_request_optimized = false; - bool enable_expr_optimize = false; - bool enable_batch_window_parallelization = true; - bool enable_window_column_pruning = false; - - // the sql content - std::string sql; - // the database - std::string db; - // the logical plan - ::hybridse::node::PlanNodeList logical_plan; - ::hybridse::vm::PhysicalOpNode* physical_plan = nullptr; - hybridse::vm::ClusterJob cluster_job; - // TODO(wangtaize) add a light jit engine - // eg using bthead to compile ir - hybridse::vm::JitOptions jit_options; - std::shared_ptr jit = nullptr; - Schema schema; - Schema request_schema; - std::string request_db_name; - std::string request_name; - Schema parameter_types; - uint32_t row_size; - uint32_t limit_cnt = 0; - std::string ir; - std::string logical_plan_str; - std::string physical_plan_str; - std::string encoded_schema; - std::string encoded_request_schema; - ::hybridse::node::NodeManager nm; - ::hybridse::udf::UdfLibrary* udf_library = nullptr; - - ::hybridse::vm::BatchRequestInfo batch_request_info; - - std::shared_ptr> options; - - // [ALPHA] SQL diagnostic infos - // not standardized, only index hints, no error, no warning, no other hint/info - std::shared_ptr index_hints_; - - SqlContext() {} - ~SqlContext() {} -}; - class SqlCompileInfo : public CompileInfo { public: SqlCompileInfo() : sql_ctx() {} @@ -111,13 +64,13 @@ class SqlCompileInfo : public CompileInfo { const std::string& GetRequestDbName() const override { return sql_ctx.request_db_name; } const hybridse::vm::BatchRequestInfo& GetBatchRequestInfo() const override { return sql_ctx.batch_request_info; } const hybridse::vm::PhysicalOpNode* GetPhysicalPlan() const override { return sql_ctx.physical_plan; } - hybridse::vm::Runner* GetMainTask() { return sql_ctx.cluster_job.GetMainTask().GetRoot(); } - hybridse::vm::ClusterJob& GetClusterJob() { return sql_ctx.cluster_job; } + hybridse::vm::Runner* GetMainTask() { return sql_ctx.cluster_job->GetMainTask().GetRoot(); } + std::shared_ptr GetClusterJob() { return sql_ctx.cluster_job; } void DumpPhysicalPlan(std::ostream& output, const std::string& tab) override { sql_ctx.physical_plan->Print(output, tab); } void DumpClusterJob(std::ostream& output, const std::string& tab) override { - sql_ctx.cluster_job.Print(output, tab); + sql_ctx.cluster_job->Print(output, tab); } static SqlCompileInfo* CastFrom(CompileInfo* node) { return dynamic_cast(node); } diff --git a/hybridse/src/vm/sql_ctx.cc b/hybridse/src/vm/sql_ctx.cc new file mode 100644 index 00000000000..b328801978c --- /dev/null +++ b/hybridse/src/vm/sql_ctx.cc @@ -0,0 +1,29 @@ +/** + * Copyright (c) 2023 OpenMLDB Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vm/sql_ctx.h" + +// DONT DELETE: unique_ptr requires full specification for underlying type +#include "zetasql/parser/parser.h" // IWYU pragma: keep + +namespace hybridse { +namespace vm { +SqlContext::SqlContext() {} + +SqlContext::~SqlContext() {} + +} // namespace vm +} // namespace hybridse diff --git a/hybridse/src/vm/transform.cc b/hybridse/src/vm/transform.cc index 82e96b3c094..49a76d95273 100644 --- a/hybridse/src/vm/transform.cc +++ b/hybridse/src/vm/transform.cc @@ -26,6 +26,7 @@ #include "codegen/context.h" #include "codegen/fn_ir_builder.h" #include "codegen/fn_let_ir_builder.h" +#include "codegen/ir_base_builder.h" #include "passes/physical/batch_request_optimize.h" #include "passes/physical/cluster_optimized.h" #include "passes/physical/condition_optimized.h" @@ -39,9 +40,9 @@ #include "passes/physical/window_column_pruning.h" #include "plan/planner.h" #include "proto/fe_common.pb.h" +#include "vm/internal/node_helper.h" #include "vm/physical_op.h" #include "vm/schemas_context.h" -#include "vm/internal/node_helper.h" namespace hybridse { namespace vm { diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/api/OpenmldbSession.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/api/OpenmldbSession.scala index 9a3113b4c09..7059f0146bd 100755 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/api/OpenmldbSession.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/api/OpenmldbSession.scala @@ -174,6 +174,8 @@ class OpenmldbSession { * @return */ def openmldbSql(sqlText: String): OpenmldbDataframe = { + logger.info("Try to execute OpenMLDB SQL: " + sqlText) + if (config.enableSparksql) { return OpenmldbDataframe(this, sparksql(sqlText)) } @@ -278,7 +280,15 @@ class OpenmldbSession { def close(): Unit = stop() def registerOpenmldbOfflineTable(catalogService: OpenmldbCatalogService): Unit = { + if (catalogService == null) { + return + } + val databases = catalogService.getDatabases + if (databases == null) { + return + } + databases.map(dbName => { val tableInfos = catalogService.getTableInfos(dbName) tableInfos.map(tableInfo => { @@ -323,7 +333,7 @@ class OpenmldbSession { } } catch { case e: Exception => { - logger.warn(s"Fail to register table $dbName.$tableName " + ExceptionUtils.getStackTrace(e)) + logger.warn(s"Fail to register table $dbName.$tableName, exception: " + ExceptionUtils.getStackTrace(e)) } } } diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/LoadDataPlan.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/LoadDataPlan.scala index a04b46ab650..ec9946b839a 100644 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/LoadDataPlan.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/nodes/LoadDataPlan.scala @@ -55,16 +55,19 @@ object LoadDataPlan { loadDataSql) // write - logger.info("write data to storage {}, writer[mode {}], is deep? {}", storage, mode, deepCopy.toString) + logger.info("write data to storage {}, writer mode {}, is deep {}", storage, mode, deepCopy.toString) if (storage == "online") { // Import online data require(deepCopy && mode == "append", "import to online storage, can't do soft copy, and mode must be append") val writeType = extra.get("writer_type").get + val putIfAbsent = extra.get("put_if_absent").get.toBoolean + logger.info(s"online write type ${writeType}, put if absent ${putIfAbsent}") val writeOptions = Map( "db" -> db, "table" -> table, "zkCluster" -> ctx.getConf.openmldbZkCluster, "zkPath" -> ctx.getConf.openmldbZkRootPath, - "writerType" -> writeType + "writerType" -> writeType, + "putIfAbsent" -> putIfAbsent.toString ) df.write.options(writeOptions).format("openmldb").mode(mode).save() } else { // Import offline data diff --git a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtil.scala b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtil.scala index 8bf6897d82f..6f3e5b78d40 100644 --- a/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtil.scala +++ b/java/openmldb-batch/src/main/scala/com/_4paradigm/openmldb/batch/utils/HybridseUtil.scala @@ -247,16 +247,17 @@ object HybridseUtil { } // extra options for some special case - // only for PhysicalLoadDataNode var extraOptions: mutable.Map[String, String] = mutable.Map() + // only for PhysicalLoadDataNode extraOptions += ("deep_copy" -> parseOption(getOptionFromNode(node, "deep_copy"), "true", getBoolOrDefault)) - - // only for select into, "" means N/A - extraOptions += ("coalesce" -> parseOption(getOptionFromNode(node, "coalesce"), "0", getIntOrDefault)) - extraOptions += ("sql" -> parseOption(getOptionFromNode(node, "sql"), "", getStringOrDefault)) extraOptions += ("writer_type") -> parseOption(getOptionFromNode(node, "writer_type"), "single", getStringOrDefault) + extraOptions += ("sql" -> parseOption(getOptionFromNode(node, "sql"), "", getStringOrDefault)) + extraOptions += ("put_if_absent" -> parseOption(getOptionFromNode(node, "put_if_absent"), "false", + getBoolOrDefault)) + // only for select into, "" means N/A + extraOptions += ("coalesce" -> parseOption(getOptionFromNode(node, "coalesce"), "0", getIntOrDefault)) extraOptions += ("create_if_not_exists" -> parseOption(getOptionFromNode(node, "create_if_not_exists"), "true", getBoolOrDefault)) diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLConnection.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLConnection.java index 5383eaf246d..8259682755d 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLConnection.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/jdbc/SQLConnection.java @@ -82,7 +82,8 @@ public java.sql.Statement createStatement() throws SQLException { @Override public java.sql.PreparedStatement prepareStatement(String sql) throws SQLException { String lower = sql.toLowerCase(); - if (lower.startsWith("insert into")) { + // insert, insert or xxx + if (lower.startsWith("insert ")) { return client.getInsertPreparedStmt(this.defaultDatabase, sql); } else if (lower.startsWith("select")) { return client.getPreparedStatement(this.defaultDatabase, sql); diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/DAGNode.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/DAGNode.java new file mode 100644 index 00000000000..c3334f281b1 --- /dev/null +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/DAGNode.java @@ -0,0 +1,31 @@ +/** + * Copyright (c) 2023 OpenMLDB authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com._4paradigm.openmldb.sdk; + +import java.util.ArrayList; + +public class DAGNode { + public DAGNode(String name, String sql, ArrayList producers) { + this.name = name; + this.sql = sql; + this.producers = producers; + } + + public String name; + public String sql; + public ArrayList producers; +} diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SdkOption.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SdkOption.java index eca5289bf32..66d0d83bef9 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SdkOption.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SdkOption.java @@ -17,13 +17,14 @@ package com._4paradigm.openmldb.sdk; import lombok.Data; +import java.io.Serializable; import com._4paradigm.openmldb.BasicRouterOptions; import com._4paradigm.openmldb.SQLRouterOptions; import com._4paradigm.openmldb.StandaloneOptions; @Data -public class SdkOption { +public class SdkOption implements Serializable { // TODO(hw): set isClusterMode automatically private boolean isClusterMode = true; // options for cluster mode diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SqlExecutor.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SqlExecutor.java index b55da67a430..1d81c271b7f 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SqlExecutor.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/SqlExecutor.java @@ -83,6 +83,13 @@ PreparedStatement getBatchRequestPreparedStmt(String db, String sql, NS.TableInfo getTableInfo(String db, String table); List getTableNames(String db); + /** + * Parse SQL query into DAG representation + * + * @param query SQL query string + * @throws SQLException exception if input query not valid for SQL parser + */ + DAGNode SQLToDAG(String query) throws SQLException; void close(); } diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementImpl.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementImpl.java index 6acefe8acff..ecc39b467c1 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementImpl.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementImpl.java @@ -319,7 +319,7 @@ public boolean execute() throws SQLException { // actually only one row boolean ok = router.ExecuteInsert(cache.getDatabase(), cache.getName(), cache.getTid(), cache.getPartitionNum(), - dimensions.array(), dimensions.capacity(), value.array(), value.capacity(), status); + dimensions.array(), dimensions.capacity(), value.array(), value.capacity(), cache.isPutIfAbsent(), status); // cleanup rows even if insert failed // we can't execute() again without set new row, so we must clean up here clearParameters(); @@ -381,7 +381,7 @@ public int[] executeBatch() throws SQLException { boolean ok = router.ExecuteInsert(cache.getDatabase(), cache.getName(), cache.getTid(), cache.getPartitionNum(), pair.getKey().array(), pair.getKey().capacity(), - pair.getValue().array(), pair.getValue().capacity(), status); + pair.getValue().array(), pair.getValue().capacity(), cache.isPutIfAbsent(), status); if (!ok) { // TODO(hw): may lost log, e.g. openmldb-batch online import in yarn mode? logger.warn(status.ToString()); diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementMeta.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementMeta.java index 448438e9d31..cf2bd05cb58 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementMeta.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/InsertPreparedStatementMeta.java @@ -31,6 +31,7 @@ public class InsertPreparedStatementMeta { private Set indexPos = new HashSet<>(); private Map> indexMap = new HashMap<>(); private Map defaultIndexValue = new HashMap<>(); + private boolean putIfAbsent; public InsertPreparedStatementMeta(String sql, NS.TableInfo tableInfo, SQLInsertRow insertRow) { this.sql = sql; @@ -51,6 +52,7 @@ public InsertPreparedStatementMeta(String sql, NS.TableInfo tableInfo, SQLInsert VectorUint32 idxArray = insertRow.GetHoleIdx(); buildHoleIdx(idxArray); idxArray.delete(); + putIfAbsent = insertRow.IsPutIfAbsent(); } private void buildIndex(NS.TableInfo tableInfo) { @@ -215,4 +217,8 @@ Map> getIndexMap() { Map getDefaultIndexValue() { return defaultIndexValue; } + + public boolean isPutIfAbsent() { + return putIfAbsent; + } } diff --git a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/SqlClusterExecutor.java b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/SqlClusterExecutor.java index 3a88fb9489e..0f1cd191911 100644 --- a/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/SqlClusterExecutor.java +++ b/java/openmldb-jdbc/src/main/java/com/_4paradigm/openmldb/sdk/impl/SqlClusterExecutor.java @@ -665,4 +665,30 @@ public boolean updateOfflineTableInfo(NS.TableInfo info) { public boolean refreshCatalog() { return sqlRouter.RefreshCatalog(); } + + @Override + public DAGNode SQLToDAG(String query) throws SQLException { + Status status = new Status(); + final com._4paradigm.openmldb.DAGNode dag = sqlRouter.SQLToDAG(query, status); + + try { + if (status.getCode() != 0) { + throw new SQLException(status.ToString()); + } + return convertDAG(dag); + } finally { + dag.delete(); + status.delete(); + } + } + + private static DAGNode convertDAG(com._4paradigm.openmldb.DAGNode dag) { + ArrayList convertedProducers = new ArrayList<>(); + for (com._4paradigm.openmldb.DAGNode producer : dag.getProducers()) { + final DAGNode converted = convertDAG(producer); + convertedProducers.add(converted); + } + + return new DAGNode(dag.getName(), dag.getSql(), convertedProducers); + } } diff --git a/java/openmldb-jdbc/src/test/java/com/_4paradigm/openmldb/jdbc/SQLRouterSmokeTest.java b/java/openmldb-jdbc/src/test/java/com/_4paradigm/openmldb/jdbc/SQLRouterSmokeTest.java index bc92f20d3f5..60a0ef744f5 100644 --- a/java/openmldb-jdbc/src/test/java/com/_4paradigm/openmldb/jdbc/SQLRouterSmokeTest.java +++ b/java/openmldb-jdbc/src/test/java/com/_4paradigm/openmldb/jdbc/SQLRouterSmokeTest.java @@ -21,6 +21,7 @@ import com._4paradigm.openmldb.common.Pair; import com._4paradigm.openmldb.proto.NS; import com._4paradigm.openmldb.sdk.Column; +import com._4paradigm.openmldb.sdk.DAGNode; import com._4paradigm.openmldb.sdk.Schema; import com._4paradigm.openmldb.sdk.SdkOption; import com._4paradigm.openmldb.sdk.SqlExecutor; @@ -870,4 +871,60 @@ public void testMergeSQL() throws SQLException { + "(select db.main.id as merge_id_3, db.main.c1 as merge_c1_3, sum(c2) over w1 from main window w1 as (union (select \"\" as id, * from t1) partition by c1 order by c2 rows between unbounded preceding and current row)) as out3 " + "on out0.merge_id_0 = out3.merge_id_3 and out0.merge_c1_0 = out3.merge_c1_3;"); } + + @Test(dataProvider = "executor") + public void testSQLToDag(SqlExecutor router) throws SQLException { + String sql = " WITH q1 as (WITH q3 as (select * from t1 LIMIT 10), q4 as (select * from t2) select * from q3 left join q4 on q3.id = q4.id)," + + + "q2 as (select * from t3)" + + "select * from q1 last join q2 on q1.id = q2.id"; + + DAGNode dag = router.SQLToDAG(sql); + + Assert.assertEquals(dag.name, ""); + Assert.assertEquals(dag.sql, "SELECT\n" + + " *\n" + + "FROM\n" + + " q1\n" + + " LAST JOIN\n" + + " q2\n" + + " ON q1.id = q2.id\n"); + Assert.assertEquals(dag.producers.size(), 2); + + DAGNode input1 = dag.producers.get(0); + Assert.assertEquals(input1.name, "q1"); + Assert.assertEquals(input1.sql, "SELECT\n" + + " *\n" + + "FROM\n" + + " q3\n" + + " LEFT JOIN\n" + + " q4\n" + + " ON q3.id = q4.id\n"); + Assert.assertEquals(2, input1.producers.size()); + + DAGNode input2 = dag.producers.get(1); + Assert.assertEquals(input2.name, "q2"); + Assert.assertEquals(input2.sql, "SELECT\n" + + " *\n" + + "FROM\n" + + " t3\n"); + Assert.assertEquals(input2.producers.size(), 0); + + DAGNode q1In1 = input1.producers.get(0); + Assert.assertEquals(q1In1.producers.size(), 0); + Assert.assertEquals(q1In1.name, "q3"); + Assert.assertEquals(q1In1.sql, "SELECT\n" + + " *\n" + + "FROM\n" + + " t1\n" + + "LIMIT 10\n"); + + DAGNode q1In2 = input1.producers.get(1); + Assert.assertEquals(q1In2.producers.size(), 0); + Assert.assertEquals(q1In2.name, "q4"); + Assert.assertEquals(q1In2.sql, "SELECT\n" + + " *\n" + + "FROM\n" + + " t2\n"); + } } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbConfig.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbConfig.java new file mode 100644 index 00000000000..7c0981d0a6c --- /dev/null +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbConfig.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com._4paradigm.openmldb.spark; + +import com._4paradigm.openmldb.sdk.SdkOption; + +import java.io.Serializable; + +import org.sparkproject.guava.base.Preconditions; + +// Must serializable +public class OpenmldbConfig implements Serializable { + public final static String DB = "db"; + public final static String TABLE = "table"; + public final static String ZK_CLUSTER = "zkCluster"; + public final static String ZK_PATH = "zkPath"; + + /* read&write */ + private String dbName; + private String tableName; + private SdkOption option = null; + + /* write */ + // single: insert when read one row + // batch: insert when commit(after read a whole partition) + private String writerType = "single"; + private int insertMemoryUsageLimit = 0; + private boolean putIfAbsent = false; + + public OpenmldbConfig() { + } + + public void setDB(String dbName) { + Preconditions.checkArgument(dbName != null && !dbName.isEmpty(), "db name must not be empty"); + this.dbName = dbName; + } + + public String getDB() { + return this.dbName; + } + + public void setTable(String tableName) { + Preconditions.checkArgument(tableName != null && !tableName.isEmpty(), "table name must not be empty"); + this.tableName = tableName; + } + + public String getTable() { + return this.tableName; + } + + public void setSdkOption(SdkOption option) { + this.option = option; + } + + public SdkOption getSdkOption() { + return this.option; + } + + public void setWriterType(String string) { + Preconditions.checkArgument(string.equals("single") || string.equals("batch"), + "writerType must be 'single' or 'batch'"); + this.writerType = string; + } + + public void setInsertMemoryUsageLimit(int int1) { + Preconditions.checkArgument(int1 >= 0, "insert_memory_usage_limit must be >= 0"); + this.insertMemoryUsageLimit = int1; + } + + public void setPutIfAbsent(Boolean valueOf) { + this.putIfAbsent = valueOf; + } + + public boolean isBatchWriter() { + return this.writerType.equals("batch"); + } + + public boolean putIfAbsent() { + return this.putIfAbsent; + } + + public int getInsertMemoryUsageLimit() { + return this.insertMemoryUsageLimit; + } + +} diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbSource.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbSource.java index 978c3cca694..9dfe78f0197 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbSource.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbSource.java @@ -18,7 +18,6 @@ package com._4paradigm.openmldb.spark; import com._4paradigm.openmldb.sdk.SdkOption; -import com.google.common.base.Preconditions; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableProvider; import org.apache.spark.sql.connector.expressions.Transform; @@ -29,31 +28,20 @@ import java.util.Map; public class OpenmldbSource implements TableProvider, DataSourceRegister { - private final String DB = "db"; - private final String TABLE = "table"; - private final String ZK_CLUSTER = "zkCluster"; - private final String ZK_PATH = "zkPath"; - private String dbName; - private String tableName; - private SdkOption option = null; - // single: insert when read one row - // batch: insert when commit(after read a whole partition) - private String writerType = "single"; + private OpenmldbConfig config = new OpenmldbConfig(); @Override public StructType inferSchema(CaseInsensitiveStringMap options) { - Preconditions.checkNotNull(dbName = options.get(DB)); - Preconditions.checkNotNull(tableName = options.get(TABLE)); + config.setDB(options.get(OpenmldbConfig.DB)); + config.setTable(options.get(OpenmldbConfig.TABLE)); - String zkCluster = options.get(ZK_CLUSTER); - String zkPath = options.get(ZK_PATH); - Preconditions.checkNotNull(zkCluster); - Preconditions.checkNotNull(zkPath); - option = new SdkOption(); - option.setZkCluster(zkCluster); - option.setZkPath(zkPath); + SdkOption option = new SdkOption(); + option.setZkCluster(options.get(OpenmldbConfig.ZK_CLUSTER)); + option.setZkPath(options.get(OpenmldbConfig.ZK_PATH)); option.setLight(true); + config.setSdkOption(option); + String timeout = options.get("sessionTimeout"); if (timeout != null) { option.setSessionTimeout(Integer.parseInt(timeout)); @@ -68,15 +56,21 @@ public StructType inferSchema(CaseInsensitiveStringMap options) { } if (options.containsKey("writerType")) { - writerType = options.get("writerType"); + config.setWriterType(options.get("writerType")); + } + if (options.containsKey("putIfAbsent")) { + config.setPutIfAbsent(Boolean.valueOf(options.get("putIfAbsent"))); } + if (options.containsKey("insert_memory_usage_limit")) { + config.setInsertMemoryUsageLimit(Integer.parseInt(options.get("insert_memory_usage_limit"))); + } return null; } @Override public Table getTable(StructType schema, Transform[] partitioning, Map properties) { - return new OpenmldbTable(dbName, tableName, option, writerType); + return new OpenmldbTable(config); } @Override diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbTable.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbTable.java index 0cf98b7d19e..e5cbcfe40ca 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbTable.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/OpenmldbTable.java @@ -22,10 +22,8 @@ import com._4paradigm.openmldb.sdk.SqlException; import com._4paradigm.openmldb.sdk.SqlExecutor; import com._4paradigm.openmldb.sdk.impl.SqlClusterExecutor; -import com._4paradigm.openmldb.spark.read.OpenmldbReadConfig; import com._4paradigm.openmldb.spark.read.OpenmldbScanBuilder; import com._4paradigm.openmldb.spark.write.OpenmldbWriteBuilder; -import com._4paradigm.openmldb.spark.write.OpenmldbWriteConfig; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.TableCapability; @@ -45,38 +43,32 @@ import java.util.Set; public class OpenmldbTable implements SupportsWrite, SupportsRead { - private final String dbName; - private final String tableName; - private final SdkOption option; - private final String writerType; - private SqlExecutor executor = null; + private OpenmldbConfig config; + private SqlExecutor executor; private Set capabilities; - public OpenmldbTable(String dbName, String tableName, SdkOption option, String writerType) { - this.dbName = dbName; - this.tableName = tableName; - this.option = option; - this.writerType = writerType; + public OpenmldbTable(OpenmldbConfig config) { + this.config = config; try { - this.executor = new SqlClusterExecutor(option); + this.executor = new SqlClusterExecutor(config.getSdkOption()); // no need to check table exists, schema() will check it later } catch (SqlException e) { e.printStackTrace(); + throw new RuntimeException("conn openmldb failed", e); } // TODO: cache schema & delete executor? } @Override public WriteBuilder newWriteBuilder(LogicalWriteInfo info) { - OpenmldbWriteConfig config = new OpenmldbWriteConfig(dbName, tableName, option, writerType); return new OpenmldbWriteBuilder(config, info); } @Override public String name() { // TODO(hw): db? - return tableName; + return config.getTable(); } public static DataType sdkTypeToSparkType(int sqlType) { @@ -107,7 +99,7 @@ public static DataType sdkTypeToSparkType(int sqlType) { @Override public StructType schema() { try { - Schema schema = executor.getTableSchema(dbName, tableName); + Schema schema = executor.getTableSchema(config.getDB(), config.getTable()); List schemaList = schema.getColumnList(); StructField[] fields = new StructField[schemaList.size()]; for (int i = 0; i < schemaList.size(); i++) { @@ -134,7 +126,6 @@ public Set capabilities() { @Override public ScanBuilder newScanBuilder(CaseInsensitiveStringMap caseInsensitiveStringMap) { - OpenmldbReadConfig config = new OpenmldbReadConfig(dbName, tableName, option); return new OpenmldbScanBuilder(config); } } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReaderFactory.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReaderFactory.java index d5e435fc247..929d30b728e 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReaderFactory.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReaderFactory.java @@ -17,15 +17,16 @@ package com._4paradigm.openmldb.spark.read; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.connector.read.InputPartition; import org.apache.spark.sql.connector.read.PartitionReader; import org.apache.spark.sql.connector.read.PartitionReaderFactory; public class OpenmldbPartitionReaderFactory implements PartitionReaderFactory { - private final OpenmldbReadConfig config; + private final OpenmldbConfig config; - public OpenmldbPartitionReaderFactory(OpenmldbReadConfig config) { + public OpenmldbPartitionReaderFactory(OpenmldbConfig config) { this.config = config; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbReadConfig.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbReadConfig.java deleted file mode 100644 index 91489888ba9..00000000000 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbReadConfig.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com._4paradigm.openmldb.spark.read; - -import com._4paradigm.openmldb.sdk.SdkOption; -import java.io.Serializable; - -// Must serializable -public class OpenmldbReadConfig implements Serializable { - public final String dbName, tableName, zkCluster, zkPath; - - public OpenmldbReadConfig(String dbName, String tableName, SdkOption option) { - this.dbName = dbName; - this.tableName = tableName; - this.zkCluster = option.getZkCluster(); - this.zkPath = option.getZkPath(); - // TODO(hw): other configs in SdkOption - } -} diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScan.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScan.java index fb7adb46b8e..4eeac9a6013 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScan.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScan.java @@ -17,6 +17,7 @@ package com._4paradigm.openmldb.spark.read; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.connector.read.Batch; import org.apache.spark.sql.connector.read.InputPartition; import org.apache.spark.sql.connector.read.PartitionReaderFactory; @@ -24,9 +25,9 @@ import org.apache.spark.sql.types.StructType; public class OpenmldbScan implements Scan, Batch { - private final OpenmldbReadConfig config; + private final OpenmldbConfig config; - public OpenmldbScan(OpenmldbReadConfig config) { + public OpenmldbScan(OpenmldbConfig config) { this.config = config; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScanBuilder.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScanBuilder.java index 2b500a6592e..de59a811f46 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScanBuilder.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/read/OpenmldbScanBuilder.java @@ -17,13 +17,14 @@ package com._4paradigm.openmldb.spark.read; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.read.ScanBuilder; public class OpenmldbScanBuilder implements ScanBuilder { - private final OpenmldbReadConfig config; + private final OpenmldbConfig config; - public OpenmldbScanBuilder(OpenmldbReadConfig config) { + public OpenmldbScanBuilder(OpenmldbConfig config) { this.config = config; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbBatchWrite.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbBatchWrite.java index ca90a07d63a..d19fd9f6aeb 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbBatchWrite.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbBatchWrite.java @@ -17,6 +17,7 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.DataWriterFactory; import org.apache.spark.sql.connector.write.LogicalWriteInfo; @@ -24,10 +25,10 @@ import org.apache.spark.sql.connector.write.WriterCommitMessage; public class OpenmldbBatchWrite implements BatchWrite { - private final OpenmldbWriteConfig config; + private final OpenmldbConfig config; private final LogicalWriteInfo info; - public OpenmldbBatchWrite(OpenmldbWriteConfig config, LogicalWriteInfo info) { + public OpenmldbBatchWrite(OpenmldbConfig config, LogicalWriteInfo info) { this.config = config; this.info = info; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataSingleWriter.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataSingleWriter.java index 2885aaba70e..cc5f0150cc3 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataSingleWriter.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataSingleWriter.java @@ -17,8 +17,9 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; + import com._4paradigm.openmldb.sdk.Schema; -import com._4paradigm.openmldb.sdk.SdkOption; import com._4paradigm.openmldb.sdk.SqlException; import com._4paradigm.openmldb.sdk.impl.SqlClusterExecutor; import com.google.common.base.Preconditions; @@ -27,31 +28,26 @@ import org.apache.spark.sql.connector.write.WriterCommitMessage; import java.io.IOException; -import java.sql.Date; import java.sql.PreparedStatement; import java.sql.ResultSetMetaData; import java.sql.SQLException; -import java.sql.Timestamp; -import java.sql.Types; public class OpenmldbDataSingleWriter implements DataWriter { private final int partitionId; private final long taskId; private PreparedStatement preparedStatement = null; - public OpenmldbDataSingleWriter(OpenmldbWriteConfig config, int partitionId, long taskId) { + public OpenmldbDataSingleWriter(OpenmldbConfig config, int partitionId, long taskId) { try { - SdkOption option = new SdkOption(); - option.setZkCluster(config.zkCluster); - option.setZkPath(config.zkPath); - option.setLight(true); - SqlClusterExecutor executor = new SqlClusterExecutor(option); - String dbName = config.dbName; - String tableName = config.tableName; + SqlClusterExecutor executor = new SqlClusterExecutor(config.getSdkOption()); + String dbName = config.getDB(); + String tableName = config.getTable(); + executor.executeSQL(dbName, "SET @@insert_memory_usage_limit=" + config.getInsertMemoryUsageLimit()); Schema schema = executor.getTableSchema(dbName, tableName); // create insert placeholder - StringBuilder insert = new StringBuilder("insert into " + tableName + " values(?"); + String insert_part = config.putIfAbsent()? "insert or ignore into " : "insert into "; + StringBuilder insert = new StringBuilder(insert_part + tableName + " values(?"); for (int i = 1; i < schema.getColumnList().size(); i++) { insert.append(",?"); } @@ -59,6 +55,7 @@ public OpenmldbDataSingleWriter(OpenmldbWriteConfig config, int partitionId, lon preparedStatement = executor.getInsertPreparedStmt(dbName, insert.toString()); } catch (SQLException | SqlException e) { e.printStackTrace(); + throw new RuntimeException("create openmldb writer failed", e); } this.partitionId = partitionId; @@ -72,7 +69,12 @@ public void write(InternalRow record) throws IOException { ResultSetMetaData metaData = preparedStatement.getMetaData(); Preconditions.checkState(record.numFields() == metaData.getColumnCount()); OpenmldbDataWriter.addRow(record, preparedStatement); - preparedStatement.execute(); + // check return for put result + // you can cache failed rows and throw exception when commit/close, + // but it still may interrupt other writers(pending or slow writers) + if(!preparedStatement.execute()) { + throw new IOException("execute failed"); + } } catch (Exception e) { throw new IOException("write row to openmldb failed on " + record, e); } @@ -80,24 +82,13 @@ public void write(InternalRow record) throws IOException { @Override public WriterCommitMessage commit() throws IOException { - try { - preparedStatement.close(); - } catch (SQLException e) { - e.printStackTrace(); - throw new IOException("commit error", e); - } - // TODO(hw): need to return new WriterCommitMessageImpl(partitionId, taskId); ? + // no transaction, no commit return null; } @Override public void abort() throws IOException { - try { - preparedStatement.close(); - } catch (SQLException e) { - e.printStackTrace(); - throw new IOException("abort error", e); - } + // no transaction, no abort } @Override diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriter.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriter.java index 5da75e99348..65bc2e5a457 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriter.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriter.java @@ -17,6 +17,8 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; + import com._4paradigm.openmldb.sdk.Schema; import com._4paradigm.openmldb.sdk.SdkOption; import com._4paradigm.openmldb.sdk.SqlException; @@ -39,19 +41,17 @@ public class OpenmldbDataWriter implements DataWriter { private final long taskId; private PreparedStatement preparedStatement = null; - public OpenmldbDataWriter(OpenmldbWriteConfig config, int partitionId, long taskId) { + public OpenmldbDataWriter(OpenmldbConfig config, int partitionId, long taskId) { try { - SdkOption option = new SdkOption(); - option.setZkCluster(config.zkCluster); - option.setZkPath(config.zkPath); - option.setLight(true); - SqlClusterExecutor executor = new SqlClusterExecutor(option); - String dbName = config.dbName; - String tableName = config.tableName; + SqlClusterExecutor executor = new SqlClusterExecutor(config.getSdkOption()); + String dbName = config.getDB(); + String tableName = config.getTable(); + executor.executeSQL(dbName, "SET @@insert_memory_usage_limit=" + config.getInsertMemoryUsageLimit()); Schema schema = executor.getTableSchema(dbName, tableName); // create insert placeholder - StringBuilder insert = new StringBuilder("insert into " + tableName + " values(?"); + String insert_part = config.putIfAbsent()? "insert or ignore into " : "insert into "; + StringBuilder insert = new StringBuilder(insert_part + tableName + " values(?"); for (int i = 1; i < schema.getColumnList().size(); i++) { insert.append(",?"); } @@ -59,6 +59,7 @@ public OpenmldbDataWriter(OpenmldbWriteConfig config, int partitionId, long task preparedStatement = executor.getInsertPreparedStmt(dbName, insert.toString()); } catch (SQLException | SqlException e) { e.printStackTrace(); + throw new RuntimeException("create openmldb data writer failed", e); } this.partitionId = partitionId; @@ -146,12 +147,7 @@ public WriterCommitMessage commit() throws IOException { @Override public void abort() throws IOException { - try { - preparedStatement.close(); - } catch (SQLException e) { - e.printStackTrace(); - throw new IOException("abort error", e); - } + // no transaction, no abort } @Override diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriterFactory.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriterFactory.java index 96e78979b2f..12cefb3928b 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriterFactory.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbDataWriterFactory.java @@ -17,20 +17,21 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.connector.write.DataWriter; import org.apache.spark.sql.connector.write.DataWriterFactory; public class OpenmldbDataWriterFactory implements DataWriterFactory { - private final OpenmldbWriteConfig config; + private final OpenmldbConfig config; - public OpenmldbDataWriterFactory(OpenmldbWriteConfig config) { + public OpenmldbDataWriterFactory(OpenmldbConfig config) { this.config = config; } @Override public DataWriter createWriter(int partitionId, long taskId) { - if (!config.writerType.equals("batch")) { + if (!config.isBatchWriter()) { return new OpenmldbDataSingleWriter(config, partitionId, taskId); } return new OpenmldbDataWriter(config, partitionId, taskId); diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteBuilder.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteBuilder.java index a3c905b15c1..ccd588df0c4 100644 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteBuilder.java +++ b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteBuilder.java @@ -17,15 +17,16 @@ package com._4paradigm.openmldb.spark.write; +import com._4paradigm.openmldb.spark.OpenmldbConfig; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; public class OpenmldbWriteBuilder implements WriteBuilder { - private final OpenmldbWriteConfig config; + private final OpenmldbConfig config; private final LogicalWriteInfo info; - public OpenmldbWriteBuilder(OpenmldbWriteConfig config, LogicalWriteInfo info) { + public OpenmldbWriteBuilder(OpenmldbConfig config, LogicalWriteInfo info) { this.config = config; this.info = info; } diff --git a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteConfig.java b/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteConfig.java deleted file mode 100644 index 89c2d801ca5..00000000000 --- a/java/openmldb-spark-connector/src/main/java/com/_4paradigm/openmldb/spark/write/OpenmldbWriteConfig.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com._4paradigm.openmldb.spark.write; - -import com._4paradigm.openmldb.sdk.SdkOption; - -import java.io.Serializable; - -// Must serializable -public class OpenmldbWriteConfig implements Serializable { - public final String dbName, tableName, zkCluster, zkPath, writerType; - - public OpenmldbWriteConfig(String dbName, String tableName, SdkOption option, String writerType) { - this.dbName = dbName; - this.tableName = tableName; - this.zkCluster = option.getZkCluster(); - this.zkPath = option.getZkPath(); - this.writerType = writerType; - // TODO(hw): other configs in SdkOption - } -} diff --git a/java/openmldb-spark-connector/src/main/scala/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReader.scala b/java/openmldb-spark-connector/src/main/scala/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReader.scala index d8eeb89e7ab..86921d0f4d5 100644 --- a/java/openmldb-spark-connector/src/main/scala/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReader.scala +++ b/java/openmldb-spark-connector/src/main/scala/com/_4paradigm/openmldb/spark/read/OpenmldbPartitionReader.scala @@ -1,5 +1,6 @@ package com._4paradigm.openmldb.spark.read +import com._4paradigm.openmldb.spark.OpenmldbConfig import com._4paradigm.openmldb.sdk.{Schema, SdkOption} import com._4paradigm.openmldb.sdk.impl.SqlClusterExecutor import org.apache.spark.sql.catalyst.InternalRow @@ -8,15 +9,10 @@ import org.apache.spark.unsafe.types.UTF8String import java.sql.Types -class OpenmldbPartitionReader(config: OpenmldbReadConfig) extends PartitionReader[InternalRow] { - - val option = new SdkOption - option.setZkCluster(config.zkCluster) - option.setZkPath(config.zkPath) - option.setLight(true) - val executor = new SqlClusterExecutor(option) - val dbName: String = config.dbName - val tableName: String = config.tableName +class OpenmldbPartitionReader(config: OpenmldbConfig) extends PartitionReader[InternalRow] { + val executor = new SqlClusterExecutor(config.getSdkOption) + val dbName: String = config.getDB + val tableName: String = config.getTable val schema: Schema = executor.getTableSchema(dbName, tableName) executor.executeSQL(dbName, "SET @@execute_mode='online'") diff --git a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/server/TaskManagerServer.java b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/server/TaskManagerServer.java index 0a75c2e37b2..031b02764e7 100644 --- a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/server/TaskManagerServer.java +++ b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/server/TaskManagerServer.java @@ -70,9 +70,12 @@ public void start(Boolean blocking) throws ConfigException, IOException, Interru logger.info("The server becomes active master and prepare to do business logic"); if (TaskManagerConfig.getTrackUnfinishedJobs()) { // Start threads to track unfinished jobs - JobTrackerService.startTrackerThreads(); + JobTrackerService.startTrackerThreads(); // may throw exception + } + // if blocking, start a bg thread to reconnect zk + if (blocking) { + failoverWatcher.startReconnectThread(); } - // Start brpc server startRpcServer(blocking); } diff --git a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/FailoverWatcher.java b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/FailoverWatcher.java index 69c7689bd45..c176919d92c 100644 --- a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/FailoverWatcher.java +++ b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/FailoverWatcher.java @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -32,316 +32,354 @@ * blocking and notification for leader election. */ public class FailoverWatcher implements Watcher { - private static final Log LOG = LogFactory.getLog(FailoverWatcher.class); + private static final Log LOG = LogFactory.getLog(FailoverWatcher.class); - private final String baseZnode; - private final String masterZnode; - private final String zkQuorum; - private final int sessionTimeout; - private final int connectRetryTimes; - private final HostPort hostPort; - private ZooKeeper zooKeeper; - private final AtomicBoolean hasActiveServer = new AtomicBoolean(false); - private final AtomicBoolean becomeActiveServer = new AtomicBoolean(false); + private final String baseZnode; + private final String masterZnode; + private final String zkQuorum; + private final int sessionTimeout; + private final int connectRetryTimes; + private final HostPort hostPort; + private RecoverableZooKeeper zooKeeper; // thread-safe + private final AtomicBoolean connected = new AtomicBoolean(false); // record zookeeper connection status + private final AtomicBoolean hasActiveServer = new AtomicBoolean(false); + private final AtomicBoolean becomeActiveServer = new AtomicBoolean(false); - /** - * Initialize FailoverWatcher with properties. - * - * @throws IOException throw when can't connect with ZooKeeper - */ - public FailoverWatcher() throws IOException { + /** + * Initialize FailoverWatcher with properties. + * + * @throws IOException throw when can't connect with ZooKeeper + */ + public FailoverWatcher() throws IOException { - baseZnode = TaskManagerConfig.getZkRootPath() + "/taskmanager"; - masterZnode = baseZnode + "/leader"; - zkQuorum = TaskManagerConfig.getZkCluster(); - sessionTimeout = TaskManagerConfig.getZkSessionTimeout(); - connectRetryTimes = 3; - String serverHost = TaskManagerConfig.getServerHost(); - int serverPort = TaskManagerConfig.getServerPort(); - hostPort = new HostPort(serverHost, serverPort); + baseZnode = TaskManagerConfig.getZkRootPath() + "/taskmanager"; + masterZnode = baseZnode + "/leader"; + zkQuorum = TaskManagerConfig.getZkCluster(); + sessionTimeout = TaskManagerConfig.getZkSessionTimeout(); + connectRetryTimes = 3; + String serverHost = TaskManagerConfig.getServerHost(); + int serverPort = TaskManagerConfig.getServerPort(); + hostPort = new HostPort(serverHost, serverPort); - connectZooKeeper(); + connectZooKeeper(); - initZnode(); - } + initZnode(); + } - /** - * Connect with ZooKeeper with retries. - * - * @throws IOException when error to construct ZooKeeper object after retrying - */ - protected void connectZooKeeper() throws IOException { - LOG.info("Connecting ZooKeeper " + zkQuorum); + /** + * Connect with ZooKeeper with retries. + * + * @throws IOException when error to construct ZooKeeper object after retrying + */ + protected void connectZooKeeper() throws IOException { + LOG.info("Connecting ZooKeeper " + zkQuorum); - for (int i = 0; i <= connectRetryTimes; i++) { - try { - zooKeeper = new ZooKeeper(zkQuorum, sessionTimeout, this); - break; - } catch (IOException e) { - if (i == connectRetryTimes) { - throw new IOException("Can't connect ZooKeeper after retrying", e); + for (int i = 0; i <= connectRetryTimes; i++) { + try { + zooKeeper = new RecoverableZooKeeper(zkQuorum, sessionTimeout, this); + break; + } catch (IOException e) { + if (i == connectRetryTimes) { + throw new IOException("Can't connect ZooKeeper after retrying", e); + } + LOG.error("Exception to connect ZooKeeper, retry " + (i + 1) + " times"); + } } - LOG.error("Exception to connect ZooKeeper, retry " + (i + 1) + " times"); - } } - } - /** - * Initialize the base znodes. - */ - protected void initZnode() { - try { - ZooKeeperUtil.createAndFailSilent(this, TaskManagerConfig.getZkRootPath()); - ZooKeeperUtil.createAndFailSilent(this, baseZnode); - } catch (Exception e) { - LOG.fatal("Error to create znode " + baseZnode - + ", exit immediately", e); - System.exit(0); + /** + * Initialize the base znodes. + */ + protected void initZnode() { + try { + ZooKeeperUtil.createAndFailSilent(this, TaskManagerConfig.getZkRootPath()); + ZooKeeperUtil.createAndFailSilent(this, baseZnode); + } catch (Exception e) { + LOG.fatal("Error to create znode " + baseZnode + ", exit immediately", e); + System.exit(0); + } } - } - /** - * Override this mothod to deal with events for leader election. - * - * @param event the ZooKeeper event - */ - @Override - public void process(WatchedEvent event) { - if (LOG.isDebugEnabled()) { - LOG.debug("Received ZooKeeper Event, " + "type=" + event.getType() + ", " + "state=" - + event.getState() + ", " + "path=" + event.getPath()); - } + /** + * Override this mothod to deal with events for leader election. + * + * @param event the ZooKeeper event + */ + @Override + public void process(WatchedEvent event) { + if (LOG.isDebugEnabled()) { + LOG.debug("Received ZooKeeper Event, " + "type=" + event.getType() + ", " + "state=" + event.getState() + + ", " + "path=" + event.getPath()); + } - switch (event.getType()) { - case None: { - processConnection(event); - break; - } - case NodeCreated: { - processNodeCreated(event.getPath()); - break; - } - case NodeDeleted: { - processNodeDeleted(event.getPath()); - break; - } - case NodeDataChanged: { - processDataChanged(event.getPath()); - break; - } - case NodeChildrenChanged: { - processNodeChildrenChanged(event.getPath()); - break; - } - default: - break; + switch (event.getType()) { + case None: { + processConnection(event); + break; + } + case NodeCreated: { + processNodeCreated(event.getPath()); + break; + } + case NodeDeleted: { + processNodeDeleted(event.getPath()); + break; + } + case NodeDataChanged: { + processDataChanged(event.getPath()); + break; + } + case NodeChildrenChanged: { + processNodeChildrenChanged(event.getPath()); + break; + } + default: + break; + } } - } - - /** - * Deal with connection event, exit current process if auth fails or session expires. - * - * @param event the ZooKeeper event - */ - protected void processConnection(WatchedEvent event) { - switch (event.getState()) { - case SyncConnected: - LOG.info(hostPort.getHostPort() + " sync connect from ZooKeeper"); - try { - waitToInitZooKeeper(2000); // init zookeeper in another thread, wait for a while - } catch (Exception e) { - LOG.fatal("Error to init ZooKeeper object after sleeping 2000 ms, exit immediately"); - System.exit(0); - } - break; - /* - case Disconnected: // be triggered when kill the server or the leader of zk cluster - LOG.warn(hostPort.getHostPort() + " received disconnected from ZooKeeper"); - if (becomeActiveServer.get()) { - // Exit if this is master and disconnect from ZK - System.exit(0); - } - break; - */ - case AuthFailed: - LOG.fatal(hostPort.getHostPort() + " auth fail, exit immediately"); - System.exit(0); - case Expired: - LOG.fatal(hostPort.getHostPort() + " received expired from ZooKeeper, exit immediately"); - System.exit(0); - break; - default: - break; + /** + * Deal with connection event, exit current process if auth fails or session expires. + * + * @param event the ZooKeeper event + */ + protected void processConnection(WatchedEvent event) { + switch (event.getState()) { + case SyncConnected: + LOG.info(hostPort.getHostPort() + " sync connect from ZooKeeper"); + try { + waitToInitZooKeeper(2000); // init zookeeper in another thread, wait for a while + } catch (Exception e) { + LOG.fatal("Error to init ZooKeeper object after sleeping 2000 ms, exit immediately"); + System.exit(0); + } + LOG.info(hostPort.getHostPort() + " init ZooKeeper object successfully, session id is 0x" + Long.toHexString( + zooKeeper.getSessionId())); + connected.set(true); + break; + /* + case Disconnected: // be triggered when kill the server or the leader of zk cluster + LOG.warn(hostPort.getHostPort() + " received disconnected from ZooKeeper"); + + if (becomeActiveServer.get()) { + // Exit if this is master and disconnect from ZK + System.exit(0); + } + break; + */ + case AuthFailed: + LOG.fatal(hostPort.getHostPort() + " auth fail, exit immediately"); + System.exit(0); + case Expired: + LOG.warn(hostPort.getHostPort() + " received expired from ZooKeeper"); + default: + // expired or other unknown state: mark as disconnected + connected.set(false); + break; + } } - } - /** - * Deal with create node event, just call the leader election. - * - * @param path which znode is created - */ - protected void processNodeCreated(String path) { - if (path.equals(masterZnode)) { - LOG.info(masterZnode + " created and try to become active master"); - handleMasterNodeChange(); + /** + * Deal with create node event, just call the leader election. + * + * @param path which znode is created + */ + protected void processNodeCreated(String path) { + if (path.equals(masterZnode)) { + LOG.info(masterZnode + " created and try to become active master"); + handleMasterNodeChange(); + } } - } - /** - * Deal with delete node event, just call the leader election. - * - * @param path which znode is deleted - */ - protected void processNodeDeleted(String path) { - if (path.equals(masterZnode)) { - LOG.info(masterZnode + " deleted and try to become active master"); - handleMasterNodeChange(); + /** + * Deal with delete node event, just call the leader election. + * + * @param path which znode is deleted + */ + protected void processNodeDeleted(String path) { + if (path.equals(masterZnode)) { + LOG.info(masterZnode + " deleted and try to become active master"); + handleMasterNodeChange(); + } } - } - /** - * Do nothing when data changes, should be overrided. - * - * @param path which znode's data is changed - */ - protected void processDataChanged(String path) { + /** + * Do nothing when data changes, should be overrided. + * + * @param path which znode's data is changed + */ + protected void processDataChanged(String path) { - } + } - /** - * Do nothing when children znode changes, should be overrided. - * - * @param path which znode's children is changed. - */ - protected void processNodeChildrenChanged(String path) { + /** + * Do nothing when children znode changes, should be overrided. + * + * @param path which znode's children is changed. + */ + protected void processNodeChildrenChanged(String path) { - } + } - /** - * Implement the logic of leader election. - */ - private void handleMasterNodeChange() { - try { - synchronized (hasActiveServer) { - if (ZooKeeperUtil.watchAndCheckExists(this, masterZnode)) { - // A master node exists, there is an active master - if (LOG.isDebugEnabled()) { - LOG.debug("A master is now available"); - } - hasActiveServer.set(true); - } else { - // Node is no longer there, cluster does not have an active master - if (LOG.isDebugEnabled()) { - LOG.debug("No master available. Notifying waiting threads"); - } - hasActiveServer.set(false); - // Notify any thread waiting to become the active master - hasActiveServer.notifyAll(); + /** + * Implement the logic of leader election. + */ + private void handleMasterNodeChange() { + try { + synchronized (hasActiveServer) { + if (ZooKeeperUtil.watchAndCheckExists(this, masterZnode)) { + // A master node exists, there is an active master + if (LOG.isDebugEnabled()) { + LOG.debug("A master is now available"); + } + hasActiveServer.set(true); + } else { + // Node is no longer there, cluster does not have an active master + if (LOG.isDebugEnabled()) { + LOG.debug("No master available. Notifying waiting threads"); + } + hasActiveServer.set(false); + // Notify any thread waiting to become the active master + hasActiveServer.notifyAll(); + } + } + } catch (KeeperException ke) { + LOG.error("Received an unexpected KeeperException, aborting", ke); } - } - } catch (KeeperException ke) { - LOG.error("Received an unexpected KeeperException, aborting", ke); } - } - /** - * Implement the logic of server to wait to become active master. - * - * @return false if error to wait to become active master - */ - public boolean blockUntilActive() { - while (true) { - try { - if (ZooKeeperUtil.createEphemeralNodeAndWatch(this, masterZnode, hostPort.getHostPort() - .getBytes())) { + /** + * Implement the logic of server to wait to become active master. + * + * @return false if error to wait to become active master + */ + public boolean blockUntilActive() { + while (true) { + try { + if (ZooKeeperUtil.createEphemeralNodeAndWatch(this, masterZnode, hostPort.getHostPort().getBytes())) { + // We are the master, return + hasActiveServer.set(true); + becomeActiveServer.set(true); + LOG.info("Become active master in " + hostPort.getHostPort()); + return true; + } - // We are the master, return - hasActiveServer.set(true); - becomeActiveServer.set(true); - LOG.info("Become active master in " + hostPort.getHostPort()); - return true; - } + hasActiveServer.set(true); - hasActiveServer.set(true); + // we start the server with the same ip_port stored in master znode, that means we want to + // restart the server? + String msg; + byte[] bytes = ZooKeeperUtil.getDataAndWatch(this, masterZnode); + if (bytes == null) { + msg = ("A master was detected, but went down before its address " + + "could be read. Attempting to become the next active master"); + } else { + if (hostPort.getHostPort().equals(new String(bytes))) { + msg = ("Current master has this master's address, " + hostPort.getHostPort() + + "; master was restarted? Deleting node."); + // Hurry along the expiration of the znode. + ZooKeeperUtil.deleteNode(this, masterZnode); + } else { + msg = "Another master " + new String(bytes) + " is the active master, " + hostPort.getHostPort() + + "; waiting to become the next active master"; + } + } + LOG.info(msg); + } catch (KeeperException ke) { + LOG.error("Received an unexpected KeeperException when block to become active, aborting", ke); + return false; + } - // we start the server with the same ip_port stored in master znode, that means we want to - // restart the server? - String msg; - byte[] bytes = ZooKeeperUtil.getDataAndWatch(this, masterZnode); - if (bytes == null) { - msg = ("A master was detected, but went down before its address " - + "could be read. Attempting to become the next active master"); - } else { - if (hostPort.getHostPort().equals(new String(bytes))) { - msg = ("Current master has this master's address, " + hostPort.getHostPort() + "; master was restarted? Deleting node."); - // Hurry along the expiration of the znode. - ZooKeeperUtil.deleteNode(this, masterZnode); - } else { - msg = "Another master " + new String(bytes) + " is the active master, " - + hostPort.getHostPort() + "; waiting to become the next active master"; - } + synchronized (hasActiveServer) { + while (hasActiveServer.get()) { + try { + hasActiveServer.wait(); + } catch (InterruptedException e) { + // We expect to be interrupted when a master dies, will fall out if so + if (LOG.isDebugEnabled()) { + LOG.debug("Interrupted while waiting to be master"); + } + return false; + } + } + } } - LOG.info(msg); - } catch (KeeperException ke) { - LOG.error("Received an unexpected KeeperException when block to become active, aborting", - ke); - return false; - } + } - synchronized (hasActiveServer) { - while (hasActiveServer.get()) { - try { - hasActiveServer.wait(); - } catch (InterruptedException e) { - // We expect to be interrupted when a master dies, will fall out if so - if (LOG.isDebugEnabled()) { - LOG.debug("Interrupted while waiting to be master"); + /** + * Close the ZooKeeper object. + */ + public void close() { + if (zooKeeper != null) { + try { + connected.set(false); + zooKeeper.close(); + } catch (InterruptedException e) { + LOG.error("Interrupt when closing zookeeper connection", e); } - return false; - } } - } } - } - /** - * Close the ZooKeeper object. - */ - public void close() { - if (zooKeeper != null) { - try { - zooKeeper.close(); - } catch (InterruptedException e) { - LOG.error("Interrupt when closing zookeeper connection", e); - } - } - } + /** + * Wait to init ZooKeeper object, only sleep when it's null. + * + * @param maxWaitMillis the max sleep time + * @throws Exception if ZooKeeper object is still null + */ + public void waitToInitZooKeeper(long maxWaitMillis) throws Exception { + long finished = System.currentTimeMillis() + maxWaitMillis; + while (System.currentTimeMillis() < finished) { + if (this.zooKeeper != null) { + return; + } - /** - * Wait to init ZooKeeper object, only sleep when it's null. - * - * @param maxWaitMillis the max sleep time - * @throws Exception if ZooKeeper object is still null - */ - public void waitToInitZooKeeper(long maxWaitMillis) throws Exception { - long finished = System.currentTimeMillis() + maxWaitMillis; - while (System.currentTimeMillis() < finished) { - if (this.zooKeeper != null) { - return; - } - - try { - Thread.sleep(1); - } catch (InterruptedException e) { - throw new Exception(e); - } + try { + Thread.sleep(1); + } catch (InterruptedException e) { + throw new Exception(e); + } + } + throw new Exception(); } - throw new Exception(); - } - public ZooKeeper getZooKeeper() { - return zooKeeper; - } + public ZooKeeper getZooKeeper() throws KeeperException { + return zooKeeper.checkZk(); // return raw zookeeper here + } -} \ No newline at end of file + // we can't know from the zookeeper client whether it's connected or not, so we use a flag to record it + // Another way is reconnect after get expired exception when get/set from zk + public void startReconnectThread() { + // TODO: just create a thread now, if more threads for new tasks, should use thread pool + // becomeActiveServer will be set only in this thread + new Thread(new Runnable() { + @Override + public void run() { + while (true) { + try { + Thread.sleep(2000); + synchronized (connected) { + if (connected.get() == false) { + LOG.info("Try to reconnect ZooKeeper"); + // set connected in event + zooKeeper.reconnectAfterExpiration(); + becomeActiveServer.set(false); + // Won't init znode again to avoid exit in initZnode, it's already created in init. + // If znode is deleted, should restart taskmanager + } + } + synchronized(connected) { + if (connected.get() == true && becomeActiveServer.get() == false) { + LOG.info("Available cxn, try to become active master after reconnecting ZooKeeper"); + if(!blockUntilActive()) { + LOG.warn("block failed, try later"); + } + } + } + } catch (Exception e) { + LOG.error("Error to reconnect ZooKeeper", e); + } + } + } + }).start(); + } +} diff --git a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/RecoverableZooKeeper.java b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/RecoverableZooKeeper.java new file mode 100644 index 00000000000..9ff2b9349b4 --- /dev/null +++ b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/RecoverableZooKeeper.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com._4paradigm.openmldb.taskmanager.zk; + +import java.io.IOException; +import java.lang.management.ManagementFactory; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import org.apache.zookeeper.AsyncCallback; +import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.KeeperException; +import org.apache.zookeeper.Op; +import org.apache.zookeeper.OpResult; +import org.apache.zookeeper.Watcher; +import org.apache.zookeeper.ZooDefs; +import org.apache.zookeeper.ZooKeeper; +import org.apache.zookeeper.ZooKeeper.States; +import org.apache.zookeeper.data.ACL; +import org.apache.zookeeper.data.Stat; +import org.apache.zookeeper.proto.CreateRequest; +import org.apache.zookeeper.proto.SetDataRequest; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.concurrent.ThreadSafe; + +/** + * ref + * https://github.com/apache/hbase/blob/25e9228e2c0a9a752db02e48d55010e0197fd203/hbase-zookeeper/src/main/java/org/apache/hadoop/hbase/zookeeper/RecoverableZooKeeper.java + * It's a thread-safe class. No opentelemetry trace, and no retry mechanism. If + * we need retry, we can include RetryCounter. + */ +@ThreadSafe +public class RecoverableZooKeeper { + private static final Logger LOG = LoggerFactory.getLogger(RecoverableZooKeeper.class); + // the actual ZooKeeper client instance + private ZooKeeper zk; + // private final RetryCounterFactory retryCounterFactory; + // An identifier of this process in the cluster + private final String identifier; + private final byte[] id; + private final Watcher watcher; + private final int sessionTimeout; + private final String quorumServers; + private final int maxMultiSize; // unused now + + @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DE_MIGHT_IGNORE", justification = "None. Its always been this way.") + public RecoverableZooKeeper(String quorumServers, int sessionTimeout, Watcher watcher) throws IOException { + // TODO: Add support for zk 'chroot'; we don't add it to the quorumServers + // String as we should. + String identifier = null; + if (identifier == null || identifier.length() == 0) { + // the identifier = processID@hostName + identifier = ManagementFactory.getRuntimeMXBean().getName(); + } + LOG.info("Process identifier={} connecting to ZooKeeper ensemble={}", identifier, quorumServers); + this.identifier = identifier; + this.id = identifier.getBytes(StandardCharsets.UTF_8.name()); + + this.watcher = watcher; + this.sessionTimeout = sessionTimeout; + this.quorumServers = quorumServers; + this.maxMultiSize = 1024 * 1024; + + try { + checkZk(); + } catch (Exception x) { + /* ignore */ + } + } + + /** + * Returns the maximum size (in bytes) that should be included in any single + * multi() call. NB: + * This is an approximation, so there may be variance in the msg actually sent + * over the wire. + * Please be sure to set this approximately, with respect to your ZK server + * configuration for + * jute.maxbuffer. + */ + public int getMaxMultiSizeLimit() { + return maxMultiSize; + } + + /** + * Try to create a ZooKeeper connection. Turns any exception encountered into a + * KeeperException.OperationTimeoutException so it can retried. + * + * @return The created ZooKeeper connection object + * @throws KeeperException if a ZooKeeper operation fails + */ + protected synchronized ZooKeeper checkZk() throws KeeperException { + if (this.zk == null) { + try { + this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher); + } catch (IOException ex) { + LOG.warn("Unable to create ZooKeeper Connection", ex); + throw new KeeperException.OperationTimeoutException(); + } + } + return zk; + } + + public synchronized void reconnectAfterExpiration() throws IOException, KeeperException, InterruptedException { + if (zk != null) { + LOG.info("Closing dead ZooKeeper connection, session" + " was: 0x" + Long.toHexString(zk.getSessionId())); + zk.close(); + // reset the ZooKeeper connection + zk = null; + } + checkZk(); + LOG.info("Recreated a ZooKeeper, session" + " is: 0x" + Long.toHexString(zk.getSessionId())); + } + + public synchronized long getSessionId() { + return zk == null ? -1 : zk.getSessionId(); + } + + public synchronized void close() throws InterruptedException { + if (zk != null) { + zk.close(); + } + } + + public synchronized States getState() { + return zk == null ? null : zk.getState(); + } + + public synchronized ZooKeeper getZooKeeper() { + return zk; + } + + public void sync(String path, AsyncCallback.VoidCallback cb, Object ctx) throws KeeperException { + checkZk().sync(path, cb, ctx); + } + + /** + * Filters the given node list by the given prefixes. This method is + * all-inclusive--if any element + * in the node list starts with any of the given prefixes, then it is included + * in the result. + * + * @param nodes the nodes to filter + * @param prefixes the prefixes to include in the result + * @return list of every element that starts with one of the prefixes + */ + private static List filterByPrefix(List nodes, String... prefixes) { + List lockChildren = new ArrayList<>(); + for (String child : nodes) { + for (String prefix : prefixes) { + if (child.startsWith(prefix)) { + lockChildren.add(child); + break; + } + } + } + return lockChildren; + } + + public String getIdentifier() { + return identifier; + } +} diff --git a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/ZooKeeperUtil.java b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/ZooKeeperUtil.java index 406488292a7..c2283629ee7 100644 --- a/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/ZooKeeperUtil.java +++ b/java/openmldb-taskmanager/src/main/java/com/_4paradigm/openmldb/taskmanager/zk/ZooKeeperUtil.java @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -31,119 +31,114 @@ * Most code is from HBase ZKUtil and ZooKeeperWatcher is replaced with FailoverWatcher. */ public class ZooKeeperUtil { - private static final Logger LOG = LoggerFactory.getLogger(ZooKeeperUtil.class); + private static final Logger LOG = LoggerFactory.getLogger(ZooKeeperUtil.class); - public static boolean watchAndCheckExists(FailoverWatcher failoverWatcher, String znode) - throws KeeperException { - try { - Stat s = failoverWatcher.getZooKeeper().exists(znode, failoverWatcher); - boolean exists = s != null; - if (LOG.isDebugEnabled()) { - if (exists) { - LOG.debug("Set watcher on existing znode " + znode); - } else { - LOG.debug(znode + " does not exist. Watcher is set."); + public static boolean watchAndCheckExists(FailoverWatcher failoverWatcher, String znode) throws KeeperException { + try { + Stat s = failoverWatcher.getZooKeeper().exists(znode, failoverWatcher); + boolean exists = s != null; + if (LOG.isDebugEnabled()) { + if (exists) { + LOG.debug("Set watcher on existing znode " + znode); + } else { + LOG.debug(znode + " does not exist. Watcher is set."); + } + } + return exists; + } catch (KeeperException e) { + LOG.warn("Unable to set watcher on znode " + znode, e); + LOG.warn("Received unexpected KeeperException, re-throwing exception"); + throw e; + } catch (InterruptedException e) { + LOG.warn("Unable to set watcher on znode " + znode, e); + return false; } - } - return exists; - } catch (KeeperException e) { - LOG.warn("Unable to set watcher on znode " + znode, e); - LOG.warn("Received unexpected KeeperException, re-throwing exception"); - throw e; - } catch (InterruptedException e) { - LOG.warn("Unable to set watcher on znode " + znode, e); - return false; } - } - public static boolean createEphemeralNodeAndWatch(FailoverWatcher failoverWatcher, String znode, - byte[] data) throws KeeperException { - try { - LOG.info("Try to create emphemeral znode " + znode); - failoverWatcher.getZooKeeper().create(znode, data, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); - } catch (KeeperException.NodeExistsException nee) { - if (!watchAndCheckExists(failoverWatcher, znode)) { - // It did exist but now it doesn't, try again - return createEphemeralNodeAndWatch(failoverWatcher, znode, data); - } - return false; - } catch (InterruptedException e) { - LOG.info("Interrupted", e); - Thread.currentThread().interrupt(); + public static boolean createEphemeralNodeAndWatch(FailoverWatcher failoverWatcher, String znode, byte[] data) + throws KeeperException { + try { + LOG.info("Try to create emphemeral znode " + znode); + failoverWatcher.getZooKeeper().create(znode, data, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); + } catch (KeeperException.NodeExistsException nee) { + if (!watchAndCheckExists(failoverWatcher, znode)) { + // It did exist but now it doesn't, try again + return createEphemeralNodeAndWatch(failoverWatcher, znode, data); + } + return false; + } catch (InterruptedException e) { + LOG.info("Interrupted", e); + Thread.currentThread().interrupt(); + } + return true; } - return true; - } - public static void deleteNode(FailoverWatcher failoverWatcher, String node) - throws KeeperException { - deleteNode(failoverWatcher, node, -1); - } + public static void deleteNode(FailoverWatcher failoverWatcher, String node) throws KeeperException { + deleteNode(failoverWatcher, node, -1); + } - public static boolean deleteNode(FailoverWatcher failoverWatcher, String node, int version) - throws KeeperException { - try { - failoverWatcher.getZooKeeper().delete(node, version); - return true; - } catch (KeeperException.BadVersionException bve) { - LOG.debug("Bad version exception when delete node '{}'", node, bve); - return false; - } catch (InterruptedException ie) { - LOG.debug("Received InterruptedException, doing nothing here", ie); - return false; + public static boolean deleteNode(FailoverWatcher failoverWatcher, String node, int version) throws KeeperException { + try { + failoverWatcher.getZooKeeper().delete(node, version); + return true; + } catch (KeeperException.BadVersionException bve) { + LOG.debug("Bad version exception when delete node '{}'", node, bve); + return false; + } catch (InterruptedException ie) { + LOG.debug("Received InterruptedException, doing nothing here", ie); + return false; + } } - } - public static byte[] getDataAndWatch(FailoverWatcher failoverWatcher, String znode) { - return getDataInternal(failoverWatcher, znode, null); - } + public static byte[] getDataAndWatch(FailoverWatcher failoverWatcher, String znode) { + return getDataInternal(failoverWatcher, znode, null); + } - @Nullable - private static byte[] getDataInternal(FailoverWatcher failoverWatcher, String znode, Stat stat) { - try { - byte[] data = failoverWatcher.getZooKeeper().getData(znode, failoverWatcher, stat); - if (LOG.isDebugEnabled()) { - LOG.debug("Retrieved " + ((data == null) ? 0 : data.length) - + " byte(s) of data from znode " + znode); - } - return data; - } catch (KeeperException.NoNodeException e) { - if (LOG.isDebugEnabled()) { - LOG.debug("Unable to get data of znode " + znode + " " - + "because node does not exist (not an error)"); - } - return null; - } catch (KeeperException | InterruptedException e) { - LOG.warn("Unable to get data of znode " + znode, e); - LOG.warn("Received unexpected KeeperException, re-throwing exception"); - return null; + @Nullable + private static byte[] getDataInternal(FailoverWatcher failoverWatcher, String znode, Stat stat) { + try { + byte[] data = failoverWatcher.getZooKeeper().getData(znode, failoverWatcher, stat); + if (LOG.isDebugEnabled()) { + LOG.debug("Retrieved " + ((data == null) ? 0 : data.length) + " byte(s) of data from znode " + znode); + } + return data; + } catch (KeeperException.NoNodeException e) { + if (LOG.isDebugEnabled()) { + LOG.debug("Unable to get data of znode " + znode + " " + "because node does not exist (not an error)"); + } + return null; + } catch (KeeperException | InterruptedException e) { + LOG.warn("Unable to get data of znode " + znode, e); + LOG.warn("Received unexpected KeeperException, re-throwing exception"); + return null; + } } - } - public static void createAndFailSilent(FailoverWatcher failoverWatcher, String znode) - throws KeeperException, InterruptedException { - try { - LOG.info("Try to create persistent znode " + znode); - ZooKeeper zk = failoverWatcher.getZooKeeper(); - if (zk.exists(znode, false) == null) { - zk.create(znode, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); - } - } catch (KeeperException.NodeExistsException ignore) { - //we just ignore result if the node already exist - LOG.info("Znode " + znode + " already exist"); - } catch (KeeperException.NoAuthException nee) { - try { - if (null == failoverWatcher.getZooKeeper().exists(znode, false)) { - // If we failed to create the file and it does not already exist. - throw nee; + public static void createAndFailSilent(FailoverWatcher failoverWatcher, String znode) + throws KeeperException, InterruptedException { + try { + LOG.info("Try to create persistent znode " + znode); + ZooKeeper zk = failoverWatcher.getZooKeeper(); + if (zk.exists(znode, false) == null) { + zk.create(znode, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); + } + } catch (KeeperException.NodeExistsException ignore) { + // we just ignore result if the node already exist + LOG.info("Znode " + znode + " already exist"); + } catch (KeeperException.NoAuthException nee) { + try { + if (null == failoverWatcher.getZooKeeper().exists(znode, false)) { + // If we failed to create the file and it does not already exist. + throw nee; + } + } catch (InterruptedException ie) { + LOG.debug("Received InterruptedException, re-throw the exception", ie); + throw ie; + } + } catch (InterruptedException ie) { + LOG.debug("Received InterruptedException, re-throw the exception", ie); + throw ie; } - } catch (InterruptedException ie) { - LOG.debug("Received InterruptedException, re-throw the exception", ie); - throw ie; - } - } catch (InterruptedException ie) { - LOG.debug("Received InterruptedException, re-throw the exception", ie); - throw ie; } - } } diff --git a/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/OpenmldbBatchjobManager.scala b/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/OpenmldbBatchjobManager.scala index 62e442f65a6..6d942b1eb9e 100644 --- a/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/OpenmldbBatchjobManager.scala +++ b/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/OpenmldbBatchjobManager.scala @@ -57,10 +57,10 @@ object OpenmldbBatchjobManager { if (TaskManagerConfig.isK8s) { val args = List(sql) - K8sJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + K8sJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } else { - SparkJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, + SparkJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb, blocking = true) } } @@ -73,11 +73,11 @@ object OpenmldbBatchjobManager { if (TaskManagerConfig.isK8s) { val args = List(sql) - K8sJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + K8sJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } else { val args = List(tempSqlFile.getAbsolutePath) - SparkJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + SparkJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } } @@ -90,11 +90,11 @@ object OpenmldbBatchjobManager { if (TaskManagerConfig.isK8s) { val args = List(sql) - K8sJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + K8sJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } else { val args = List(tempSqlFile.getAbsolutePath) - SparkJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + SparkJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } } @@ -107,11 +107,11 @@ object OpenmldbBatchjobManager { if (TaskManagerConfig.isK8s) { val args = List(sql) - K8sJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + K8sJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } else { val args = List(tempSqlFile.getAbsolutePath) - SparkJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + SparkJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } } @@ -124,11 +124,11 @@ object OpenmldbBatchjobManager { if (TaskManagerConfig.isK8s) { val args = List(sql) - K8sJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + K8sJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } else { val args = List(tempSqlFile.getAbsolutePath) - SparkJobManager.submitSparkJob(jobType, mainClass, args, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, + SparkJobManager.submitSparkJob(jobType, mainClass, args, sql, tempSqlFile.getAbsolutePath, sparkConf.asScala.toMap, defaultDb) } } diff --git a/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/k8s/K8sJobManager.scala b/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/k8s/K8sJobManager.scala index b9985a263b0..0c459969003 100644 --- a/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/k8s/K8sJobManager.scala +++ b/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/k8s/K8sJobManager.scala @@ -44,12 +44,18 @@ object K8sJobManager { def submitSparkJob(jobType: String, mainClass: String, args: List[String] = List(), + sql: String = "", localSqlFile: String = "", sparkConf: Map[String, String] = Map(), defaultDb: String = "", blocking: Boolean = false): JobInfo = { - val jobInfo = JobInfoManager.createJobInfo(jobType, args, sparkConf) + val jobInfoArgs = if (sql.nonEmpty) { + List(sql) + } else { + args + } + val jobInfo = JobInfoManager.createJobInfo(jobType, jobInfoArgs, sparkConf) val jobName = getK8sJobName(jobInfo.getId) jobInfo.setApplicationId(jobName) diff --git a/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/spark/SparkJobManager.scala b/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/spark/SparkJobManager.scala index bc8c5dfebbe..44fc619e536 100644 --- a/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/spark/SparkJobManager.scala +++ b/java/openmldb-taskmanager/src/main/scala/com/_4paradigm/openmldb/taskmanager/spark/SparkJobManager.scala @@ -77,11 +77,18 @@ object SparkJobManager { def submitSparkJob(jobType: String, mainClass: String, args: List[String] = List(), + sql: String = "", localSqlFile: String = "", sparkConf: Map[String, String] = Map(), defaultDb: String = "", blocking: Boolean = false): JobInfo = { - val jobInfo = JobInfoManager.createJobInfo(jobType, args, sparkConf) + + val jobInfoArgs = if (sql.nonEmpty) { + List(sql) + } else { + args + } + val jobInfo = JobInfoManager.createJobInfo(jobType, jobInfoArgs, sparkConf) // Submit Spark application with SparkLauncher val launcher = createSparkLauncher(mainClass) diff --git a/java/openmldb-taskmanager/src/test/scala/com/_4paradigm/openmldb/taskmanager/spark/TestSparkJobManager.scala b/java/openmldb-taskmanager/src/test/scala/com/_4paradigm/openmldb/taskmanager/spark/TestSparkJobManager.scala index 3294b83e7db..902ff4a6af1 100644 --- a/java/openmldb-taskmanager/src/test/scala/com/_4paradigm/openmldb/taskmanager/spark/TestSparkJobManager.scala +++ b/java/openmldb-taskmanager/src/test/scala/com/_4paradigm/openmldb/taskmanager/spark/TestSparkJobManager.scala @@ -36,7 +36,7 @@ class TestSparkJobManager extends FunSuite { val jobType = "DummySparkApp" val sparkConf = Map(SparkLauncher.DRIVER_EXTRA_CLASSPATH -> System.getProperty("java.class.path")) - SparkJobManager.submitSparkJob(jobType, mainClass, List[String](), "", sparkConf) + SparkJobManager.submitSparkJob(jobType, mainClass, List[String](), "", "", sparkConf) JobInfoManager.getAllJobs().map(println) Thread.sleep(5000) diff --git a/python/openmldb_tool/diagnostic_tool/collector.py b/python/openmldb_tool/diagnostic_tool/collector.py index 7e143025e11..41403061610 100644 --- a/python/openmldb_tool/diagnostic_tool/collector.py +++ b/python/openmldb_tool/diagnostic_tool/collector.py @@ -115,7 +115,8 @@ def get_spark_home(server_info: ServerInfo): tm_conf_path = server_info.conf_path_pair("")[0] config_name = "spark.home=" log.debug("get %s from %s", config_name, tm_conf_path) - grep_str, _ = server_info.cmd_on_host(f"grep {config_name} {tm_conf_path}") + # last one option + grep_str, _ = server_info.cmd_on_host(f"grep {config_name} {tm_conf_path} | tail -n 1") if not grep_str: # TODO(hw):no config in file, get env SPARK_HOME? diff --git a/python/openmldb_tool/tests/sbin_test/tmp/openmldb/tablet-1/conf/tablet.flags b/python/openmldb_tool/tests/sbin_test/tmp/openmldb/tablet-1/conf/tablet.flags index 0ad78be8f67..568acd398d0 100644 --- a/python/openmldb_tool/tests/sbin_test/tmp/openmldb/tablet-1/conf/tablet.flags +++ b/python/openmldb_tool/tests/sbin_test/tmp/openmldb/tablet-1/conf/tablet.flags @@ -82,12 +82,12 @@ #--key_entry_max_height=8 # query conf -# max table traverse iteration(full table scan/aggregation),default: 50000 -#--max_traverse_cnt=50000 -# max table traverse pk number(batch query), default: 5000 -#--max_traverse_pk_cnt=5000 -# max result size in byte (default: 2MB) -#--scan_max_bytes_size=2097152 +# max table traverse iteration(full table scan/aggregation),default: 0 +#--max_traverse_cnt=0 +# max table traverse pk number(batch query), default: 0 +#--max_traverse_pk_cnt=0 +# max result size in byte (default: 0 ulimited) +#--scan_max_bytes_size=0 # loadtable #--load_table_batch=30 diff --git a/python/openmldb_tool/tests/sbin_test/tmp/openmldb/tablet-2/conf/tablet.flags b/python/openmldb_tool/tests/sbin_test/tmp/openmldb/tablet-2/conf/tablet.flags index d7866ff32bd..51f629ff2ba 100644 --- a/python/openmldb_tool/tests/sbin_test/tmp/openmldb/tablet-2/conf/tablet.flags +++ b/python/openmldb_tool/tests/sbin_test/tmp/openmldb/tablet-2/conf/tablet.flags @@ -82,12 +82,12 @@ #--key_entry_max_height=8 # query conf -# max table traverse iteration(full table scan/aggregation),default: 50000 -#--max_traverse_cnt=50000 -# max table traverse pk number(batch query), default: 5000 -#--max_traverse_pk_cnt=5000 -# max result size in byte (default: 2MB) -#--scan_max_bytes_size=2097152 +# max table traverse iteration(full table scan/aggregation),default: 0 +#--max_traverse_cnt=0 +# max table traverse pk number(batch query), default: 0 +#--max_traverse_pk_cnt=0 +# max result size in byte (default: 0 ulimited) +#--scan_max_bytes_size=0 # loadtable #--load_table_batch=30 diff --git a/release/conf/openmldb-env.sh b/release/conf/openmldb-env.sh index 5ba917c49e7..3b4b83dd5ef 100644 --- a/release/conf/openmldb-env.sh +++ b/release/conf/openmldb-env.sh @@ -2,15 +2,12 @@ export OPENMLDB_VERSION=0.8.4 # openmldb mode: standalone / cluster export OPENMLDB_MODE=${OPENMLDB_MODE:=cluster} -# tablet port -export OPENMLDB_TABLET_PORT=10921 -# nameserver port -export OPENMLDB_NAMESERVER_PORT=7527 -# taskmanager port -export OPENMLDB_TASKMANAGER_PORT=9902 -# apiserver port -export OPENMLDB_APISERVER_PORT=9080 - +# openmldb root path +export OPENMLDB_HOME= +# the root path of openmldb spark release, default is $OPENMLDB_HOME/spark +# if not exists, download from online +export SPARK_HOME= +export RUNNER_EXISTING_SPARK_HOME= # if OPENMLDB_USE_EXISTING_ZK_CLUSTER is set, will use existing zk cluster export OPENMLDB_USE_EXISTING_ZK_CLUSTER=false # the root path of zookeeper release, default is $OPENMLDB_HOME/zookeeper @@ -20,17 +17,26 @@ export OPENMLDB_ZK_HOME= export OPENMLDB_ZK_CLUSTER= # zookeeper root path export OPENMLDB_ZK_ROOT_PATH=/openmldb + +export OPENMLDB_FORCE_LOCAL=false + +export RUNNER_JAVA_HOME= + +# if CLEAR_OPENMLDB_INSTALL_DIR is set, all files in the WORKDIR will be deleted when running sbin/clear-all.sh +export CLEAR_OPENMLDB_INSTALL_DIR=false + +# tablet port +export OPENMLDB_TABLET_PORT=10921 +# nameserver port +export OPENMLDB_NAMESERVER_PORT=7527 +# taskmanager port +export OPENMLDB_TASKMANAGER_PORT=9902 +# apiserver port +export OPENMLDB_APISERVER_PORT=9080 + # zookeeper client port, clientPort=2181 in zoo.cfg export OPENMLDB_ZK_CLUSTER_CLIENT_PORT=2181 # zookeeper peer port, which is the first port in this config server.1=zoo1:2888:3888 in zoo.cfg export OPENMLDB_ZK_CLUSTER_PEER_PORT=2888 # zookeeper election port, which is the second port in this config server.1=zoo1:2888:3888 in zoo.cfg export OPENMLDB_ZK_CLUSTER_ELECTION_PORT=3888 - -# openmldb root path -export OPENMLDB_HOME= -# the root path of openmldb spark release, default is $OPENMLDB_HOME/spark -# if not exists, download from online -export SPARK_HOME= -# if CLEAR_OPENMLDB_INSTALL_DIR is set, all files in the WORKDIR will be deleted when running sbin/clear-all.sh -export CLEAR_OPENMLDB_INSTALL_DIR=false diff --git a/release/conf/tablet.flags.template b/release/conf/tablet.flags.template index d5109a9abaf..d4f615d5c69 100644 --- a/release/conf/tablet.flags.template +++ b/release/conf/tablet.flags.template @@ -88,8 +88,8 @@ #--max_traverse_cnt=0 # max table traverse unique key number(batch query), default: 0 #--max_traverse_key_cnt=0 -# max result size in byte (default: 2MB) -#--scan_max_bytes_size=2097152 +# max result size in byte (default: 0 ulimited) +#--scan_max_bytes_size=0 # loadtable #--load_table_batch=30 diff --git a/release/sbin/deploy-all.sh b/release/sbin/deploy-all.sh index 3a4f101b15b..ddfc7e712cd 100755 --- a/release/sbin/deploy-all.sh +++ b/release/sbin/deploy-all.sh @@ -33,7 +33,7 @@ distribute() { type=$4 fi local use_ssh=true - if [[ $host = "localhost" || $host = "127.0.0.1" ]]; then + if [[ "$OPENMLDB_FORCE_LOCAL" = true || "$host" = "localhost" || "$host" = "127.0.0.1" ]]; then use_ssh=false if [[ "$dest" = "$src" ]]; then echo "skip rsync as dest=src: $dest" @@ -56,7 +56,10 @@ distribute() { else if [[ "$type" = "taskmanager" ]]; then dir_list=(bin sbin conf taskmanager) - if [[ "$use_ssh" = true ]]; then + if [[ -n "$RUNNER_EXISTING_SPARK_HOME" ]]; then + echo "use existing spark $RUNNER_EXISTING_SPARK_HOME on $host, skip deploy spark" + elif [[ "$use_ssh" = true ]]; then + run_auto "$host" "mkdir -p ${SPARK_HOME} > /dev/null 2>&1" rsync -arz "${SPARK_HOME}/" "$host:${SPARK_HOME}/" fi else @@ -146,6 +149,10 @@ function download_spark { # deploy taskmanagers downloaded=false +if [[ -n "${RUNNER_EXISTING_SPARK_HOME}" ]]; then + echo "use $RUNNER_EXISTING_SPARK_HOME, skip download openmldbspark" + downloaded=true +fi for line in $(parse_host conf/hosts taskmanager) do if ! $downloaded; then @@ -158,7 +165,7 @@ do echo "deploy taskmanager to $host:$port $dir" distribute "$host" "$dir" "$home" taskmanager - cmd="cd $dir && OPENMLDB_VERSION=${OPENMLDB_VERSION} SPARK_HOME=${SPARK_HOME} OPENMLDB_HOST=$host OPENMLDB_TASKMANAGER_PORT=$port OPENMLDB_ZK_CLUSTER=${OPENMLDB_ZK_CLUSTER} OPENMLDB_ZK_ROOT_PATH=${OPENMLDB_ZK_ROOT_PATH} sbin/deploy.sh taskmanager" + cmd="cd $dir && SPARK_HOME=${SPARK_HOME} OPENMLDB_HOST=$host OPENMLDB_TASKMANAGER_PORT=$port OPENMLDB_ZK_CLUSTER=${OPENMLDB_ZK_CLUSTER} OPENMLDB_ZK_ROOT_PATH=${OPENMLDB_ZK_ROOT_PATH} sbin/deploy.sh taskmanager" run_auto "$host" "$cmd" done diff --git a/release/sbin/init.sh b/release/sbin/init.sh index b73ab226b81..1b20442bb48 100755 --- a/release/sbin/init.sh +++ b/release/sbin/init.sh @@ -90,7 +90,7 @@ function parse_host { run_auto() { local host=$1 local cmd=$2 - if [[ $host = "localhost" || $host = "127.0.0.1" ]]; then + if [[ "$OPENMLDB_FORCE_LOCAL" = true || "$host" = "localhost" || "$host" = "127.0.0.1" ]]; then local cur_dir cur_dir=$(pwd) bash -c "$cmd" @@ -105,7 +105,11 @@ if [ -z "${OPENMLDB_HOME}" ]; then export OPENMLDB_HOME fi -if [ -z "${SPARK_HOME}" ]; then +if [ -n "$RUNNER_EXISTING_SPARK_HOME" ]; then + echo "use existing spark $RUNNER_EXISTING_SPARK_HOME on runner, overwrite SPARK_HOME" + SPARK_HOME="$RUNNER_EXISTING_SPARK_HOME" + export SPARK_HOME +elif [ -z "${SPARK_HOME}" ]; then SPARK_HOME=${OPENMLDB_HOME}/spark export SPARK_HOME fi diff --git a/release/sbin/openmldb-cli.sh b/release/sbin/openmldb-cli.sh index 2102990164a..19bd5160a7b 100755 --- a/release/sbin/openmldb-cli.sh +++ b/release/sbin/openmldb-cli.sh @@ -20,7 +20,7 @@ sbin="$(cd "$(dirname "$0")" || exit 1; pwd)" . "$home"/conf/openmldb-env.sh . "$sbin"/init.sh cd "$home" || exit 1 - +echo "${OPENMLDB_MODE} ${OPENMLDB_ZK_CLUSTER} ${OPENMLDB_ZK_ROOT_PATH}" if [[ -n "$OPENMLDB_MODE" && "$OPENMLDB_MODE" = "cluster" ]]; then bin/openmldb --zk_cluster="${OPENMLDB_ZK_CLUSTER}" --zk_root_path="${OPENMLDB_ZK_ROOT_PATH}" --role=sql_client "$@" else diff --git a/release/sbin/start-taskmanagers.sh b/release/sbin/start-taskmanagers.sh index b6873c33089..322824dfbbf 100755 --- a/release/sbin/start-taskmanagers.sh +++ b/release/sbin/start-taskmanagers.sh @@ -38,11 +38,13 @@ else echo "start taskmanager in $dir with endpoint $host:$port " cmd="cd $dir && SPARK_HOME=${SPARK_HOME} bin/start.sh start taskmanager $*" - run_auto "$host" "$cmd" - - # Print the log of taskmanager if fail - #cmd="cd $dir && cat taskmanager/bin/logs/taskmanager.log" - #run_auto "$host" "$cmd" + # special for java + pre="" + if [[ -n $RUNNER_JAVA_HOME ]]; then + echo "overwrite java env by RUNNER_JAVA_HOME:$RUNNER_JAVA_HOME" + pre="export JAVA_HOME=$RUNNER_JAVA_HOME && export PATH=$JAVA_HOME/bin:$PATH &&" + fi + run_auto "$host" "$pre $cmd" done IFS="$old_IFS" fi diff --git a/release/sbin/start-zks.sh b/release/sbin/start-zks.sh index c13b762be90..775d52715ac 100755 --- a/release/sbin/start-zks.sh +++ b/release/sbin/start-zks.sh @@ -33,6 +33,12 @@ do echo "start zookeeper in $dir with endpoint $host:$port " cmd="cd $dir && bin/zkServer.sh start" - run_auto "$host" "$cmd" + # special for java + pre="" + if [[ -n $RUNNER_JAVA_HOME ]]; then + echo "overwrite java env by RUNNER_JAVA_HOME:$RUNNER_JAVA_HOME" + pre="export JAVA_HOME=$RUNNER_JAVA_HOME && export PATH=$JAVA_HOME/bin:$PATH &&" + fi + run_auto "$host" "$pre $cmd" done -IFS="$old_IFS" \ No newline at end of file +IFS="$old_IFS" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aa341bea32d..d24b41fef9f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -60,7 +60,7 @@ endfunction(compile_lib) set(TEST_LIBS openmldb_test_base apiserver nameserver tablet query_response_time openmldb_sdk - openmldb_catalog schema client zk_client storage replica base openmldb_codec openmldb_proto log + openmldb_catalog client zk_client storage schema replica base openmldb_codec openmldb_proto log common zookeeper_mt tcmalloc_minimal ${RocksDB_LIB} ${VM_LIBS} ${LLVM_LIBS} ${ZETASQL_LIBS} ${BRPC_LIBS}) if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.1") # GNU implementation prior to 9.1 requires linking with -lstdc++fs diff --git a/src/apiserver/api_server_impl.h b/src/apiserver/api_server_impl.h index ee41e34935b..fc8e8022417 100644 --- a/src/apiserver/api_server_impl.h +++ b/src/apiserver/api_server_impl.h @@ -27,15 +27,13 @@ #include "absl/status/status.h" #include "apiserver/interface_provider.h" #include "apiserver/json_helper.h" -#include "rapidjson/document.h" // raw rapidjson 1.1.0, not in butil +#include "bvar/bvar.h" +#include "bvar/multi_dimension.h" // latency recorder #include "proto/api_server.pb.h" +#include "rapidjson/document.h" // raw rapidjson 1.1.0, not in butil #include "sdk/sql_cluster_router.h" #include "sdk/sql_request_row.h" -#include "absl/status/status.h" -#include "bvar/bvar.h" -#include "bvar/multi_dimension.h" // latency recorder - namespace openmldb { namespace apiserver { diff --git a/src/base/ddl_parser.cc b/src/base/ddl_parser.cc index 2af468bea95..4f3d631a1fc 100644 --- a/src/base/ddl_parser.cc +++ b/src/base/ddl_parser.cc @@ -443,13 +443,13 @@ void IndexMapBuilder::Report(absl::string_view db, absl::string_view table, absl // we encode table, keys and ts to one string auto index = Encode(db, table, keys, ts); if (index.empty()) { - LOG(WARNING) << "index encode failed for table " << table; + LOG(WARNING) << "index encode failed for table " << db << "." << table; return; } if (index_map_.find(index) != index_map_.end()) { // index id has unique idx, can't be dup. It's a weird case - LOG(DFATAL) << "index " << index << " existed in cache"; + LOG(DFATAL) << db << "." << table << " index " << index << " existed in cache"; return; } diff --git a/src/base/status.h b/src/base/status.h index 5995138edd6..8ac134b18bd 100644 --- a/src/base/status.h +++ b/src/base/status.h @@ -96,6 +96,7 @@ enum ReturnCode { kInvalidArgs = 161, kCheckIndexFailed = 162, kCatalogUpdateFailed = 163, + kExceedPutMemoryLimit = 164, kNameserverIsNotLeader = 300, kAutoFailoverIsEnabled = 301, kEndpointIsNotExist = 302, diff --git a/src/base/sys_info.h b/src/base/sys_info.h new file mode 100644 index 00000000000..4b61b5e22d4 --- /dev/null +++ b/src/base/sys_info.h @@ -0,0 +1,107 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_BASE_SYS_INFO_H_ +#define SRC_BASE_SYS_INFO_H_ + +#include "absl/strings/ascii.h" +#include "absl/strings/match.h" +#include "absl/strings/numbers.h" +#include "absl/strings/string_view.h" +#include "base/status.h" + +namespace openmldb::base { + +constexpr const char* MEM_TOTAL = "MemTotal"; +constexpr const char* MEM_BUFFERS = "Buffers"; +constexpr const char* MEM_CACHED = "Cached"; +constexpr const char* MEM_FREE = "MemFree"; +constexpr const char* SRECLAIMABLE = "SReclaimable"; + +/* We calculate MemAvailable as follows + * MemAvailable = MemFree + Buffers + Cached + SReclaimable + * refer https://www.kernel.org/doc/Documentation/filesystems/proc.txt + * */ + +struct SysInfo { + uint64_t mem_total = 0; // unit is kB + uint64_t mem_used = 0; // unit is kB + uint64_t mem_free = 0; // unit is kB + uint64_t mem_buffers = 0; // unit is kB + uint64_t mem_cached = 0; // unit is kB +}; + +base::Status GetSysMem(SysInfo* info) { +#if defined(__linux__) + FILE *fd = fopen("/proc/meminfo", "r"); + if (fd == nullptr) { + return {ReturnCode::kError, "fail to open meminfo file"}; + } + char line[256]; + auto parse = [](absl::string_view str, absl::string_view key, uint64_t* val) -> base::Status { + str.remove_prefix(key.size() + 1); + str.remove_suffix(2); + str = absl::StripAsciiWhitespace(str); + if (!absl::SimpleAtoi(str, val)) { + return {ReturnCode::kError, absl::StrCat("fail to parse ", key)}; + } + return {}; + }; + int parse_cnt = 0; + uint64_t s_reclaimable = 0; + while (fgets(line, sizeof(line), fd)) { + absl::string_view str_view(line); + str_view = absl::StripAsciiWhitespace(str_view); + if (absl::StartsWith(str_view, MEM_TOTAL)) { + if (auto status = parse(str_view, MEM_TOTAL, &info->mem_total); !status.OK()) { + return status; + } + parse_cnt++; + } else if (absl::StartsWith(str_view, MEM_BUFFERS)) { + if (auto status = parse(str_view, MEM_BUFFERS, &info->mem_buffers); !status.OK()) { + return status; + } + parse_cnt++; + } else if (absl::StartsWith(str_view, MEM_CACHED)) { + if (auto status = parse(str_view, MEM_CACHED, &info->mem_cached); !status.OK()) { + return status; + } + parse_cnt++; + } else if (absl::StartsWith(str_view, MEM_FREE)) { + if (auto status = parse(str_view, MEM_FREE, &info->mem_free); !status.OK()) { + return status; + } + parse_cnt++; + } else if (absl::StartsWith(str_view, SRECLAIMABLE)) { + if (auto status = parse(str_view, SRECLAIMABLE, &s_reclaimable); !status.OK()) { + return status; + } + parse_cnt++; + } + } + if (parse_cnt != 5) { + return {ReturnCode::kError, "fail to parse meminfo"}; + } + info->mem_cached += s_reclaimable; + info->mem_used = info->mem_total - info->mem_buffers - info->mem_cached - info->mem_free; + fclose(fd); +#endif + return {}; +} + +} // namespace openmldb::base + +#endif // SRC_BASE_SYS_INFO_H_ diff --git a/src/base/sys_info_test.cc b/src/base/sys_info_test.cc new file mode 100644 index 00000000000..4d5f5cc03c8 --- /dev/null +++ b/src/base/sys_info_test.cc @@ -0,0 +1,50 @@ +/* + * Copyright 2021 4Paradigm + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gtest/gtest.h" +#include "base/sys_info.h" + +namespace openmldb { +namespace base { + +class SystemInfoTest : public ::testing::Test { + public: + SystemInfoTest() {} + ~SystemInfoTest() {} +}; + +TEST_F(SystemInfoTest, GetMemory) { + base::SysInfo info; + auto status = base::GetSysMem(&info); + ASSERT_TRUE(status.OK()); + ASSERT_GT(info.mem_total, 0); + ASSERT_GT(info.mem_used, 0); + ASSERT_GT(info.mem_free, 0); + ASSERT_EQ(info.mem_total, info.mem_used + info.mem_buffers + info.mem_free + info.mem_cached); + /*printf("total:%lu\n", info.mem_total); + printf("used:%lu\n", info.mem_used); + printf("free:%lu\n", info.mem_free); + printf("buffers:%lu\n", info.mem_buffers); + printf("cached:%lu\n", info.mem_cached);*/ +} + +} // namespace base +} // namespace openmldb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/client/tablet_client.cc b/src/client/tablet_client.cc index 878d2a5f3cc..f445cc1791c 100644 --- a/src/client/tablet_client.cc +++ b/src/client/tablet_client.cc @@ -201,36 +201,46 @@ bool TabletClient::UpdateTableMetaForAddField(uint32_t tid, const std::vector>& dimensions) { +base::Status TabletClient::Put(uint32_t tid, uint32_t pid, uint64_t time, const std::string& value, + const std::vector>& dimensions, + int memory_usage_limit, bool put_if_absent) { + ::google::protobuf::RepeatedPtrField<::openmldb::api::Dimension> pb_dimensions; for (size_t i = 0; i < dimensions.size(); i++) { ::openmldb::api::Dimension* d = pb_dimensions.Add(); d->set_key(dimensions[i].first); d->set_idx(dimensions[i].second); } - return Put(tid, pid, time, base::Slice(value), &pb_dimensions); + + return Put(tid, pid, time, base::Slice(value), &pb_dimensions, memory_usage_limit, put_if_absent); } -bool TabletClient::Put(uint32_t tid, uint32_t pid, uint64_t time, const base::Slice& value, - ::google::protobuf::RepeatedPtrField<::openmldb::api::Dimension>* dimensions) { +base::Status TabletClient::Put(uint32_t tid, uint32_t pid, uint64_t time, const base::Slice& value, + ::google::protobuf::RepeatedPtrField<::openmldb::api::Dimension>* dimensions, + int memory_usage_limit, bool put_if_absent) { ::openmldb::api::PutRequest request; + if (memory_usage_limit < 0 || memory_usage_limit > 100) { + return {base::ReturnCode::kError, absl::StrCat("invalid memory_usage_limit ", memory_usage_limit)}; + } else if (memory_usage_limit > 0) { + request.set_memory_limit(memory_usage_limit); + } request.set_time(time); request.set_value(value.data(), value.size()); request.set_tid(tid); request.set_pid(pid); request.mutable_dimensions()->Swap(dimensions); + request.set_put_if_absent(put_if_absent); ::openmldb::api::PutResponse response; - bool ok = - client_.SendRequest(&::openmldb::api::TabletServer_Stub::Put, &request, &response, FLAGS_request_timeout_ms, 1); - if (ok && response.code() == 0) { - return true; + auto st = client_.SendRequestSt(&::openmldb::api::TabletServer_Stub::Put, + &request, &response, FLAGS_request_timeout_ms, 1); + if (!st.OK()) { + return st; } - LOG(WARNING) << "fail to send write request for " << response.msg() << " and error code " << response.code(); - return false; + return {response.code(), response.msg()}; } -bool TabletClient::Put(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, const std::string& value) { +base::Status TabletClient::Put(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, + const std::string& value) { ::openmldb::api::PutRequest request; auto dim = request.add_dimensions(); dim->set_key(pk); @@ -240,14 +250,12 @@ bool TabletClient::Put(uint32_t tid, uint32_t pid, const std::string& pk, uint64 request.set_tid(tid); request.set_pid(pid); ::openmldb::api::PutResponse response; - - bool ok = - client_.SendRequest(&::openmldb::api::TabletServer_Stub::Put, &request, &response, FLAGS_request_timeout_ms, 1); - if (ok && response.code() == 0) { - return true; + auto st = client_.SendRequestSt(&::openmldb::api::TabletServer_Stub::Put, + &request, &response, FLAGS_request_timeout_ms, 1); + if (!st.OK()) { + return st; } - LOG(WARNING) << "fail to put for error " << response.msg(); - return false; + return {response.code(), response.msg()}; } bool TabletClient::MakeSnapshot(uint32_t tid, uint32_t pid, uint64_t offset, std::shared_ptr task_info) { diff --git a/src/client/tablet_client.h b/src/client/tablet_client.h index 9fee8e08392..19579f90c5c 100644 --- a/src/client/tablet_client.h +++ b/src/client/tablet_client.h @@ -72,23 +72,22 @@ class TabletClient : public Client { std::shared_ptr<::openmldb::sdk::SQLRequestRowBatch>, brpc::Controller* cntl, ::openmldb::api::SQLBatchRequestQueryResponse* response, const bool is_debug = false); - bool Put(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, const std::string& value); + base::Status Put(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, const std::string& value); - bool Put(uint32_t tid, uint32_t pid, uint64_t time, const std::string& value, - const std::vector>& dimensions); + base::Status Put(uint32_t tid, uint32_t pid, uint64_t time, const std::string& value, + const std::vector>& dimensions, + int memory_usage_limit = 0, bool put_if_absent = false); - bool Put(uint32_t tid, uint32_t pid, uint64_t time, const base::Slice& value, - ::google::protobuf::RepeatedPtrField<::openmldb::api::Dimension>* dimensions); + base::Status Put(uint32_t tid, uint32_t pid, uint64_t time, const base::Slice& value, + ::google::protobuf::RepeatedPtrField<::openmldb::api::Dimension>* dimensions, + int memory_usage_limit = 0, bool put_if_absent = false); bool Get(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, std::string& value, // NOLINT uint64_t& ts, // NOLINT - std::string& msg); - ; // NOLINT + std::string& msg); // NOLINT bool Get(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, const std::string& idx_name, - std::string& value, // NOLINT - uint64_t& ts, // NOLINT - std::string& msg); // NOLINT + std::string& value, uint64_t& ts, std::string& msg); // NOLINT bool Delete(uint32_t tid, uint32_t pid, const std::string& pk, const std::string& idx_name, std::string& msg); // NOLINT diff --git a/src/cmd/display.h b/src/cmd/display.h index 714a9ca6a73..34e1f851e39 100644 --- a/src/cmd/display.h +++ b/src/cmd/display.h @@ -586,6 +586,21 @@ __attribute__((unused)) static void PrintProcedureInfo( sql = boost::regex_replace(sql, boost::regex(pattern_sp), "DEPLOY"); std::string pattern_blank = "(.*)(\\(.*\\) )(BEGIN )(.*)( END;)"; sql = boost::regex_replace(sql, boost::regex(pattern_blank), "$1$4"); + if (!sp_info.GetOption()->empty()) { + std::stringstream ss; + ss << " OPTIONS("; + for (auto iter = sp_info.GetOption()->begin(); iter != sp_info.GetOption()->end(); iter++) { + if (iter != sp_info.GetOption()->begin()) { + ss << ", "; + } + ss << absl::AsciiStrToUpper(iter->first) << "=\"" << iter->second << "\""; + } + ss << ")"; + std::string prefix = absl::StrCat("DEPLOY ", sp_info.GetSpName()); + absl::string_view old_sql = sql; + old_sql.remove_prefix(prefix.size()); + sql = absl::StrCat(prefix, ss.str(), old_sql); + } } PrintItemTable({"DB", type_name}, {vec}, stream); diff --git a/src/cmd/openmldb.cc b/src/cmd/openmldb.cc index b4d12210cdf..53da31ad634 100644 --- a/src/cmd/openmldb.cc +++ b/src/cmd/openmldb.cc @@ -304,7 +304,7 @@ int PutData(uint32_t tid, const std::mapPut(tid, pid, ts, value, iter->second)) { + if (!clients[endpoint]->Put(tid, pid, ts, value, iter->second).OK()) { printf("put failed. tid %u pid %u endpoint %s ts %lu \n", tid, pid, endpoint.c_str(), ts); return -1; } @@ -438,7 +438,7 @@ std::shared_ptr<::openmldb::client::TabletClient> GetTabletClient(const ::openml void HandleNSClientSetTTL(const std::vector& parts, ::openmldb::client::NsClient* client) { if (parts.size() < 4) { - std::cout << "bad setttl format, eg settl t1 absolute 10" << std::endl; + std::cout << "bad setttl format, eg settl t1 absolute 10 [index0]" << std::endl; return; } std::string index_name; @@ -1307,14 +1307,14 @@ void HandleNSGet(const std::vector& parts, ::openmldb::client::NsCl if (parts.size() < 4) { std::cout << "get format error. eg: get table_name key ts | get " "table_name key idx_name ts | get table_name=xxx key=xxx " - "index_name=xxx ts=xxx ts_name=xxx " + "index_name=xxx ts=xxx" << std::endl; return; } std::map parameter_map; if (!GetParameterMap("table_name", parts, "=", parameter_map)) { std::cout << "get format error. eg: get table_name=xxx key=xxx " - "index_name=xxx ts=xxx ts_name=xxx " + "index_name=xxx ts=xxx" << std::endl; return; } @@ -1382,7 +1382,7 @@ void HandleNSGet(const std::vector& parts, ::openmldb::client::NsCl return; } ::openmldb::codec::SDKCodec codec(tables[0]); - bool no_schema = tables[0].column_desc_size() == 0 && tables[0].column_desc_size() == 0; + bool no_schema = tables[0].column_desc_size() == 0; if (no_schema) { std::string value; uint64_t ts = 0; @@ -2459,7 +2459,7 @@ void HandleNSClientHelp(const std::vector& parts, ::openmldb::clien printf("ex:man create\n"); } else if (parts[1] == "setttl") { printf("desc: set table ttl \n"); - printf("usage: setttl table_name ttl_type ttl [ts_name]\n"); + printf("usage: setttl table_name ttl_type ttl [index_name], abs ttl unit is minute\n"); printf("ex: setttl t1 absolute 10\n"); printf("ex: setttl t2 latest 5\n"); printf("ex: setttl t3 latest 5 ts1\n"); diff --git a/src/datacollector/data_collector.cc b/src/datacollector/data_collector.cc index 1af941226cf..1966f37c052 100644 --- a/src/datacollector/data_collector.cc +++ b/src/datacollector/data_collector.cc @@ -57,7 +57,7 @@ namespace openmldb::datacollector { std::string LogPartsToString(replica::LogParts* log_parts) { std::stringstream ss; ss << "["; - auto it = log_parts->NewIterator(); + std::unique_ptr it(log_parts->NewIterator()); it->SeekToFirst(); while (it->Valid()) { ss << "(" << it->GetKey() << ", " << it->GetValue() << "),"; @@ -259,7 +259,8 @@ void DataCollectorImpl::CreateTaskEnv(const datasync::AddSyncTaskRequest* reques auto tablet_client = tablet_client_map_[tablet_endpoint]; api::TableStatus table_status; if (auto st = tablet_client->GetTableStatus(tid, pid, table_status); !st.OK()) { - SET_RESP_AND_WARN(response, -1, "get table status from tablet server failed, maybe table doesn't exist: " + st.GetMsg()); + SET_RESP_AND_WARN(response, -1, "get table status from tablet server failed, maybe table doesn't exist: " + + st.GetMsg()); return; } if (!ValidateTableStatus(table_status)) { diff --git a/src/datacollector/data_collector_test.cc b/src/datacollector/data_collector_test.cc index d08cd4b71b8..cbd11b41fdd 100644 --- a/src/datacollector/data_collector_test.cc +++ b/src/datacollector/data_collector_test.cc @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "datacollector/data_collector.h" +#include +#include "codec/sdk_codec.h" +#include "datacollector/data_collector.h" #include "gflags/gflags.h" #include "gtest/gtest.h" - -#include "codec/sdk_codec.h" #include "vm/engine.h" using openmldb::client::TabletClient; diff --git a/src/flags.cc b/src/flags.cc index 42e085781eb..b16af056095 100644 --- a/src/flags.cc +++ b/src/flags.cc @@ -34,6 +34,7 @@ DEFINE_string(zk_auth_schema, "digest", "config the id of authentication schema" DEFINE_string(zk_cert, "", "config the application credentials"); DEFINE_string(tablet, "", "config the endpoint of tablet"); DEFINE_string(nameserver, "", "config the endpoint of nameserver"); +DEFINE_int32(get_sys_mem_interval, 10000, "config the interval of get system memory. unit is milliseconds"); DEFINE_int32(zk_keep_alive_check_interval, 15000, "config the interval of keep alive check. unit is milliseconds"); DEFINE_uint32(zk_log_level, 0, "CLI: set level integer, DISABLE_LOGGING=0, " @@ -76,7 +77,8 @@ DEFINE_bool(enable_localtablet, true, "enable or disable local tablet opt when d DEFINE_string(bucket_size, "1d", "the default bucket size in pre-aggr table"); // scan configuration -DEFINE_uint32(scan_max_bytes_size, 2 * 1024 * 1024, "config the max size of scan bytes size"); +// max bytes size: write all even if scan result is too large, let it fail in client(receiver) +DEFINE_uint32(scan_max_bytes_size, 0, "config the max size of scan bytes size, 0 means unlimit"); DEFINE_uint32(scan_reserve_size, 1024, "config the size of vec reserve"); DEFINE_uint32(preview_limit_max_num, 1000, "config the max num of preview limit"); DEFINE_uint32(preview_default_limit, 100, "config the default limit of preview"); diff --git a/src/log/log_reader.cc b/src/log/log_reader.cc index e012d5680c3..e6752c5c52c 100644 --- a/src/log/log_reader.cc +++ b/src/log/log_reader.cc @@ -438,7 +438,7 @@ LogReader::LogReader(LogParts* logs, const std::string& log_path, bool compresse start_offset_ = 0; compressed_ = compressed; - auto it = logs_->NewIterator(); + std::unique_ptr it(logs_->NewIterator()); it->SeekToLast(); if (it->Valid()) { min_offset_ = it->GetValue(); diff --git a/src/nameserver/name_server_impl.cc b/src/nameserver/name_server_impl.cc index d9ce3aff439..d5cda3ae537 100644 --- a/src/nameserver/name_server_impl.cc +++ b/src/nameserver/name_server_impl.cc @@ -8521,7 +8521,7 @@ void NameServerImpl::AddIndex(RpcController* controller, const AddIndexRequest* for (const auto& column_key : column_key_vec) { if (schema::IndexUtil::IsExist(column_key, table_info->column_key())) { base::SetResponseStatus(ReturnCode::kIndexAlreadyExists, "index has already exist!", response); - LOG(WARNING) << "index" << column_key.index_name() << " has already exist! table " << name; + LOG(WARNING) << "index " << column_key.index_name() << " has already exist! table " << name; return; } } @@ -8640,7 +8640,10 @@ void NameServerImpl::AddIndex(RpcController* controller, const AddIndexRequest* } } } - AddIndexToTableInfo(name, db, column_key_vec, nullptr); + // no rollback now + if (!AddIndexToTableInfo(name, db, column_key_vec, nullptr)) { + base::SetResponseStatus(ReturnCode::kAddIndexFailed, "add to table info failed", response); + } } base::SetResponseOK(response); LOG(INFO) << "add index. table[" << name << "] index count[" << column_key_vec.size() << "]"; @@ -9362,14 +9365,9 @@ base::Status NameServerImpl::CreateProcedureOnTablet(const ::openmldb::api::Crea for (auto tb_client : tb_client_vec) { auto status = tb_client->CreateProcedure(sp_request); if (!status.OK()) { - std::string err_msg; - char temp_msg[100]; - snprintf(temp_msg, sizeof(temp_msg), - "create procedure on tablet failed. db_name[%s], sp_name[%s], endpoint[%s]. ", - sp_info.db_name().c_str(), sp_info.sp_name().c_str(), tb_client->GetEndpoint().c_str()); - absl::StrAppend(&err_msg, temp_msg, "msg: ", status.GetMsg()); - LOG(WARNING) << err_msg; - return {base::ReturnCode::kCreateProcedureFailedOnTablet, err_msg}; + return {base::ReturnCode::kCreateProcedureFailedOnTablet, + absl::StrCat("create procedure on tablet failed, sp ", sp_info.db_name(), ".", sp_info.sp_name(), + ", endpoint: ", tb_client->GetEndpoint(), ", msg: ", status.GetMsg())}; } DLOG(INFO) << "create procedure on tablet success. db_name: " << sp_info.db_name() << ", " << "sp_name: " << sp_info.sp_name() << ", " @@ -9908,7 +9906,7 @@ base::Status NameServerImpl::InitGlobalVarTable() { uint64_t cur_ts = ::baidu::common::timer::get_micros() / 1000; std::string endpoint = table_info->table_partition(0).partition_meta(meta_idx).endpoint(); auto table_ptr = GetTablet(endpoint); - if (!table_ptr->client_->Put(tid, pid, cur_ts, row, dimensions)) { + if (!table_ptr->client_->Put(tid, pid, cur_ts, row, dimensions).OK()) { return {ReturnCode::kPutFailed, "fail to make a put request to table"}; } break; diff --git a/src/proto/name_server.proto b/src/proto/name_server.proto index 219b83a0b73..e605692d3fe 100755 --- a/src/proto/name_server.proto +++ b/src/proto/name_server.proto @@ -98,7 +98,7 @@ message TableInfo { repeated openmldb.common.ColumnDesc column_desc = 9; repeated openmldb.common.ColumnKey column_key = 10; repeated openmldb.common.ColumnDesc added_column_desc = 11; - optional uint32 format_version = 12 [default = 1]; + optional uint32 format_version = 12 [default = 1, deprecated = true]; optional string db = 13 [default = ""]; repeated string partition_key = 14; repeated common.VersionPair schema_versions = 15; diff --git a/src/proto/tablet.proto b/src/proto/tablet.proto index ee0ec5beae1..a18714b2ae1 100755 --- a/src/proto/tablet.proto +++ b/src/proto/tablet.proto @@ -193,7 +193,9 @@ message PutRequest { optional uint32 pid = 5; repeated Dimension dimensions = 6; repeated TSDimension ts_dimensions = 7 [deprecated = true]; - optional uint32 format_version = 8 [default = 0]; + optional uint32 format_version = 8 [default = 0, deprecated = true]; + optional uint32 memory_limit = 9; + optional bool put_if_absent = 10 [default = false]; } message PutResponse { @@ -324,7 +326,7 @@ message TableMeta { repeated openmldb.common.ColumnKey column_key = 11; repeated openmldb.common.ColumnDesc added_column_desc = 12; // format_version 0 , the legacy format 1 ,the new one - optional uint32 format_version = 13 [default = 0]; + optional uint32 format_version = 13 [default = 0, deprecated = true]; optional string db = 14 [default = ""]; repeated common.VersionPair schema_versions = 15; repeated common.TablePartition table_partition = 16; diff --git a/src/replica/snapshot_replica_test.cc b/src/replica/snapshot_replica_test.cc index 05e9a9d01da..64b8c2565ff 100644 --- a/src/replica/snapshot_replica_test.cc +++ b/src/replica/snapshot_replica_test.cc @@ -130,7 +130,7 @@ TEST_P(SnapshotReplicaTest, LeaderAndFollower) { ASSERT_TRUE(status.OK()); uint64_t cur_time = ::baidu::common::timer::get_micros() / 1000; auto ret = client.Put(tid, pid, "testkey", cur_time, ::openmldb::test::EncodeKV("testkey", "value1")); - ASSERT_TRUE(ret); + ASSERT_TRUE(ret.OK()); uint32_t count = 0; while (count < 10) { @@ -185,7 +185,7 @@ TEST_P(SnapshotReplicaTest, LeaderAndFollower) { ASSERT_EQ(0, srp.code()); ret = client.Put(tid, pid, "newkey", cur_time, ::openmldb::test::EncodeKV("newkey", "value2")); - ASSERT_TRUE(ret); + ASSERT_TRUE(ret.OK()); sleep(2); sr.set_pk("newkey"); tablet1->Scan(NULL, &sr, &srp, &closure); @@ -240,7 +240,7 @@ TEST_P(SnapshotReplicaTest, SendSnapshot) { ASSERT_TRUE(status.OK()); uint64_t cur_time = ::baidu::common::timer::get_micros() / 1000; auto ret = client.Put(tid, pid, "testkey", cur_time, ::openmldb::test::EncodeKV("testkey", "value1")); - ASSERT_TRUE(ret); + ASSERT_TRUE(ret.OK()); uint32_t count = 0; while (count < 10) { @@ -351,7 +351,7 @@ TEST_P(SnapshotReplicaTest, IncompleteSnapshot) { 16, 0, ::openmldb::type::CompressType::kNoCompress, storage_mode)); ASSERT_TRUE(status.OK()); auto ret = client.Put(tid, pid, "testkey", cur_time, ::openmldb::test::EncodeKV("testkey", "value1")); - ASSERT_TRUE(ret); + ASSERT_TRUE(ret.OK()); uint32_t count = 0; while (count < 10) { @@ -420,7 +420,7 @@ TEST_P(SnapshotReplicaTest, IncompleteSnapshot) { ASSERT_EQ(0, srp.code()); std::string key = "test2"; - ASSERT_TRUE(client.Put(tid, pid, key, cur_time, ::openmldb::test::EncodeKV(key, key))); + ASSERT_TRUE(client.Put(tid, pid, key, cur_time, ::openmldb::test::EncodeKV(key, key)).OK()); sr.set_tid(tid); sr.set_pid(pid); @@ -583,7 +583,7 @@ TEST_P(SnapshotReplicaTest, LeaderAndFollowerTS) { std::vector row = {"card0", "mcc0", "1.3", std::to_string(cur_time), std::to_string(cur_time - 100)}; std::string value; sdk_codec.EncodeRow(row, &value); - ASSERT_TRUE(client.Put(tid, pid, cur_time, value, dimensions)); + ASSERT_TRUE(client.Put(tid, pid, cur_time, value, dimensions).OK()); sleep(3); ::openmldb::test::TempPath temp_path; diff --git a/src/schema/index_test.cc b/src/schema/index_test.cc index 73365844f51..780e083c9be 100644 --- a/src/schema/index_test.cc +++ b/src/schema/index_test.cc @@ -65,11 +65,41 @@ TEST_F(IndexTest, CheckExist) { SchemaCodec::SetIndex(&test_index4, "index1", "aa", "ts2", ::openmldb::type::kAbsoluteTime, 0, 0); ASSERT_TRUE(IndexUtil::IsExist(test_index1, table_info.column_key())); + ASSERT_FALSE(IndexUtil::IsExist(test_index2, table_info.column_key())); + table_info.mutable_column_key(1)->set_flag(0); ASSERT_TRUE(IndexUtil::IsExist(test_index2, table_info.column_key())); ASSERT_FALSE(IndexUtil::IsExist(test_index3, table_info.column_key())); ASSERT_TRUE(IndexUtil::IsExist(test_index4, table_info.column_key())); } +TEST_F(IndexTest, GetPosition) { + openmldb::nameserver::TableInfo table_info; + SchemaCodec::SetColumnDesc(table_info.add_column_desc(), "card", ::openmldb::type::kString); + SchemaCodec::SetColumnDesc(table_info.add_column_desc(), "mcc", ::openmldb::type::kString); + SchemaCodec::SetColumnDesc(table_info.add_column_desc(), "ts1", ::openmldb::type::kBigInt); + SchemaCodec::SetColumnDesc(table_info.add_column_desc(), "ts2", ::openmldb::type::kBigInt); + SchemaCodec::SetIndex(table_info.add_column_key(), "index1", "card", "ts1", ::openmldb::type::kAbsoluteTime, 0, 0); + auto index2 = table_info.add_column_key(); + SchemaCodec::SetIndex(index2, "index2", "card", "ts2", ::openmldb::type::kAbsoluteTime, 0, 0); + index2->set_flag(1); + + ::openmldb::common::ColumnKey test_index1; + SchemaCodec::SetIndex(&test_index1, "test_index1", "card", "ts1", ::openmldb::type::kAbsoluteTime, 0, 0); + ::openmldb::common::ColumnKey test_index2; + SchemaCodec::SetIndex(&test_index2, "test_index2", "card", "ts2", ::openmldb::type::kAbsoluteTime, 0, 0); + ::openmldb::common::ColumnKey test_index3; + SchemaCodec::SetIndex(&test_index3, "test_index3", "mcc", "ts2", ::openmldb::type::kAbsoluteTime, 0, 0); + ::openmldb::common::ColumnKey test_index4; + SchemaCodec::SetIndex(&test_index4, "index1", "aa", "ts2", ::openmldb::type::kAbsoluteTime, 0, 0); + + ASSERT_EQ(IndexUtil::GetPosition(test_index1, table_info.column_key()), 0); + ASSERT_EQ(IndexUtil::GetPosition(test_index2, table_info.column_key()), 1); + table_info.mutable_column_key(1)->set_flag(0); + ASSERT_EQ(IndexUtil::GetPosition(test_index2, table_info.column_key()), 1); + ASSERT_EQ(IndexUtil::GetPosition(test_index3, table_info.column_key()), -1); + ASSERT_EQ(IndexUtil::GetPosition(test_index4, table_info.column_key()), -1); +} + TEST_F(IndexTest, CheckIndex) { PBSchema schema; SchemaCodec::SetColumnDesc(schema.Add(), "card", ::openmldb::type::kString); diff --git a/src/schema/index_util.cc b/src/schema/index_util.cc index fead40b5c70..097562d48e3 100644 --- a/src/schema/index_util.cc +++ b/src/schema/index_util.cc @@ -154,14 +154,11 @@ base::Status IndexUtil::CheckUnique(const PBIndex& index) { bool IndexUtil::IsExist(const ::openmldb::common::ColumnKey& column_key, const PBIndex& index) { std::string id_str = GetIDStr(column_key); for (int32_t index_pos = 0; index_pos < index.size(); index_pos++) { - if (index.Get(index_pos).index_name() == column_key.index_name()) { - if (index.Get(index_pos).flag() == 0) { + if (index.Get(index_pos).flag() == 0) { + if (index.Get(index_pos).index_name() == column_key.index_name() || + id_str == GetIDStr(index.Get(index_pos))) { return true; } - break; - } - if (id_str == GetIDStr(index.Get(index_pos))) { - return true; } } return false; @@ -170,12 +167,6 @@ bool IndexUtil::IsExist(const ::openmldb::common::ColumnKey& column_key, const P int IndexUtil::GetPosition(const ::openmldb::common::ColumnKey& column_key, const PBIndex& index) { std::string id_str = GetIDStr(column_key); for (int32_t index_pos = 0; index_pos < index.size(); index_pos++) { - if (index.Get(index_pos).index_name() == column_key.index_name()) { - if (index.Get(index_pos).flag() == 0) { - return index_pos; - } - break; - } if (id_str == GetIDStr(index.Get(index_pos))) { return index_pos; } diff --git a/src/sdk/mini_cluster_batch_bm.cc b/src/sdk/mini_cluster_batch_bm.cc index 1b5227f3367..8dc4e9e665e 100644 --- a/src/sdk/mini_cluster_batch_bm.cc +++ b/src/sdk/mini_cluster_batch_bm.cc @@ -96,7 +96,7 @@ static void BM_SimpleQueryFunction(benchmark::State& state) { // NOLINT uint32_t tid = sdk.GetTableId(db, name); { for (int32_t i = 0; i < 1000; i++) { - ok = tablet[0]->GetClient()->Put(tid, 0, pk, ts + i, value); + tablet[0]->GetClient()->Put(tid, 0, pk, ts + i, value); } } std::string sql = "select col1, col2 + 1, col3, col4, col5 from " + name + " ;"; diff --git a/src/sdk/node_adapter.cc b/src/sdk/node_adapter.cc index ef9de07a774..2a7960741a8 100644 --- a/src/sdk/node_adapter.cc +++ b/src/sdk/node_adapter.cc @@ -330,7 +330,7 @@ bool NodeAdapter::TransformToTableDef(::hybridse::node::CreatePlanNode* create_n status->code = hybridse::common::kTypeError; return false; } - auto val = TransformDataType(*dynamic_cast(default_val), + auto val = TransformDataType(*dynamic_cast(default_val), add_column_desc->data_type()); if (!val) { status->msg = "default value type mismatch"; diff --git a/src/sdk/node_adapter_test.cc b/src/sdk/node_adapter_test.cc index e09758b07cd..70c35ff7d9c 100644 --- a/src/sdk/node_adapter_test.cc +++ b/src/sdk/node_adapter_test.cc @@ -64,7 +64,7 @@ void CheckTablePartition(const ::openmldb::nameserver::TableInfo& table_info, if (table_partition.partition_meta(pos).is_leader()) { ASSERT_EQ(table_partition.partition_meta(pos).endpoint(), leader); } else { - ASSERT_EQ(follower.count(table_partition.partition_meta(pos).endpoint()), 1); + ASSERT_EQ(follower.count(table_partition.partition_meta(pos).endpoint()), (std::size_t)1); } } } diff --git a/src/sdk/sql_cache.h b/src/sdk/sql_cache.h index a326437c10f..1fe0b346fa2 100644 --- a/src/sdk/sql_cache.h +++ b/src/sdk/sql_cache.h @@ -54,26 +54,28 @@ class InsertSQLCache : public SQLCache { InsertSQLCache(const std::shared_ptr<::openmldb::nameserver::TableInfo>& table_info, const std::shared_ptr<::hybridse::sdk::Schema>& column_schema, DefaultValueMap default_map, - uint32_t str_length, std::vector hole_idx_arr) + uint32_t str_length, std::vector hole_idx_arr, bool put_if_absent) : SQLCache(table_info->db(), table_info->tid(), table_info->name()), table_info_(table_info), column_schema_(column_schema), default_map_(std::move(default_map)), str_length_(str_length), - hole_idx_arr_(std::move(hole_idx_arr)) {} + hole_idx_arr_(std::move(hole_idx_arr)), + put_if_absent_(put_if_absent) {} std::shared_ptr<::openmldb::nameserver::TableInfo> GetTableInfo() { return table_info_; } std::shared_ptr<::hybridse::sdk::Schema> GetSchema() const { return column_schema_; } uint32_t GetStrLength() const { return str_length_; } const DefaultValueMap& GetDefaultValue() const { return default_map_; } const std::vector& GetHoleIdxArr() const { return hole_idx_arr_; } - + const bool IsPutIfAbsent() const { return put_if_absent_; } private: std::shared_ptr<::openmldb::nameserver::TableInfo> table_info_; std::shared_ptr<::hybridse::sdk::Schema> column_schema_; const DefaultValueMap default_map_; const uint32_t str_length_; const std::vector hole_idx_arr_; + const bool put_if_absent_; }; class RouterSQLCache : public SQLCache { diff --git a/src/sdk/sql_cluster_router.cc b/src/sdk/sql_cluster_router.cc index 0e172322921..bdad16cfc8c 100644 --- a/src/sdk/sql_cluster_router.cc +++ b/src/sdk/sql_cluster_router.cc @@ -320,6 +320,7 @@ bool SQLClusterRouter::Init() { session_variables_.emplace("enable_trace", "false"); session_variables_.emplace("sync_job", "false"); session_variables_.emplace("job_timeout", "60000"); // rpc request timeout for taskmanager + session_variables_.emplace("insert_memory_usage_limit", "0"); session_variables_.emplace("spark_config", ""); } return true; @@ -454,39 +455,40 @@ std::shared_ptr SQLClusterRouter::GetInsertRow(const std::string& *status = {}; return std::make_shared(insert_cache->GetTableInfo(), insert_cache->GetSchema(), insert_cache->GetDefaultValue(), insert_cache->GetStrLength(), - insert_cache->GetHoleIdxArr()); + insert_cache->GetHoleIdxArr(), insert_cache->IsPutIfAbsent()); } } std::shared_ptr<::openmldb::nameserver::TableInfo> table_info; DefaultValueMap default_map; uint32_t str_length = 0; std::vector stmt_column_idx_arr; - if (!GetInsertInfo(db, sql, status, &table_info, &default_map, &str_length, &stmt_column_idx_arr)) { + bool put_if_absent = false; + if (!GetInsertInfo(db, sql, status, &table_info, &default_map, &str_length, &stmt_column_idx_arr, &put_if_absent)) { SET_STATUS_AND_WARN(status, StatusCode::kCmdError, "get insert information failed"); return {}; } auto schema = openmldb::schema::SchemaAdapter::ConvertSchema(table_info->column_desc()); - auto insert_cache = - std::make_shared(table_info, schema, default_map, str_length, - SQLInsertRow::GetHoleIdxArr(default_map, stmt_column_idx_arr, schema)); + auto insert_cache = std::make_shared( + table_info, schema, default_map, str_length, + SQLInsertRow::GetHoleIdxArr(default_map, stmt_column_idx_arr, schema), put_if_absent); SetCache(db, sql, hybridse::vm::kBatchMode, insert_cache); *status = {}; return std::make_shared(insert_cache->GetTableInfo(), insert_cache->GetSchema(), insert_cache->GetDefaultValue(), insert_cache->GetStrLength(), - insert_cache->GetHoleIdxArr()); + insert_cache->GetHoleIdxArr(), insert_cache->IsPutIfAbsent()); } bool SQLClusterRouter::GetMultiRowInsertInfo(const std::string& db, const std::string& sql, ::hybridse::sdk::Status* status, std::shared_ptr<::openmldb::nameserver::TableInfo>* table_info, std::vector* default_maps, - std::vector* str_lengths) { + std::vector* str_lengths, bool* put_if_absent) { RET_FALSE_IF_NULL_AND_WARN(status, "output status is nullptr"); // TODO(hw): return status? RET_FALSE_IF_NULL_AND_WARN(table_info, "output table_info is nullptr"); RET_FALSE_IF_NULL_AND_WARN(default_maps, "output default_maps is nullptr"); RET_FALSE_IF_NULL_AND_WARN(str_lengths, "output str_lengths is nullptr"); - + RET_FALSE_IF_NULL_AND_WARN(put_if_absent, "output put_if_absent is nullptr"); ::hybridse::node::NodeManager nm; ::hybridse::plan::PlanNodeList plans; bool ok = GetSQLPlan(sql, &nm, &plans); @@ -505,6 +507,7 @@ bool SQLClusterRouter::GetMultiRowInsertInfo(const std::string& db, const std::s SET_STATUS_AND_WARN(status, StatusCode::kPlanError, "insert stmt is null"); return false; } + *put_if_absent = insert_stmt->insert_mode_ == ::hybridse::node::InsertStmt::IGNORE; std::string db_name; if (!insert_stmt->db_name_.empty()) { db_name = insert_stmt->db_name_; @@ -575,7 +578,7 @@ bool SQLClusterRouter::GetMultiRowInsertInfo(const std::string& db, const std::s bool SQLClusterRouter::GetInsertInfo(const std::string& db, const std::string& sql, ::hybridse::sdk::Status* status, std::shared_ptr<::openmldb::nameserver::TableInfo>* table_info, DefaultValueMap* default_map, uint32_t* str_length, - std::vector* stmt_column_idx_in_table) { + std::vector* stmt_column_idx_in_table, bool* put_if_absent) { RET_FALSE_IF_NULL_AND_WARN(status, "output status is nullptr"); RET_FALSE_IF_NULL_AND_WARN(table_info, "output table_info is nullptr"); RET_FALSE_IF_NULL_AND_WARN(default_map, "output default_map is nullptr"); @@ -635,6 +638,7 @@ bool SQLClusterRouter::GetInsertInfo(const std::string& db, const std::string& s SET_STATUS_AND_WARN(status, StatusCode::kCmdError, "get default value map of " + sql + " failed"); return false; } + *put_if_absent = insert_stmt->insert_mode_ == ::hybridse::node::InsertStmt::IGNORE; return true; } @@ -770,23 +774,24 @@ std::shared_ptr SQLClusterRouter::GetInsertRows(const std::string status->SetOK(); return std::make_shared(insert_cache->GetTableInfo(), insert_cache->GetSchema(), insert_cache->GetDefaultValue(), insert_cache->GetStrLength(), - insert_cache->GetHoleIdxArr()); + insert_cache->GetHoleIdxArr(), insert_cache->IsPutIfAbsent()); } } std::shared_ptr<::openmldb::nameserver::TableInfo> table_info; DefaultValueMap default_map; uint32_t str_length = 0; std::vector stmt_column_idx_arr; - if (!GetInsertInfo(db, sql, status, &table_info, &default_map, &str_length, &stmt_column_idx_arr)) { + bool put_if_absent = false; + if (!GetInsertInfo(db, sql, status, &table_info, &default_map, &str_length, &stmt_column_idx_arr, &put_if_absent)) { return {}; } auto col_schema = openmldb::schema::SchemaAdapter::ConvertSchema(table_info->column_desc()); - insert_cache = - std::make_shared(table_info, col_schema, default_map, str_length, - SQLInsertRow::GetHoleIdxArr(default_map, stmt_column_idx_arr, col_schema)); + insert_cache = std::make_shared( + table_info, col_schema, default_map, str_length, + SQLInsertRow::GetHoleIdxArr(default_map, stmt_column_idx_arr, col_schema), put_if_absent); SetCache(db, sql, hybridse::vm::kBatchMode, insert_cache); return std::make_shared(table_info, insert_cache->GetSchema(), default_map, str_length, - insert_cache->GetHoleIdxArr()); + insert_cache->GetHoleIdxArr(), insert_cache->IsPutIfAbsent()); } bool SQLClusterRouter::ExecuteDDL(const std::string& db, const std::string& sql, hybridse::sdk::Status* status) { @@ -1302,7 +1307,8 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& s std::shared_ptr<::openmldb::nameserver::TableInfo> table_info; std::vector default_maps; std::vector str_lengths; - if (!GetMultiRowInsertInfo(db, sql, status, &table_info, &default_maps, &str_lengths)) { + bool put_if_absent; + if (!GetMultiRowInsertInfo(db, sql, status, &table_info, &default_maps, &str_lengths, &put_if_absent)) { CODE_PREPEND_AND_WARN(status, StatusCode::kCmdError, "Fail to get insert info"); return false; } @@ -1317,7 +1323,7 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& s } std::vector fails; for (size_t i = 0; i < default_maps.size(); i++) { - auto row = std::make_shared(table_info, schema, default_maps[i], str_lengths[i]); + auto row = std::make_shared(table_info, schema, default_maps[i], str_lengths[i], put_if_absent); if (!row) { LOG(WARNING) << "fail to parse row[" << i << "]"; fails.push_back(i); @@ -1366,13 +1372,20 @@ bool SQLClusterRouter::PutRow(uint32_t tid, const std::shared_ptr& if (client) { DLOG(INFO) << "put data to endpoint " << client->GetEndpoint() << " with dimensions size " << kv.second.size(); - bool ret = client->Put(tid, pid, cur_ts, row->GetRow(), kv.second); - if (!ret) { - SET_STATUS_AND_WARN(status, StatusCode::kCmdError, - "INSERT failed, tid " + std::to_string(tid) + - ". Note that data might have been partially inserted. " - "You are encouraged to perform DELETE to remove any partially " - "inserted data before trying INSERT again."); + auto ret = client->Put(tid, pid, cur_ts, row->GetRow(), kv.second, + insert_memory_usage_limit_.load(std::memory_order_relaxed), row->IsPutIfAbsent()); + if (!ret.OK()) { + if (RevertPut(row->GetTableInfo(), pid, dimensions, cur_ts, + base::Slice(row->GetRow()), tablets).IsOK()) { + SET_STATUS_AND_WARN(status, StatusCode::kCmdError, + absl::StrCat("INSERT failed, tid ", tid)); + } else { + SET_STATUS_AND_WARN(status, StatusCode::kCmdError, + "INSERT failed, tid " + std::to_string(tid) + + ". Note that data might have been partially inserted. " + "You are encouraged to perform DELETE to remove any partially " + "inserted data before trying INSERT again."); + } return false; } continue; @@ -1440,8 +1453,8 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& s } bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& name, int tid, int partition_num, - hybridse::sdk::ByteArrayPtr dimension, int dimension_len, - hybridse::sdk::ByteArrayPtr value, int len, hybridse::sdk::Status* status) { + hybridse::sdk::ByteArrayPtr dimension, int dimension_len, + hybridse::sdk::ByteArrayPtr value, int len, bool put_if_absent, hybridse::sdk::Status* status) { RET_FALSE_IF_NULL_AND_WARN(status, "output status is nullptr"); if (dimension == nullptr || dimension_len <= 0 || value == nullptr || len <= 0 || partition_num <= 0) { *status = {StatusCode::kCmdError, "invalid parameter"}; @@ -1482,13 +1495,30 @@ bool SQLClusterRouter::ExecuteInsert(const std::string& db, const std::string& n if (client) { DLOG(INFO) << "put data to endpoint " << client->GetEndpoint() << " with dimensions size " << kv.second.size(); - bool ret = client->Put(tid, pid, cur_ts, row_value, &kv.second); - if (!ret) { + auto ret = client->Put(tid, pid, cur_ts, row_value, &kv.second, + insert_memory_usage_limit_.load(std::memory_order_relaxed), put_if_absent); + if (!ret.OK()) { SET_STATUS_AND_WARN(status, StatusCode::kCmdError, - "INSERT failed, tid " + std::to_string(tid) + - ". Note that data might have been partially inserted. " - "You are encouraged to perform DELETE to remove any partially " - "inserted data before trying INSERT again."); + "INSERT failed, tid " + std::to_string(tid) + + ". Note that data might have been partially inserted. " + "You are encouraged to perform DELETE to remove any partially " + "inserted data before trying INSERT again."); + std::map>> dimensions; + for (const auto& val : dimensions_map) { + std::vector> vec; + for (const auto& data : val.second) { + vec.emplace_back(data.key(), data.idx()); + } + dimensions.emplace(val.first, std::move(vec)); + } + auto table_info = cluster_sdk_->GetTableInfo(db, name); + if (!table_info) { + return false; + } + if (RevertPut(*table_info, pid, dimensions, cur_ts, row_value, tablets).IsOK()) { + SET_STATUS_AND_WARN(status, StatusCode::kCmdError, + absl::StrCat("INSERT failed, tid ", tid)); + } return false; } continue; @@ -2721,6 +2751,7 @@ std::shared_ptr SQLClusterRouter::ExecuteSQL( return {}; } column_key.set_index_name(create_index_node->index_name_); + // no skip load data, so it's always be a async op if (ns_ptr->AddIndex(db_name, create_index_node->table_name_, column_key, nullptr, msg)) { *status = {::hybridse::common::StatusCode::kOk, "AddIndex is an asynchronous job. Run 'SHOW JOBS FROM NAMESERVER' to see the job status"}; @@ -2828,7 +2859,8 @@ std::shared_ptr SQLClusterRouter::ExecuteSQL( } if (!cluster_sdk_->IsClusterMode() || is_local.value()) { if (cluster_sdk_->IsClusterMode() && !IsOnlineMode()) { - *status = {::hybridse::common::StatusCode::kCmdError, "local load only supports loading data to online storage"}; + *status = {::hybridse::common::StatusCode::kCmdError, + "local load only supports loading data to online storage"}; return {}; } @@ -2859,6 +2891,8 @@ std::shared_ptr SQLClusterRouter::ExecuteSQL( ::openmldb::base::Status base_status; if (is_online_mode) { // Handle in online mode + config.emplace("insert_memory_usage_limit", + std::to_string(insert_memory_usage_limit_.load(std::memory_order_relaxed))); base_status = ImportOnlineData(sql, config, database, is_sync_job, offline_job_timeout, &job_info); } else { // Handle in offline mode @@ -3112,6 +3146,15 @@ ::hybridse::sdk::Status SQLClusterRouter::SetVariable(hybridse::node::SetPlanNod if (!absl::SimpleAtoi(value, &new_timeout)) { return {StatusCode::kCmdError, "Fail to parse value, can't set the request timeout"}; } + } else if (key == "insert_memory_usage_limit") { + int limit = 0; + if (!absl::SimpleAtoi(value, &limit)) { + return {StatusCode::kCmdError, "Fail to parse value, can't set the insert_memory_usage_limit"}; + } + if (limit < 0 || limit > 100) { + return {StatusCode::kCmdError, "Invalid value! The value must be between 0 and 100"}; + } + insert_memory_usage_limit_.store(limit, std::memory_order_relaxed); } else if (key == "spark_config") { if (!CheckSparkConfigString(value)) { return {StatusCode::kCmdError, @@ -3755,6 +3798,19 @@ hybridse::sdk::Status SQLClusterRouter::HandleDeploy(const std::string& db, if (!get_index_status.IsOK()) { return get_index_status; } + std::stringstream index_stream; + for (auto[db, db_map] : new_index_map) { + for (auto[table, index_list] : db_map) { + for (auto index : index_list) { + index_stream << db << "-" << table << "-"; + for (auto col : index.col_name()) { + index_stream << col << ","; + } + index_stream << "|" << index.ts_name() << ";"; + } + } + } + LOG(INFO) << "should create new indexs: " << index_stream.str(); if (!new_index_map.empty()) { if (cluster_sdk_->IsClusterMode() && record_cnt > 0) { @@ -4725,6 +4781,56 @@ std::shared_ptr SQLClusterRouter::GetNameServerJobResu return rs; } +::hybridse::sdk::Status SQLClusterRouter::RevertPut(const nameserver::TableInfo& table_info, + uint32_t end_pid, + const std::map>>& dimensions, + uint64_t ts, + const base::Slice& value, + const std::vector>& tablets) { + codec::RowView row_view(table_info.column_desc()); + std::map column_map; + for (int32_t i = 0; i < table_info.column_desc_size(); i++) { + column_map.emplace(table_info.column_desc(i).name(), i); + } + const int8_t* data = reinterpret_cast(value.data()); + for (const auto& kv : dimensions) { + if (static_cast(kv.first) > tablets.size()) { + return {StatusCode::kCmdError, absl::StrCat("pid ", kv.first, + " is greater than the tablets size ", tablets.size())}; + } + auto tablet = tablets[kv.first]; + if (!tablet) { + continue; + } + auto client = tablet->GetClient(); + for (const auto& val : kv.second) { + if (val.second >= static_cast(table_info.column_key_size())) { + return {StatusCode::kCmdError, absl::StrCat("invalid index pos ", val.second)}; + } + const auto& index = table_info.column_key(val.second); + if (index.flag()) { + continue; + } + int64_t cur_ts = ts; + if (!index.ts_name().empty()) { + if (auto it = column_map.find(index.ts_name()); it == column_map.end()) { + return {StatusCode::kCmdError, absl::StrCat("invalid ts name ", index.ts_name())}; + } else if (row_view.GetInteger(data, it->second, + table_info.column_desc(it->second).data_type(), &cur_ts) != 0) { + return {StatusCode::kCmdError, "get ts failed"}; + } + } + std::map index_val = { {val.second, val.first} }; + uint64_t end_ts = cur_ts > 0 ? cur_ts - 1 : 0; + client->Delete(table_info.tid(), kv.first, index_val, "", cur_ts, end_ts); + } + if (kv.first == end_pid) { + break; + } + } + return {}; +} + common::ColumnKey Bias::AddBias(const common::ColumnKey& index) const { if (!index.has_ttl()) { LOG(WARNING) << "index has no ttl, skip bias"; diff --git a/src/sdk/sql_cluster_router.h b/src/sdk/sql_cluster_router.h index 53d2389b575..0b9f6cca272 100644 --- a/src/sdk/sql_cluster_router.h +++ b/src/sdk/sql_cluster_router.h @@ -87,7 +87,7 @@ class SQLClusterRouter : public SQLRouter { bool ExecuteInsert(const std::string& db, const std::string& name, int tid, int partition_num, hybridse::sdk::ByteArrayPtr dimension, int dimension_len, - hybridse::sdk::ByteArrayPtr value, int len, hybridse::sdk::Status* status) override; + hybridse::sdk::ByteArrayPtr value, int len, bool put_if_absent, hybridse::sdk::Status* status) override; bool ExecuteDelete(std::shared_ptr row, hybridse::sdk::Status* status) override; @@ -316,10 +316,11 @@ class SQLClusterRouter : public SQLRouter { bool GetInsertInfo(const std::string& db, const std::string& sql, ::hybridse::sdk::Status* status, std::shared_ptr<::openmldb::nameserver::TableInfo>* table_info, DefaultValueMap* default_map, - uint32_t* str_length, std::vector* stmt_column_idx_in_table); + uint32_t* str_length, std::vector* stmt_column_idx_in_table, bool* put_if_absent); bool GetMultiRowInsertInfo(const std::string& db, const std::string& sql, ::hybridse::sdk::Status* status, std::shared_ptr<::openmldb::nameserver::TableInfo>* table_info, - std::vector* default_maps, std::vector* str_lengths); + std::vector* default_maps, std::vector* str_lengths, + bool* put_if_absent); DefaultValueMap GetDefaultMap(const std::shared_ptr<::openmldb::nameserver::TableInfo>& table_info, const std::map& column_map, ::hybridse::node::ExprListNode* row, @@ -423,6 +424,13 @@ class SQLClusterRouter : public SQLRouter { int64_t timeout_ms, const base::Slice& row, const std::string& router_col, hybridse::sdk::Status* status); + ::hybridse::sdk::Status RevertPut(const nameserver::TableInfo& table_info, + uint32_t end_pid, + const std::map>>& dimensions, + uint64_t ts, + const base::Slice& value, + const std::vector>& tablets); + private: std::shared_ptr options_; std::string db_; @@ -434,6 +442,7 @@ class SQLClusterRouter : public SQLRouter { input_lru_cache_; ::openmldb::base::SpinMutex mu_; ::openmldb::base::Random rand_; + std::atomic insert_memory_usage_limit_ = 0; // [0-100], the default value 0 means unlimited }; class Bias { diff --git a/src/sdk/sql_cluster_test.cc b/src/sdk/sql_cluster_test.cc index 9374841d71e..c3bb0f08e9d 100644 --- a/src/sdk/sql_cluster_test.cc +++ b/src/sdk/sql_cluster_test.cc @@ -121,13 +121,19 @@ TEST_F(SQLClusterDDLTest, TestShowAndDropDeployment) { router->ExecuteSQL(db, "deploy " + deploy_name + " select col1 from " + table_name + ";", &status); ASSERT_TRUE(status.IsOK()); - router->ExecuteSQL(db2, "deploy " + deploy_name + " select col1 from " + db + "." + table_name + ";", &status); + std::string sql = absl::StrCat("deploy ", deploy_name, + " OPTIONS(RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\") select col1 from ", db, ".", table_name, ";"); + router->ExecuteSQL(db2, sql, &status); ASSERT_TRUE(status.IsOK()); - router->ExecuteSQL(db, "show deployment " + deploy_name + ";", &status); + auto rs = router->ExecuteSQL(db, "show deployment " + deploy_name + ";", &status); ASSERT_TRUE(status.IsOK()); - router->ExecuteSQL(db, "show deployment " + db2 + "." + deploy_name + ";", &status); + ASSERT_TRUE(rs->Next()); + ASSERT_TRUE(rs->GetStringUnsafe(0).find("OPTIONS") == std::string::npos); + rs = router->ExecuteSQL(db, "show deployment " + db2 + "." + deploy_name + ";", &status); ASSERT_TRUE(status.IsOK()); + ASSERT_TRUE(rs->Next()); + ASSERT_TRUE(rs->GetStringUnsafe(0).find("OPTIONS(RANGE_BIAS=\"inf\", ROWS_BIAS=\"inf\")") != std::string::npos); router->ExecuteSQL(db, "drop deployment " + deploy_name + ";", &status); ASSERT_TRUE(status.IsOK()); diff --git a/src/sdk/sql_insert_row.cc b/src/sdk/sql_insert_row.cc index e4da7c50669..492bb80e49b 100644 --- a/src/sdk/sql_insert_row.cc +++ b/src/sdk/sql_insert_row.cc @@ -29,33 +29,35 @@ namespace sdk { SQLInsertRows::SQLInsertRows(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_str_length, const std::vector& hole_idx_arr) + uint32_t default_str_length, const std::vector& hole_idx_arr, bool put_if_absent) : table_info_(std::move(table_info)), schema_(std::move(schema)), default_map_(std::move(default_map)), default_str_length_(default_str_length), - hole_idx_arr_(hole_idx_arr) {} + hole_idx_arr_(hole_idx_arr), + put_if_absent_(put_if_absent) {} std::shared_ptr SQLInsertRows::NewRow() { if (!rows_.empty() && !rows_.back()->IsComplete()) { return {}; } - std::shared_ptr row = - std::make_shared(table_info_, schema_, default_map_, default_str_length_, hole_idx_arr_); + std::shared_ptr row = std::make_shared( + table_info_, schema_, default_map_, default_str_length_, hole_idx_arr_, put_if_absent_); rows_.push_back(row); return row; } SQLInsertRow::SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_string_length) + uint32_t default_string_length, bool put_if_absent) : table_info_(table_info), schema_(std::move(schema)), default_map_(std::move(default_map)), default_string_length_(default_string_length), rb_(table_info->column_desc()), val_(), - str_size_(0) { + str_size_(0), + put_if_absent_(put_if_absent) { std::map column_name_map; for (int idx = 0; idx < table_info_->column_desc_size(); idx++) { column_name_map.emplace(table_info_->column_desc(idx).name(), idx); @@ -64,12 +66,16 @@ SQLInsertRow::SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> ta index_map_.clear(); raw_dimensions_.clear(); for (int idx = 0; idx < table_info_->column_key_size(); ++idx) { - for (const auto& column : table_info_->column_key(idx).col_name()) { + const auto& index = table_info_->column_key(idx); + if (index.flag()) { + continue; + } + for (const auto& column : index.col_name()) { index_map_[idx].push_back(column_name_map[column]); raw_dimensions_[column_name_map[column]] = hybridse::codec::NONETOKEN; } - if (!table_info_->column_key(idx).ts_name().empty()) { - ts_set_.insert(column_name_map[table_info_->column_key(idx).ts_name()]); + if (!index.ts_name().empty()) { + ts_set_.insert(column_name_map[index.ts_name()]); } } } @@ -77,8 +83,9 @@ SQLInsertRow::SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> ta SQLInsertRow::SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_str_length, std::vector hole_idx_arr) - : SQLInsertRow(std::move(table_info), std::move(schema), std::move(default_map), default_str_length) { + uint32_t default_str_length, std::vector hole_idx_arr, bool put_if_absent) + : SQLInsertRow(std::move(table_info), std::move(schema), std::move(default_map), default_str_length, + put_if_absent) { hole_idx_arr_ = std::move(hole_idx_arr); } diff --git a/src/sdk/sql_insert_row.h b/src/sdk/sql_insert_row.h index ded1c824e19..af18891587f 100644 --- a/src/sdk/sql_insert_row.h +++ b/src/sdk/sql_insert_row.h @@ -103,12 +103,13 @@ class DefaultValueContainer { class SQLInsertRow { public: + // for raw insert sql(no hole) SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_str_length); + uint32_t default_str_length, bool put_if_absent); SQLInsertRow(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, - uint32_t default_str_length, std::vector hole_idx_arr); + uint32_t default_str_length, std::vector hole_idx_arr, bool put_if_absent); ~SQLInsertRow() = default; bool Init(int str_length); bool AppendBool(bool val); @@ -155,6 +156,10 @@ class SQLInsertRow { return *table_info_; } + bool IsPutIfAbsent() const { + return put_if_absent_; + } + private: bool MakeDefault(); void PackDimension(const std::string& val); @@ -175,13 +180,14 @@ class SQLInsertRow { ::openmldb::codec::RowBuilder rb_; std::string val_; uint32_t str_size_; + bool put_if_absent_; }; class SQLInsertRows { public: SQLInsertRows(std::shared_ptr<::openmldb::nameserver::TableInfo> table_info, std::shared_ptr schema, DefaultValueMap default_map, uint32_t str_size, - const std::vector& hole_idx_arr); + const std::vector& hole_idx_arr, bool put_if_absent); ~SQLInsertRows() = default; std::shared_ptr NewRow(); inline uint32_t GetCnt() { return rows_.size(); } @@ -200,6 +206,7 @@ class SQLInsertRows { DefaultValueMap default_map_; uint32_t default_str_length_; std::vector hole_idx_arr_; + bool put_if_absent_; std::vector> rows_; }; diff --git a/src/sdk/sql_router.cc b/src/sdk/sql_router.cc index 72e3adeaa3d..32c555a096b 100644 --- a/src/sdk/sql_router.cc +++ b/src/sdk/sql_router.cc @@ -15,11 +15,17 @@ */ #include "sdk/sql_router.h" + #include + +#include "absl/strings/substitute.h" #include "base/ddl_parser.h" #include "glog/logging.h" #include "schema/schema_adapter.h" #include "sdk/sql_cluster_router.h" +#include "zetasql/parser/parser.h" +#include "zetasql/public/error_helpers.h" +#include "zetasql/public/error_location.pb.h" namespace openmldb::sdk { @@ -274,4 +280,81 @@ std::vector> GetDependentTables( return tables; } +std::shared_ptr QueryToDAG(const zetasql::ASTQuery* query, absl::string_view name) { + std::vector> producers; + if (query->with_clause() != nullptr) { + for (auto with_entry : query->with_clause()->with()) { + producers.push_back(QueryToDAG(with_entry->query(), with_entry->alias()->GetAsStringView())); + } + } + + // SQL without WITH clause + std::string sql = zetasql::Unparse(query->query_expr()); + if (query->order_by() != nullptr) { + absl::StrAppend(&sql, zetasql::Unparse(query->order_by())); + } + if (query->limit_offset() != nullptr) { + absl::StrAppend(&sql, zetasql::Unparse(query->limit_offset())); + } + + return std::make_shared(name, sql, producers); +} + +std::shared_ptr SQLRouter::SQLToDAG(const std::string& query, hybridse::sdk::Status* status) { + std::unique_ptr parser_output; + zetasql::ParserOptions parser_opts; + zetasql::LanguageOptions language_opts; + language_opts.EnableLanguageFeature(zetasql::FEATURE_V_1_3_COLUMN_DEFAULT_VALUE); + parser_opts.set_language_options(&language_opts); + auto zetasql_status = zetasql::ParseStatement(query, parser_opts, &parser_output); + zetasql::ErrorLocation location; + if (!zetasql_status.ok()) { + zetasql::ErrorLocation location; + GetErrorLocation(zetasql_status, &location); + status->msg = zetasql::FormatError(zetasql_status); + status->code = hybridse::common::kSyntaxError; + return {}; + } + + auto stmt = parser_output->statement(); + if (stmt == nullptr) { + status->msg = "not a statement"; + status->code = hybridse::common::kSyntaxError; + return {}; + } + + if (stmt->node_kind() != zetasql::AST_QUERY_STATEMENT) { + status->msg = "not a query"; + status->code = hybridse::common::kSyntaxError; + return {}; + } + + auto const query_stmt = stmt->GetAsOrNull(); + if (query_stmt == nullptr) { + status->msg = "not a query"; + status->code = hybridse::common::kSyntaxError; + return {}; + } + + status->code = hybridse::common::kOk; + return QueryToDAG(query_stmt->query(), ""); +} + +bool DAGNode::operator==(const DAGNode& rhs) const noexcept { + return name == rhs.name && sql == rhs.sql && + absl::c_equal(producers, rhs.producers, + [](const std::shared_ptr& left, const std::shared_ptr& right) { + return left != nullptr && right != nullptr && *left == *right; + }); +} + +std::ostream& operator<<(std::ostream& os, const DAGNode& obj) { return os << obj.DebugString(); } + +std::string DAGNode::DebugString() const { + return absl::Substitute("{$0, $1, [$2]}", name, sql, + absl::StrJoin(producers, ",", [](std::string* out, const std::shared_ptr& e) { + absl::StrAppend(out, (e == nullptr ? "" : e->DebugString())); + })); +} + } // namespace openmldb::sdk diff --git a/src/sdk/sql_router.h b/src/sdk/sql_router.h index 68186a83b00..07b2e3b7734 100644 --- a/src/sdk/sql_router.h +++ b/src/sdk/sql_router.h @@ -80,6 +80,22 @@ class ExplainInfo { virtual const std::string& GetRequestDbName() = 0; }; +struct DAGNode { + DAGNode(absl::string_view name, absl::string_view sql) : name(name), sql(sql) {} + DAGNode(absl::string_view name, absl::string_view sql, const std::vector>& producers) + : name(name), sql(sql), producers(producers) {} + + std::string name; + std::string sql; + std::vector> producers; + + bool operator==(const DAGNode& op) const noexcept; + + std::string DebugString() const; + + friend std::ostream& operator<<(std::ostream& os, const DAGNode& obj); +}; + class QueryFuture { public: QueryFuture() {} @@ -114,7 +130,7 @@ class SQLRouter { virtual bool ExecuteInsert(const std::string& db, const std::string& name, int tid, int partition_num, hybridse::sdk::ByteArrayPtr dimension, int dimension_len, - hybridse::sdk::ByteArrayPtr value, int len, hybridse::sdk::Status* status) = 0; + hybridse::sdk::ByteArrayPtr value, int len, bool put_if_absent, hybridse::sdk::Status* status) = 0; virtual bool ExecuteDelete(std::shared_ptr row, hybridse::sdk::Status* status) = 0; @@ -234,6 +250,11 @@ class SQLRouter { virtual bool IsOnlineMode() = 0; virtual std::string GetDatabase() = 0; + + // parse SQL query into DAG representation + // + // Optional CONFIG clause from SQL query statement is skipped in output DAG + std::shared_ptr SQLToDAG(const std::string& query, hybridse::sdk::Status* status); }; std::shared_ptr NewClusterSQLRouter(const SQLRouterOptions& options); diff --git a/src/sdk/sql_router_sdk.i b/src/sdk/sql_router_sdk.i index 22ee63b3e6d..07bb3d5741b 100644 --- a/src/sdk/sql_router_sdk.i +++ b/src/sdk/sql_router_sdk.i @@ -69,6 +69,7 @@ %template(VectorUint32) std::vector; %template(VectorString) std::vector; +%shared_ptr(openmldb::sdk::DAGNode); %{ #include "sdk/sql_router.h" #include "sdk/result_set.h" @@ -117,3 +118,5 @@ using openmldb::sdk::DefaultValueContainer; %template(DBTable) std::pair; %template(DBTableVector) std::vector>; + +%template(DAGNodeList) std::vector>; diff --git a/src/sdk/sql_router_test.cc b/src/sdk/sql_router_test.cc index cc91375c6d7..daa2a1ed059 100644 --- a/src/sdk/sql_router_test.cc +++ b/src/sdk/sql_router_test.cc @@ -1226,6 +1226,69 @@ TEST_F(SQLRouterTest, DDLParseMethodsCombineIndex) { ddl_list.at(0)); } +TEST_F(SQLRouterTest, SQLToDAG) { + auto sql = R"(WITH q1 as ( + WITH q3 as (select * from t1 ORDER BY ts), + q4 as (select * from t2 LIMIT 10) + + select * from q3 left join q4 on q3.key = q4.key + ), + q2 as (select * from t3) + + select * from q1 last join q2 on q1.id = q2.id)"; + + + hybridse::sdk::Status status; + auto dag = router_->SQLToDAG(sql, &status); + ASSERT_TRUE(status.IsOK()); + + std::string_view q3 = R"(SELECT + * +FROM + t1 +ORDER BY ts +)"; + std::string_view q4 = R"(SELECT + * +FROM + t2 +LIMIT 10 +)"; + std::string_view q2 = R"(SELECT + * +FROM + t3 +)"; + std::string_view q1 = R"(SELECT + * +FROM + q3 + LEFT JOIN + q4 + ON q3.key = q4.key +)"; + std::string_view q = R"(SELECT + * +FROM + q1 + LAST JOIN + q2 + ON q1.id = q2.id +)"; + + std::shared_ptr dag_q3 = std::make_shared("q3", q3); + std::shared_ptr dag_q4 = std::make_shared("q4", q4); + + std::shared_ptr dag_q1 = + std::make_shared("q1", q1, std::vector>({dag_q3, dag_q4})); + std::shared_ptr dag_q2 = std::make_shared("q2", q2); + + std::shared_ptr expect = + std::make_shared("", q, std::vector>({dag_q1, dag_q2})); + + EXPECT_EQ(*dag, *expect); +} + } // namespace openmldb::sdk int main(int argc, char** argv) { diff --git a/src/storage/aggregator.cc b/src/storage/aggregator.cc index 7814c687be5..4615c87bc20 100644 --- a/src/storage/aggregator.cc +++ b/src/storage/aggregator.cc @@ -641,9 +641,9 @@ bool Aggregator::FlushAggrBuffer(const std::string& key, const std::string& filt auto dimension = entry.add_dimensions(); dimension->set_idx(aggr_index_pos_); dimension->set_key(key); - bool ok = aggr_table_->Put(time, entry.value(), entry.dimensions()); - if (!ok) { - PDLOG(ERROR, "Aggregator put failed"); + auto st = aggr_table_->Put(time, entry.value(), entry.dimensions()); + if (!st.ok()) { + LOG(ERROR) << "Aggregator put failed: " << st.ToString(); return false; } entry.set_pk(key); diff --git a/src/storage/disk_table.cc b/src/storage/disk_table.cc index b41c9f8fd3c..af35ab9a170 100644 --- a/src/storage/disk_table.cc +++ b/src/storage/disk_table.cc @@ -227,7 +227,8 @@ bool DiskTable::Put(const std::string& pk, uint64_t time, const char* data, uint } } -bool DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& dimensions) { +absl::Status DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& dimensions, bool put_if_absent) { + // disk table will update if key-time is the same, so no need to handle put_if_absent const int8_t* data = reinterpret_cast(value.data()); std::string uncompress_data; if (GetCompressType() == openmldb::type::kSnappy) { @@ -237,15 +238,14 @@ bool DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& d uint8_t version = codec::RowView::GetSchemaVersion(data); auto decoder = GetVersionDecoder(version); if (decoder == nullptr) { - PDLOG(WARNING, "invalid schema version %u, tid %u pid %u", version, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid schema version ", version)); } rocksdb::WriteBatch batch; for (auto it = dimensions.begin(); it != dimensions.end(); ++it) { auto index_def = table_index_.GetIndex(it->idx()); if (!index_def || !index_def->IsReady()) { - PDLOG(WARNING, "failed putting key %s to dimension %u in table tid %u pid %u", it->key().c_str(), - it->idx(), id_, pid_); + PDLOG(WARNING, "failed putting key %s to dimension %u in table tid %u pid %u", it->key().c_str(), it->idx(), + id_, pid_); } int32_t inner_pos = table_index_.GetInnerIndexPos(it->idx()); auto inner_index = table_index_.GetInnerIndex(inner_pos); @@ -256,12 +256,10 @@ bool DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& d if (ts_col->IsAutoGenTs()) { ts = time; } else if (decoder->GetInteger(data, ts_col->GetId(), ts_col->GetType(), &ts) != 0) { - PDLOG(WARNING, "get ts failed. tid %u pid %u", id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": get ts failed")); } if (ts < 0) { - PDLOG(WARNING, "ts %ld is negative. tid %u pid %u", ts, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": ts is negative ", ts)); } if (inner_index->GetIndex().size() > 1) { combine_key = CombineKeyTs(it->key(), ts, ts_col->GetId()); @@ -275,10 +273,9 @@ bool DiskTable::Put(uint64_t time, const std::string& value, const Dimensions& d auto s = db_->Write(write_opts_, &batch); if (s.ok()) { offset_.fetch_add(1, std::memory_order_relaxed); - return true; + return absl::OkStatus(); } else { - DEBUGLOG("Put failed. tid %u pid %u msg %s", id_, pid_, s.ToString().c_str()); - return false; + return absl::InternalError(absl::StrCat(id_, ".", pid_, ": ", s.ToString())); } } diff --git a/src/storage/disk_table.h b/src/storage/disk_table.h index be549d0c2cd..7b471bac45e 100644 --- a/src/storage/disk_table.h +++ b/src/storage/disk_table.h @@ -21,6 +21,7 @@ #include #include #include + #include "base/slice.h" #include "base/status.h" #include "common/timer.h" @@ -102,7 +103,7 @@ class AbsoluteTTLCompactionFilter : public rocksdb::CompactionFilter { return false; } uint32_t ts_idx = *((uint32_t*)(key.data() + key.size() - TS_LEN - // NOLINT - TS_POS_LEN)); + TS_POS_LEN)); bool has_found = false; for (const auto& index : indexs) { auto ts_col = index->GetTsColumn(); @@ -110,7 +111,7 @@ class AbsoluteTTLCompactionFilter : public rocksdb::CompactionFilter { return false; } if (ts_col->GetId() == ts_idx && - index->GetTTL()->ttl_type == openmldb::storage::TTLType::kAbsoluteTime) { + index->GetTTL()->ttl_type == openmldb::storage::TTLType::kAbsoluteTime) { real_ttl = index->GetTTL()->abs_ttl; has_found = true; break; @@ -172,7 +173,8 @@ class DiskTable : public Table { bool Put(const std::string& pk, uint64_t time, const char* data, uint32_t size) override; - bool Put(uint64_t time, const std::string& value, const Dimensions& dimensions) override; + absl::Status Put(uint64_t time, const std::string& value, const Dimensions& dimensions, + bool put_if_absent) override; bool Get(uint32_t idx, const std::string& pk, uint64_t ts, std::string& value); // NOLINT @@ -183,8 +185,8 @@ class DiskTable : public Table { base::Status Truncate(); - bool Delete(uint32_t idx, const std::string& pk, - const std::optional& start_ts, const std::optional& end_ts) override; + bool Delete(uint32_t idx, const std::string& pk, const std::optional& start_ts, + const std::optional& end_ts) override; uint64_t GetExpireTime(const TTLSt& ttl_st) override; @@ -233,7 +235,7 @@ class DiskTable : public Table { uint64_t GetRecordByteSize() const override { return 0; } uint64_t GetRecordIdxByteSize() override; - int GetCount(uint32_t index, const std::string& pk, uint64_t& count) override; // NOLINT + int GetCount(uint32_t index, const std::string& pk, uint64_t& count) override; // NOLINT private: base::Status Delete(uint32_t idx, const std::string& pk, uint64_t start_ts, const std::optional& end_ts); diff --git a/src/storage/disk_table_test.cc b/src/storage/disk_table_test.cc index 2a4e0d53c98..04a5d6edbb3 100644 --- a/src/storage/disk_table_test.cc +++ b/src/storage/disk_table_test.cc @@ -111,7 +111,7 @@ TEST_F(DiskTableTest, MultiDimensionPut) { mapping.insert(std::make_pair("idx1", 1)); mapping.insert(std::make_pair("idx2", 2)); std::string table_path = FLAGS_hdd_root_path + "/2_1"; - DiskTable* table = new DiskTable("yjtable2", 2, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, + Table* table = new DiskTable("yjtable2", 2, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, ::openmldb::common::StorageMode::kHDD, table_path); ASSERT_TRUE(table->Init()); ASSERT_EQ(3, (int64_t)table->GetIdxCnt()); @@ -136,7 +136,7 @@ TEST_F(DiskTableTest, MultiDimensionPut) { d2->set_idx(2); std::string value; ASSERT_EQ(0, sdk_codec.EncodeRow(row, &value)); - bool ok = table->Put(1, value, dimensions); + bool ok = table->Put(1, value, dimensions).ok(); ASSERT_TRUE(ok); // some functions in disk table need to be implemented. // refer to issue #1238 @@ -202,7 +202,7 @@ TEST_F(DiskTableTest, MultiDimensionPut) { row = {"valuea", "valueb", "valuec"}; ASSERT_EQ(0, sdk_codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(2, value, dimensions)); + ASSERT_TRUE(table->Put(2, value, dimensions).ok()); it = table->NewIterator(0, "key2", ticket); it->SeekToFirst(); @@ -223,7 +223,7 @@ TEST_F(DiskTableTest, MultiDimensionPut) { delete it; std::string val; - ASSERT_TRUE(table->Get(1, "key1", 2, val)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, "key1", 2, val)); data = reinterpret_cast(val.data()); version = codec::RowView::GetSchemaVersion(data); decoder = table->GetVersionDecoder(version); @@ -277,7 +277,7 @@ TEST_F(DiskTableTest, LongPut) { mapping.insert(std::make_pair("idx0", 0)); mapping.insert(std::make_pair("idx1", 1)); std::string table_path = FLAGS_ssd_root_path + "/3_1"; - DiskTable* table = new DiskTable("yjtable3", 3, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, + Table* table = new DiskTable("yjtable3", 3, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, ::openmldb::common::StorageMode::kSSD, table_path); auto meta = ::openmldb::test::GetTableMeta({"idx0", "idx1"}); ::openmldb::codec::SDKCodec sdk_codec(meta); @@ -297,7 +297,7 @@ TEST_F(DiskTableTest, LongPut) { std::string value; ASSERT_EQ(0, sdk_codec.EncodeRow(row, &value)); for (int k = 0; k < 10; k++) { - ASSERT_TRUE(table->Put(ts + k, value, dimensions)); + ASSERT_TRUE(table->Put(ts + k, value, dimensions).ok()); } } for (int idx = 0; idx < 10; idx++) { @@ -465,7 +465,7 @@ TEST_F(DiskTableTest, TraverseIterator) { } ASSERT_EQ(20, count); std::string val; - ASSERT_TRUE(table->Get(0, "test98", 9548, val)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, "test98", 9548, val)); ASSERT_EQ("valu8", val); delete it; delete table; @@ -733,7 +733,7 @@ TEST_F(DiskTableTest, CompactFilter) { std::map mapping; mapping.insert(std::make_pair("idx0", 0)); std::string table_path = FLAGS_hdd_root_path + "/10_1"; - DiskTable* table = new DiskTable("t1", 10, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, + Table* table = new DiskTable("t1", 10, 1, mapping, 10, ::openmldb::type::TTLType::kAbsoluteTime, ::openmldb::common::StorageMode::kHDD, table_path); ASSERT_TRUE(table->Init()); uint64_t cur_time = ::baidu::common::timer::get_micros() / 1000; @@ -754,24 +754,24 @@ TEST_F(DiskTableTest, CompactFilter) { for (int k = 0; k < 5; k++) { std::string value; if (k > 2) { - ASSERT_TRUE(table->Get(key, ts - k - 10 * 60 * 1000, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts - k - 10 * 60 * 1000, value)); ASSERT_EQ("value9", value); } else { - ASSERT_TRUE(table->Get(key, ts - k, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts - k, value)); ASSERT_EQ("value", value); } } } - table->CompactDB(); + reinterpret_cast(table)->CompactDB(); for (int idx = 0; idx < 100; idx++) { std::string key = "test" + std::to_string(idx); uint64_t ts = cur_time; for (int k = 0; k < 5; k++) { std::string value; if (k > 2) { - ASSERT_FALSE(table->Get(key, ts - k - 10 * 60 * 1000, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(key, ts - k - 10 * 60 * 1000, value)); } else { - ASSERT_TRUE(table->Get(key, ts - k, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts - k, value)); ASSERT_EQ("value", value); } } @@ -794,7 +794,7 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { SchemaCodec::SetIndex(table_meta.add_column_key(), "mcc", "mcc", "ts2", ::openmldb::type::kAbsoluteTime, 5, 0); std::string table_path = FLAGS_hdd_root_path + "/11_1"; - DiskTable* table = new DiskTable(table_meta, table_path); + Table* table = new DiskTable(table_meta, table_path); ASSERT_TRUE(table->Init()); codec::SDKCodec codec(table_meta); @@ -818,7 +818,7 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::to_string(cur_time - i * 60 * 1000)}; std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(cur_time - i * 60 * 1000, value, dims)); + ASSERT_TRUE(table->Put(cur_time - i * 60 * 1000, value, dims).ok()); } } else { @@ -828,7 +828,7 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::to_string(cur_time - i)}; std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(cur_time - i, value, dims)); + ASSERT_TRUE(table->Put(cur_time - i, value, dims).ok()); } } } @@ -860,11 +860,11 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::string e_value; ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; - ASSERT_TRUE(table->Get(0, key, ts - i * 60 * 1000, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, ts - i * 60 * 1000, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, ts - i * 60 * 1000, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, ts - i * 60 * 1000, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, ts - i * 60 * 1000, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, ts - i * 60 * 1000, value)); } } else { @@ -874,15 +874,15 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::string e_value; ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; - ASSERT_TRUE(table->Get(0, key, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, ts - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, ts - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, ts - i, value)); } } } - table->CompactDB(); + reinterpret_cast(table)->CompactDB(); iter = table->NewIterator(0, "card0", ticket); iter->SeekToFirst(); while (iter->Valid()) { @@ -908,18 +908,18 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; if (i < 3) { - ASSERT_TRUE(table->Get(0, key, cur_ts, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, cur_ts, value)); ASSERT_EQ(e_value, value); } else { - ASSERT_FALSE(table->Get(0, key, cur_ts, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_ts, value)); } if (i < 5) { - ASSERT_TRUE(table->Get(1, key, cur_ts, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, cur_ts, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, cur_ts, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, cur_ts, value)); } else { - ASSERT_FALSE(table->Get(1, key, cur_ts, value)); - ASSERT_FALSE(table->Get(2, key1, cur_ts, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(1, key, cur_ts, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(2, key1, cur_ts, value)); } } } else { @@ -929,11 +929,11 @@ TEST_F(DiskTableTest, CompactFilterMulTs) { std::string e_value; ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; - ASSERT_TRUE(table->Get(0, key, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, ts - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, ts - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, ts - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, ts - i, value)); } } } @@ -955,7 +955,8 @@ TEST_F(DiskTableTest, GcHeadMulTs) { SchemaCodec::SetIndex(table_meta.add_column_key(), "mcc", "mcc", "ts2", ::openmldb::type::kLatestTime, 0, 5); std::string table_path = FLAGS_hdd_root_path + "/12_1"; - DiskTable* table = new DiskTable(table_meta, table_path); + // Table base class doesn't have Get method, cast to DiskTable to call Get + Table* table = new DiskTable(table_meta, table_path); ASSERT_TRUE(table->Init()); codec::SDKCodec codec(table_meta); @@ -980,7 +981,7 @@ TEST_F(DiskTableTest, GcHeadMulTs) { std::to_string(cur_time - i), std::to_string(cur_time - i)}; std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(cur_time - i, value, dims)); + ASSERT_TRUE(table->Put(cur_time - i, value, dims).ok()); } } Ticket ticket; @@ -1006,15 +1007,15 @@ TEST_F(DiskTableTest, GcHeadMulTs) { ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; if (idx == 50 && i > 2) { - ASSERT_FALSE(table->Get(0, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(1, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(2, key1, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } else { - ASSERT_TRUE(table->Get(0, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } } } @@ -1041,24 +1042,24 @@ TEST_F(DiskTableTest, GcHeadMulTs) { ASSERT_EQ(0, codec.EncodeRow(row, &e_value)); std::string value; if (idx == 50 && i > 2) { - ASSERT_FALSE(table->Get(0, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(1, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(2, key1, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } else if (i < 3) { - ASSERT_TRUE(table->Get(0, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(1, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } else if (i < 5) { - ASSERT_FALSE(table->Get(0, key, cur_time - i, value)); - ASSERT_TRUE(table->Get(1, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); ASSERT_EQ(e_value, value); - ASSERT_TRUE(table->Get(2, key1, cur_time - i, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } else { - ASSERT_FALSE(table->Get(0, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(1, key, cur_time - i, value)); - ASSERT_FALSE(table->Get(2, key1, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(0, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(1, key, cur_time - i, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(2, key1, cur_time - i, value)); } } } @@ -1089,7 +1090,7 @@ TEST_F(DiskTableTest, GcHead) { uint64_t ts = 9537; for (int k = 0; k < 5; k++) { std::string value; - ASSERT_TRUE(table->Get(key, ts + k, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts + k, value)); if (idx == 10 && k == 2) { ASSERT_EQ("value8", value); } else { @@ -1104,9 +1105,9 @@ TEST_F(DiskTableTest, GcHead) { for (int k = 0; k < 5; k++) { std::string value; if (k < 2) { - ASSERT_FALSE(table->Get(key, ts + k, value)); + ASSERT_FALSE(reinterpret_cast(table)->Get(key, ts + k, value)); } else { - ASSERT_TRUE(table->Get(key, ts + k, value)); + ASSERT_TRUE(reinterpret_cast(table)->Get(key, ts + k, value)); if (idx == 10 && k == 2) { ASSERT_EQ("value8", value); } else { diff --git a/src/storage/key_entry.h b/src/storage/key_entry.h index e8969c9b832..1b5f4778f4f 100644 --- a/src/storage/key_entry.h +++ b/src/storage/key_entry.h @@ -49,11 +49,19 @@ struct DataBlock { delete[] data; data = nullptr; } + + bool EqualWithoutCnt(const DataBlock& other) const { + if (size != other.size) { + return false; + } + // you can improve it ref RowBuilder::InitBuffer header version + return memcmp(data, other.data, size) == 0; + } }; // the desc time comparator struct TimeComparator { - int operator() (uint64_t a, uint64_t b) const { + int operator()(uint64_t a, uint64_t b) const { if (a > b) { return -1; } else if (a == b) { @@ -86,7 +94,6 @@ class KeyEntry { std::atomic count_; }; - } // namespace storage } // namespace openmldb diff --git a/src/storage/mem_table.cc b/src/storage/mem_table.cc index a50e3c6dc82..3a57ffc4e93 100644 --- a/src/storage/mem_table.cc +++ b/src/storage/mem_table.cc @@ -25,6 +25,7 @@ #include "base/slice.h" #include "common/timer.h" #include "gflags/gflags.h" +#include "schema/index_util.h" #include "storage/record.h" #include "storage/mem_table_iterator.h" @@ -139,21 +140,21 @@ bool MemTable::Put(const std::string& pk, uint64_t time, const char* data, uint3 return true; } -bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& dimensions) { +absl::Status MemTable::Put(uint64_t time, const std::string& value, const Dimensions& dimensions, bool put_if_absent) { if (dimensions.empty()) { PDLOG(WARNING, "empty dimension. tid %u pid %u", id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": empty dimension")); } if (value.length() < codec::HEADER_LENGTH) { PDLOG(WARNING, "invalid value. tid %u pid %u", id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid value")); } + // inner index pos: -1 means invalid, so it's positive in inner_index_key_map std::map inner_index_key_map; for (auto iter = dimensions.begin(); iter != dimensions.end(); iter++) { int32_t inner_pos = table_index_.GetInnerIndexPos(iter->idx()); if (inner_pos < 0) { - PDLOG(WARNING, "invalid dimension. dimension idx %u, tid %u pid %u", iter->idx(), id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid dimension idx ", iter->idx())); } inner_index_key_map.emplace(inner_pos, iter->key()); } @@ -167,15 +168,13 @@ bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& di uint8_t version = codec::RowView::GetSchemaVersion(data); auto decoder = GetVersionDecoder(version); if (decoder == nullptr) { - PDLOG(WARNING, "invalid schema version %u, tid %u pid %u", version, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid schema version ", version)); } std::map> ts_value_map; for (const auto& kv : inner_index_key_map) { auto inner_index = table_index_.GetInnerIndex(kv.first); if (!inner_index) { - PDLOG(WARNING, "invalid inner index pos %d. tid %u pid %u", kv.first, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": invalid inner index pos ", kv.first)); } std::map ts_map; for (const auto& index_def : inner_index->GetIndex()) { @@ -188,13 +187,12 @@ bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& di if (ts_col->IsAutoGenTs()) { ts = time; } else if (decoder->GetInteger(data, ts_col->GetId(), ts_col->GetType(), &ts) != 0) { - PDLOG(WARNING, "get ts failed. tid %u pid %u", id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": get ts failed")); } if (ts < 0) { - PDLOG(WARNING, "ts %ld is negative. tid %u pid %u", ts, id_, pid_); - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": ts is negative ", ts)); } + // TODO(hw): why uint32_t to int32_t? ts_map.emplace(ts_col->GetId(), ts); real_ref_cnt++; } @@ -204,7 +202,7 @@ bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& di } } if (ts_value_map.empty()) { - return false; + return absl::InvalidArgumentError(absl::StrCat(id_, ".", pid_, ": empty ts value map")); } auto* block = new DataBlock(real_ref_cnt, value.c_str(), value.length()); for (const auto& kv : inner_index_key_map) { @@ -217,10 +215,12 @@ bool MemTable::Put(uint64_t time, const std::string& value, const Dimensions& di seg_idx = ::openmldb::base::hash(kv.second.data(), kv.second.size(), SEED) % seg_cnt_; } Segment* segment = segments_[kv.first][seg_idx]; - segment->Put(::openmldb::base::Slice(kv.second), iter->second, block); + if (!segment->Put(kv.second, iter->second, block, put_if_absent)) { + return absl::AlreadyExistsError("data exists"); // let caller know exists + } } record_byte_size_.fetch_add(GetRecordSize(value.length())); - return true; + return absl::OkStatus(); } bool MemTable::Delete(const ::openmldb::api::LogEntry& entry) { @@ -549,6 +549,7 @@ uint64_t MemTable::GetRecordIdxCnt() { if (!index_def || !index_def->IsReady()) { return record_idx_cnt; } + uint32_t inner_idx = index_def->GetInnerPos(); auto inner_index = table_index_.GetInnerIndex(inner_idx); int32_t ts_col_id = -1; @@ -626,13 +627,15 @@ bool MemTable::AddIndex(const ::openmldb::common::ColumnKey& column_key) { PDLOG(WARNING, "index %s is exist. tid %u pid %u", column_key.index_name().c_str(), id_, pid_); return false; } - new_table_meta->mutable_column_key(index_def->GetId())->CopyFrom(column_key); if (column_key.has_ttl()) { index_def->SetTTL(::openmldb::storage::TTLSt(column_key.ttl())); } + } + int index_pos = schema::IndexUtil::GetPosition(column_key, new_table_meta->column_key()); + if (index_pos >= 0) { + new_table_meta->mutable_column_key(index_pos)->CopyFrom(column_key); } else { - ::openmldb::common::ColumnKey* added_column_key = new_table_meta->add_column_key(); - added_column_key->CopyFrom(column_key); + new_table_meta->add_column_key()->CopyFrom(column_key); } if (!index_def) { auto cols = GetSchema(); @@ -662,7 +665,7 @@ bool MemTable::AddIndex(const ::openmldb::common::ColumnKey& column_key) { } ts_vec.push_back(ts_iter->second.GetId()); } else { - ts_vec.push_back(DEFUALT_TS_COL_ID); + ts_vec.push_back(DEFAULT_TS_COL_ID); } uint32_t inner_id = table_index_.GetAllInnerIndex()->size(); Segment** seg_arr = new Segment*[seg_cnt_]; @@ -682,7 +685,7 @@ bool MemTable::AddIndex(const ::openmldb::common::ColumnKey& column_key) { auto ts_iter = schema.find(column_key.ts_name()); index_def->SetTsColumn(std::make_shared(ts_iter->second)); } else { - index_def->SetTsColumn(std::make_shared(DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + index_def->SetTsColumn(std::make_shared(DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, ::openmldb::type::kTimestamp, true)); } if (column_key.has_ttl()) { @@ -721,14 +724,14 @@ bool MemTable::DeleteIndex(const std::string& idx_name) { new_table_meta->mutable_column_key(index_def->GetId())->set_flag(1); } std::atomic_store_explicit(&table_meta_, new_table_meta, std::memory_order_release); - index_def->SetStatus(IndexStatus::kWaiting); + index_def->SetStatus(IndexStatus::kWaiting); // let gc do deletion return true; } ::hybridse::vm::WindowIterator* MemTable::NewWindowIterator(uint32_t index) { std::shared_ptr index_def = table_index_.GetIndex(index); if (!index_def || !index_def->IsReady()) { - LOG(WARNING) << "index id " << index << " not found. tid " << id_ << " pid " << pid_; + LOG(WARNING) << "index id " << index << " not found. tid " << id_ << " pid " << pid_; return nullptr; } uint64_t expire_time = 0; diff --git a/src/storage/mem_table.h b/src/storage/mem_table.h index 8ae1964e0ef..e85762a97dc 100644 --- a/src/storage/mem_table.h +++ b/src/storage/mem_table.h @@ -51,7 +51,8 @@ class MemTable : public Table { bool Put(const std::string& pk, uint64_t time, const char* data, uint32_t size) override; - bool Put(uint64_t time, const std::string& value, const Dimensions& dimensions) override; + absl::Status Put(uint64_t time, const std::string& value, const Dimensions& dimensions, + bool put_if_absent) override; bool GetBulkLoadInfo(::openmldb::api::BulkLoadInfoResponse* response); @@ -59,8 +60,8 @@ class MemTable : public Table { const ::google::protobuf::RepeatedPtrField<::openmldb::api::BulkLoadIndex>& indexes); bool Delete(const ::openmldb::api::LogEntry& entry) override; - bool Delete(uint32_t idx, const std::string& key, - const std::optional& start_ts, const std::optional& end_ts); + bool Delete(uint32_t idx, const std::string& key, const std::optional& start_ts, + const std::optional& end_ts); // use the first demission TableIterator* NewIterator(const std::string& pk, Ticket& ticket) override; diff --git a/src/storage/schema.cc b/src/storage/schema.cc index 943adc8de95..3250a047a8b 100644 --- a/src/storage/schema.cc +++ b/src/storage/schema.cc @@ -129,8 +129,6 @@ uint32_t InnerIndexSt::GetKeyEntryMaxHeight(uint32_t abs_max_height, uint32_t la return max_height; } -bool ColumnDefSortFunc(const ColumnDef& cd_a, const ColumnDef& cd_b) { return (cd_a.GetId() < cd_b.GetId()); } - TableIndex::TableIndex() { indexs_ = std::make_shared>>(); inner_indexs_ = std::make_shared>>(); @@ -218,7 +216,7 @@ int TableIndex::ParseFromMeta(const ::openmldb::api::TableMeta& table_meta) { index->SetTsColumn(col_map[ts_name]); } else { // set default ts col - index->SetTsColumn(std::make_shared(DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + index->SetTsColumn(std::make_shared(DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, ::openmldb::type::kTimestamp, true)); } if (column_key.has_ttl()) { @@ -234,7 +232,7 @@ int TableIndex::ParseFromMeta(const ::openmldb::api::TableMeta& table_meta) { // add default dimension if (indexs_->empty()) { auto index = std::make_shared("idx0", 0); - index->SetTsColumn(std::make_shared(DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + index->SetTsColumn(std::make_shared(DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, ::openmldb::type::kTimestamp, true)); if (AddIndex(index) < 0) { DLOG(WARNING) << "add index failed"; diff --git a/src/storage/schema.h b/src/storage/schema.h index c1b361999dc..39359761ed9 100644 --- a/src/storage/schema.h +++ b/src/storage/schema.h @@ -31,8 +31,8 @@ namespace openmldb::storage { static constexpr uint32_t MAX_INDEX_NUM = 200; -static constexpr uint32_t DEFUALT_TS_COL_ID = UINT32_MAX; -static constexpr const char* DEFUALT_TS_COL_NAME = "default_ts"; +static constexpr uint32_t DEFAULT_TS_COL_ID = UINT32_MAX; +static constexpr const char* DEFAULT_TS_COL_NAME = "default_ts"; enum TTLType { kAbsoluteTime = 1, kRelativeTime = 2, kLatestTime = 3, kAbsAndLat = 4, kAbsOrLat = 5 }; @@ -163,7 +163,7 @@ class ColumnDef { return false; } - inline bool IsAutoGenTs() const { return id_ == DEFUALT_TS_COL_ID; } + inline bool IsAutoGenTs() const { return id_ == DEFAULT_TS_COL_ID; } private: std::string name_; @@ -240,8 +240,6 @@ class InnerIndexSt { std::vector ts_; }; -bool ColumnDefSortFunc(const ColumnDef& cd_a, const ColumnDef& cd_b); - class TableIndex { public: TableIndex(); diff --git a/src/storage/schema_test.cc b/src/storage/schema_test.cc index 1c169697634..77840c13e93 100644 --- a/src/storage/schema_test.cc +++ b/src/storage/schema_test.cc @@ -233,9 +233,9 @@ TEST_F(SchemaTest, TsAndDefaultTs) { ::openmldb::storage::kAbsoluteTime); AssertIndex(*(table_index.GetIndex("key2")), "key2", "col1", "col7", 7, 10, 0, ::openmldb::storage::kAbsoluteTime); AssertIndex(*(table_index.GetIndex("key3")), "key3", "col2", "col6", 6, 10, 0, ::openmldb::storage::kAbsoluteTime); - AssertIndex(*(table_index.GetIndex("key4")), "key4", "col2", DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + AssertIndex(*(table_index.GetIndex("key4")), "key4", "col2", DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, 10, 0, ::openmldb::storage::kAbsoluteTime); - AssertIndex(*(table_index.GetIndex("key5")), "key5", "col3", DEFUALT_TS_COL_NAME, DEFUALT_TS_COL_ID, + AssertIndex(*(table_index.GetIndex("key5")), "key5", "col3", DEFAULT_TS_COL_NAME, DEFAULT_TS_COL_ID, 10, 0, ::openmldb::storage::kAbsoluteTime); auto inner_index = table_index.GetAllInnerIndex(); ASSERT_EQ(inner_index->size(), 3u); @@ -243,10 +243,10 @@ TEST_F(SchemaTest, TsAndDefaultTs) { std::vector ts_vec0 = {6, 7}; AssertInnerIndex(*(table_index.GetInnerIndex(0)), 0, index0, ts_vec0); std::vector index1 = {"key3", "key4"}; - std::vector ts_vec1 = {6, DEFUALT_TS_COL_ID}; + std::vector ts_vec1 = {6, DEFAULT_TS_COL_ID}; AssertInnerIndex(*(table_index.GetInnerIndex(1)), 1, index1, ts_vec1); std::vector index2 = {"key5"}; - std::vector ts_vec2 = {DEFUALT_TS_COL_ID}; + std::vector ts_vec2 = {DEFAULT_TS_COL_ID}; AssertInnerIndex(*(table_index.GetInnerIndex(2)), 2, index2, ts_vec2); } diff --git a/src/storage/segment.cc b/src/storage/segment.cc index d79b6e85681..8255d27b7bd 100644 --- a/src/storage/segment.cc +++ b/src/storage/segment.cc @@ -15,7 +15,9 @@ */ #include "storage/segment.h" + #include + #include #include "base/glog_wrapper.h" @@ -64,9 +66,7 @@ Segment::Segment(uint8_t height, const std::vector& ts_idx_vec) } } -Segment::~Segment() { - delete entries_; -} +Segment::~Segment() { delete entries_; } void Segment::Release(StatisticsInfo* statistics_info) { std::unique_ptr it(entries_->NewIterator()); @@ -98,9 +98,7 @@ void Segment::Release(StatisticsInfo* statistics_info) { } } -void Segment::ReleaseAndCount(StatisticsInfo* statistics_info) { - Release(statistics_info); -} +void Segment::ReleaseAndCount(StatisticsInfo* statistics_info) { Release(statistics_info); } void Segment::ReleaseAndCount(const std::vector& id_vec, StatisticsInfo* statistics_info) { if (ts_cnt_ <= 1) { @@ -135,25 +133,28 @@ void Segment::ReleaseAndCount(const std::vector& id_vec, StatisticsInfo* } } -void Segment::Put(const Slice& key, uint64_t time, const char* data, uint32_t size) { +void Segment::Put(const Slice& key, uint64_t time, const char* data, uint32_t size, bool put_if_absent, + bool check_all_time) { if (ts_cnt_ > 1) { return; } auto* db = new DataBlock(1, data, size); - Put(key, time, db); + Put(key, time, db, put_if_absent, check_all_time); } -void Segment::Put(const Slice& key, uint64_t time, DataBlock* row) { +bool Segment::Put(const Slice& key, uint64_t time, DataBlock* row, bool put_if_absent, bool check_all_time) { if (ts_cnt_ > 1) { - return; + LOG(ERROR) << "wrong call"; + return false; } std::lock_guard lock(mu_); - PutUnlock(key, time, row); + return PutUnlock(key, time, row, put_if_absent, check_all_time); } -void Segment::PutUnlock(const Slice& key, uint64_t time, DataBlock* row) { +bool Segment::PutUnlock(const Slice& key, uint64_t time, DataBlock* row, bool put_if_absent, bool check_all_time) { void* entry = nullptr; uint32_t byte_size = 0; + // one key just one entry int ret = entries_->Get(key, entry); if (ret < 0 || entry == nullptr) { char* pk = new char[key.size()]; @@ -164,12 +165,17 @@ void Segment::PutUnlock(const Slice& key, uint64_t time, DataBlock* row) { uint8_t height = entries_->Insert(skey, entry); byte_size += GetRecordPkIdxSize(height, key.size(), key_entry_max_height_); pk_cnt_.fetch_add(1, std::memory_order_relaxed); + // no need to check if absent when first put + } else if (put_if_absent && ListContains(reinterpret_cast(entry), time, row, check_all_time)) { + return false; } + idx_cnt_vec_[0]->fetch_add(1, std::memory_order_relaxed); uint8_t height = reinterpret_cast(entry)->entries.Insert(time, row); reinterpret_cast(entry)->count_.fetch_add(1, std::memory_order_relaxed); byte_size += GetRecordTsIdxSize(height); idx_byte_size_.fetch_add(byte_size, std::memory_order_relaxed); + return true; } void Segment::BulkLoadPut(unsigned int key_entry_id, const Slice& key, uint64_t time, DataBlock* row) { @@ -201,16 +207,17 @@ void Segment::BulkLoadPut(unsigned int key_entry_id, const Slice& key, uint64_t } } -void Segment::Put(const Slice& key, const std::map& ts_map, DataBlock* row) { - uint32_t ts_size = ts_map.size(); - if (ts_size == 0) { - return; +bool Segment::Put(const Slice& key, const std::map& ts_map, DataBlock* row, bool put_if_absent) { + if (ts_map.empty()) { + return false; } if (ts_cnt_ == 1) { + bool ret = false; if (auto pos = ts_map.find(ts_idx_map_.begin()->first); pos != ts_map.end()) { - Put(key, pos->second, row); + // TODO(hw): why ts_map key is int32_t, default ts is uint32_t? + ret = Put(key, pos->second, row, put_if_absent, pos->first == DEFAULT_TS_COL_ID); } - return; + return ret; } void* entry_arr = nullptr; std::lock_guard lock(mu_); @@ -237,12 +244,16 @@ void Segment::Put(const Slice& key, const std::map& ts_map, D } } auto entry = reinterpret_cast(entry_arr)[pos->second]; + if (put_if_absent && ListContains(entry, kv.second, row, pos->first == DEFAULT_TS_COL_ID)) { + return false; + } uint8_t height = entry->entries.Insert(kv.second, row); entry->count_.fetch_add(1, std::memory_order_relaxed); byte_size += GetRecordTsIdxSize(height); idx_byte_size_.fetch_add(byte_size, std::memory_order_relaxed); idx_cnt_vec_[pos->second]->fetch_add(1, std::memory_order_relaxed); } + return true; } bool Segment::Delete(const std::optional& idx, const Slice& key) { @@ -289,8 +300,8 @@ bool Segment::Delete(const std::optional& idx, const Slice& key) { return true; } -bool Segment::Delete(const std::optional& idx, const Slice& key, - uint64_t ts, const std::optional& end_ts) { +bool Segment::Delete(const std::optional& idx, const Slice& key, uint64_t ts, + const std::optional& end_ts) { void* entry = nullptr; if (entries_->Get(key, entry) < 0 || entry == nullptr) { return true; @@ -347,7 +358,7 @@ bool Segment::Delete(const std::optional& idx, const Slice& key, } void Segment::FreeList(uint32_t ts_idx, ::openmldb::base::Node* node, - StatisticsInfo* statistics_info) { + StatisticsInfo* statistics_info) { while (node != nullptr) { statistics_info->IncrIdxCnt(ts_idx); ::openmldb::base::Node* tmp = node; @@ -365,7 +376,6 @@ void Segment::FreeList(uint32_t ts_idx, ::openmldb::base::Node it(entry->entries.NewIterator()); + if (check_all_time) { + it->SeekToFirst(); + while (it->Valid()) { + if (it->GetValue()->EqualWithoutCnt(*row)) { + return true; + } + it->Next(); + } + } else { + // less than but desc time comparator, so it's <= time(not valid if empty or all > time), and get smaller by + // next + it->Seek(time); + while (it->Valid()) { + // key > time is just a protection, normally it should not happen + if (it->GetKey() < time || it->GetKey() > time) { + break; // no entry == time, or all entries == time have been checked + } + if (it->GetValue()->EqualWithoutCnt(*row)) { + return true; + } + it->Next(); + } + } + return false; +} + // fast gc with no global pause void Segment::Gc4TTL(const uint64_t time, StatisticsInfo* statistics_info) { uint64_t consumed = ::baidu::common::timer::get_micros(); @@ -606,8 +645,7 @@ void Segment::Gc4TTL(const uint64_t time, StatisticsInfo* statistics_info) { if (node == nullptr) { continue; } else if (node->GetKey() > time) { - DEBUGLOG("[Gc4TTL] segment gc with key %lu need not ttl, last node key %lu", - time, node->GetKey()); + DEBUGLOG("[Gc4TTL] segment gc with key %lu need not ttl, last node key %lu", time, node->GetKey()); continue; } node = nullptr; @@ -648,8 +686,7 @@ void Segment::Gc4TTLAndHead(const uint64_t time, const uint64_t keep_cnt, Statis if (node == nullptr) { continue; } else if (node->GetKey() > time) { - DEBUGLOG("[Gc4TTLAndHead] segment gc with key %lu need not ttl, last node key %lu", - time, node->GetKey()); + DEBUGLOG("[Gc4TTLAndHead] segment gc with key %lu need not ttl, last node key %lu", time, node->GetKey()); continue; } node = nullptr; @@ -663,8 +700,8 @@ void Segment::Gc4TTLAndHead(const uint64_t time, const uint64_t keep_cnt, Statis FreeList(0, node, statistics_info); entry->count_.fetch_sub(statistics_info->GetIdxCnt(0) - cur_idx_cnt, std::memory_order_relaxed); } - DEBUGLOG("[Gc4TTLAndHead] segment gc time %lu and keep cnt %lu consumed %lu, count %lu", - time, keep_cnt, (::baidu::common::timer::get_micros() - consumed) / 1000, statistics_info->GetIdxCnt(0) - old); + DEBUGLOG("[Gc4TTLAndHead] segment gc time %lu and keep cnt %lu consumed %lu, count %lu", time, keep_cnt, + (::baidu::common::timer::get_micros() - consumed) / 1000, statistics_info->GetIdxCnt(0) - old); idx_cnt_vec_[0]->fetch_sub(statistics_info->GetIdxCnt(0) - old, std::memory_order_relaxed); } @@ -709,8 +746,8 @@ void Segment::Gc4TTLOrHead(const uint64_t time, const uint64_t keep_cnt, Statist FreeList(0, node, statistics_info); entry->count_.fetch_sub(statistics_info->GetIdxCnt(0) - cur_idx_cnt, std::memory_order_relaxed); } - DEBUGLOG("[Gc4TTLAndHead] segment gc time %lu and keep cnt %lu consumed %lu, count %lu", - time, keep_cnt, (::baidu::common::timer::get_micros() - consumed) / 1000, statistics_info->GetIdxCnt(0) - old); + DEBUGLOG("[Gc4TTLAndHead] segment gc time %lu and keep cnt %lu consumed %lu, count %lu", time, keep_cnt, + (::baidu::common::timer::get_micros() - consumed) / 1000, statistics_info->GetIdxCnt(0) - old); idx_cnt_vec_[0]->fetch_sub(statistics_info->GetIdxCnt(0) - old, std::memory_order_relaxed); } @@ -754,8 +791,8 @@ MemTableIterator* Segment::NewIterator(const Slice& key, Ticket& ticket, type::C return new MemTableIterator(reinterpret_cast(entry)->entries.NewIterator(), compress_type); } -MemTableIterator* Segment::NewIterator(const Slice& key, uint32_t idx, - Ticket& ticket, type::CompressType compress_type) { +MemTableIterator* Segment::NewIterator(const Slice& key, uint32_t idx, Ticket& ticket, + type::CompressType compress_type) { auto pos = ts_idx_map_.find(idx); if (pos == ts_idx_map_.end()) { return new MemTableIterator(nullptr, compress_type); diff --git a/src/storage/segment.h b/src/storage/segment.h index fe58dd893a0..11322483832 100644 --- a/src/storage/segment.h +++ b/src/storage/segment.h @@ -70,20 +70,19 @@ class Segment { Segment(uint8_t height, const std::vector& ts_idx_vec); ~Segment(); - // Put time data - void Put(const Slice& key, uint64_t time, const char* data, uint32_t size); + // legacy interface called by memtable and ut + void Put(const Slice& key, uint64_t time, const char* data, uint32_t size, bool put_if_absent = false, + bool check_all_time = false); - void Put(const Slice& key, uint64_t time, DataBlock* row); - - void PutUnlock(const Slice& key, uint64_t time, DataBlock* row); + bool Put(const Slice& key, uint64_t time, DataBlock* row, bool put_if_absent = false, bool check_all_time = false); void BulkLoadPut(unsigned int key_entry_id, const Slice& key, uint64_t time, DataBlock* row); - - void Put(const Slice& key, const std::map& ts_map, DataBlock* row); + // main put method + bool Put(const Slice& key, const std::map& ts_map, DataBlock* row, bool put_if_absent = false); bool Delete(const std::optional& idx, const Slice& key); - bool Delete(const std::optional& idx, const Slice& key, - uint64_t ts, const std::optional& end_ts); + bool Delete(const std::optional& idx, const Slice& key, uint64_t ts, + const std::optional& end_ts); void Release(StatisticsInfo* statistics_info); @@ -97,12 +96,10 @@ class Segment { void GcAllType(const std::map& ttl_st_map, StatisticsInfo* statistics_info); MemTableIterator* NewIterator(const Slice& key, Ticket& ticket, type::CompressType compress_type); // NOLINT - MemTableIterator* NewIterator(const Slice& key, uint32_t idx, - Ticket& ticket, type::CompressType compress_type); // NOLINT + MemTableIterator* NewIterator(const Slice& key, uint32_t idx, Ticket& ticket, // NOLINT + type::CompressType compress_type); - uint64_t GetIdxCnt() const { - return idx_cnt_vec_[0]->load(std::memory_order_relaxed); - } + uint64_t GetIdxCnt() const { return idx_cnt_vec_[0]->load(std::memory_order_relaxed); } int GetIdxCnt(uint32_t ts_idx, uint64_t& ts_cnt) { // NOLINT uint32_t real_idx = 0; @@ -145,10 +142,14 @@ class Segment { void ReleaseAndCount(const std::vector& id_vec, StatisticsInfo* statistics_info); private: - void FreeList(uint32_t ts_idx, ::openmldb::base::Node* node, - StatisticsInfo* statistics_info); + void FreeList(uint32_t ts_idx, ::openmldb::base::Node* node, StatisticsInfo* statistics_info); void SplitList(KeyEntry* entry, uint64_t ts, ::openmldb::base::Node** node); + bool ListContains(KeyEntry* entry, uint64_t time, DataBlock* row, bool check_all_time); + + bool PutUnlock(const Slice& key, uint64_t time, DataBlock* row, bool put_if_absent = false, + bool check_all_time = false); + private: KeyEntries* entries_; std::mutex mu_; diff --git a/src/storage/segment_test.cc b/src/storage/segment_test.cc index c51c0984473..e43461c47e6 100644 --- a/src/storage/segment_test.cc +++ b/src/storage/segment_test.cc @@ -424,6 +424,82 @@ TEST_F(SegmentTest, TestDeleteRange) { CheckStatisticsInfo(CreateStatisticsInfo(20, 1012, 20 * (6 + sizeof(DataBlock))), gc_info); } +TEST_F(SegmentTest, PutIfAbsent) { + { + Segment segment(8); // so ts_cnt_ == 1 + // check all time == false + segment.Put("PK", 1, "test1", 5, true); + segment.Put("PK", 1, "test2", 5, true); // even key&time is the same, different value means different record + ASSERT_EQ(2, (int64_t)segment.GetIdxCnt()); + ASSERT_EQ(1, (int64_t)segment.GetPkCnt()); + segment.Put("PK", 2, "test3", 5, true); + segment.Put("PK", 2, "test4", 5, true); + segment.Put("PK", 3, "test5", 5, true); + segment.Put("PK", 3, "test6", 5, true); + ASSERT_EQ(6, (int64_t)segment.GetIdxCnt()); + // insert exists rows + segment.Put("PK", 2, "test3", 5, true); + segment.Put("PK", 1, "test1", 5, true); + segment.Put("PK", 1, "test2", 5, true); + segment.Put("PK", 3, "test6", 5, true); + ASSERT_EQ(6, (int64_t)segment.GetIdxCnt()); + // new rows + segment.Put("PK", 2, "test7", 5, true); + ASSERT_EQ(7, (int64_t)segment.GetIdxCnt()); + segment.Put("PK", 0, "test8", 5, true); // seek to last, next is empty + ASSERT_EQ(8, (int64_t)segment.GetIdxCnt()); + } + + { + // support when ts_cnt_ != 1 too + std::vector ts_idx_vec = {1, 3}; + Segment segment(8, ts_idx_vec); + ASSERT_EQ(2, (int64_t)segment.GetTsCnt()); + std::string key = "PK"; + uint64_t ts = 1669013677221000; + // the same ts + for (int j = 0; j < 2; j++) { + DataBlock* data = new DataBlock(2, key.c_str(), key.length()); + std::map ts_map = {{1, ts}, {3, ts}}; + segment.Put(Slice(key), ts_map, data, true); + } + ASSERT_EQ(1, GetCount(&segment, 1)); + ASSERT_EQ(1, GetCount(&segment, 3)); + } + + { + // put ts_map contains DEFAULT_TS_COL_ID + std::vector ts_idx_vec = {DEFAULT_TS_COL_ID}; + Segment segment(8, ts_idx_vec); + ASSERT_EQ(1, (int64_t)segment.GetTsCnt()); + std::string key = "PK"; + std::map ts_map = {{DEFAULT_TS_COL_ID, 100}}; // cur time == 100 + auto* block = new DataBlock(1, "test1", 5); + segment.Put(Slice(key), ts_map, block, true); + ASSERT_EQ(1, GetCount(&segment, DEFAULT_TS_COL_ID)); + ts_map = {{DEFAULT_TS_COL_ID, 200}}; + block = new DataBlock(1, "test1", 5); + segment.Put(Slice(key), ts_map, block, true); + ASSERT_EQ(1, GetCount(&segment, DEFAULT_TS_COL_ID)); + } + + { + // put ts_map contains DEFAULT_TS_COL_ID + std::vector ts_idx_vec = {DEFAULT_TS_COL_ID, 1, 3}; + Segment segment(8, ts_idx_vec); + ASSERT_EQ(3, (int64_t)segment.GetTsCnt()); + std::string key = "PK"; + std::map ts_map = {{DEFAULT_TS_COL_ID, 100}}; // cur time == 100 + auto* block = new DataBlock(1, "test1", 5); + segment.Put(Slice(key), ts_map, block, true); + ASSERT_EQ(1, GetCount(&segment, DEFAULT_TS_COL_ID)); + ts_map = {{DEFAULT_TS_COL_ID, 200}}; + block = new DataBlock(1, "test1", 5); + segment.Put(Slice(key), ts_map, block, true); + ASSERT_EQ(1, GetCount(&segment, DEFAULT_TS_COL_ID)); + } +} + } // namespace storage } // namespace openmldb diff --git a/src/storage/snapshot_test.cc b/src/storage/snapshot_test.cc index 910a8bc7724..e9dd679eafc 100644 --- a/src/storage/snapshot_test.cc +++ b/src/storage/snapshot_test.cc @@ -1085,7 +1085,7 @@ TEST_F(SnapshotTest, MakeSnapshotAbsOrLat) { SchemaCodec::SetColumnDesc(table_meta->add_column_desc(), "value", ::openmldb::type::kString); SchemaCodec::SetIndex(table_meta->add_column_key(), "index1", "card|merchant", "", ::openmldb::type::kAbsOrLat, 0, 1); - std::shared_ptr table = std::make_shared(*table_meta); + std::shared_ptr
table = std::make_shared(*table_meta); table->Init(); LogParts* log_part = new LogParts(12, 4, scmp); @@ -1119,7 +1119,7 @@ TEST_F(SnapshotTest, MakeSnapshotAbsOrLat) { google::protobuf::RepeatedPtrField<::openmldb::api::Dimension> d_list; ::openmldb::api::Dimension* d_ptr2 = d_list.Add(); d_ptr2->CopyFrom(dimensions); - ASSERT_EQ(table->Put(i + 1, *result, d_list), true); + ASSERT_EQ(table->Put(i + 1, *result, d_list).ok(), true); } table->SchedGc(); diff --git a/src/storage/table.h b/src/storage/table.h index 32a957c9db7..4c4a1f011f7 100644 --- a/src/storage/table.h +++ b/src/storage/table.h @@ -22,6 +22,7 @@ #include #include +#include "absl/status/status.h" #include "codec/codec.h" #include "proto/tablet.pb.h" #include "storage/iterator.h" @@ -50,17 +51,16 @@ class Table { int InitColumnDesc(); virtual bool Put(const std::string& pk, uint64_t time, const char* data, uint32_t size) = 0; + // DO NOT set different default value in derived class + virtual absl::Status Put(uint64_t time, const std::string& value, const Dimensions& dimensions, + bool put_if_absent = false) = 0; - virtual bool Put(uint64_t time, const std::string& value, const Dimensions& dimensions) = 0; - - bool Put(const ::openmldb::api::LogEntry& entry) { - return Put(entry.ts(), entry.value(), entry.dimensions()); - } + bool Put(const ::openmldb::api::LogEntry& entry) { return Put(entry.ts(), entry.value(), entry.dimensions()).ok(); } virtual bool Delete(const ::openmldb::api::LogEntry& entry) = 0; - virtual bool Delete(uint32_t idx, const std::string& key, - const std::optional& start_ts, const std::optional& end_ts) = 0; + virtual bool Delete(uint32_t idx, const std::string& key, const std::optional& start_ts, + const std::optional& end_ts) = 0; virtual TableIterator* NewIterator(const std::string& pk, Ticket& ticket) = 0; // NOLINT @@ -88,9 +88,7 @@ class Table { } return ""; } - inline ::openmldb::common::StorageMode GetStorageMode() const { - return storage_mode_; - } + inline ::openmldb::common::StorageMode GetStorageMode() const { return storage_mode_; } inline uint32_t GetId() const { return id_; } inline uint32_t GetIdxCnt() const { return table_index_.Size(); } @@ -153,14 +151,8 @@ class Table { std::shared_ptr GetIndex(const std::string& name) { return table_index_.GetIndex(name); } - std::shared_ptr GetIndex(const std::string& name, uint32_t ts_idx) { - return table_index_.GetIndex(name, ts_idx); - } - std::shared_ptr GetIndex(uint32_t idx) { return table_index_.GetIndex(idx); } - std::shared_ptr GetIndex(uint32_t idx, uint32_t ts_idx) { return table_index_.GetIndex(idx, ts_idx); } - std::shared_ptr GetPkIndex() { return table_index_.GetPkIndex(); } void SetTTL(const ::openmldb::storage::UpdateTTLMeta& ttl_meta); @@ -179,7 +171,7 @@ class Table { virtual uint64_t GetRecordByteSize() const = 0; virtual uint64_t GetRecordIdxByteSize() = 0; - virtual int GetCount(uint32_t index, const std::string& pk, uint64_t& count) = 0; // NOLINT + virtual int GetCount(uint32_t index, const std::string& pk, uint64_t& count) = 0; // NOLINT protected: void UpdateTTL(); diff --git a/src/storage/table_iterator_test.cc b/src/storage/table_iterator_test.cc index 7ba932422e1..3af20940266 100644 --- a/src/storage/table_iterator_test.cc +++ b/src/storage/table_iterator_test.cc @@ -450,7 +450,7 @@ TEST_P(TableIteratorTest, SeekNonExistent) { ASSERT_EQ(0, now - wit->GetKey()); } -INSTANTIATE_TEST_CASE_P(TestMemAndHDD, TableIteratorTest, +INSTANTIATE_TEST_SUITE_P(TestMemAndHDD, TableIteratorTest, ::testing::Values(::openmldb::common::kMemory, ::openmldb::common::kHDD)); } // namespace storage diff --git a/src/storage/table_test.cc b/src/storage/table_test.cc index 8e9b7b09ca8..43b3508822e 100644 --- a/src/storage/table_test.cc +++ b/src/storage/table_test.cc @@ -198,7 +198,7 @@ TEST_P(TableTest, MultiDimissionPut0) { ::openmldb::codec::SDKCodec sdk_codec(meta); std::string result; sdk_codec.EncodeRow({"d0", "d1", "d2"}, &result); - bool ok = table->Put(1, result, dimensions); + bool ok = table->Put(1, result, dimensions).ok(); ASSERT_TRUE(ok); // some functions in disk table need to be implemented. // refer to issue #1238 @@ -808,7 +808,7 @@ TEST_P(TableTest, TableIteratorTS) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); it->SeekToFirst(); @@ -921,7 +921,7 @@ TEST_P(TableTest, TraverseIteratorCount) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); it->SeekToFirst(); @@ -1048,7 +1048,7 @@ TEST_P(TableTest, AbsAndLatSetGet) { dim->set_key("mcc"); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } // test get and set ttl ASSERT_EQ(10, (int64_t)table->GetIndex(0)->GetTTL()->abs_ttl / (10 * 6000)); @@ -1149,7 +1149,7 @@ TEST_P(TableTest, AbsOrLatSetGet) { dim->set_key("mcc"); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } // test get and set ttl ASSERT_EQ(10, (int64_t)table->GetIndex(0)->GetTTL()->abs_ttl / (10 * 6000)); @@ -1562,7 +1562,7 @@ TEST_P(TableTest, TraverseIteratorCountWithLimit) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); @@ -1669,7 +1669,7 @@ TEST_P(TableTest, TSColIDLength) { dim1->set_key(row[0]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); @@ -1727,7 +1727,7 @@ TEST_P(TableTest, MultiDimensionPutTS) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); it->SeekToFirst(); @@ -1781,7 +1781,7 @@ TEST_P(TableTest, MultiDimensionPutTS1) { dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } TableIterator* it = table->NewTraverseIterator(0); it->SeekToFirst(); @@ -1823,7 +1823,7 @@ TEST_P(TableTest, MultiDimissionPutTS2) { ::openmldb::codec::SDKCodec sdk_codec(meta); std::string result; sdk_codec.EncodeRow({"d0", "d1", "d2"}, &result); - bool ok = table->Put(100, result, dimensions); + bool ok = table->Put(100, result, dimensions).ok(); ASSERT_TRUE(ok); TableIterator* it = table->NewTraverseIterator(0); @@ -1885,7 +1885,7 @@ TEST_P(TableTest, AbsAndLat) { } std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - table->Put(0, value, request.dimensions()); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } for (int i = 0; i <= 5; i++) { @@ -1924,7 +1924,6 @@ TEST_P(TableTest, NegativeTs) { table_meta.set_seg_cnt(8); table_meta.set_mode(::openmldb::api::TableMode::kTableLeader); table_meta.set_key_entry_max_height(8); - table_meta.set_format_version(1); table_meta.set_storage_mode(storageMode); SchemaCodec::SetColumnDesc(table_meta.add_column_desc(), "card", ::openmldb::type::kString); SchemaCodec::SetColumnDesc(table_meta.add_column_desc(), "ts1", ::openmldb::type::kBigInt); @@ -1939,10 +1938,11 @@ TEST_P(TableTest, NegativeTs) { dim->set_key(row[0]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_FALSE(table->Put(0, value, request.dimensions())); + auto st = table->Put(0, value, request.dimensions()); + ASSERT_TRUE(absl::IsInvalidArgument(st)) << st.ToString(); } -INSTANTIATE_TEST_CASE_P(TestMemAndHDD, TableTest, +INSTANTIATE_TEST_SUITE_P(TestMemAndHDD, TableTest, ::testing::Values(::openmldb::common::kMemory, ::openmldb::common::kHDD)); } // namespace storage diff --git a/src/tablet/tablet_impl.cc b/src/tablet/tablet_impl.cc index 2c506be510f..8b2b446e874 100644 --- a/src/tablet/tablet_impl.cc +++ b/src/tablet/tablet_impl.cc @@ -43,6 +43,7 @@ #include "base/proto_util.h" #include "base/status.h" #include "base/strings.h" +#include "base/sys_info.h" #include "brpc/controller.h" #include "butil/iobuf.h" #include "codec/codec.h" @@ -77,6 +78,7 @@ DECLARE_uint32(scan_max_bytes_size); DECLARE_uint32(scan_reserve_size); DECLARE_uint32(max_memory_mb); DECLARE_double(mem_release_rate); +DECLARE_int32(get_sys_mem_interval); DECLARE_string(db_root_path); DECLARE_string(ssd_root_path); DECLARE_string(hdd_root_path); @@ -135,7 +137,7 @@ TabletImpl::TabletImpl() replicators_(), snapshots_(), zk_client_(nullptr), - keep_alive_pool_(1), + trivial_task_pool_(1), task_pool_(FLAGS_task_pool_size), io_pool_(FLAGS_io_pool_size), snapshot_pool_(FLAGS_snapshot_pool_size), @@ -154,7 +156,7 @@ TabletImpl::TabletImpl() TabletImpl::~TabletImpl() { task_pool_.Stop(true); - keep_alive_pool_.Stop(true); + trivial_task_pool_.Stop(true); gc_pool_.Stop(true); io_pool_.Stop(true); snapshot_pool_.Stop(true); @@ -315,6 +317,9 @@ bool TabletImpl::Init(const std::string& zk_cluster, const std::string& zk_path, #ifdef TCMALLOC_ENABLE MallocExtension* tcmalloc = MallocExtension::instance(); tcmalloc->SetMemoryReleaseRate(FLAGS_mem_release_rate); +#endif +#if defined(__linux__) + trivial_task_pool_.DelayTask(FLAGS_get_sys_mem_interval, boost::bind(&TabletImpl::UpdateMemoryUsage, this)); #endif return true; } @@ -402,7 +407,7 @@ bool TabletImpl::RegisterZK() { LOG(WARNING) << "add notify watcher failed"; return false; } - keep_alive_pool_.DelayTask(FLAGS_zk_keep_alive_check_interval, boost::bind(&TabletImpl::CheckZkClient, this)); + trivial_task_pool_.DelayTask(FLAGS_zk_keep_alive_check_interval, boost::bind(&TabletImpl::CheckZkClient, this)); } return true; } @@ -437,6 +442,21 @@ bool TabletImpl::CheckGetDone(::openmldb::api::GetType type, uint64_t ts, uint64 return false; } +void TabletImpl::UpdateMemoryUsage() { + base::SysInfo info; + if (auto status = base::GetSysMem(&info); status.OK()) { + if (info.mem_total > 0) { + system_memory_usage_rate_.store(info.mem_used * 100 / info.mem_total, std::memory_order_relaxed); + DEBUGLOG("system_memory_usage_rate is %u", system_memory_usage_rate_.load(std::memory_order_relaxed)); + } else { + PDLOG(WARNING, "total memory is zero"); + } + } else { + PDLOG(WARNING, "GetSysMem run failed. error message %s", status.GetMsg().c_str()); + } + trivial_task_pool_.DelayTask(FLAGS_get_sys_mem_interval, boost::bind(&TabletImpl::UpdateMemoryUsage, this)); +} + int32_t TabletImpl::GetIndex(const ::openmldb::api::GetRequest* request, const ::openmldb::api::TableMeta& meta, const std::map>& vers_schema, CombineIterator* it, std::string* value, uint64_t* ts) { @@ -714,13 +734,22 @@ void TabletImpl::Put(RpcController* controller, const ::openmldb::api::PutReques response->set_msg("table is loading"); return; } - if (table->GetStorageMode() == ::openmldb::common::StorageMode::kMemory && - memory_used_.load(std::memory_order_relaxed) > FLAGS_max_memory_mb) { - PDLOG(WARNING, "current memory %lu MB exceed max memory limit %lu MB. tid %u, pid %u", - memory_used_.load(std::memory_order_relaxed), FLAGS_max_memory_mb, tid, pid); - response->set_code(::openmldb::base::ReturnCode::kExceedMaxMemory); - response->set_msg("exceed max memory"); - return; + if (table->GetStorageMode() == ::openmldb::common::StorageMode::kMemory) { + if (memory_used_.load(std::memory_order_relaxed) > FLAGS_max_memory_mb) { + PDLOG(WARNING, "current memory %lu MB exceed max memory limit %lu MB. tid %u, pid %u", + memory_used_.load(std::memory_order_relaxed), FLAGS_max_memory_mb, tid, pid); + response->set_code(base::ReturnCode::kExceedMaxMemory); + response->set_msg("exceed max memory"); + return; + } + if (request->has_memory_limit() && request->memory_limit() > 0 + && system_memory_usage_rate_.load(std::memory_order_relaxed) > request->memory_limit()) { + PDLOG(WARNING, "current system_memory_usage_rate %u exceed request memory limit %u. tid %u, pid %u", + system_memory_usage_rate_.load(std::memory_order_relaxed), request->memory_limit(), tid, pid); + response->set_code(base::ReturnCode::kExceedPutMemoryLimit); + response->set_msg("exceed memory limit"); + return; + } } ::openmldb::api::LogEntry entry; entry.set_pk(request->pk()); @@ -738,7 +767,8 @@ void TabletImpl::Put(RpcController* controller, const ::openmldb::api::PutReques if (request->ts_dimensions_size() > 0) { entry.mutable_ts_dimensions()->CopyFrom(request->ts_dimensions()); } - bool ok = false; + + absl::Status st; if (request->dimensions_size() > 0) { int32_t ret_code = CheckDimessionPut(request, table->GetIdxCnt()); if (ret_code != 0) { @@ -747,16 +777,27 @@ void TabletImpl::Put(RpcController* controller, const ::openmldb::api::PutReques return; } DLOG(INFO) << "put data to tid " << tid << " pid " << pid << " with key " << request->dimensions(0).key(); - ok = table->Put(entry.ts(), entry.value(), entry.dimensions()); + // 1. normal put: ok, invalid data + // 2. put if absent: ok, exists but ignore, invalid data + st = table->Put(entry.ts(), entry.value(), entry.dimensions(), request->put_if_absent()); } - if (!ok) { + + if (!st.ok()) { + if (request->put_if_absent() && absl::IsAlreadyExists(st)) { + // not a failure but shounld't write log entry + response->set_code(::openmldb::base::ReturnCode::kOk); + response->set_msg("exists but ignore"); + return; + } + LOG(WARNING) << st.ToString(); response->set_code(::openmldb::base::ReturnCode::kPutFailed); - response->set_msg("put failed"); + response->set_msg(st.ToString()); return; } response->set_code(::openmldb::base::ReturnCode::kOk); std::shared_ptr replicator; + bool ok = false; do { replicator = GetReplicator(request->tid(), request->pid()); if (!replicator) { @@ -970,7 +1011,7 @@ int32_t TabletImpl::ScanIndex(const ::openmldb::api::ScanRequest* request, const total_block_size += data.size(); } record_count++; - if (total_block_size > FLAGS_scan_max_bytes_size) { + if (FLAGS_scan_max_bytes_size > 0 && total_block_size > FLAGS_scan_max_bytes_size) { *is_finish = false; break; } @@ -1642,7 +1683,7 @@ void TabletImpl::ProcessQuery(bool is_sub, RpcController* ctrl, const openmldb:: uint32_t byte_size = 0; uint32_t count = 0; for (auto& output_row : output_rows) { - if (byte_size > FLAGS_scan_max_bytes_size) { + if (FLAGS_scan_max_bytes_size > 0 && byte_size > FLAGS_scan_max_bytes_size) { LOG(WARNING) << "reach the max byte size " << FLAGS_scan_max_bytes_size << " truncate result"; response->set_schema(session.GetEncodedSchema()); response->set_byte_size(byte_size); @@ -2191,9 +2232,8 @@ void TabletImpl::AppendEntries(RpcController* controller, const ::openmldb::api: return; } if (entry.has_method_type() && entry.method_type() == ::openmldb::api::MethodType::kDelete) { - table->Delete(entry); - } - if (!table->Put(entry)) { + table->Delete(entry); // TODO(hw): error handle + } else if (!table->Put(entry)) { // put if type is not delete PDLOG(WARNING, "fail to put entry. tid %u pid %u", tid, pid); response->set_code(::openmldb::base::ReturnCode::kFailToAppendEntriesToReplicator); response->set_msg("fail to append entry to table"); @@ -2968,7 +3008,8 @@ void TabletImpl::LoadTable(RpcController* controller, const ::openmldb::api::Loa std::string db_path = GetDBPath(root_path, tid, pid); if (!::openmldb::base::IsExists(db_path)) { - PDLOG(WARNING, "table db path does not exist, but still load. tid %u, pid %u, path %s", tid, pid, db_path.c_str()); + PDLOG(WARNING, "table db path does not exist, but still load. tid %u, pid %u, path %s", + tid, pid, db_path.c_str()); } std::shared_ptr
table = GetTable(tid, pid); @@ -3468,12 +3509,6 @@ base::Status TabletImpl::TruncateTableInternal(uint32_t tid, uint32_t pid) { return {::openmldb::base::ReturnCode::kTableMetaIsIllegal, "fail to init table"}; } new_table->SetTableStat(::openmldb::storage::kNormal); - { - std::lock_guard spin_lock(spin_mutex_); - tables_[tid].insert_or_assign(pid, new_table); - } - auto mem_snapshot = std::dynamic_pointer_cast(snapshot); - mem_snapshot->Truncate(replicator->GetOffset(), replicator->GetLeaderTerm()); if (table_meta->mode() == ::openmldb::api::TableMode::kTableLeader) { if (catalog_->AddTable(*table_meta, new_table)) { LOG(INFO) << "add table " << table_meta->name() << " to catalog with db " << table_meta->db(); @@ -3483,6 +3518,14 @@ base::Status TabletImpl::TruncateTableInternal(uint32_t tid, uint32_t pid) { return {::openmldb::base::ReturnCode::kCatalogUpdateFailed, "fail to update catalog"}; } } + { + std::lock_guard spin_lock(spin_mutex_); + tables_[tid].insert_or_assign(pid, new_table); + } + auto mem_snapshot = std::dynamic_pointer_cast(snapshot); + mem_snapshot->Truncate(replicator->GetOffset(), replicator->GetLeaderTerm()); + // running ResetTable after this function return + task_pool_.DelayTask(10, boost::bind(&TabletImpl::ResetTable, this, table)); } else { auto disk_table = std::dynamic_pointer_cast(table); if (auto status = disk_table->Truncate(); !status.OK()) { @@ -4265,7 +4308,7 @@ void TabletImpl::CheckZkClient() { PDLOG(INFO, "registe zk ok"); } } - keep_alive_pool_.DelayTask(FLAGS_zk_keep_alive_check_interval, boost::bind(&TabletImpl::CheckZkClient, this)); + trivial_task_pool_.DelayTask(FLAGS_zk_keep_alive_check_interval, boost::bind(&TabletImpl::CheckZkClient, this)); } } diff --git a/src/tablet/tablet_impl.h b/src/tablet/tablet_impl.h index c24a253c9e9..833dbe5ff70 100644 --- a/src/tablet/tablet_impl.h +++ b/src/tablet/tablet_impl.h @@ -423,6 +423,8 @@ class TabletImpl : public ::openmldb::api::TabletServer { bool IsCollectDeployStatsEnabled() const; + void ResetTable(std::shared_ptr
t) { t.reset(); } + // collect deploy statistics into memory void TryCollectDeployStats(const std::string& db, const std::string& name, absl::Time start_time); @@ -435,6 +437,8 @@ class TabletImpl : public ::openmldb::api::TabletServer { // refresh the pre-aggr tables info bool RefreshAggrCatalog(); + void UpdateMemoryUsage(); + private: Tables tables_; std::mutex mu_; @@ -444,7 +448,7 @@ class TabletImpl : public ::openmldb::api::TabletServer { Snapshots snapshots_; Aggregators aggregators_; ZkClient* zk_client_; - ThreadPool keep_alive_pool_; + ThreadPool trivial_task_pool_; ThreadPool task_pool_; ThreadPool io_pool_; ThreadPool snapshot_pool_; @@ -474,6 +478,7 @@ class TabletImpl : public ::openmldb::api::TabletServer { std::unique_ptr deploy_collector_; std::atomic memory_used_ = 0; + std::atomic system_memory_usage_rate_ = 0; // [0, 100] }; } // namespace tablet diff --git a/src/tablet/tablet_impl_func_test.cc b/src/tablet/tablet_impl_func_test.cc index c84729f288d..c07084a396d 100644 --- a/src/tablet/tablet_impl_func_test.cc +++ b/src/tablet/tablet_impl_func_test.cc @@ -89,7 +89,7 @@ void CreateBaseTable(::openmldb::storage::Table*& table, // NOLINT dim->set_key(row[1]); std::string value; ASSERT_EQ(0, codec.EncodeRow(row, &value)); - ASSERT_TRUE(table->Put(0, value, request.dimensions())); + ASSERT_TRUE(table->Put(0, value, request.dimensions()).ok()); } return; } @@ -389,7 +389,7 @@ TEST_P(TabletFuncTest, GetTimeIndex_ts1_iterator) { RunGetTimeIndexAssert(&query_its, base_ts, base_ts - 100); } -INSTANTIATE_TEST_CASE_P(TabletMemAndHDD, TabletFuncTest, +INSTANTIATE_TEST_SUITE_P(TabletMemAndHDD, TabletFuncTest, ::testing::Values(::openmldb::common::kMemory, ::openmldb::common::kHDD, ::openmldb::common::kSSD)); diff --git a/test/integration-test/openmldb-test-python/ha_cases/test_addindex.py b/test/integration-test/openmldb-test-python/ha_cases/test_addindex.py index afef45d51bb..a1b9fae118b 100644 --- a/test/integration-test/openmldb-test-python/ha_cases/test_addindex.py +++ b/test/integration-test/openmldb-test-python/ha_cases/test_addindex.py @@ -24,6 +24,7 @@ from cluster_manager import ClusterManager from tool import Executor from tool import Status +from tool import SystemUtil import openmldb.dbapi class TestAddIndex: @@ -43,7 +44,6 @@ def setup_class(cls): @pytest.mark.parametrize("storage_mode, snapshot", [("memory", True), ("memory", False)]) def test_addindex(self, storage_mode, snapshot): database = "test" - tablets = self.manager.GetComponents("tablet") self.cursor.execute(f"create database if not exists {database}") self.cursor.execute(f"use {database}") table_name = "table" + str(random.randint(0, 10000)) @@ -92,3 +92,43 @@ def test_addindex(self, storage_mode, snapshot): else: assert status.OK() and len(result) == 1 and result[0][0] == "key1" + str(i) self.cursor.execute(f"drop table {table_name}") + + def test_add_deleted_index(self): + database = "test" + self.cursor.execute(f"create database if not exists {database}") + self.cursor.execute(f"use {database}") + table_name = "table" + str(random.randint(0, 10000)) + partition_num = 1 + ddl = f"create table if not exists {table_name} (col1 string, col2 string, col3 bigint, index(key=col1, ts=col3), index(key=col1)) options (partitionnum=1, replicanum=1);" + self.cursor.execute(ddl) + status, indexs = self.executor.GetIndexs(database, table_name) + assert status.OK() and len(indexs) == 2 + try: + result = self.cursor.execute(f"create index index2 on {table_name} (col1)") + except Exception as e: + assert True + index1 = indexs[1].GetName() + try: + result = self.cursor.execute(f"create index {index1} on {table_name} (col1)") + assert False + except Exception as e: + assert True + self.cursor.execute(f"drop index {table_name}.{index1}") + status, indexs = self.executor.GetIndexs(database, table_name) + assert status.OK() and len(indexs) == 1 + status, table = self.executor.GetTablePartition(database, table_name) + assert status.OK() + tid = table['0'][0].GetTid() + tablets = self.manager.GetComponents("tablet") + for tablet in tablets: + json = "{\"tid\": %d, \"pid\":0 }" % int(tid) + cmd = f"curl -d \'{json}\' http://{tablet}/TabletServer/ExecuteGc" + SystemUtil.ExecuteCmd(cmd) + time.sleep(1) + SystemUtil.ExecuteCmd(cmd) + result = self.cursor.execute(f"create index index2 on {table_name} (col1)") + assert self.executor.WaitingOP(database, table_name, 0).OK() + status, indexs = self.executor.GetIndexs(database, table_name) + assert status.OK() and len(indexs) == 2 + + self.cursor.execute(f"drop table {table_name}") diff --git a/third-party/cmake/FetchGlog.cmake b/third-party/cmake/FetchGlog.cmake index 8aec8d8c696..458d5052268 100644 --- a/third-party/cmake/FetchGlog.cmake +++ b/third-party/cmake/FetchGlog.cmake @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -set(GLOG_URL https://github.com/google/glog/archive/refs/tags/v0.4.0.tar.gz) +set(GLOG_URL https://github.com/google/glog/archive/refs/tags/v0.6.0.tar.gz) message(STATUS "build glog from ${GLOG_URL}") @@ -20,14 +20,13 @@ find_program(MAKE_EXE NAMES gmake nmake make) ExternalProject_Add( glog URL ${GLOG_URL} - URL_HASH SHA256=f28359aeba12f30d73d9e4711ef356dc842886968112162bc73002645139c39c + URL_HASH SHA256=8a83bf982f37bb70825df71a9709fa90ea9f4447fb3c099e1d720a439d88bad6 PREFIX ${DEPS_BUILD_DIR} DOWNLOAD_DIR ${DEPS_DOWNLOAD_DIR}/glog INSTALL_DIR ${DEPS_INSTALL_DIR} DEPENDS gflags BUILD_IN_SOURCE TRUE - CONFIGURE_COMMAND - ./autogen.sh - COMMAND CXXFLAGS=-fPIC ./configure --prefix= --enable-shared=no --with-gflags= - BUILD_COMMAND ${MAKE_EXE} - INSTALL_COMMAND ${MAKE_EXE} install) + CONFIGURE_COMMAND ${CMAKE_COMMAND} -H -B -DCMAKE_CXX_FLAGS=-fPIC + -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH= -DCMAKE_INSTALL_PREFIX= + BUILD_COMMAND ${CMAKE_COMMAND} --build -- ${MAKEOPTS} + INSTALL_COMMAND ${CMAKE_COMMAND} --build --target install) diff --git a/third-party/cmake/FetchZetasql.cmake b/third-party/cmake/FetchZetasql.cmake index b2b1d580593..bfe1e4c94a0 100644 --- a/third-party/cmake/FetchZetasql.cmake +++ b/third-party/cmake/FetchZetasql.cmake @@ -13,10 +13,10 @@ # limitations under the License. set(ZETASQL_HOME https://github.com/4paradigm/zetasql) -set(ZETASQL_VERSION 0.3.1) -set(ZETASQL_HASH_DARWIN 48bfdfe5fa91d414b0bf8383f116bc2a1f558c12fa286e49ea5ceede366dfbcf) -set(ZETASQL_HASH_LINUX_UBUNTU 3847ed7a60aeda1192adf7d702076d2db2bd49258992e2af67515a57b8f6f6a6) -set(ZETASQL_HASH_LINUX_CENTOS e73e6259ab2df3ae7289a9ae78600b69a8fbb6e4890d07a1031ccb1e37fa4281) +set(ZETASQL_VERSION 0.3.3) +set(ZETASQL_HASH_DARWIN f1c6a4f61b4a3f278dd46ace86f8b5e30780e596ef4af22f22cc12a4a7f83664) +set(ZETASQL_HASH_LINUX_UBUNTU bfe6ef8fd8221e5619dbb66b298ad767a4e1a1326b0c4ccfb75aa9ab872d1ce2) +set(ZETASQL_HASH_LINUX_CENTOS 8b63a149abf9d14fed9e63f465e74c2300d6de7404b859c48a94d4b579d080c2) set(ZETASQL_TAG v${ZETASQL_VERSION}) function(init_zetasql_urls)