From 84a1ffc4963c96f6c1d22d20c9aa5733d6a4be3e Mon Sep 17 00:00:00 2001 From: HuangWei Date: Thu, 16 Nov 2023 11:56:14 +0800 Subject: [PATCH] feat: catch error msgs in ns/tablet client (#3587) --- docs/zh/maintain/openmldb_ops.md | 3 +- src/base/status.h | 3 + src/client/ns_client.cc | 97 +++++++++++++--------------- src/client/ns_client.h | 28 ++++---- src/client/tablet_client.cc | 96 +++++++++++++-------------- src/client/tablet_client.h | 90 +++++++++++++------------- src/cmd/openmldb.cc | 83 ++++++++++++------------ src/datacollector/data_collector.cc | 4 +- src/nameserver/name_server_impl.cc | 4 +- src/replica/snapshot_replica_test.cc | 3 +- src/sdk/sql_cluster_router.cc | 20 ++++-- src/tablet/tablet_impl.cc | 5 +- tools/openmldb_ops.py | 5 +- tools/tool.py | 17 ++--- 14 files changed, 224 insertions(+), 234 deletions(-) diff --git a/docs/zh/maintain/openmldb_ops.md b/docs/zh/maintain/openmldb_ops.md index 591ae355a75..d96b23131b3 100644 --- a/docs/zh/maintain/openmldb_ops.md +++ b/docs/zh/maintain/openmldb_ops.md @@ -35,8 +35,9 @@ python tools/openmldb_ops.py --openmldb_bin_path=./bin/openmldb --zk_cluster=0.0 python tools/openmldb_ops.py --openmldb_bin_path=./bin/openmldb --zk_cluster=0.0.0.0:2181 --zk_root_path=/openmldb --cmd=recoverdata ``` -注:理论上openmldb_ops不要求版本匹配,高版本openmldb_ops可以操作低版本的openmldb集群。 +运行结果可以只关注是否存在ERROR级日志,如果存在,请保留完整的日志记录,便于技术人员查找问题。 ### 系统要求 - 要求python2.7及以上版本 +- 理论上openmldb_ops不要求与OpenMLDB集群的版本匹配,高版本openmldb_ops可以操作低版本的OpenMLDB集群。 - `showopstatus`和`showtablestatus`需要`prettytable`依赖 diff --git a/src/base/status.h b/src/base/status.h index a6854e287b6..5995138edd6 100644 --- a/src/base/status.h +++ b/src/base/status.h @@ -19,6 +19,8 @@ #include +#include "absl/strings/str_cat.h" + #include "base/slice.h" #include "version.h" // NOLINT @@ -189,6 +191,7 @@ struct Status { inline bool OK() const { return code == ReturnCode::kOk; } inline const std::string& GetMsg() const { return msg; } inline int GetCode() const { return code; } + inline std::string ToString() const { return absl::StrCat("ReturnCode[", code, "]", msg); } int code; std::string msg; }; diff --git a/src/client/ns_client.cc b/src/client/ns_client.cc index 2b3a4a4ad45..ebf2bca2416 100644 --- a/src/client/ns_client.cc +++ b/src/client/ns_client.cc @@ -221,17 +221,16 @@ base::Status NsClient::ShowOPStatus(const std::string& name, uint32_t pid, return {base::ReturnCode::kError, response->msg()}; } -bool NsClient::CancelOP(uint64_t op_id, std::string& msg) { +base::Status NsClient::CancelOP(uint64_t op_id) { ::openmldb::nameserver::CancelOPRequest request; ::openmldb::nameserver::GeneralResponse response; request.set_op_id(op_id); - bool ok = client_.SendRequest(&::openmldb::nameserver::NameServer_Stub::CancelOP, &request, &response, - FLAGS_request_timeout_ms, 1); - msg = response.msg(); - if (ok && response.code() == 0) { - return true; + auto st = client_.SendRequestSt(&::openmldb::nameserver::NameServer_Stub::CancelOP, &request, &response, + FLAGS_request_timeout_ms, 1); + if (st.OK()) { + return {response.code(), response.msg()}; } - return false; + return st; } bool NsClient::AddTableField(const std::string& table_name, const ::openmldb::common::ColumnDesc& column_desc, @@ -342,10 +341,10 @@ bool NsClient::SetSdkEndpoint(const std::string& server_name, const std::string& return false; } -bool NsClient::AddReplica(const std::string& name, const std::set& pid_set, const std::string& endpoint, - std::string& msg) { +base::Status NsClient::AddReplica(const std::string& name, const std::set& pid_set, + const std::string& endpoint) { if (pid_set.empty()) { - return false; + return {base::ReturnCode::kError, "arg pid set is empty"}; } ::openmldb::nameserver::AddReplicaNSRequest request; ::openmldb::nameserver::GeneralResponse response; @@ -358,13 +357,12 @@ bool NsClient::AddReplica(const std::string& name, const std::set& pid request.add_pid_group(pid); } } - bool ok = client_.SendRequest(&::openmldb::nameserver::NameServer_Stub::AddReplicaNS, &request, &response, - FLAGS_request_timeout_ms, 1); - msg = response.msg(); - if (ok && response.code() == 0) { - return true; + auto st = client_.SendRequestSt(&::openmldb::nameserver::NameServer_Stub::AddReplicaNS, &request, &response, + FLAGS_request_timeout_ms, 1); + if (st.OK()) { + return {response.code(), response.msg()}; } - return false; + return st; } bool NsClient::AddReplicaNS(const std::string& name, const std::vector& endpoint_vec, uint32_t pid, @@ -393,10 +391,10 @@ bool NsClient::AddReplicaNS(const std::string& name, const std::vector& pid_set, const std::string& endpoint, - std::string& msg) { +base::Status NsClient::DelReplica(const std::string& name, const std::set& pid_set, + const std::string& endpoint) { if (pid_set.empty()) { - return false; + return {base::ReturnCode::kError, "arg pid set is empty"}; } ::openmldb::nameserver::DelReplicaNSRequest request; ::openmldb::nameserver::GeneralResponse response; @@ -409,13 +407,12 @@ bool NsClient::DelReplica(const std::string& name, const std::set& pid request.add_pid_group(pid); } } - bool ok = client_.SendRequest(&::openmldb::nameserver::NameServer_Stub::DelReplicaNS, &request, &response, - FLAGS_request_timeout_ms, 1); - msg = response.msg(); - if (ok && response.code() == 0) { - return true; + auto st = client_.SendRequestSt(&::openmldb::nameserver::NameServer_Stub::DelReplicaNS, &request, &response, + FLAGS_request_timeout_ms, 1); + if (st.OK()) { + return {response.code(), response.msg()}; } - return false; + return st; } bool NsClient::ConfSet(const std::string& key, const std::string& value, std::string& msg) { @@ -458,7 +455,7 @@ bool NsClient::ConfGet(const std::string& key, std::map& pid_set, - const std::string& des_endpoint, std::string& msg) { +base::Status NsClient::Migrate(const std::string& src_endpoint, const std::string& name, + const std::set& pid_set, const std::string& des_endpoint) { ::openmldb::nameserver::MigrateRequest request; ::openmldb::nameserver::GeneralResponse response; request.set_src_endpoint(src_endpoint); @@ -503,13 +499,12 @@ bool NsClient::Migrate(const std::string& src_endpoint, const std::string& name, for (auto pid : pid_set) { request.add_pid(pid); } - bool ok = client_.SendRequest(&::openmldb::nameserver::NameServer_Stub::Migrate, &request, &response, + auto st = client_.SendRequestSt(&::openmldb::nameserver::NameServer_Stub::Migrate, &request, &response, FLAGS_request_timeout_ms, 1); - msg = response.msg(); - if (ok && response.code() == 0) { - return true; + if (st.OK()) { + return {response.code(), response.msg()}; } - return false; + return st; } bool NsClient::RecoverEndpoint(const std::string& endpoint, bool need_restore, uint32_t concurrency, std::string& msg) { @@ -529,20 +524,19 @@ bool NsClient::RecoverEndpoint(const std::string& endpoint, bool need_restore, u return false; } -bool NsClient::RecoverTable(const std::string& name, uint32_t pid, const std::string& endpoint, std::string& msg) { +base::Status NsClient::RecoverTable(const std::string& name, uint32_t pid, const std::string& endpoint) { ::openmldb::nameserver::RecoverTableRequest request; ::openmldb::nameserver::GeneralResponse response; request.set_name(name); request.set_pid(pid); request.set_endpoint(endpoint); request.set_db(GetDb()); - bool ok = client_.SendRequest(&::openmldb::nameserver::NameServer_Stub::RecoverTable, &request, &response, + auto st = client_.SendRequestSt(&::openmldb::nameserver::NameServer_Stub::RecoverTable, &request, &response, FLAGS_request_timeout_ms, 1); - msg = response.msg(); - if (ok && response.code() == 0) { - return true; + if (st.OK()) { + return {response.code(), response.msg()}; } - return false; + return st; } bool NsClient::ConnectZK(std::string& msg) { @@ -603,8 +597,8 @@ bool NsClient::GetTablePartition(const std::string& name, uint32_t pid, return false; } -bool NsClient::UpdateTableAliveStatus(const std::string& endpoint, std::string& name, uint32_t pid, bool is_alive, - std::string& msg) { +base::Status NsClient::UpdateTableAliveStatus(const std::string& endpoint, const std::string& name, uint32_t pid, + bool is_alive) { ::openmldb::nameserver::UpdateTableAliveRequest request; ::openmldb::nameserver::GeneralResponse response; request.set_endpoint(endpoint); @@ -614,13 +608,12 @@ bool NsClient::UpdateTableAliveStatus(const std::string& endpoint, std::string& if (pid < UINT32_MAX) { request.set_pid(pid); } - bool ok = client_.SendRequest(&::openmldb::nameserver::NameServer_Stub::UpdateTableAliveStatus, &request, &response, - FLAGS_request_timeout_ms, 1); - msg = response.msg(); - if (ok && response.code() == 0) { - return true; + auto st = client_.SendRequestSt(&::openmldb::nameserver::NameServer_Stub::UpdateTableAliveStatus, &request, + &response, FLAGS_request_timeout_ms, 1); + if (st.OK()) { + return {response.code(), response.msg()}; } - return false; + return st; } bool NsClient::UpdateTTL(const std::string& name, const ::openmldb::type::TTLType& type, uint64_t abs_ttl, diff --git a/src/client/ns_client.h b/src/client/ns_client.h index 467219f4ec8..eb26aca55d6 100644 --- a/src/client/ns_client.h +++ b/src/client/ns_client.h @@ -94,12 +94,11 @@ class NsClient : public Client { bool MakeSnapshot(const std::string& name, const std::string& db, uint32_t pid, uint64_t end_offset, std::string& msg); // NOLINT - base::Status ShowOPStatus(const std::string& name, uint32_t pid, - nameserver::ShowOPStatusResponse* response); + base::Status ShowOPStatus(const std::string& name, uint32_t pid, nameserver::ShowOPStatusResponse* response); base::Status ShowOPStatus(uint64_t op_id, ::openmldb::nameserver::ShowOPStatusResponse* response); - bool CancelOP(uint64_t op_id, std::string& msg); // NOLINT + base::Status CancelOP(uint64_t op_id); bool AddTableField(const std::string& table_name, const ::openmldb::common::ColumnDesc& column_desc, std::string& msg); // NOLINT @@ -146,14 +145,12 @@ class NsClient : public Client { const ::openmldb::nameserver::ZoneInfo& zone_info, std::string& msg); // NOLINT - bool AddReplica(const std::string& name, const std::set& pid_set, const std::string& endpoint, - std::string& msg); // NOLINT + base::Status AddReplica(const std::string& name, const std::set& pid_set, const std::string& endpoint); bool AddReplicaNS(const std::string& name, const std::vector& endpoint_vec, uint32_t pid, const ::openmldb::nameserver::ZoneInfo& zone_info, const ::openmldb::api::TaskInfo& task_info); - bool DelReplica(const std::string& name, const std::set& pid_set, const std::string& endpoint, - std::string& msg); // NOLINT + base::Status DelReplica(const std::string& name, const std::set& pid_set, const std::string& endpoint); bool ConfSet(const std::string& key, const std::string& value, std::string& msg); // NOLINT @@ -161,20 +158,19 @@ class NsClient : public Client { bool ConfGet(const std::string& key, std::map& conf_map, // NOLINT std::string& msg); // NOLINT - bool ChangeLeader(const std::string& name, uint32_t pid, - std::string& candidate_leader, // NOLINT - std::string& msg); // NOLINT + base::Status ChangeLeader(const std::string& name, uint32_t pid, + std::string& candidate_leader); // NOLINT bool OfflineEndpoint(const std::string& endpoint, uint32_t concurrency, std::string& msg); // NOLINT - bool Migrate(const std::string& src_endpoint, const std::string& name, const std::set& pid_set, - const std::string& des_endpoint, std::string& msg); // NOLINT + base::Status Migrate(const std::string& src_endpoint, const std::string& name, const std::set& pid_set, + const std::string& des_endpoint); bool RecoverEndpoint(const std::string& endpoint, bool need_restore, uint32_t concurrency, std::string& msg); // NOLINT - bool RecoverTable(const std::string& name, uint32_t pid, const std::string& endpoint, std::string& msg); // NOLINT + base::Status RecoverTable(const std::string& name, uint32_t pid, const std::string& endpoint); bool ConnectZK(std::string& msg); // NOLINT @@ -187,10 +183,8 @@ class NsClient : public Client { ::openmldb::nameserver::TablePartition& table_partition, // NOLINT std::string& msg); // NOLINT - bool UpdateTableAliveStatus(const std::string& endpoint, - std::string& name, // NOLINT - uint32_t pid, bool is_alive, - std::string& msg); // NOLINT + base::Status UpdateTableAliveStatus(const std::string& endpoint, const std::string& name, uint32_t pid, + bool is_alive); bool UpdateTTL(const std::string& name, const ::openmldb::type::TTLType& type, uint64_t abs_ttl, uint64_t lat_ttl, const std::string& ts_name, std::string& msg); // NOLINT diff --git a/src/client/tablet_client.cc b/src/client/tablet_client.cc index 2049279d17f..878d2a5f3cc 100644 --- a/src/client/tablet_client.cc +++ b/src/client/tablet_client.cc @@ -335,12 +335,13 @@ bool TabletClient::SendSnapshot(uint32_t tid, uint32_t remote_tid, uint32_t pid, return false; } -bool TabletClient::LoadTable(const std::string& name, uint32_t id, uint32_t pid, uint64_t ttl, uint32_t seg_cnt) { +base::Status TabletClient::LoadTable(const std::string& name, uint32_t id, uint32_t pid, uint64_t ttl, + uint32_t seg_cnt) { return LoadTable(name, id, pid, ttl, false, seg_cnt); } -bool TabletClient::LoadTable(const std::string& name, uint32_t tid, uint32_t pid, uint64_t ttl, bool leader, - uint32_t seg_cnt, std::shared_ptr task_info) { +base::Status TabletClient::LoadTable(const std::string& name, uint32_t tid, uint32_t pid, uint64_t ttl, bool leader, + uint32_t seg_cnt, std::shared_ptr task_info) { ::openmldb::api::TableMeta table_meta; table_meta.set_name(name); table_meta.set_tid(tid); @@ -351,10 +352,11 @@ bool TabletClient::LoadTable(const std::string& name, uint32_t tid, uint32_t pid } else { table_meta.set_mode(::openmldb::api::TableMode::kTableFollower); } - return LoadTable(table_meta, task_info); + return LoadTableInternal(table_meta, task_info); } -bool TabletClient::LoadTable(const ::openmldb::api::TableMeta& table_meta, std::shared_ptr task_info) { +base::Status TabletClient::LoadTableInternal(const ::openmldb::api::TableMeta& table_meta, + std::shared_ptr task_info) { ::openmldb::api::LoadTableRequest request; ::openmldb::api::TableMeta* cur_table_meta = request.mutable_table_meta(); cur_table_meta->CopyFrom(table_meta); @@ -362,28 +364,21 @@ bool TabletClient::LoadTable(const ::openmldb::api::TableMeta& table_meta, std:: request.mutable_task_info()->CopyFrom(*task_info); } ::openmldb::api::GeneralResponse response; - bool ok = client_.SendRequest(&::openmldb::api::TabletServer_Stub::LoadTable, &request, &response, - FLAGS_request_timeout_ms, 1); - if (ok && response.code() == 0) { - return true; + auto st = client_.SendRequestSt(&::openmldb::api::TabletServer_Stub::LoadTable, &request, &response, + FLAGS_request_timeout_ms, 1); + if (st.OK()) { + return {response.code(), response.msg()}; } - return false; + return st; } -bool TabletClient::LoadTable(uint32_t tid, uint32_t pid, std::string* msg) { - ::openmldb::api::LoadTableRequest request; - ::openmldb::api::TableMeta* table_meta = request.mutable_table_meta(); - table_meta->set_tid(tid); - table_meta->set_pid(pid); - table_meta->set_mode(::openmldb::api::TableMode::kTableLeader); - ::openmldb::api::GeneralResponse response; - bool ok = client_.SendRequest(&::openmldb::api::TabletServer_Stub::LoadTable, &request, &response, - FLAGS_request_timeout_ms, 1); - msg->swap(*response.mutable_msg()); - if (ok && response.code() == 0) { - return true; +bool TabletClient::LoadTable(const ::openmldb::api::TableMeta& table_meta, std::shared_ptr task_info) { + auto st = LoadTableInternal(table_meta, task_info); + // can't return msg, log here + if (!st.OK()) { + LOG(WARNING) << st.ToString(); } - return false; + return st.OK(); } bool TabletClient::ChangeRole(uint32_t tid, uint32_t pid, bool leader, uint64_t term) { @@ -513,37 +508,36 @@ bool TabletClient::GetManifest(uint32_t tid, uint32_t pid, ::openmldb::common::S return true; } -bool TabletClient::GetTableStatus(::openmldb::api::GetTableStatusResponse& response) { +base::Status TabletClient::GetTableStatus(::openmldb::api::GetTableStatusResponse& response) { ::openmldb::api::GetTableStatusRequest request; - bool ret = client_.SendRequest(&::openmldb::api::TabletServer_Stub::GetTableStatus, &request, &response, + auto st = client_.SendRequestSt(&::openmldb::api::TabletServer_Stub::GetTableStatus, &request, &response, FLAGS_request_timeout_ms, 1); - if (ret) { - return true; + if (st.OK()) { + return {response.code(), response.msg()}; } - return false; + return st; } -bool TabletClient::GetTableStatus(uint32_t tid, uint32_t pid, ::openmldb::api::TableStatus& table_status) { +base::Status TabletClient::GetTableStatus(uint32_t tid, uint32_t pid, ::openmldb::api::TableStatus& table_status) { return GetTableStatus(tid, pid, false, table_status); } -bool TabletClient::GetTableStatus(uint32_t tid, uint32_t pid, bool need_schema, +base::Status TabletClient::GetTableStatus(uint32_t tid, uint32_t pid, bool need_schema, ::openmldb::api::TableStatus& table_status) { ::openmldb::api::GetTableStatusRequest request; request.set_tid(tid); request.set_pid(pid); request.set_need_schema(need_schema); ::openmldb::api::GetTableStatusResponse response; - bool ret = client_.SendRequest(&::openmldb::api::TabletServer_Stub::GetTableStatus, &request, &response, + auto st = client_.SendRequestSt(&::openmldb::api::TabletServer_Stub::GetTableStatus, &request, &response, FLAGS_request_timeout_ms, 1); - if (!ret) { - return false; + if (!st.OK()) { + return st; } - if (response.all_table_status_size() > 0) { + if (response.code() == 0 && response.all_table_status_size() > 0) { table_status = response.all_table_status(0); - return true; } - return false; + return {response.code(), response.msg()}; } std::shared_ptr TabletClient::Scan(uint32_t tid, uint32_t pid, @@ -701,25 +695,27 @@ bool TabletClient::SetExpire(uint32_t tid, uint32_t pid, bool is_expire) { return true; } -bool TabletClient::GetTableFollower(uint32_t tid, uint32_t pid, uint64_t& offset, - std::map& info_map, std::string& msg) { +base::Status TabletClient::GetTableFollower(uint32_t tid, uint32_t pid, uint64_t& offset, + std::map& info_map) { ::openmldb::api::GetTableFollowerRequest request; ::openmldb::api::GetTableFollowerResponse response; request.set_tid(tid); request.set_pid(pid); - bool ok = client_.SendRequest(&::openmldb::api::TabletServer_Stub::GetTableFollower, &request, &response, - FLAGS_request_timeout_ms, 1); - if (response.has_msg()) { - msg = response.msg(); - } - if (!ok || response.code() != 0) { - return false; - } - for (int idx = 0; idx < response.follower_info_size(); idx++) { - info_map.insert(std::make_pair(response.follower_info(idx).endpoint(), response.follower_info(idx).offset())); + auto st = client_.SendRequestSt(&::openmldb::api::TabletServer_Stub::GetTableFollower, &request, &response, + FLAGS_request_timeout_ms, 1); + if (st.OK()) { + if (response.code() == 0) { + offset = response.offset(); + for (int idx = 0; idx < response.follower_info_size(); idx++) { + info_map.insert( + std::make_pair(response.follower_info(idx).endpoint(), response.follower_info(idx).offset())); + } + return {}; + } else { + return {response.code(), response.msg()}; + } } - offset = response.offset(); - return true; + return st; } bool TabletClient::Get(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, std::string& value, diff --git a/src/client/tablet_client.h b/src/client/tablet_client.h index ec3ab346cc7..9fee8e08392 100644 --- a/src/client/tablet_client.h +++ b/src/client/tablet_client.h @@ -39,7 +39,6 @@ namespace sdk { class SQLRequestRowBatch; } // namespace sdk - namespace client { using ::openmldb::api::TaskInfo; const uint32_t INVALID_REMOTE_TID = UINT32_MAX; @@ -83,7 +82,8 @@ class TabletClient : public Client { bool Get(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, std::string& value, // NOLINT uint64_t& ts, // NOLINT - std::string& msg); ; // NOLINT + std::string& msg); + ; // NOLINT bool Get(uint32_t tid, uint32_t pid, const std::string& pk, uint64_t time, const std::string& idx_name, std::string& value, // NOLINT @@ -94,21 +94,20 @@ class TabletClient : public Client { std::string& msg); // NOLINT base::Status Delete(uint32_t tid, uint32_t pid, const std::map& index_val, - const std::string& ts_name, const std::optional start_ts, const std::optional& end_ts); + const std::string& ts_name, const std::optional start_ts, + const std::optional& end_ts); bool Count(uint32_t tid, uint32_t pid, const std::string& pk, const std::string& idx_name, bool filter_expired_data, uint64_t& value, std::string& msg); // NOLINT + std::shared_ptr Scan(uint32_t tid, uint32_t pid, const std::string& pk, + const std::string& idx_name, uint64_t stime, uint64_t etime, + uint32_t limit, uint32_t skip_record_num, + std::string& msg); // NOLINT - std::shared_ptr Scan(uint32_t tid, uint32_t pid, - const std::string& pk, const std::string& idx_name, - uint64_t stime, uint64_t etime, - uint32_t limit, uint32_t skip_record_num, std::string& msg); // NOLINT - - std::shared_ptr Scan(uint32_t tid, uint32_t pid, - const std::string& pk, const std::string& idx_name, - uint64_t stime, uint64_t etime, - uint32_t limit, std::string& msg); // NOLINT + std::shared_ptr Scan(uint32_t tid, uint32_t pid, const std::string& pk, + const std::string& idx_name, uint64_t stime, uint64_t etime, + uint32_t limit, std::string& msg); // NOLINT bool Scan(const ::openmldb::api::ScanRequest& request, brpc::Controller* cntl, ::openmldb::api::ScanResponse* response); @@ -140,15 +139,14 @@ class TabletClient : public Client { bool RecoverSnapshot(uint32_t tid, uint32_t pid, std::shared_ptr task_info = std::shared_ptr()); - bool LoadTable(const std::string& name, uint32_t id, uint32_t pid, uint64_t ttl, uint32_t seg_cnt); + base::Status LoadTable(const std::string& name, uint32_t id, uint32_t pid, uint64_t ttl, uint32_t seg_cnt); - bool LoadTable(const std::string& name, uint32_t id, uint32_t pid, uint64_t ttl, bool leader, uint32_t seg_cnt, - std::shared_ptr task_info = std::shared_ptr()); + base::Status LoadTable(const std::string& name, uint32_t id, uint32_t pid, uint64_t ttl, bool leader, + uint32_t seg_cnt, std::shared_ptr task_info = std::shared_ptr()); + // for ns WrapTaskFun, must return bool bool LoadTable(const ::openmldb::api::TableMeta& table_meta, std::shared_ptr task_info); - bool LoadTable(uint32_t tid, uint32_t pid, std::string* msg); - bool ChangeRole(uint32_t tid, uint32_t pid, bool leader, uint64_t term); bool ChangeRole(uint32_t tid, uint32_t pid, bool leader, const std::vector& endpoints, uint64_t term, @@ -165,26 +163,25 @@ class TabletClient : public Client { bool GetTermPair(uint32_t tid, uint32_t pid, ::openmldb::common::StorageMode storage_mode, // NOLINT - uint64_t& term, // NOLINT - uint64_t& offset, bool& has_table, // NOLINT - bool& is_leader); // NOLINT + uint64_t& term, // NOLINT + uint64_t& offset, bool& has_table, // NOLINT + bool& is_leader); // NOLINT bool GetManifest(uint32_t tid, uint32_t pid, ::openmldb::common::StorageMode storage_mode, ::openmldb::api::Manifest& manifest); // NOLINT - bool GetTableStatus(::openmldb::api::GetTableStatusResponse& response); // NOLINT - bool GetTableStatus(uint32_t tid, uint32_t pid, - ::openmldb::api::TableStatus& table_status); // NOLINT - bool GetTableStatus(uint32_t tid, uint32_t pid, bool need_schema, - ::openmldb::api::TableStatus& table_status); // NOLINT + base::Status GetTableStatus(::openmldb::api::GetTableStatusResponse& response); // NOLINT + base::Status GetTableStatus(uint32_t tid, uint32_t pid, + ::openmldb::api::TableStatus& table_status); // NOLINT + base::Status GetTableStatus(uint32_t tid, uint32_t pid, bool need_schema, + ::openmldb::api::TableStatus& table_status); // NOLINT bool FollowOfNoOne(uint32_t tid, uint32_t pid, uint64_t term, uint64_t& offset); // NOLINT - bool GetTableFollower(uint32_t tid, uint32_t pid, - uint64_t& offset, // NOLINT - std::map& info_map, // NOLINT - std::string& msg); // NOLINT + base::Status GetTableFollower(uint32_t tid, uint32_t pid, + uint64_t& offset, // NOLINT + std::map& info_map); // NOLINT bool GetAllSnapshotOffset(std::map>& tid_pid_offset); // NOLINT @@ -193,8 +190,9 @@ class TabletClient : public Client { bool DisConnectZK(); std::shared_ptr Traverse(uint32_t tid, uint32_t pid, - const std::string& idx_name, const std::string& pk, uint64_t ts, - uint32_t limit, bool skip_current_pk, uint32_t ts_pos, uint32_t& count); // NOLINT + const std::string& idx_name, const std::string& pk, + uint64_t ts, uint32_t limit, bool skip_current_pk, + uint32_t ts_pos, uint32_t& count); // NOLINT bool SetMode(bool mode); @@ -203,9 +201,8 @@ class TabletClient : public Client { bool AddIndex(uint32_t tid, uint32_t pid, const ::openmldb::common::ColumnKey& column_key, std::shared_ptr task_info); - bool AddMultiIndex(uint32_t tid, uint32_t pid, - const std::vector<::openmldb::common::ColumnKey>& column_keys, - std::shared_ptr task_info); + bool AddMultiIndex(uint32_t tid, uint32_t pid, const std::vector<::openmldb::common::ColumnKey>& column_keys, + std::shared_ptr task_info); bool GetCatalog(uint64_t* version); @@ -215,8 +212,7 @@ class TabletClient : public Client { bool LoadIndexData(uint32_t tid, uint32_t pid, uint32_t partition_num, std::shared_ptr task_info); bool ExtractIndexData(uint32_t tid, uint32_t pid, uint32_t partition_num, - const std::vector<::openmldb::common::ColumnKey>& column_key, - uint64_t offset, bool dump_data, + const std::vector<::openmldb::common::ColumnKey>& column_key, uint64_t offset, bool dump_data, std::shared_ptr task_info); bool CancelOP(const uint64_t op_id); @@ -235,9 +231,9 @@ class TabletClient : public Client { uint64_t timeout_ms); base::Status CallSQLBatchRequestProcedure(const std::string& db, const std::string& sp_name, - const base::Slice& meta, const base::Slice& data, - bool is_debug, uint64_t timeout_ms, - brpc::Controller* cntl, openmldb::api::SQLBatchRequestQueryResponse* response); + const base::Slice& meta, const base::Slice& data, bool is_debug, + uint64_t timeout_ms, brpc::Controller* cntl, + openmldb::api::SQLBatchRequestQueryResponse* response); bool DropProcedure(const std::string& db_name, const std::string& sp_name); @@ -261,17 +257,19 @@ class TabletClient : public Client { uint64_t timeout_ms, openmldb::RpcCallback* callback); - base::Status CallSQLBatchRequestProcedure(const std::string& db, const std::string& sp_name, - const base::Slice& meta, const base::Slice& data, - bool is_debug, uint64_t timeout_ms, - openmldb::RpcCallback* callback); + base::Status CallSQLBatchRequestProcedure( + const std::string& db, const std::string& sp_name, const base::Slice& meta, const base::Slice& data, + bool is_debug, uint64_t timeout_ms, + openmldb::RpcCallback* callback); - bool CreateAggregator(const ::openmldb::api::TableMeta& base_table_meta, - uint32_t aggr_tid, uint32_t aggr_pid, uint32_t index_pos, - const ::openmldb::base::LongWindowInfo& window_info); + bool CreateAggregator(const ::openmldb::api::TableMeta& base_table_meta, uint32_t aggr_tid, uint32_t aggr_pid, + uint32_t index_pos, const ::openmldb::base::LongWindowInfo& window_info); bool GetAndFlushDeployStats(::openmldb::api::DeployStatsResponse* res); + private: + base::Status LoadTableInternal(const ::openmldb::api::TableMeta& table_meta, std::shared_ptr task_info); + private: ::openmldb::RpcClient<::openmldb::api::TabletServer_Stub> client_; }; diff --git a/src/cmd/openmldb.cc b/src/cmd/openmldb.cc index 328f4ff342b..b4d12210cdf 100644 --- a/src/cmd/openmldb.cc +++ b/src/cmd/openmldb.cc @@ -476,7 +476,8 @@ void HandleNSClientSetTTL(const std::vector& parts, ::openmldb::cli bool ok = client->UpdateTTL(parts[1], type, abs_ttl, lat_ttl, index_name, err); if (ok) { std::cout << "Set ttl ok ! Note that, " - "it will take effect after two garbage collection intervals (i.e. gc_interval)." << std::endl; + "it will take effect after two garbage collection intervals (i.e. gc_interval)." + << std::endl; } else { std::cout << "Set ttl failed! " << err << std::endl; } @@ -491,17 +492,16 @@ void HandleNSClientCancelOP(const std::vector& parts, ::openmldb::c return; } try { - std::string err; if (boost::lexical_cast(parts[1]) <= 0) { std::cout << "Invalid args. op_id should be large than zero" << std::endl; return; } uint64_t op_id = boost::lexical_cast(parts[1]); - bool ok = client->CancelOP(op_id, err); - if (ok) { - std::cout << "Cancel op ok!" << std::endl; + auto st = client->CancelOP(op_id); + if (st.OK()) { + std::cout << "Cancel op ok" << std::endl; } else { - std::cout << "Cancel op failed! " << err << std::endl; + std::cout << "Cancel op failed, error msg: " << st.ToString() << std::endl; } } catch (std::exception const& e) { std::cout << "Invalid args. op_id should be uint64_t" << std::endl; @@ -693,10 +693,10 @@ void HandleNSAddReplica(const std::vector& parts, ::openmldb::clien std::cout << "has not valid pid" << std::endl; return; } - std::string msg; - bool ok = client->AddReplica(parts[1], pid_set, parts[3], msg); - if (!ok) { - std::cout << "Fail to addreplica. error msg:" << msg << std::endl; + + auto st = client->AddReplica(parts[1], pid_set, parts[3]); + if (!st.OK()) { + std::cout << "Fail to addreplica. error msg:" << st.GetMsg() << std::endl; return; } std::cout << "AddReplica ok" << std::endl; @@ -716,10 +716,9 @@ void HandleNSDelReplica(const std::vector& parts, ::openmldb::clien std::cout << "has not valid pid" << std::endl; return; } - std::string msg; - bool ok = client->DelReplica(parts[1], pid_set, parts[3], msg); - if (!ok) { - std::cout << "Fail to delreplica. error msg:" << msg << std::endl; + auto st = client->DelReplica(parts[1], pid_set, parts[3]); + if (!st.OK()) { + std::cout << "Fail to delreplica. error msg:" << st.GetMsg() << std::endl; return; } std::cout << "DelReplica ok" << std::endl; @@ -896,17 +895,17 @@ void HandleNSClientChangeLeader(const std::vector& parts, ::openmld if (parts.size() > 3) { candidate_leader = parts[3]; } - bool ret = client->ChangeLeader(parts[1], pid, candidate_leader, msg); - if (!ret) { - std::cout << "failed to change leader. error msg: " << msg << std::endl; + auto st = client->ChangeLeader(parts[1], pid, candidate_leader); + if (!st.OK()) { + std::cout << "failed to change leader. error msg: " << st.GetMsg() << std::endl; return; } } catch (const std::exception& e) { std::cout << "Invalid args. pid should be uint32_t" << std::endl; return; } - std::cout << "change leader ok. " - "If there are writing operations while changing a leader, it may cause data loss." << std::endl; + std::cout << "change leader ok. If there are writing operations while changing a leader, it may cause data loss." + << std::endl; } void HandleNSClientOfflineEndpoint(const std::vector& parts, ::openmldb::client::NsClient* client) { @@ -957,9 +956,9 @@ void HandleNSClientMigrate(const std::vector& parts, ::openmldb::cl std::cout << "has not valid pid" << std::endl; return; } - bool ret = client->Migrate(parts[1], parts[2], pid_set, parts[4], msg); - if (!ret) { - std::cout << "failed to migrate partition. error msg: " << msg << std::endl; + auto st = client->Migrate(parts[1], parts[2], pid_set, parts[4]); + if (!st.OK()) { + std::cout << "failed to migrate partition. error msg: " << st.GetMsg() << std::endl; return; } std::cout << "partition migrate ok" << std::endl; @@ -1012,10 +1011,9 @@ void HandleNSClientRecoverTable(const std::vector& parts, ::openmld } try { uint32_t pid = boost::lexical_cast(parts[2]); - std::string msg; - bool ok = client->RecoverTable(parts[1], pid, parts[3], msg); - if (!ok) { - std::cout << "Fail to recover table. error msg:" << msg << std::endl; + auto st = client->RecoverTable(parts[1], pid, parts[3]); + if (!st.OK()) { + std::cout << "Fail to recover table. error msg:" << st.GetMsg() << std::endl; return; } std::cout << "recover table ok" << std::endl; @@ -2671,9 +2669,9 @@ void HandleNSClientUpdateTableAlive(const std::vector& parts, ::ope return; } } - std::string msg; - if (!client->UpdateTableAliveStatus(endpoint, name, pid, is_alive, msg)) { - std::cout << "Fail to update table alive. error msg: " << msg << std::endl; + + if (auto st = client->UpdateTableAliveStatus(endpoint, name, pid, is_alive); !st.OK()) { + std::cout << "Fail to update table alive. error msg: " << st.GetMsg() << std::endl; return; } std::cout << "update ok" << std::endl; @@ -3085,19 +3083,20 @@ void HandleClientGetTableStatus(const std::vector parts, ::openmldb if (parts.size() == 3) { ::openmldb::api::TableStatus table_status; try { - if (client->GetTableStatus(boost::lexical_cast(parts[1]), boost::lexical_cast(parts[2]), - table_status)) { + if (auto st = client->GetTableStatus(boost::lexical_cast(parts[1]), + boost::lexical_cast(parts[2]), table_status); + st.OK()) { status_vec.push_back(table_status); } else { - std::cout << "gettablestatus failed" << std::endl; + std::cout << "gettablestatus failed, error msg: " << st.GetMsg() << std::endl; } } catch (boost::bad_lexical_cast& e) { std::cout << "Bad gettablestatus format" << std::endl; } } else if (parts.size() == 1) { ::openmldb::api::GetTableStatusResponse response; - if (!client->GetTableStatus(response)) { - std::cout << "gettablestatus failed" << std::endl; + if (auto st = client->GetTableStatus(response); !st.OK()) { + std::cout << "gettablestatus failed, error msg: " << st.GetMsg() << std::endl; return; } for (int idx = 0; idx < response.all_table_status_size(); idx++) { @@ -3202,12 +3201,13 @@ void HandleClientLoadTable(const std::vector parts, ::openmldb::cli return; } } - bool ok = client->LoadTable(parts[1], boost::lexical_cast(parts[2]), + // TODO(): get status msg + auto st = client->LoadTable(parts[1], boost::lexical_cast(parts[2]), boost::lexical_cast(parts[3]), ttl, is_leader, seg_cnt); - if (ok) { + if (st.OK()) { std::cout << "LoadTable ok" << std::endl; } else { - std::cout << "Fail to LoadTable" << std::endl; + std::cout << "Fail to LoadTable: " << st.ToString() << std::endl; } } catch (boost::bad_lexical_cast& e) { std::cout << "Bad LoadTable format" << std::endl; @@ -3278,8 +3278,8 @@ void HandleClientPreview(const std::vector& parts, ::openmldb::clie return; } ::openmldb::api::TableStatus table_status; - if (!client->GetTableStatus(tid, pid, true, table_status)) { - std::cout << "Fail to get table status" << std::endl; + if (auto st = client->GetTableStatus(tid, pid, true, table_status); !st.OK()) { + std::cout << "Fail to get table status, error msg: " << st.GetMsg() << std::endl; return; } /*std::string schema = table_status.schema(); @@ -3369,9 +3369,8 @@ void HandleClientGetFollower(const std::vector& parts, ::openmldb:: } std::map info_map; uint64_t offset = 0; - std::string msg; - if (!client->GetTableFollower(tid, pid, offset, info_map, msg)) { - std::cout << "get failed. msg: " << msg << std::endl; + if (auto st = client->GetTableFollower(tid, pid, offset, info_map); !st.OK()) { + std::cout << "get failed, error msg: " << st.GetMsg() << std::endl; return; } std::vector header; diff --git a/src/datacollector/data_collector.cc b/src/datacollector/data_collector.cc index cb1a8f254e2..1af941226cf 100644 --- a/src/datacollector/data_collector.cc +++ b/src/datacollector/data_collector.cc @@ -258,8 +258,8 @@ void DataCollectorImpl::CreateTaskEnv(const datasync::AddSyncTaskRequest* reques } auto tablet_client = tablet_client_map_[tablet_endpoint]; api::TableStatus table_status; - if (!tablet_client->GetTableStatus(tid, pid, table_status)) { - SET_RESP_AND_WARN(response, -1, "get table status from tablet server failed, maybe table doesn't exist"); + if (auto st = tablet_client->GetTableStatus(tid, pid, table_status); !st.OK()) { + SET_RESP_AND_WARN(response, -1, "get table status from tablet server failed, maybe table doesn't exist: " + st.GetMsg()); return; } if (!ValidateTableStatus(table_status)) { diff --git a/src/nameserver/name_server_impl.cc b/src/nameserver/name_server_impl.cc index 06912ad9736..d9ce3aff439 100644 --- a/src/nameserver/name_server_impl.cc +++ b/src/nameserver/name_server_impl.cc @@ -5072,8 +5072,8 @@ void NameServerImpl::UpdateTableStatus() { pos_response.reserve(16); for (const auto& kv : tablet_ptr_map) { ::openmldb::api::GetTableStatusResponse tablet_status_response; - if (!kv.second->client_->GetTableStatus(tablet_status_response)) { - PDLOG(WARNING, "get table status failed! endpoint[%s]", kv.first.c_str()); + if (auto st = kv.second->client_->GetTableStatus(tablet_status_response); !st.OK()) { + PDLOG(WARNING, "get table status failed! endpoint[%s], %s", kv.first.c_str(), st.GetMsg()); continue; } for (int pos = 0; pos < tablet_status_response.all_table_status_size(); pos++) { diff --git a/src/replica/snapshot_replica_test.cc b/src/replica/snapshot_replica_test.cc index a9302050142..05e9a9d01da 100644 --- a/src/replica/snapshot_replica_test.cc +++ b/src/replica/snapshot_replica_test.cc @@ -93,7 +93,8 @@ TEST_P(SnapshotReplicaTest, AddReplicate) { sleep(1); ::openmldb::api::TableStatus table_status; - ASSERT_TRUE(client.GetTableStatus(tid, pid, table_status)); + auto st = client.GetTableStatus(tid, pid, table_status); + ASSERT_TRUE(st.OK()) << st.ToString(); ASSERT_EQ(::openmldb::api::kTableNormal, table_status.state()); ret = client.DelReplica(tid, pid, end_point); diff --git a/src/sdk/sql_cluster_router.cc b/src/sdk/sql_cluster_router.cc index d838870d65b..1a55e94fb2e 100644 --- a/src/sdk/sql_cluster_router.cc +++ b/src/sdk/sql_cluster_router.cc @@ -4370,10 +4370,12 @@ std::shared_ptr SQLClusterRouter::ExecuteShowTableStat std::shared_ptr tablet_client; if (tablet_accessor && (tablet_client = tablet_accessor->GetClient())) { ::openmldb::api::GetTableStatusResponse response; - if (tablet_client->GetTableStatus(response)) { + if (auto st = tablet_client->GetTableStatus(response); st.OK()) { for (const auto& table_status : response.all_table_status()) { table_statuses[table_status.tid()][table_status.pid()][tablet_client->GetEndpoint()] = table_status; } + } else { + LOG(WARNING) << "get table status from tablet failed: " << st.GetMsg(); } } } @@ -4503,14 +4505,18 @@ bool SQLClusterRouter::CheckTableStatus(const std::string& db, const std::string if (tablet_accessor && (tablet_client = tablet_accessor->GetClient())) { uint64_t offset = 0; std::map info_map; - std::string msg; - tablet_client->GetTableFollower(tid, pid, offset, info_map, msg); - for (auto& meta : partition_info.partition_meta()) { - if (meta.is_leader()) continue; + auto st = tablet_client->GetTableFollower(tid, pid, offset, info_map); + // no followers is fine if replicanum == 1 + if (st.OK() || st.GetCode() == ::openmldb::base::ReturnCode::kNoFollower) { + for (auto& meta : partition_info.partition_meta()) { + if (meta.is_leader()) continue; - if (info_map.count(meta.endpoint()) == 0) { - append_error_msg(error_msg, pid, false, meta.endpoint(), "not connected to leader"); + if (info_map.count(meta.endpoint()) == 0) { + append_error_msg(error_msg, pid, false, meta.endpoint(), "not connected to leader"); + } } + } else { + append_error_msg(error_msg, pid, -1, "", absl::StrCat("fail to get from tablet: ", st.GetMsg())); } } diff --git a/src/tablet/tablet_impl.cc b/src/tablet/tablet_impl.cc index 4b30036465c..2c506be510f 100644 --- a/src/tablet/tablet_impl.cc +++ b/src/tablet/tablet_impl.cc @@ -2968,10 +2968,7 @@ void TabletImpl::LoadTable(RpcController* controller, const ::openmldb::api::Loa std::string db_path = GetDBPath(root_path, tid, pid); if (!::openmldb::base::IsExists(db_path)) { - PDLOG(WARNING, "table db path does not exist. tid %u, pid %u, path %s", tid, pid, db_path.c_str()); - response->set_code(::openmldb::base::ReturnCode::kTableDbPathIsNotExist); - response->set_msg("table db path does not exist"); - break; + PDLOG(WARNING, "table db path does not exist, but still load. tid %u, pid %u, path %s", tid, pid, db_path.c_str()); } std::shared_ptr table = GetTable(tid, pid); diff --git a/tools/openmldb_ops.py b/tools/openmldb_ops.py index 543c0bbfbf9..f3069254a65 100644 --- a/tools/openmldb_ops.py +++ b/tools/openmldb_ops.py @@ -615,8 +615,9 @@ def PrettyPrint(data, header = None): sys.exit() executor = Executor(options.openmldb_bin_path, options.zk_cluster, options.zk_root_path) - if not executor.Connect().OK(): - log.error("connect OpenMLDB failed") + st = executor.Connect() + if not st.OK(): + log.error("connect OpenMLDB failed, {}".format(st.GetMsg())) sys.exit() if options.cmd in manage_ops: status, auto_failover = executor.GetAutofailover() diff --git a/tools/tool.py b/tools/tool.py index cff6eb1db98..98876b2cc3a 100644 --- a/tools/tool.py +++ b/tools/tool.py @@ -85,7 +85,7 @@ def Connect(self): cmd.append("--cmd=showns") status, output = self.RunWithRetuncode(cmd) if not status.OK() or status.GetMsg().find("zk client init failed") != -1: - return Status(-1, "get ns failed"), None + return Status(-1, "get ns failed") result = self.ParseResult(output) for record in result: if record[2] == "leader": @@ -98,7 +98,7 @@ def Connect(self): cmd.append("--cmd=showtablet") status, output = self.RunWithRetuncode(cmd) if not status.OK(): - return Status(-1, "get tablet failed"), None + return Status(-1, "get tablet failed") result = self.ParseResult(output) for record in result: if record[1] != '-': @@ -119,12 +119,13 @@ def RunWithRetuncode(self, command, useshell = USE_SHELL, env = os.environ): try: + log.info(" ".join(command)) p = subprocess.Popen(command, stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = useshell, universal_newlines = universal_newlines, env = env) - output = p.stdout.read() - p.wait() - errout = p.stderr.read() - p.stdout.close() - p.stderr.close() + output, errout = p.communicate() + # TODO(hw): the print from ns/tablet client are not standard, print it for debug + if output != "": + log.info(output) + # errout has glog output, don't print it if "error msg" in output: return Status(-1, output), output return Status(p.returncode, errout), output @@ -167,7 +168,7 @@ def GetAutofailover(self): return status, None if output.find("true") != -1: return Status(), True - return Status(), False; + return Status(), False def SetAutofailover(self, value): cmd = list(self.ns_base_cmd)