diff --git a/Makefile.am b/Makefile.am index 1450d8a..9a7de19 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,12 +5,14 @@ if HAVE_UCG -SUBDIRS = base builtin hicoll +SUBDIRS = base builtin hicoll secure + +UCG_VERSION=2:0:1 lib_LTLIBRARIES = libucg.la libucg_la_CFLAGS = $(BASE_CFLAGS) libucg_la_CPPFLAGS = $(BASE_CPPFLAGS) -libucg_la_LDFLAGS = -ldl -version-info $(SOVERSION) +libucg_la_LDFLAGS = -ldl -version-info $(UCG_VERSION) libucg_ladir = $(includedir)/ucg libucg_la_SOURCES = libucg_la_LIBADD = \ @@ -18,7 +20,8 @@ libucg_la_LIBADD = \ ../uct/libuct.la \ ../ucp/libucp.la \ base/libucg_base.la \ - builtin/libucg_builtin.la + builtin/libucg_builtin.la \ + secure/libucg_secure.la nobase_dist_libucg_la_HEADERS = \ api/ucg_def.h \ diff --git a/api/ucg.h b/api/ucg.h index fbe2f0a..ba96eb5 100644 --- a/api/ucg.h +++ b/api/ucg.h @@ -1,6 +1,6 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. + * Description: UCG common header file */ #ifndef UCG_H_ @@ -78,6 +78,7 @@ enum ucg_collective_modifiers { UCG_GROUP_COLLECTIVE_MODIFIER_ALLTOALL = UCS_BIT(13), /* MPI_ALLTOALL */ UCG_GROUP_COLLECTIVE_MODIFIER_ALLGATHER = UCS_BIT(14), /* MPI_ALLGATHER */ + UCG_GROUP_COLLECTIVE_MODIFIER_ALLTOALLV = UCS_BIT(15), /* MPI_ALLTOALLV */ UCG_GROUP_COLLECTIVE_MODIFIER_MASK = UCS_MASK(16) }; @@ -87,6 +88,18 @@ typedef struct ucg_collective_type { ucg_group_member_index_t root :48; } ucg_collective_type_t; +typedef enum { + COLL_TYPE_BARRIER, + COLL_TYPE_BCAST, + COLL_TYPE_ALLREDUCE, + COLL_TYPE_ALLTOALLV, + /* + * Only collective oprations that already + * be supported should be added above. + */ + COLL_TYPE_NUMS +} coll_type_t; + enum UCS_S_PACKED ucg_group_member_distance { UCG_GROUP_MEMBER_DISTANCE_SELF = 0, UCG_GROUP_MEMBER_DISTANCE_L3CACHE, @@ -105,33 +118,48 @@ enum UCS_S_PACKED ucg_group_hierarchy_level { typedef int (*dt_convert_f)(void *dt_ext, ucp_datatype_t *ucp_datatype); typedef ptrdiff_t (*dt_span_f)(void *dt_ext, int count, ptrdiff_t *gap); +typedef struct inc_params { + uint16_t comm_id; /* INC comm id */ + uint8_t switch_info_got; /* indicates whether switch supports INC with under the current parameters */ + uint8_t feature_used; /* indicates whether the current collective operation is supported */ + uint32_t spine_select; /* selected spine ip in 2-layer networking */ + uint8_t coll_operation_type; /* supported collective operation */ + uint16_t data_operation_type; /* supported allreduce operation type */ + uint16_t data_type; /* supported collective type */ + uint16_t max_data_size; /* max data size in INC without padding */ + int node_under_tor; /* node/socket num under the tor */ + unsigned header_under_tor; /* for now, the mininum rank under the tor */ + uint8_t req_id; /* Indicates the Nth collective operation in INC, 1-255, must be continuous increment */ + /* rank id in MPI_COMM_WORLD, uniquely identify a task and commutication with job_id, comm_id, cid */ + int world_rank; + unsigned ppn; +} inc_params_t; + +typedef enum ucg_group_member_distance (*rank_dist_f)(void *comm, int rank1, int rank2); + +typedef struct { + uint16_t ppn_local; /* number of processes on my node */ + uint16_t pps_local; /* number of processes on my socket */ + uint16_t ppn_max; /* number of processes on all nodes */ + uint16_t node_nums; + uint16_t ppn_unbalance : 1; + uint16_t pps_unbalance : 1; + uint16_t nrank_uncontinue : 1; + uint16_t srank_uncontinue : 1; + uint16_t bind_to_none : 1; + uint16_t reserved : 7; + uint16_t rank_continuous_in_node : 1; + uint16_t rank_continuous_in_sock : 1; + uint16_t rank_balance_in_node : 1; + uint16_t rank_balance_in_sock : 1; +} ucg_topo_args_t; + typedef struct ucg_group_params { ucg_group_member_index_t member_count; /* number of group members */ + ucg_group_member_index_t member_index; /* My member index within the group */ uint32_t cid; /* Assign value to group_id */ - char **topo_map; /* Global topology map, topo_map[i][j] means Distance between rank i and rank j. */ - - /* - * This array contains information about the process placement of different - * group members, which is used to select the best topology for collectives. - * - * - * For example, for 2 nodes, 3 sockets each, 4 cores per socket, each member - * should be passed the distance array contents as follows: - * 1st group member distance array: 0111222222223333333333333333 - * 2nd group member distance array: 1011222222223333333333333333 - * 3rd group member distance array: 1101222222223333333333333333 - * 4th group member distance array: 1110222222223333333333333333 - * 5th group member distance array: 2222011122223333333333333333 - * 6th group member distance array: 2222101122223333333333333333 - * 7th group member distance array: 2222110122223333333333333333 - * 8th group member distance array: 2222111022223333333333333333 - * ... - * 12th group member distance array: 3333333333333333011122222222 - * 13th group member distance array: 3333333333333333101122222222 - * ... - */ - enum ucg_group_member_distance *distance; + ucg_topo_args_t topo_args; /* node index */ uint16_t *node_index; @@ -156,14 +184,19 @@ typedef struct ucg_group_params { /* Callback function for get rank in MPI_COMM_WORLD */ ucg_group_member_index_t (*mpi_global_idx_f) (void *cb_group_obj, ucg_group_member_index_t index); - + rank_dist_f mpi_rank_distance; dt_span_f mpi_datatype_span; + + int (*get_operate_param_f)(void *mpi_op, void *mpi_dt, int *op, int *dt); + + /* INC params */ + inc_params_t inc_param; char is_socket_balance; } ucg_group_params_t; typedef struct ucg_collective { ucg_collective_type_t type; /* the type (and root) of the collective */ - ucg_hash_index_t plan_cache_index; /* the index of collective type in plan cache. */ + coll_type_t coll_type; struct { void *buf; /* buffer location to use */ @@ -260,7 +293,13 @@ unsigned ucg_worker_progress(ucg_worker_h worker); * @param [in] group Group object to query. */ const ucg_group_params_t* ucg_group_get_params(ucg_group_h group); - +/** + * @ingroup UCG_GROUP + * @brief Get group member count. + * + * @param [in] group Group object to query. + */ +ucg_group_member_index_t ucg_group_get_member_count(ucg_group_h group); /** * @ingroup UCG_GROUP @@ -292,7 +331,9 @@ ucs_status_t ucg_collective_create(ucg_group_h group, * @return otherwise - Operation was scheduled for send and can be * completed in any point in time. The request handle * is returned to the application in order to track - * progress of the message. + * progress of the message. The application is + * responsible to release the handle using + * @reg ucg_request_free routine. */ ucs_status_ptr_t ucg_collective_start_nb(ucg_coll_h coll); @@ -345,9 +386,27 @@ void ucg_collective_destroy(ucg_coll_h coll); * @return Error code as defined by @ref ucs_status_t */ ucs_status_t ucg_request_check_status(void *request); - +/** + * @ingroup UCG_GROUP + * @brief Cancel an outstanding commutications request. + * + * @param [in] worker UCG worker. + * @param [in] request Non-blocking request to cancel. + * + * This routine tries to cancels an outstanding commutication request. + */ void ucg_request_cancel(ucg_worker_h worker, void *request); - +/** + * @ingroup UCG_GROUP + * @brief Release a communications request. + * + * @param [in] request Non-blocking request to release. + * + * This routine release then non-blocking request back to the library, regardless + * of its current state. Commutications operations associated with this request + * will make progress internally, however no further notifications or callbacks + * will be invoked for this request. + */ void ucg_request_free(void *request); @@ -365,6 +424,11 @@ ucs_status_t ucg_worker_create(ucp_context_h context, const ucp_worker_params_t *params, ucp_worker_h *worker_p); +ucs_status_t ucg_collective_check_input(ucg_group_h group, + const ucg_collective_params_t *params, + const ucg_coll_h *coll); + + END_C_DECLS #endif diff --git a/api/ucg_def.h b/api/ucg_def.h index cc1af17..171c5b3 100644 --- a/api/ucg_def.h +++ b/api/ucg_def.h @@ -1,5 +1,5 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -66,12 +66,5 @@ typedef uint64_t ucg_group_member_index_t; */ typedef void (*ucg_collective_callback_t)(void *request, ucs_status_t status); -/** - * @ingroup ucg_collective - * @brief Hash index for each hash table. - * - * This type is used as index of hash array. - */ -typedef uint32_t ucg_hash_index_t; #endif \ No newline at end of file diff --git a/api/ucg_mpi.h b/api/ucg_mpi.h index e9bf742..0593a16 100644 --- a/api/ucg_mpi.h +++ b/api/ucg_mpi.h @@ -1,6 +1,6 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. + * Description: Init function for MPI collective operations */ #ifndef UCG_MPI_H_ @@ -24,6 +24,7 @@ enum ucg_predefined { UCG_PRIMITIVE_SCATTER, UCG_PRIMITIVE_ALLREDUCE, UCG_PRIMITIVE_ALLTOALL, + UCG_PRIMITIVE_ALLTOALLV, UCG_PRIMITIVE_REDUCE_SCATTER, UCG_PRIMITIVE_ALLGATHER, UCG_PRIMITIVE_ALLGATHERV, @@ -45,6 +46,8 @@ static enum ucg_collective_modifiers ucg_predefined_modifiers[] = { [UCG_PRIMITIVE_ALLREDUCE] = UCG_GROUP_COLLECTIVE_MODIFIER_AGGREGATE | UCG_GROUP_COLLECTIVE_MODIFIER_BROADCAST, [UCG_PRIMITIVE_ALLTOALL] = UCG_GROUP_COLLECTIVE_MODIFIER_ALLTOALL, + [UCG_PRIMITIVE_ALLTOALLV] = UCG_GROUP_COLLECTIVE_MODIFIER_ALLTOALLV | + UCG_GROUP_COLLECTIVE_MODIFIER_VARIABLE_LENGTH, [UCG_PRIMITIVE_REDUCE_SCATTER] = UCG_GROUP_COLLECTIVE_MODIFIER_AGGREGATE | UCG_GROUP_COLLECTIVE_MODIFIER_SINGLE_SOURCE, [UCG_PRIMITIVE_ALLGATHER] = UCG_GROUP_COLLECTIVE_MODIFIER_BROADCAST | @@ -58,11 +61,6 @@ static enum ucg_collective_modifiers ucg_predefined_modifiers[] = { UCG_GROUP_COLLECTIVE_MODIFIER_VARIABLE_DATATYPE, }; -static ucg_hash_index_t UCS_F_ALWAYS_INLINE ucg_mpi_coll_hash(enum ucg_predefined mpi_coll_type) -{ - return (ucg_hash_index_t)mpi_coll_type; -} - #define UCG_COLL_PARAMS_BUF_R(_buf, _count, _dt_len, _dt_ext) \ .buf = (_buf), \ .count = (_count), \ @@ -96,7 +94,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_coll_##_lname##_init(__VA_ARGS__, \ .modifiers = flags, \ .root = root, \ }, \ - .plan_cache_index = ucg_mpi_coll_hash(UCG_PRIMITIVE_##_uname), \ + .coll_type = COLL_TYPE_##_uname, \ .send = { \ UCG_COLL_PARAMS_BUF##_stype _sargs \ }, \ @@ -132,6 +130,14 @@ UCG_COLL_INIT_FUNC(_lname, _uname, int *rcounts, size_t len_rdtype, \ void *mpi_rdtype, int *rdispls) +#define UCG_COLL_INIT_FUNC_SVN_RVN(_lname, _uname) \ +UCG_COLL_INIT_FUNC(_lname, _uname, \ + _V, ((char*)sbuf, scounts, len_sdtype, mpi_sdtype, sdispls), \ + _V, (rbuf, rcounts, len_rdtype, mpi_rdtype, rdispls), \ + const void *sbuf, int *scounts, size_t len_sdtype, \ + void *mpi_sdtype, int *sdispls, void *rbuf, int *rcounts, \ + size_t len_rdtype, void *mpi_rdtype, int *rdispls) + #define UCG_COLL_INIT_FUNC_SWN_RWN(_lname, _uname) \ UCG_COLL_INIT_FUNC(_lname, _uname, \ _W, ((char*)sbuf, scounts, len_sdtypes, mpi_sdtypes, sdispls), \ @@ -141,11 +147,13 @@ UCG_COLL_INIT_FUNC(_lname, _uname, int *rcounts, size_t *len_rdtypes, void **mpi_rdtypes, \ int *rdispls) - - UCG_COLL_INIT_FUNC_SR1_RR1(allreduce, ALLREDUCE) -UCG_COLL_INIT_FUNC_SR1_RR1(reduce, REDUCE) UCG_COLL_INIT_FUNC_SR1_RR1(bcast, BCAST) +UCG_COLL_INIT_FUNC(barrier, BARRIER, _R, (0, 0, 0, 0), _R, (0, 0, 0, 0), int ign) +UCG_COLL_INIT_FUNC_SVN_RVN(alltoallv, ALLTOALLV) + +#ifdef UCG_COLL_ALREADY_SUPPORTED +UCG_COLL_INIT_FUNC_SR1_RR1(reduce, REDUCE) UCG_COLL_INIT_FUNC_SR1_RRN(gather, GATHER) UCG_COLL_INIT_FUNC_SR1_RRN(scatter, SCATTER) UCG_COLL_INIT_FUNC_SR1_RRN(allgather, ALLGATHER) @@ -153,8 +161,8 @@ UCG_COLL_INIT_FUNC_SR1_RVN(allgatherv, ALLGATHERV) UCG_COLL_INIT_FUNC_SR1_RRN(alltoall, ALLTOALL) UCG_COLL_INIT_FUNC_SWN_RWN(alltoallw, ALLTOALLW) UCG_COLL_INIT_FUNC_SWN_RWN(neighbor_alltoallw, NEIGHBOR_ALLTOALLW) -UCG_COLL_INIT_FUNC(barrier, BARRIER, _R, (0, 0, 0, 0), _R, (0, 0, 0, 0), int ign) +#endif /* UCG_COLL_ALREADY_SUPPORTED */ END_C_DECLS -#endif \ No newline at end of file +#endif diff --git a/api/ucg_plan_component.h b/api/ucg_plan_component.h index 963a715..d61fa55 100644 --- a/api/ucg_plan_component.h +++ b/api/ucg_plan_component.h @@ -1,6 +1,6 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved. + * Description: UCG plan component */ #ifndef UCG_PLAN_COMPONENT_H_ @@ -101,10 +101,11 @@ typedef struct ucg_base_plan { /* Plan lookup - caching mechanism */ ucg_collective_type_t type; ucs_list_link_t op_head; /**< List of requests following this plan */ - + int op_cnt; /* Plan progress */ ucg_plan_component_t *planner; ucg_group_id_t group_id; + short up_offset; /* In allreduce-tree algo, my position in my father's reduce buffer */ ucg_group_member_index_t my_index; ucg_group_h group; ucs_mpool_t *am_mp; @@ -113,14 +114,13 @@ typedef struct ucg_base_plan { /* Attribute */ int support_non_commutative; int support_large_datatype; - int is_noncontig_allreduce; - int is_ring_plan_topo_type; + } ucg_plan_t; enum ucg_request_common_flags { UCG_REQUEST_COMMON_FLAG_COMPLETED = UCS_BIT(0), - - UCG_REQUEST_COMMON_FLAG_MASK = UCS_MASK(1) + UCG_REQUEST_COMMON_FLAG_INC_FAIL = UCS_BIT(1), + UCG_REQUEST_COMMON_FLAG_MASK = UCS_MASK(2) }; typedef struct ucg_request { @@ -164,10 +164,8 @@ struct ucg_plan_component { unsigned (*progress)(ucg_group_h group); /* plan a collective operation with this component */ - ucs_status_t (*plan) (ucg_plan_component_t *plan_component, - const ucg_collective_type_t *coll_type, - const size_t msg_size, - ucg_group_h group, + ucs_status_t (*plan) (ucg_group_h group, + int algo_id, ucg_collective_params_t *coll_params, ucg_plan_t **plan_p); /* Prepare an operation to follow the given plan */ @@ -268,18 +266,6 @@ ucs_status_t ucg_plan_select(ucg_group_h group, const char* planner_name, /* Start pending operations after a barrier has been completed */ ucs_status_t ucg_collective_release_barrier(ucg_group_h group); -/* Check if the plan support non commutative operation. */ -static inline int ucg_plan_support_non_commutative(ucg_plan_t *plan) -{ - return plan->support_non_commutative; -} - -/* Check if the plan support large datatype. */ -static inline int ucg_plan_support_large_datatype(ucg_plan_t *plan) -{ - return plan->support_large_datatype; -} - END_C_DECLS #endif \ No newline at end of file diff --git a/base/ucg_group.c b/base/ucg_group.c index 42b63b7..a0aeba2 100644 --- a/base/ucg_group.c +++ b/base/ucg_group.c @@ -1,12 +1,14 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved. + * Description: UCG group */ -#include "ucg_group.h" -#include "../builtin/plan/builtin_plan.h" -#include +#include +#include +#include +#include +#include #include #include #include @@ -16,10 +18,10 @@ #include #include /* for @ref ucp_proxy_ep_test */ +#include "ucg_group.h" + #if ENABLE_STATS -/** - * UCG group statistics counters - */ +/* UCG group statistics counters */ enum { UCG_GROUP_STAT_PLANS_CREATED, UCG_GROUP_STAT_PLANS_USED, @@ -57,10 +59,6 @@ static ucs_stats_class_t ucg_group_stats_class = { } \ } -__KHASH_IMPL(ucg_groups_ep, static UCS_F_MAYBE_UNUSED inline, - ucg_group_member_index_t, ucp_ep_h, 1, kh_int64_hash_func, - kh_int64_hash_equal); - unsigned ucg_worker_progress(ucg_worker_h worker) { unsigned idx; @@ -93,40 +91,6 @@ unsigned ucg_group_progress(ucg_group_h group) unsigned ucg_base_am_id; size_t ucg_ctx_worker_offset; -void ucg_init_group_cache(struct ucg_group *new_group) -{ - unsigned idx, rank, algo_idx; - for (algo_idx = 0; algo_idx < UCG_GROUP_MSG_SIZE_LEVEL; algo_idx++) { - for (rank = 0; rank < UCG_GROUP_MAX_ROOT_PARAM; rank++) { - for (idx = 0; idx < UCG_GROUP_MAX_COLL_TYPE_BUCKETS; idx++) { - new_group->cache[algo_idx][rank][idx] = NULL; - } - } - } -} - -void ucg_init_group_root_used(struct ucg_group *new_group) -{ - unsigned rank; - /* Initalization of root_used */ - for (rank = 0; rank < UCG_GROUP_MAX_ROOT_PARAM; rank++) { - new_group->root_used[rank] = (unsigned) -1; - } -} - -static void ucg_group_clean_topo_map(ucg_group_params_t *params, unsigned index) -{ - unsigned i; - for (i = 0; i <= index; i++) { - if(params->topo_map[i] != NULL) { - ucs_free(params->topo_map[i]); - params->topo_map[i] = NULL; - } - } - ucs_free(params->topo_map); - params->topo_map = NULL; -} - ucs_status_t ucg_init_group(ucg_worker_h worker, const ucg_group_params_t *params, ucg_groups_t *ctx, @@ -142,32 +106,18 @@ ucs_status_t ucg_init_group(ucg_worker_h worker, new_group->iface_cnt = 0; ucs_queue_head_init(&new_group->pending); - memcpy((ucg_group_params_t*)&new_group->params, params, sizeof(*params)); - new_group->params.distance = (typeof(params->distance))((char*)(new_group - + 1) + ctx->total_planner_sizes); - memcpy(new_group->params.distance, params->distance, distance_size); + new_group->params = *params; new_group->params.node_index = (typeof(params->node_index))((char*)(new_group + 1) + ctx->total_planner_sizes + distance_size); - memcpy(new_group->params.node_index, params->node_index, nodenumber_size); - memset(new_group + 1, 0, ctx->total_planner_sizes); - - ucg_init_group_cache(new_group); - ucg_init_group_root_used(new_group); - new_group->params.topo_map = NULL; - if (params->topo_map) { - new_group->params.topo_map = UCS_ALLOC_CHECK(sizeof(char*) * params->member_count, "topo map"); - unsigned i; - for (i = 0; i < params->member_count; i++) { - unsigned topo_size = sizeof(char) * params->member_count; - new_group->params.topo_map[i] = (char*)malloc(topo_size); - if (new_group->params.topo_map[i] == NULL) { - ucg_group_clean_topo_map(&new_group->params, i); - return UCS_ERR_NO_MEMORY; - } - memcpy(new_group->params.topo_map[i], params->topo_map[i], topo_size); - } + errno_t status = memcpy_s(new_group->params.node_index, nodenumber_size, params->node_index, nodenumber_size); + if (status != EOK) { + return UCS_ERR_INVALID_PARAM; } - + status = memset_s(new_group + 1, ctx->total_planner_sizes, 0, ctx->total_planner_sizes); + if (status != EOK ) { + return UCS_ERR_INVALID_PARAM; + } + new_group->params.topo_args = params->topo_args; return UCS_OK; } @@ -180,7 +130,6 @@ static void ucg_group_clean_planners(ucg_groups_t *ctx, planner->destroy((void*)new_group); } ucs_free(new_group); - new_group = NULL; } static ucs_status_t ucg_group_planner_create(ucg_groups_t *ctx, @@ -188,7 +137,7 @@ static ucs_status_t ucg_group_planner_create(ucg_groups_t *ctx, struct ucg_group *new_group, int *idx) { - ucs_status_t status = UCS_OK; + ucs_status_t status; for (*idx = 0; *idx < ctx->num_planners; (*idx)++) { /* Create the per-planner per-group context */ ucg_plan_component_t *planner = ctx->planners[*idx].plan_component; @@ -213,17 +162,16 @@ ucs_status_t ucg_group_create(ucg_worker_h worker, UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); /* allocate a new group */ - size_t distance_size = sizeof(*params->distance) * params->member_count; size_t nodenumber_size = sizeof(*params->node_index) * params->member_count; struct ucg_group *new_group = ucs_malloc(sizeof(struct ucg_group) + - ctx->total_planner_sizes + distance_size + nodenumber_size, "communicator group"); + ctx->total_planner_sizes + nodenumber_size, "communicator group"); if (new_group == NULL) { status = UCS_ERR_NO_MEMORY; goto cleanup_none; } int idx = 0; - status = ucg_init_group(worker, params, ctx, distance_size, nodenumber_size, new_group); + status = ucg_init_group(worker, params, ctx, 0, nodenumber_size, new_group); if (status != UCS_OK) { ucs_free(new_group); new_group = NULL; @@ -235,12 +183,24 @@ ucs_status_t ucg_group_create(ucg_worker_h worker, goto cleanup_planners; } +#if ENABLE_UCG_HICOLL + /* + * INC initialization, generate random comm_id, + * subroot send query to root and root reply notify + */ + ucg_builtin_config_t *config; + config = (ucg_builtin_config_t *)ctx->planners[0].plan_component->plan_config; + init_inc_params(new_group); + if (inc_enable(config)) { + (void)inc_create(new_group, config, params); + } +#endif + status = UCS_STATS_NODE_ALLOC(&new_group->stats, &ucg_group_stats_class, worker->stats, "-%p", new_group); if (status != UCS_OK) { goto cleanup_planners; } - new_group->params.is_socket_balance = params->is_socket_balance; ucs_list_add_head(&ctx->groups_head, &new_group->list); UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); *group_p = new_group; @@ -249,7 +209,7 @@ ucs_status_t ucg_group_create(ucg_worker_h worker, cleanup_planners: ucg_group_clean_planners(ctx, idx, new_group); - + new_group = NULL; cleanup_none: UCP_WORKER_THREAD_CS_EXIT_CONDITIONAL(worker); return status; @@ -263,6 +223,15 @@ const ucg_group_params_t* ucg_group_get_params(ucg_group_h group) return &group->params; } +ucg_group_member_index_t ucg_group_get_member_count(ucg_group_h group) +{ + const ucg_group_params_t *group_params = ucg_group_get_params(group); + if (group_params == NULL) { + return 0; + } + return group_params->member_count; +} + void ucg_group_planner_destroy(ucg_group_h group) { unsigned idx; @@ -283,7 +252,19 @@ void ucg_group_destroy(ucg_group_h group) while (!ucs_queue_is_empty(&group->pending)) { ucg_group_progress(group); } - +#if ENABLE_UCG_HICOLL + /* + * INC finalize, clear switch + * subroot send kill to root and root reply kill to suroot clear switch + */ + ucs_status_t status; + if (inc_available(group)) { + status = inc_destroy(group, 0); + if (status != UCS_OK) { + ucs_info(" INC failed. INC destroy failed\n"); + } + } +#endif #if ENABLE_MT ucg_worker_h worker = group->worker; UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(worker); @@ -305,6 +286,9 @@ ucs_status_t ucg_request_check_status(void *request) ucg_request_t *req = (ucg_request_t*)request - 1; if (req->flags & UCG_REQUEST_COMMON_FLAG_COMPLETED) { + if (req->flags & UCG_REQUEST_COMMON_FLAG_INC_FAIL) { + return UCS_ERR_INVALID_PARAM; + } ucs_assert(req->status != UCS_INPROGRESS); return req->status; } @@ -324,146 +308,99 @@ ucs_status_t ucg_plan_select(ucg_group_h group, const char* planner_name, planner_name, &group->params, params, planc_p); } -static int ucg_chk_noncontig_allreduce_plan(const ucg_collective_params_t *coll_params, - const ucg_group_params_t *group_params, - const ucg_plan_t *plan) +void ucg_log_coll_params(const ucg_collective_params_t *params) { - int noncontig_allreduce; - - if (coll_params->type.modifiers != ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE]) { - return 0; - } - - noncontig_allreduce = ucg_is_noncontig_allreduce(group_params, coll_params); - if (plan->is_noncontig_allreduce) { - return !noncontig_allreduce; - } else { - return noncontig_allreduce; - } + ucs_debug("ucg_collective_create OP: " + "params={type=%u, root=%lu, send=[%p,%i,%lu,%p,%p], " + "recv=[%p,%i,%lu,%p,%p], cb=%p, op=%p}", + (unsigned)params->type.modifiers, (uint64_t)params->type.root, + params->send.buf, params->send.count, params->send.dt_len, + params->send.dt_ext, params->send.displs, + params->recv.buf, params->recv.count, params->recv.dt_len, + params->recv.dt_ext, params->recv.displs, + params->comp_cb, params->recv.op_ext); } -void ucg_get_cache_plan(unsigned int message_size_level, unsigned int coll_root, - ucg_group_h group, ucg_collective_params_t *params, ucg_plan_t **cache_plan, unsigned root) +static inline ucs_status_t ucg_collective_check_const_length(const ucg_collective_params_t *coll_params) { - ucg_plan_t *plan = group->cache[message_size_level][coll_root][params->plan_cache_index]; - if (plan == NULL) { - *cache_plan = NULL; - return; - } - - if (params->send.op_ext && !group->params.op_is_commute_f(params->send.op_ext) && !plan->support_non_commutative) { - *cache_plan = NULL; - return; - } - - if (params->send.op_ext && !group->params.op_is_commute_f(params->send.op_ext) && params->send.count > 1 - && plan->is_ring_plan_topo_type) { - *cache_plan = NULL; - return; - } - - ucg_builtin_config_t *config = (ucg_builtin_config_t *)plan->planner->plan_config; - if (params->send.dt_len > config->large_datatype_threshold && !plan->support_large_datatype) { - *cache_plan = NULL; - return; - } - - if (ucg_chk_noncontig_allreduce_plan(params, &group->params, plan)) { - *cache_plan = NULL; - return; - } - - if (plan->is_ring_plan_topo_type && ucg_is_segmented_allreduce(params)) { - *cache_plan = NULL; - return; - } - - if (plan != NULL && root != plan->type.root) { - *cache_plan = NULL; - return; + if (coll_params->send.count < 0) { + ucs_error("The send count cannot be less than 0."); + return UCS_ERR_INVALID_PARAM; } - - ucs_debug("select plan from cache: %p", plan); - *cache_plan = plan; + return UCS_OK; } -void ucg_update_group_cache(ucg_group_h group, - unsigned int message_size_level, - unsigned int coll_root, - ucg_collective_params_t *params, - ucg_plan_t *plan) +static inline ucs_status_t ucg_collective_check_counts(const int *counts, + ucg_group_member_index_t member_count) { - if (group->cache[message_size_level][coll_root][params->plan_cache_index] != NULL) { - ucg_builtin_plan_t *builtin_plan = ucs_derived_of(group->cache[message_size_level][coll_root][params->plan_cache_index], ucg_builtin_plan_t); - (void)ucg_builtin_destroy_plan(builtin_plan, group); - group->cache[message_size_level][coll_root][params->plan_cache_index] = NULL; + ucg_group_member_index_t i; + for (i = 0; i < member_count; i++) { + if (counts[i] < 0) { + return UCS_ERR_INVALID_PARAM; + } } - group->cache[message_size_level][coll_root][params->plan_cache_index] = plan; + return UCS_OK; } -void ucg_log_coll_params(ucg_collective_params_t *params) +STATIC_GTEST ucs_status_t ucg_collective_check_variable_length(ucg_group_h group, + const ucg_collective_params_t *coll_params) { - ucs_debug("ucg_collective_create OP: " - "params={type=%u, root=%lu, send=[%p,%i,%lu,%p,%p], " - "recv=[%p,%i,%lu,%p,%p], cb=%p, op=%p}", - (unsigned)params->type.modifiers, (uint64_t)params->type.root, - params->send.buf, params->send.count, params->send.dt_len, - params->send.dt_ext, params->send.displs, - params->recv.buf, params->recv.count, params->recv.dt_len, - params->recv.dt_ext, params->recv.displs, - params->comp_cb, params->recv.op_ext); + ucs_status_t status; + ucg_group_member_index_t member_count = ucg_group_get_member_count(group); + status = ucg_collective_check_counts(coll_params->send.counts, member_count); + if (status != UCS_OK) { + ucs_error("The send counts cannot be less than 0."); + return status; + } + status = ucg_collective_check_counts(coll_params->recv.counts, member_count); + if (status != UCS_OK) { + ucs_error("The receive counts cannot be less than 0."); + return status; + } + return status; } -void ucg_collective_create_choose_algorithm(unsigned msg_size, unsigned *message_size_level) +static inline ucs_status_t ucg_collective_check_params(ucg_group_h group, + const ucg_collective_params_t *coll_params) { - /* choose algorithm due to message size */ - if (msg_size < UCG_GROUP_MED_MSG_SIZE) { - *message_size_level = 0; + ucs_status_t status; + uint32_t is_variable_len = (uint32_t)coll_params->type.modifiers & UCG_GROUP_COLLECTIVE_MODIFIER_VARIABLE_LENGTH; + if (is_variable_len) { + status = ucg_collective_check_variable_length(group, coll_params); } else { - *message_size_level = 1; + status = ucg_collective_check_const_length(coll_params); } + return status; } +ucs_status_t ucg_collective_check_input(ucg_group_h group, + const ucg_collective_params_t *params, + const ucg_coll_h *coll) +{ + if (group == NULL || params == NULL || coll == NULL) { + return UCS_ERR_INVALID_PARAM; + } + ucs_status_t status = ucg_collective_check_params(group, params); + return status; +} UCS_PROFILE_FUNC(ucs_status_t, ucg_collective_create, (group, params, coll), ucg_group_h group, ucg_collective_params_t *params, ucg_coll_h *coll) { - UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(group->worker); - - /* check the recycling/cache for this collective */ + ucg_plan_t *plan = NULL; ucg_op_t *op = NULL; ucs_status_t status; - if (group == NULL || params == NULL || coll == NULL || params->send.count < 0) { - status = UCS_ERR_INVALID_PARAM; - goto out; - } - - /* find the plan of current root whether has been established */ - ucg_group_member_index_t root = UCG_ROOT_RANK(params); - unsigned msg_size = params->send.count * params->send.dt_len; - unsigned coll_root; - unsigned message_size_level; - unsigned is_coll_root_found = 1; + int algo; + UCP_WORKER_THREAD_CS_ENTER_CONDITIONAL(group->worker); - if (root >= group->params.member_count) { - status = UCS_ERR_INVALID_PARAM; - ucs_error("Invalid root[%ld] for communication group size[%ld]", root, group->params.member_count); + status = ucg_collective_check_input(group, params, coll); + if (status != UCS_OK) { goto out; } - /* root cache has been not found */ - if (root != group->root_used[root % UCG_GROUP_MAX_ROOT_PARAM]) { - group->root_used[root % UCG_GROUP_MAX_ROOT_PARAM] = root; - is_coll_root_found = 0; - } - coll_root = root % UCG_GROUP_MAX_ROOT_PARAM; + algo = ucg_builtin_algo_decision(&group->params, params); - ucg_collective_create_choose_algorithm(msg_size, &message_size_level); - - ucg_plan_t *plan = NULL; - if (is_coll_root_found) { - ucg_get_cache_plan(message_size_level, coll_root, group, params, &plan, root); - } + plan = ucg_builtin_pcache_find(group, algo, params); if (ucs_likely(plan != NULL)) { ucs_list_for_each(op, &plan->op_head, list) { @@ -489,7 +426,7 @@ UCS_PROFILE_FUNC(ucs_status_t, ucg_collective_create, UCS_PROFILE_CODE("ucg_plan") { ucs_trace_req("ucg_collective_create PLAN: planc=%s type=%x root=%lu", &planc->name[0], params->type.modifiers, (uint64_t)params->type.root); - status = ucg_plan(planc, ¶ms->type, params->send.count * params->send.dt_len, group, params, &plan); + status = ucg_plan(planc, group, algo, params, &plan); } if (status != UCS_OK) { goto out; @@ -500,7 +437,8 @@ UCS_PROFILE_FUNC(ucs_status_t, ucg_collective_create, plan->type = params->type; plan->group_id = group->group_id; plan->am_mp = &group->worker->am_mp; - ucg_update_group_cache(group, message_size_level, coll_root, params, plan); + plan->op_cnt = 0; + ucg_builtin_pcache_update(group, plan, algo, params); ucs_list_head_init(&plan->op_head); UCS_STATS_UPDATE_COUNTER(group->stats, UCG_GROUP_STAT_PLANS_CREATED, 1); @@ -512,9 +450,15 @@ UCS_PROFILE_FUNC(ucs_status_t, ucg_collective_create, if (status != UCS_OK) { goto out; } - + /* limit the length of op list in plan to avoid huge const in reuse check. */ + while (plan->op_cnt >= UCG_GROUP_MAX_OPS_IN_PLAN) { + ucg_op_t*op_head = ucs_list_extract_head(&plan->op_head, ucg_op_t, list); + ucg_discard(op_head); + plan->op_cnt--; + } ucs_list_add_head(&plan->op_head, &op->list); - memcpy(&op->params, params, sizeof(*params)); + plan->op_cnt++; + op->params = *params; op->plan = plan; op_found: @@ -550,7 +494,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_collective_trigger(ucg_group_h group ucs_status_t ucg_collective_release_barrier(ucg_group_h group) { if (group->is_barrier_outstanding == 0) { - // current operation is not barrier. + /* current operation is not barrier. */ return UCS_OK; } group->is_barrier_outstanding = 0; @@ -720,7 +664,11 @@ ucs_status_t ucg_plan_connect(ucg_group_h group, ucg_group_member_index_t index, ucg_groups_t *gctx = UCG_WORKER_TO_GROUPS_CTX(group->worker); int ret = 0; khiter_t iter = kh_get(ucg_groups_ep, &gctx->eps, global_index); - if (iter != kh_end(&gctx->eps)) { + int reuse_ep = 1; +#if ENABLE_UCG_HICOLL + reuse_ep = (inc_available(group) == 0 || inc_used(&group->params) == 0); +#endif + if (iter != kh_end(&gctx->eps) && reuse_ep) { /* Use the cached connection */ ucp_ep = kh_value(&gctx->eps, iter); } else { @@ -812,4 +760,4 @@ ucs_status_t ucg_worker_create(ucp_context_h context, am_handler->tracer = ucg_builtin_msg_dump; am_handler->flags = 0; return ucp_worker_create(context, params, worker_p); -} \ No newline at end of file +} diff --git a/base/ucg_group.h b/base/ucg_group.h index ab8cad1..d5fe00e 100644 --- a/base/ucg_group.h +++ b/base/ucg_group.h @@ -1,6 +1,6 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved. + * Description: UCG group */ #ifndef UCG_GROUP_H_ @@ -18,13 +18,18 @@ #define UCG_GROUP_MSG_SIZE_LEVEL 2 /* threshold message size to switch algorithm */ -#define UCG_GROUP_MED_MSG_SIZE 16384 +#define UCG_GROUP_MED_MSG_SIZE 8192 /* max number of actual root rank used */ #define UCG_GROUP_MAX_ROOT_PARAM 96 /* max number of collective type in the plan cache. */ #define UCG_GROUP_MAX_COLL_TYPE_BUCKETS 16 +/* 1 for inc available ande 0 for unavailable */ +#define UCG_GROUP_INC_STATUS_NUM 2 + +/* max number of ops stored in a plan */ +#define UCG_GROUP_MAX_OPS_IN_PLAN 200 extern size_t ucg_ctx_worker_offset; #define UCG_WORKER_TO_GROUPS_CTX(worker) \ @@ -37,7 +42,9 @@ extern size_t ucg_ctx_worker_offset; ((params)->type.root) __KHASH_TYPE(ucg_groups_ep, ucg_group_member_index_t, ucp_ep_h) - +__KHASH_IMPL(ucg_groups_ep, static UCS_F_MAYBE_UNUSED inline, + ucg_group_member_index_t, ucp_ep_h, 1, kh_int64_hash_func, + kh_int64_hash_equal); /* * To enable the "Groups" feature in UCX - it's registered as part of the UCX * context - and allocated a context slot in each UCP Worker at a certain offset. @@ -75,18 +82,7 @@ struct ucg_group { unsigned iface_cnt; uct_iface_h ifaces[UCG_GROUP_MAX_IFACES]; - /* per-group cache of previous plans/operations, arranged as follows: - * for each collective type (e.g. Allreduce) there is a plan with a list of - * operations. To re-use a past operation it must be available and match the - * requested collective parameters. - */ - ucg_plan_t *cache[UCG_GROUP_MSG_SIZE_LEVEL][UCG_GROUP_MAX_ROOT_PARAM][UCG_GROUP_MAX_COLL_TYPE_BUCKETS]; - - /* - * for root collective operations(e.g. Bcast), the parameter of root should be - * the criterion to decide whether plan has been found. - */ - unsigned root_used[UCG_GROUP_MAX_ROOT_PARAM]; + ucg_plan_t **builtin_pcache[COLL_TYPE_NUMS]; /* Below this point - the private per-planner data is allocated/stored */ }; @@ -94,12 +90,5 @@ struct ucg_group { int ucg_builtin_op_can_reuse(const ucg_plan_t *plan, const ucg_op_t *op, const ucg_collective_params_t *params); -void ucg_builtin_update_op(const ucg_plan_t *plan, ucg_op_t *op, - const ucg_collective_params_t *params); - -int ucg_is_segmented_allreduce(const ucg_collective_params_t *coll_params); - -int ucg_is_noncontig_allreduce(const ucg_group_params_t *group_params, - const ucg_collective_params_t *coll_params); #endif /* UCG_GROUP_H_ */ \ No newline at end of file diff --git a/base/ucg_plan.c b/base/ucg_plan.c index c2e5006..d8647c2 100644 --- a/base/ucg_plan.c +++ b/base/ucg_plan.c @@ -1,9 +1,9 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved. + * Description: UCG plan */ -#include "ucg_plan.h" +#include #include #include @@ -15,9 +15,10 @@ #include #include +#include "ucg_plan.h" UCS_LIST_HEAD(ucg_plan_components_list); -/** +/* * Keeps information about allocated configuration structure, to be used when * releasing the options. */ @@ -127,8 +128,12 @@ ucs_status_t ucg_plan_query(ucg_plan_desc_t **resources_p, unsigned *nums_p) } resources = tmp; - memcpy(resources + nums, planners, - sizeof(*planners) * num_plans); + if (memcpy_s(resources + nums, sizeof(*planners) * num_plans, planners, + sizeof(*planners) * num_plans) != EOK) { + status = UCS_ERR_OUT_OF_RANGE; + goto err; + } + nums += num_plans; ucg_plan_free((void **)&planners); } @@ -164,9 +169,8 @@ ucs_status_t ucg_plan_single(ucg_plan_component_t *planc, ucg_plan_desc_t **resources_p, unsigned *nums_p) { - ucg_plan_desc_t *resource; + ucg_plan_desc_t *resource = ucs_malloc(sizeof(*resource), "planner description"); - resource = ucs_malloc(sizeof(*resource), "planner description"); if (resource == NULL) { return UCS_ERR_NO_MEMORY; } diff --git a/base/ucg_plan.h b/base/ucg_plan.h index fcca7df..c98f289 100644 --- a/base/ucg_plan.h +++ b/base/ucg_plan.h @@ -1,6 +1,6 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. All rights reserved. + * Description: UCG plan */ #ifndef UCG_PLAN_H_ @@ -19,8 +19,8 @@ ucs_status_t ucg_plan_select_component(ucg_plan_desc_t *planners, ucg_plan_component_t **planc_p); /* Functions on a specific component */ -#define ucg_plan(planc, group_ctx, msg_size, coll_group, coll_params, plan_p) \ - ((planc)->plan(planc, group_ctx, msg_size, coll_group, coll_params, plan_p)) +#define ucg_plan(planc, group, algo_id, coll_params, plan_p) \ + ((planc)->plan(group, algo_id, coll_params, plan_p)) #define ucg_prepare(plan, params, op) ((plan)->planner->prepare(plan, params, op)) #define ucg_trigger(op, cid, req) ((op)->plan->planner->trigger(op, cid, req)) #define ucg_discard(op) ((op)->plan->planner->discard(op)) diff --git a/builtin/Makefile.am b/builtin/Makefile.am index 433d155..94428c7 100644 --- a/builtin/Makefile.am +++ b/builtin/Makefile.am @@ -18,12 +18,23 @@ libucg_builtin_la_CFLAGS = $(BASE_CFLAGS) noinst_HEADERS = \ ops/builtin_ops.h \ ops/builtin_cb.inl \ - plan/builtin_plan.h + plan/builtin_plan.h \ + plan/builtin_algo_decision.h \ + plan/builtin_plan_cache.h \ + plan/builtin_topo.h libucg_builtin_la_SOURCES = \ builtin.c \ ops/builtin_ops.c \ + plan/builtin_algo_select.c \ + plan/builtin_algo_check.c \ + plan/builtin_algo_decision.c \ + plan/builtin_plan_cache.c \ plan/builtin_binomial_tree.c \ plan/builtin_recursive.c \ plan/builtin_ring.c \ - plan/builtin_topo_info.c \ No newline at end of file + plan/builtin_topo_info.c \ + plan/builtin_trees.c \ + plan/builtin_topo_aware.c \ + plan/builtin_topo.c \ + plan/builtin_binary_block.c diff --git a/builtin/builtin.c b/builtin/builtin.c index 575834e..3e8ea57 100644 --- a/builtin/builtin.c +++ b/builtin/builtin.c @@ -1,17 +1,20 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * Description: Algorithm acceleration component architecture of UCG + * Notes: See file LICENSE for terms. */ +#include #include #include -#include #include +#include +#include +#include #include "ops/builtin_ops.h" #include "plan/builtin_plan.h" -#include -#include +#include "plan/builtin_plan_cache.h" #define CACHE_SIZE 1000 #define RECURSIVE_FACTOR 2 @@ -26,7 +29,16 @@ static ucs_config_field_t ucg_builtin_config_table[] = { {"BMTREE_", "", NULL, ucs_offsetof(ucg_builtin_config_t, bmtree), UCS_CONFIG_TYPE_TABLE(ucg_builtin_binomial_tree_config_table)}, +#if ENABLE_UCG_HICOLL + {"INC_", "", NULL, ucs_offsetof(ucg_builtin_config_t, inc), + UCS_CONFIG_TYPE_TABLE(ucg_inc_config_table)}, + {"NAP_", "", NULL, ucs_offsetof(ucg_builtin_config_t, NAP), + UCS_CONFIG_TYPE_TABLE(ucg_builtin_NAP_config_table)}, + + {"LADD_THEROTTLED_FACTOR", "0", "throttle factor", + ucs_offsetof(ucg_builtin_config_t, throttle_factor), UCS_CONFIG_TYPE_UINT}, +#endif {"BCAST_ALGORITHM", "0", "Bcast algorithm", ucs_offsetof(ucg_builtin_config_t, bcast_algorithm), UCS_CONFIG_TYPE_DOUBLE}, @@ -36,6 +48,12 @@ static ucs_config_field_t ucg_builtin_config_table[] = { {"BARRIER_ALGORITHM", "0", "Barrier algorithm", ucs_offsetof(ucg_builtin_config_t, barrier_algorithm), UCS_CONFIG_TYPE_DOUBLE}, + {"ALLTOALLV_ALGORITHM", "0", "Alltoallv algorithm", + ucs_offsetof(ucg_builtin_config_t, alltoallv_algorithm), UCS_CONFIG_TYPE_DOUBLE}, + + {"TREES_", "", NULL, ucs_offsetof(ucg_builtin_config_t, trees), + UCS_CONFIG_TYPE_TABLE(ucg_builtin_trees_config_table)}, + {"MAX_MSG_LIST_SIZE", "40", "Largest loop count of msg process function", ucs_offsetof(ucg_builtin_config_t, max_msg_list_size), UCS_CONFIG_TYPE_UINT}, @@ -58,6 +76,11 @@ static ucs_config_field_t ucg_builtin_config_table[] = { {"LARGE_DATATYPE_THRESHOLD", "32", "Large datatype threshold", ucs_offsetof(ucg_builtin_config_t, large_datatype_threshold), UCS_CONFIG_TYPE_UINT}, + /* To ensure consistency of allreduce calculation results,you need to enable this flag. + By default, this function is disabled. If this flag is enabled, the performance of the + allreduce tree algorithm interface decreases by 5%. */ + {"REDUCE_CONSISTENCY", "n", "reduce consistency flag", + ucs_offsetof(ucg_builtin_config_t, reduce_consistency), UCS_CONFIG_TYPE_BOOL}, {NULL} }; @@ -70,8 +93,12 @@ struct ucg_builtin_algorithm ucg_algo = { .topo = 0, .topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE, .ring = 0, + .NAP = 0, .pipeline = 0, .feature_flag = UCG_ALGORITHM_SUPPORT_COMMON_FEATURE, + .binary_block = 0, + .ladd = 0, + .plummer = 0, }; struct ucg_builtin_group_ctx { @@ -84,26 +111,127 @@ struct ucg_builtin_group_ctx { ucs_list_link_t plan_head; /* for resource release */ ucg_builtin_config_t *config; - ucg_builtin_comp_slot_t slots[UCG_BUILTIN_MAX_CONCURRENT_OPS]; + ucg_builtin_comp_slot_t *slots; }; -typedef struct ucg_builtin_am_buffer { - int group_id; - char used; - void *data; - size_t length; - unsigned am_flags; -} ucg_builtin_am_buffer_t; - typedef struct ucg_builtin_ctx { unsigned slots_total; - unsigned slots_used; - ucg_builtin_am_buffer_t buffer; - ucg_builtin_comp_slot_t *slots[]; + ucg_builtin_comp_slot_t **slots; } ucg_builtin_ctx_t; +static ucg_builtin_comp_slot_t *ucg_builtin_alloc_slot() +{ + ucg_builtin_comp_slot_t *slot = + ucs_malloc(sizeof(ucg_builtin_comp_slot_t) * UCG_BUILTIN_MAX_CONCURRENT_OPS, "ucg_msg_slot"); + if (slot == NULL) { + return NULL; + } + + unsigned i; + for (i = 0; i < UCG_BUILTIN_MAX_CONCURRENT_OPS; i++) { + ucs_list_head_init(&slot[i].msg_head); + slot[i].mp = NULL; + slot[i].cb = NULL; + slot[i].coll_id = 0; + slot[i].step_idx = 0; + } + return slot; +} + +static void ucg_builtin_free_slot(ucg_builtin_comp_slot_t *slot) +{ + if (!ucs_list_is_empty(&slot->msg_head)) { + ucs_warn("massage head is not empty!"); + } + ucs_free(slot); +} + +static ucs_status_t ucg_builtin_init_ctx(ucg_builtin_ctx_t **ctx) +{ + /* The applied memory is reclaimed by the operating system. */ + (*ctx) = UCS_ALLOC_CHECK(sizeof(ucg_builtin_ctx_t), "alloc ucg_builtin_ctx_t"); + + (*ctx)->slots_total = 0; + (*ctx)->slots = NULL; + return UCS_OK; +} + +static ucs_status_t ucg_builtin_extend_slots(ucg_builtin_ctx_t *ctx, unsigned max_size) +{ + if (ctx->slots_total >= max_size) { + return UCS_OK; + } + size_t slots_size = max_size * sizeof(ucg_builtin_comp_slot_t *); + ucg_builtin_comp_slot_t **new_slots = ucs_realloc(ctx->slots, slots_size, "ucg_msg_slots"); + if (new_slots == NULL) { + return UCS_ERR_NO_MEMORY; + } + ctx->slots = new_slots; + unsigned i; + for (i = ctx->slots_total; i < max_size; i++) { + ctx->slots[i] = ucg_builtin_alloc_slot(); + if (ctx->slots[i] == NULL) { + goto cleanup; + } + } + ctx->slots_total = max_size; + return UCS_OK; + +cleanup: + while((i--) > ctx->slots_total) { + ucg_builtin_free_slot(ctx->slots[i]); + ctx->slots[i] = NULL; + } + return UCS_ERR_NO_MEMORY; +} + +static ucg_builtin_ctx_t *ucg_builtin_get_ctx(ucg_worker_h worker) +{ + ucg_builtin_ctx_t **ctx = UCG_WORKER_TO_COMPONENT_CTX(ucg_builtin_component, worker); + if (*ctx == NULL) { + ucs_status_t status = ucg_builtin_init_ctx(ctx); + if (status != UCS_OK) { + return NULL; + } + } + return (*ctx); +} + +static ucg_builtin_comp_slot_t *ucg_builtin_get_slot(ucg_worker_h worker, unsigned group_id) +{ + ucg_builtin_ctx_t *ctx = ucg_builtin_get_ctx(worker); + if (ctx == NULL) { + return NULL; + } + if (ctx->slots_total <= group_id) { + return NULL; + } + return ctx->slots[group_id]; +} + +static ucg_builtin_comp_slot_t *ucg_builtin_set_slot(ucg_worker_h worker, unsigned group_id, ucs_mpool_t *group_am_mp) +{ + ucg_builtin_ctx_t *ctx = ucg_builtin_get_ctx(worker); + if (ctx == NULL) { + return NULL; + } + if (ctx->slots_total <= group_id) { + ucs_status_t status = ucg_builtin_extend_slots(ctx, group_id + 1); + if (status != UCS_OK) { + return NULL; + } + } + + unsigned i; + for (i = 0; i < UCG_BUILTIN_MAX_CONCURRENT_OPS; i++) { + ucg_builtin_comp_slot_t *slot = &ctx->slots[group_id][i]; + slot->mp = group_am_mp; + + } + return ctx->slots[group_id]; +} /* - * + * fix white-box review */ void ucg_builtin_free(void **p) { @@ -140,6 +268,10 @@ enum ucg_builtin_plan_topology_type ucg_builtin_choose_type(enum ucg_collective_ return UCG_PLAN_RECURSIVE; } else if (ucg_algo.ring) { return UCG_PLAN_RING; + } else if (ucg_algo.NAP) { + return UCG_PLAN_NAP; + } else if (ucg_algo.binary_block) { + return UCG_PLAN_BINARY_BLOCK; } else { return UCG_PLAN_TREE_FANIN_FANOUT; } @@ -149,17 +281,22 @@ enum ucg_builtin_plan_topology_type ucg_builtin_choose_type(enum ucg_collective_ return UCG_PLAN_BRUCK; } + if (flags & ucg_predefined_modifiers[UCG_PRIMITIVE_ALLTOALLV]) { + return (ucg_algo.plummer) ? UCG_PLAN_ALLTOALLV_PLUMMER : UCG_PLAN_ALLTOALLV_LADD; + } + if (flags & UCG_GROUP_COLLECTIVE_MODIFIER_ALLGATHER) { - if (ucg_algo.bruck) { - return UCG_PLAN_BRUCK; - } else { - return UCG_PLAN_RECURSIVE; - } + return (ucg_algo.bruck) ? UCG_PLAN_BRUCK : UCG_PLAN_RECURSIVE; } return UCG_PLAN_TREE_FANIN_FANOUT; } +static inline void ucg_builtin_release_desc_self(void *desc) +{ + ucs_free(desc); +} + static ucs_status_t ucg_builtin_am_process(ucg_builtin_comp_slot_t *slot, void *data, size_t length, unsigned am_flags) @@ -173,9 +310,11 @@ static ucs_status_t ucg_builtin_am_process(ucg_builtin_comp_slot_t *slot, void * if ((slot->req.step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY) && (slot->req.step->flags & UCG_BUILTIN_OP_STEP_FLAG_RECV_AFTER_SEND)) { + /* receive from "multiple" EPs with "multiple" fragments */ + unsigned recv_zcopy_cnt = slot->req.step->fragments_recv * slot->req.step->phase->ep_cnt; /* Zcopy recv before sending finished, store msg */ - if (slot->req.pending > slot->req.step->fragments_recv) { - if (++slot->req.step->zcopy.num_store > slot->req.step->fragments_recv) { + if (slot->req.pending > recv_zcopy_cnt) { + if (++slot->req.step->zcopy.num_store > recv_zcopy_cnt) { /* recv msg from step - step index = step now index + 256, store msg without count */ slot->req.step->zcopy.num_store--; } @@ -214,14 +353,35 @@ static ucs_status_t ucg_builtin_am_process(ucg_builtin_comp_slot_t *slot, void * if (am_flags & UCT_CB_PARAM_FLAG_DESC) { desc = (ucg_builtin_comp_desc_t*)((char*)data - offsetof(ucg_builtin_comp_desc_t, header)); + desc->release = uct_iface_release_desc; ret = UCS_INPROGRESS; } else { - /* Cannot use existing descriptor - must allocate my own... */ - desc = (ucg_builtin_comp_desc_t*)ucs_mpool_get_inline(slot->mp); - if (desc == NULL) { - return UCS_ERR_NO_MEMORY; + if (slot->mp == NULL) { + desc = ucs_malloc(sizeof(ucg_builtin_comp_desc_t) + (length - sizeof(ucg_builtin_header_t)), + "alloc builtin comp desc"); + if (desc == NULL) { + /* The UCT layer does not detect other error status codes and only identifies + whether the status is UCS_INPROGRESS and then process. We do not need UCT + desc, just return UCS_OK. */ + return UCS_OK; + } + desc->release = ucg_builtin_release_desc_self; + } else { + /* Cannot use existing descriptor - must allocate my own... */ + desc = (ucg_builtin_comp_desc_t*)ucs_mpool_get_inline(slot->mp); + if (desc == NULL) { + /* The UCT layer does not detect other error status codes and only identifies + whether the status is UCS_INPROGRESS and then process. We do not need UCT + desc, just return UCS_OK. */ + return UCS_OK; + } + desc->release = ucs_mpool_put_inline; + } + errno_t error_status = memcpy_s(&desc->header, length, data, length); + if (error_status != EOK) { + ret = UCS_ERR_INVALID_PARAM; } - memcpy(&desc->header, data, length); + ret = UCS_OK; } @@ -238,28 +398,20 @@ UCS_PROFILE_FUNC(ucs_status_t, ucg_builtin_am_handler, (arg, data, length, am_flags), void *arg, void *data, size_t length, unsigned am_flags) { + ucg_worker_h worker = (ucg_worker_h)arg; ucg_builtin_header_t *header = data; - ucg_builtin_ctx_t **ctx = UCG_WORKER_TO_COMPONENT_CTX(ucg_builtin_component, arg); ucg_builtin_comp_slot_t *slot = NULL; ucg_group_id_t group_id = header->group_id; ucs_assert(length >= sizeof(header)); - if ((*ctx)->slots_total > group_id) { - slot = &(*ctx)->slots[group_id][header->coll_id % UCG_BUILTIN_MAX_CONCURRENT_OPS]; - if (slot != NULL) { - return ucg_builtin_am_process(slot, data, length, am_flags); + + slot = ucg_builtin_get_slot(worker, group_id); + if (slot == NULL) { + slot = ucg_builtin_set_slot(worker, group_id, NULL); + if (slot == NULL) { + ucs_fatal("Message abandoned, collection operation cannot be performed."); } } - /* rank A and rank B both creating a new group, This is creates a "race condition", - where A maybe sends a message to B before B finished creating the group. - At this point, we will encounter the situation that slots_total and group_id are equal. - Therefore, we need to store the message and process it when B creates the group. */ - ucg_builtin_am_buffer_t *buffer = &(*ctx)->buffer; - buffer->data = data; - buffer->group_id = group_id; - buffer->length = length; - buffer->am_flags = am_flags; - buffer->used = 1; - return (am_flags & UCT_CB_PARAM_FLAG_DESC) ? UCS_INPROGRESS : UCS_OK; + return ucg_builtin_am_process(&slot[header->coll_id % UCG_BUILTIN_MAX_CONCURRENT_OPS], data, length, am_flags); } void ucg_builtin_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type, @@ -272,7 +424,6 @@ void ucg_builtin_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type, (uint64_t)header->remote_offset, length - sizeof(*header)); } - static ucs_status_t ucg_builtin_init_plan_config(ucg_plan_component_t *plan_component) { ucg_builtin_config_t *config = (ucg_builtin_config_t*)plan_component->plan_config; @@ -280,20 +431,20 @@ static ucs_status_t ucg_builtin_init_plan_config(ucg_plan_component_t *plan_comp config->pipelining = 0; config->recursive.factor = RECURSIVE_FACTOR; - /* K-nomial tree algorithm require all K vaule is bigger than 1 */ + /* K-nomial tree algorithm require all K value is bigger than 1 */ if (config->bmtree.degree_inter_fanout <= 1 || config->bmtree.degree_inter_fanin <= 1 || config->bmtree.degree_intra_fanout <= 1 || config->bmtree.degree_intra_fanin <= 1) { - ucs_info("K-nomial tree algorithm require all K vaule is bigger than one, switch to default parameter sets"); + ucs_info("K-nomial tree algorithm require all K value is bigger than one, switch to default parameter sets"); config->bmtree.degree_inter_fanout = DEFAULT_INTER_KVALUE; config->bmtree.degree_inter_fanin = DEFAULT_INTER_KVALUE; config->bmtree.degree_intra_fanout = DEFAULT_INTRA_KVALUE; config->bmtree.degree_intra_fanin = DEFAULT_INTRA_KVALUE; } - ucs_info("plan %s bcast %u allreduce %u barrier %u " + ucs_info("plan %s bcast %u allreduce %u barrier %u alltoallv %u" "inter_fanout %u inter_fanin %u intra_fanout %u intra_fanin %u", plan_component->name, (unsigned)config->bcast_algorithm, (unsigned)config->allreduce_algorithm, - (unsigned)config->barrier_algorithm, config->bmtree.degree_inter_fanout, config->bmtree.degree_inter_fanin, + (unsigned)config->alltoallv_algorithm, (unsigned)config->barrier_algorithm, config->bmtree.degree_inter_fanout, config->bmtree.degree_inter_fanin, config->bmtree.degree_intra_fanout, config->bmtree.degree_intra_fanin); return UCS_OK; @@ -307,23 +458,6 @@ static ucs_status_t ucg_builtin_create(ucg_plan_component_t *plan_component, ucs_mpool_t *group_am_mp, const ucg_group_params_t *group_params) { - /* Create or expand the per-worker context - for the AM-handler's sake */ - ucg_builtin_ctx_t **bctx = - UCG_WORKER_TO_COMPONENT_CTX(ucg_builtin_component, worker); - if ((ucs_unlikely(*bctx == NULL)) || - (ucs_likely((*bctx)->slots_total <= group_id))) { - void *temp = *bctx; - size_t bctx_size = sizeof(**bctx) + ((group_id + 1) * sizeof(void*)); - *bctx = ucs_realloc(temp, bctx_size, "builtin_context"); - if (ucs_unlikely(*bctx == NULL)) { - *bctx = temp; - return UCS_ERR_NO_MEMORY; - } - (*bctx)->slots_total = group_id + 1; - (*bctx)->slots_used = (temp == NULL) ? 0 : (*bctx)->slots_used; - } else { - (*bctx)->slots_used++; - } /* Fill in the information in the per-group context */ ucg_builtin_group_ctx_t *gctx = UCG_GROUP_TO_COMPONENT_CTX(ucg_builtin_component, group); @@ -336,24 +470,14 @@ static ucs_status_t ucg_builtin_create(ucg_plan_component_t *plan_component, ucs_list_head_init(&gctx->send_head); ucs_list_head_init(&gctx->plan_head); - int i; - for (i = 0; i < UCG_BUILTIN_MAX_CONCURRENT_OPS; i++) { - ucs_list_head_init(&gctx->slots[i].msg_head); - gctx->slots[i].mp = group_am_mp; - gctx->slots[i].cb = NULL; - gctx->slots[i].coll_id = i; - gctx->slots[i].step_idx = 0; + gctx->slots = ucg_builtin_set_slot(worker, group_id, group_am_mp); + if (gctx->slots == NULL) { + return UCS_ERR_NO_RESOURCE; } - /* Link the two contexts */ - (*bctx)->slots[group_id] = gctx->slots; - - if ((*bctx)->buffer.used == 1 && (*bctx)->buffer.group_id == group_id) { - ucg_builtin_am_buffer_t *buffer = &(*bctx)->buffer; - ucg_builtin_header_t *header = buffer->data; - (void)ucg_builtin_am_process(&gctx->slots[header->coll_id], buffer->data, - buffer->length, buffer->am_flags); - buffer->used = 0; + if (ucg_builtin_pcache_init(group)) { + ucs_error("plan cache init fail"); + return UCS_ERR_NO_MEMORY; } return ucg_builtin_init_plan_config(plan_component); @@ -365,6 +489,7 @@ static void ucg_builtin_clean_phases(ucg_builtin_plan_t *plan) for (i = 0; i < plan->phs_cnt; i++) { ucg_builtin_free((void **)&plan->phss[i].recv_cache_buffer); ucg_builtin_free((void **)&plan->phss[i].ucp_eps); + ucg_builtin_free((void **)&plan->phss[i].ep_thresh); } #if ENABLE_DEBUG_DATA @@ -410,21 +535,11 @@ ucs_status_t ucg_builtin_destroy_plan(ucg_builtin_plan_t *plan, ucg_group_h grou return UCS_OK; } -void ucg_builtin_release_comp_desc(ucg_builtin_comp_desc_t *desc) -{ - if (desc->super.flags == UCT_CB_PARAM_FLAG_DESC) { - uct_iface_release_desc(desc); - } else { - ucs_mpool_put_inline(desc); - } -} - static void ucg_builtin_destroy(ucg_group_h group) { ucg_builtin_group_ctx_t *gctx = UCG_GROUP_TO_COMPONENT_CTX(ucg_builtin_component, group); - ucg_builtin_ctx_t **bctx = UCG_WORKER_TO_COMPONENT_CTX(ucg_builtin_component, group->worker); - (*bctx)->slots[group->group_id] = NULL; unsigned i; + ucg_builtin_pcache_destroy(group); for (i = 0; i < UCG_BUILTIN_MAX_CONCURRENT_OPS; i++) { if (gctx->slots[i].cb != NULL) { ucs_debug("Collective operation #%u has been left incomplete (Group #%u)", @@ -437,15 +552,9 @@ static void ucg_builtin_destroy(ucg_group_h group) ucg_builtin_comp_desc_t, super.tag_list[0]); ucs_debug("Collective operation #%u has %u bytes left pending for step #%u (Group #%u)", desc->header.coll_id, desc->super.length, desc->header.step_idx, desc->header.group_id); - ucg_builtin_release_comp_desc(desc); - } - } - - if (group->params.topo_map) { - for (i = 0; i < group->params.member_count; i++) { - ucg_builtin_free((void **)&group->params.topo_map[i]); + desc->release(desc); + desc = NULL; } - ucg_builtin_free((void **)&group->params.topo_map); } while (!ucs_list_is_empty(&gctx->plan_head)) { @@ -493,173 +602,15 @@ ucs_mpool_ops_t ucg_builtin_plan_mpool_ops = { .obj_cleanup = ucs_empty_function }; -void ucg_builtin_plan_decision_in_unsupport_allreduce_case_check_msg_size(const size_t msg_size) -{ - if (msg_size < UCG_GROUP_MED_MSG_SIZE) { - /* Node-aware Recursive */ - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_RECURSIVE_AND_BMTREE, &ucg_algo); - } else { - /* Ring */ - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_RING, &ucg_algo); - } -} - -void ucg_builtin_plan_decision_in_unsupport_allreduce_case(const size_t msg_size, - const ucg_group_params_t *group_params, - const enum ucg_collective_modifiers modifiers, - const ucg_collective_params_t *coll_params) -{ - if (modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE]) { - if (coll_params->send.op_ext && !group_params->op_is_commute_f(coll_params->send.op_ext)) { - /* Ring */ - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_RING, &ucg_algo); - ucs_debug("non-commutative operation, select Ring."); - } else { - ucg_builtin_plan_decision_in_unsupport_allreduce_case_check_msg_size(msg_size); - } - } -} - -void ucg_builtin_plan_decision_in_unsupport_bcast_case(const size_t msg_size, - const ucg_group_params_t *group_params, - const enum ucg_collective_modifiers modifiers, - const ucg_collective_params_t *coll_params) -{ - if (modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_BCAST]) { - /* Node-aware Binomial tree (DEFAULT) */ - ucg_builtin_bcast_algo_switch(UCG_ALGORITHM_BCAST_NODE_AWARE_BMTREE, &ucg_algo); - } -} - -void ucg_builtin_plan_decision_in_unsupport_barrier_case(const size_t msg_size, - const ucg_group_params_t *group_params, - const enum ucg_collective_modifiers modifiers, - const ucg_collective_params_t *coll_params) -{ - if (modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_BARRIER]) { - /* Node-aware Recursive (DEFAULT) */ - ucg_builtin_barrier_algo_switch(UCG_ALGORITHM_BARRIER_NODE_AWARE_RECURSIVE_AND_BMTREE, &ucg_algo); - } -} - -/* change algorithm in unsupport case */ -void ucg_builtin_plan_decision_in_unsupport_case(const size_t msg_size, - const ucg_group_params_t *group_params, - const enum ucg_collective_modifiers modifiers, - const ucg_collective_params_t *coll_params) -{ - /* choose algorithm due to message size */ - ucg_builtin_plan_decision_in_unsupport_allreduce_case(msg_size, group_params, modifiers, coll_params); - ucg_builtin_plan_decision_in_unsupport_bcast_case(msg_size, group_params, modifiers, coll_params); - ucg_builtin_plan_decision_in_unsupport_barrier_case(msg_size, group_params, modifiers, coll_params); -} - -void ucg_builtin_plan_decision_in_noncommutative_largedata_case_recusive(const size_t msg_size, enum ucg_builtin_allreduce_algorithm *allreduce_algo_decision) -{ - /* Recusive */ - if (allreduce_algo_decision != NULL) { - *allreduce_algo_decision = UCG_ALGORITHM_ALLREDUCE_RECURSIVE; - } - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_RECURSIVE, &ucg_algo); - ucs_debug("non-commutative operation, select recurisive"); -} - -void ucg_builtin_plan_decision_in_noncommutative_largedata_case_ring(const size_t msg_size, enum ucg_builtin_allreduce_algorithm *allreduce_algo_decision) -{ - /* Ring */ - if (allreduce_algo_decision != NULL) { - *allreduce_algo_decision = UCG_ALGORITHM_ALLREDUCE_RING; - } - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_RING, &ucg_algo); - ucs_debug("non-commutative operation, select Ring."); -} - -void ucg_builtin_plan_decision_in_noncommutative_largedata_case(const size_t msg_size, enum ucg_builtin_allreduce_algorithm *allreduce_algo_decision) -{ - if (msg_size < UCG_GROUP_MED_MSG_SIZE) { - ucg_builtin_plan_decision_in_noncommutative_largedata_case_recusive(msg_size, allreduce_algo_decision); - } else { - ucg_builtin_plan_decision_in_noncommutative_largedata_case_ring(msg_size, allreduce_algo_decision); - } -} - -void ucg_builtin_plan_decision_in_noncommutative_many_counts_case() -{ - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_RECURSIVE, &ucg_algo); - ucs_debug("non-commutative operation with more than one send count, select recurisive"); -} - -void ucg_builtin_allreduce_decision_fixed(const size_t msg_size, - const ucg_group_params_t *group_params, - const ucg_collective_params_t *coll_params, - const unsigned large_datatype_threshold, - const int is_unbalanced_ppn, - enum ucg_builtin_allreduce_algorithm *allreduce_algo_decision) -{ - unsigned is_large_datatype = (coll_params->send.dt_len > large_datatype_threshold); - unsigned is_non_commutative = coll_params->send.op_ext - && !group_params->op_is_commute_f(coll_params->send.op_ext); - if (is_large_datatype || is_non_commutative) { - ucg_builtin_plan_decision_in_noncommutative_largedata_case(msg_size, allreduce_algo_decision); - } else if (msg_size >= UCG_GROUP_MED_MSG_SIZE) { - /* Ring */ - *allreduce_algo_decision = UCG_ALGORITHM_ALLREDUCE_RING; - ucg_builtin_allreduce_algo_switch(*allreduce_algo_decision, &ucg_algo); - } else if (is_unbalanced_ppn) { - /* Node-aware Recursive */ - *allreduce_algo_decision = UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_RECURSIVE_AND_BMTREE; - ucg_builtin_allreduce_algo_switch(*allreduce_algo_decision, &ucg_algo); - } else { - /* Node-aware Kinomial tree (DEFAULT) */ - *allreduce_algo_decision = UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_KMTREE; - ucg_builtin_allreduce_algo_switch(*allreduce_algo_decision, &ucg_algo); - } -} - -void plan_decision_fixed(const size_t msg_size, - const ucg_group_params_t *group_params, - const enum ucg_collective_modifiers modifiers, - const ucg_collective_params_t *coll_params, - const unsigned large_datatype_threshold, - const int is_unbalanced_ppn, - enum ucg_builtin_bcast_algorithm *bcast_algo_decision, - enum ucg_builtin_allreduce_algorithm *allreduce_algo_decision, - enum ucg_builtin_barrier_algorithm *barrier_algo_decision) -{ - *bcast_algo_decision = UCG_ALGORITHM_BCAST_AUTO_DECISION; - *allreduce_algo_decision = UCG_ALGORITHM_ALLREDUCE_AUTO_DECISION; - *barrier_algo_decision = UCG_ALGORITHM_BARRIER_AUTO_DECISION; - /* choose algorithm due to message size */ - if (modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE]) { - ucg_builtin_allreduce_decision_fixed(msg_size, group_params, coll_params, large_datatype_threshold, - is_unbalanced_ppn, allreduce_algo_decision); - } - if (modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_BCAST]) { - /* Node-aware Binomial tree (DEFAULT) */ - *bcast_algo_decision = UCG_ALGORITHM_BCAST_NODE_AWARE_KMTREE; - ucg_builtin_bcast_algo_switch(*bcast_algo_decision, &ucg_algo); - } - if (modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_BARRIER]) { - /* Node-aware Recursive (DEFAULT) */ - if (is_unbalanced_ppn) { - /* Node-aware Recursive */ - *barrier_algo_decision = UCG_ALGORITHM_BARRIER_NODE_AWARE_RECURSIVE_AND_BMTREE; - ucg_builtin_barrier_algo_switch(*barrier_algo_decision, &ucg_algo); - } else { - /* Node-aware Kinomial tree (DEFAULT) */ - *barrier_algo_decision = UCG_ALGORITHM_BARRIER_NODE_AWARE_KMTREE; - ucg_builtin_barrier_algo_switch(*barrier_algo_decision, &ucg_algo); - } - } -} - void ucg_builtin_fillin_algo(struct ucg_builtin_algorithm *algo, unsigned bmtree, unsigned kmtree, unsigned kmtree_intra, unsigned recursive, unsigned topo, - unsigned ring) + unsigned ring, + unsigned NAP, + unsigned binary_block) { algo->bmtree = bmtree; algo->kmtree = kmtree; @@ -667,18 +618,23 @@ void ucg_builtin_fillin_algo(struct ucg_builtin_algorithm *algo, algo->recursive = recursive; algo->topo = topo; algo->ring = ring; + algo->NAP = NAP; + algo->binary_block = binary_block; } static void ucg_builtin_init_algo(struct ucg_builtin_algorithm *algo) { - ucg_builtin_fillin_algo(algo, 1, 0, 0, 1, 0, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 0, 1, 0, 0, 0, 0); algo->bruck = 1, algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE, algo->pipeline = 0; algo->feature_flag = UCG_ALGORITHM_SUPPORT_COMMON_FEATURE; + algo->inc = 0; + algo->ladd = 0; + algo->plummer = 0; } -ucs_status_t ucg_builtin_bcast_algo_switch(const enum ucg_builtin_bcast_algorithm bcast_algo_decision, +void ucg_builtin_bcast_algo_switch(const enum ucg_builtin_bcast_algorithm bcast_algo_decision, struct ucg_builtin_algorithm *algo) { algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; @@ -686,277 +642,206 @@ ucs_status_t ucg_builtin_bcast_algo_switch(const enum ucg_builtin_bcast_algorith algo->bruck = 1; switch (bcast_algo_decision) { case UCG_ALGORITHM_BCAST_BMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 0, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 0, 0, 0, 0); algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; break; case UCG_ALGORITHM_BCAST_NODE_AWARE_BMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; break; case UCG_ALGORITHM_BCAST_NODE_AWARE_KMTREE_AND_BMTREE: - ucg_builtin_fillin_algo(algo, 1, 1, 0, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 1, 0, 0, 1, 0, 0, 0); break; case UCG_ALGORITHM_BCAST_NODE_AWARE_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0, 0, 0); + break; + case UCG_ALGORITHM_BCAST_NODE_AWARE_INC: + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); + algo->inc = 1; break; default: - ucg_builtin_bcast_algo_switch(UCG_ALGORITHM_BCAST_NODE_AWARE_KMTREE, algo); + ucg_builtin_bcast_algo_switch(UCG_ALGORITHM_BCAST_NODE_AWARE_KMTREE_AND_BMTREE, algo); break; } - return UCS_OK; } -ucs_status_t ucg_builtin_barrier_algo_switch(const enum ucg_builtin_barrier_algorithm barrier_algo_decision, +void ucg_builtin_barrier_algo_switch(const enum ucg_builtin_barrier_algorithm barrier_algo_decision, struct ucg_builtin_algorithm *algo) { algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; algo->bruck = 1; switch (barrier_algo_decision) { case UCG_ALGORITHM_BARRIER_RECURSIVE: - ucg_builtin_fillin_algo(algo, 0, 0, 0, 1, 0, 0); + ucg_builtin_fillin_algo(algo, 0, 0, 0, 1, 0, 0, 0, 0); algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_BARRIER_NODE_AWARE_RECURSIVE_AND_BMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_BARRIER_SOCKET_AWARE_RECURSIVE_AND_BMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; break; case UCG_ALGORITHM_BARRIER_NODE_AWARE_RECURSIVE_AND_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 1, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_BARRIER_SOCKET_AWARE_RECURSIVE_AND_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 1, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; break; case UCG_ALGORITHM_BARRIER_NODE_AWARE_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_BARRIER_SOCKET_AWARE_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0, 0, 0); + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; + break; + case UCG_ALGORITHM_BARRIER_NODE_AWARE_INC: + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; + algo->inc = 1; + break; + case UCG_ALGORITHM_BARRIER_SOCKET_AWARE_INC: + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; + algo->inc = 1; + break; + case UCG_ALGORITHM_BARRIER_NAP: + ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 1, 0, 1, 0); + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; break; default: ucg_builtin_barrier_algo_switch(UCG_ALGORITHM_BARRIER_NODE_AWARE_KMTREE, algo); break; } - return UCS_OK; } -ucs_status_t ucg_builtin_allreduce_algo_switch(const enum ucg_builtin_allreduce_algorithm allreduce_algo_decision, +void ucg_builtin_allreduce_algo_switch(const enum ucg_builtin_allreduce_algorithm allreduce_algo_decision, struct ucg_builtin_algorithm *algo) { algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; algo->bruck = 1; switch (allreduce_algo_decision) { case UCG_ALGORITHM_ALLREDUCE_RECURSIVE: - ucg_builtin_fillin_algo(algo, 0, 0, 0, 1, 0, 0); - algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; - algo->feature_flag |= UCG_ALGORITHM_SUPPORT_ALLREDUCE_RARE_FEATURE; - algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; + ucg_builtin_fillin_algo(algo, 0, 0, 0, 1, 0, 0, 0, 0); + algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE | + UCG_ALGORITHM_SUPPORT_ALLREDUCE_RARE_FEATURE | + UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_RECURSIVE_AND_BMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; - algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; - algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; + algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE | + UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_RECURSIVE_AND_BMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; break; case UCG_ALGORITHM_ALLREDUCE_RING: - ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 0, 1); - algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; - algo->feature_flag |= UCG_ALGORITHM_SUPPORT_ALLREDUCE_RARE_FEATURE; - algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; + ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 0, 1, 0, 0); + algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE | + UCG_ALGORITHM_SUPPORT_ALLREDUCE_RARE_FEATURE | + UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_RECURSIVE_AND_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 1, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_RECURSIVE_AND_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 0, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 0, 1, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; break; case UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0, 0, 0); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; algo->feature_flag |= UCG_ALGORITHM_SUPPORT_BIND_TO_NONE; break; case UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_KMTREE: - ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0); + ucg_builtin_fillin_algo(algo, 1, 1, 1, 0, 1, 0, 0, 0); + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; + break; + case UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_INC: + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; + algo->inc = 1; + break; + case UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_INC: + ucg_builtin_fillin_algo(algo, 1, 0, 0, 0, 1, 0, 0, 0); + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; + algo->inc = 1; + break; + case UCG_ALGORITHM_ALLREDUCE_NAP: + ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 1, 0, 1, 0); + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; + break; + case UCG_ALGORITHM_ALLREDUCE_RABENSEIFNER_BINARY_BLOCK: + ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 0, 0, 0, 1); + algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; + break; + case UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_RABENSEIFNER_BINARY_BLOCK: + ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 1, 0, 0, 1); + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; + break; + case UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_RABENSEIFNER_BINARY_BLOCK: + ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 1, 0, 0, 1); algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_SOCKET; break; default: ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_KMTREE, algo); break; } - return UCS_OK; -} - -void ucg_builtin_check_algorithm_param_size(ucg_builtin_config_t *config) -{ - if (((int)config->allreduce_algorithm >= UCG_ALGORITHM_ALLREDUCE_LAST) || ((int)config->allreduce_algorithm < UCG_ALGORITHM_ALLREDUCE_AUTO_DECISION)) { - ucs_info("Param UCX_BUILTIN_ALLREDUCE_ALGORITHM=%d is invalid parameter, switch to default algorithm.", (int)config->allreduce_algorithm); - } - if (((int)config->bcast_algorithm >= UCG_ALGORITHM_BCAST_LAST) || ((int)config->bcast_algorithm < UCG_ALGORITHM_BCAST_AUTO_DECISION)) { - ucs_info("Param UCX_BUILTIN_BCAST_ALGORITHM=%d is invalid parameter, switch to default algorithm.", (int)config->bcast_algorithm); - } - if (((int)config->barrier_algorithm >= UCG_ALGORITHM_BARRIER_LAST) || ((int)config->barrier_algorithm < UCG_ALGORITHM_BARRIER_AUTO_DECISION)) { - ucs_info("Param UCX_BUILTIN_BARRIER_ALGORITHM=%d is invalid parameter, switch to default algorithm.", (int)config->barrier_algorithm); - } } -void ucg_builtin_check_algorithm_param_type(ucg_builtin_config_t *config) +void ucg_builtin_alltoallv_algo_switch(const enum ucg_builtin_alltoallv_algorithm alltoallv_algo_decision, + struct ucg_builtin_algorithm *algo) { - if ((config->allreduce_algorithm - (int)config->allreduce_algorithm) != 0) { - ucs_info("Param UCX_BUILTIN_ALLREDUCE_ALGORITHM=%lf is not unsigned integer, switch to unsigned integer '%d'.", config->allreduce_algorithm, (int)config->allreduce_algorithm); - } - if ((config->bcast_algorithm - (int)config->bcast_algorithm) != 0) { - ucs_info("Param UCX_BUILTIN_BCAST_ALGORITHM=%lf is not unsigned integer, switch to unsigned integer '%d'.", config->bcast_algorithm, (int)config->bcast_algorithm); - } - if ((config->barrier_algorithm - (int)config->barrier_algorithm) != 0) { - ucs_info("Param UCX_BUILTIN_BARRIER_ALGORITHM=%lf is not unsigned integer, switch to unsigned integer '%d'.", config->barrier_algorithm, (int)config->barrier_algorithm); - } -} - -enum choose_ops_mask ucg_builtin_plan_choose_ops(ucg_plan_component_t *plan_component, enum ucg_collective_modifiers ops_type_choose) -{ - ucg_builtin_config_t *config = (ucg_builtin_config_t *)plan_component->plan_config; - ucg_builtin_check_algorithm_param_type(config); - ucg_builtin_check_algorithm_param_size(config); - - enum ucg_builtin_bcast_algorithm bcast_algo_decision = (enum ucg_builtin_bcast_algorithm)config->bcast_algorithm; - enum ucg_builtin_allreduce_algorithm allreduce_algo_decision = (enum ucg_builtin_allreduce_algorithm) - config->allreduce_algorithm; - enum ucg_builtin_barrier_algorithm barrier_algo_decision = (enum ucg_builtin_barrier_algorithm) - config->barrier_algorithm; - enum choose_ops_mask result = OPS_AUTO_DECISION; - - if (!(bcast_algo_decision | allreduce_algo_decision | barrier_algo_decision)) { - return OPS_AUTO_DECISION; - } - - if (ops_type_choose == ucg_predefined_modifiers[UCG_PRIMITIVE_BCAST]) { - if (bcast_algo_decision >= UCG_ALGORITHM_BCAST_LAST || bcast_algo_decision <= UCG_ALGORITHM_BCAST_AUTO_DECISION) { - return OPS_AUTO_DECISION; - } - result = OPS_BCAST; - } - - if (ops_type_choose == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE]) { - if (allreduce_algo_decision >= UCG_ALGORITHM_ALLREDUCE_LAST || allreduce_algo_decision <= UCG_ALGORITHM_ALLREDUCE_AUTO_DECISION) { - return OPS_AUTO_DECISION; - } - result = OPS_ALLREDUCE; - } - - if (ops_type_choose == ucg_predefined_modifiers[UCG_PRIMITIVE_BARRIER]) { - if (barrier_algo_decision >= UCG_ALGORITHM_BARRIER_LAST || barrier_algo_decision <= UCG_ALGORITHM_BARRIER_AUTO_DECISION) { - return OPS_AUTO_DECISION; - } - result = OPS_BARRIER; - } - - return result; -} - -void ucg_builtin_check_continuous_number_by_sort(ucg_group_member_index_t *array, - unsigned array_len, - unsigned *discont_flag) -{ - ucg_group_member_index_t member_idx; - unsigned idx, idx2; - /* bubble sort */ - for (idx = 0; idx < array_len - 1; idx++) { - for (idx2 = 0; idx2 < array_len - 1 - idx; idx2++) { - if (array[idx2] > array[idx2 + 1]) { - member_idx = array[idx2 + 1]; - array[idx2 + 1] = array[idx2]; - array[idx2] = member_idx; - } - } - } - /* discontinous or not */ - for (idx = 0; idx < array_len - 1; idx++) { - if (array[idx + 1] - array[idx] != 1) { - *discont_flag = 1; + algo->topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; + algo->feature_flag |= UCG_ALGORITHM_SUPPORT_RANK_FEATURE; + algo->bruck = 0; + switch (alltoallv_algo_decision) { + case UCG_ALGORITHM_ALLTOALLV_LADD: + ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 0, 0, 0, 0); + algo->ladd = 1; + break; + case UCG_ALGORITHM_ALLTOALLV_NODE_AWARE_PLUMMER: + ucg_builtin_fillin_algo(algo, 0, 0, 0, 0, 1, 0, 0, 0); + algo->plummer = 1; + break; + default: + ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLTOALLV_LADD, algo); break; - } - } -} - -static void ucg_builtin_prepare_rank_same_unit(const ucg_group_params_t *group_params, - enum ucg_group_member_distance domain_distance, - ucg_group_member_index_t *rank_same_unit) -{ - unsigned idx, member_idx; - enum ucg_group_member_distance next_distance; - for (idx = 0, member_idx = 0; member_idx < group_params->member_count; member_idx++) { - next_distance = group_params->distance[member_idx]; - if (ucs_likely(next_distance <= domain_distance)) { - rank_same_unit[idx++] = member_idx; - } } } -ucs_status_t ucg_builtin_check_continuous_number_no_topo_map(const ucg_group_params_t *group_params, - enum ucg_group_member_distance domain_distance, - unsigned *discont_flag) +enum ucg_group_member_distance ucg_builtin_get_distance(const ucg_group_params_t *group_params, + ucg_group_member_index_t rank1, + ucg_group_member_index_t rank2) { - unsigned ppx = ucg_builtin_calculate_ppx(group_params, domain_distance); - - /* store rank number in same unit */ - size_t alloc_size = ppx * sizeof(ucg_group_member_index_t); - ucg_group_member_index_t *rank_same_unit = (ucg_group_member_index_t*)UCS_ALLOC_CHECK(alloc_size, "rank number"); - memset(rank_same_unit, 0, alloc_size); - ucg_builtin_prepare_rank_same_unit(group_params, domain_distance, rank_same_unit); - - ucg_builtin_check_continuous_number_by_sort(rank_same_unit, ppx, discont_flag); - ucg_builtin_free((void **)&rank_same_unit); - return UCS_OK; + return group_params->mpi_rank_distance(group_params->cb_group_obj, rank1, rank2); } ucs_status_t ucg_builtin_check_continuous_number(const ucg_group_params_t *group_params, enum ucg_group_member_distance domain_distance, unsigned *discont_flag) { - if (group_params->topo_map == NULL) { - return ucg_builtin_check_continuous_number_no_topo_map(group_params, domain_distance, discont_flag); - } - - char domain_distance_ch = (char)domain_distance; - /* Check the topo distance in each line and find all ranks in the same node - Make sure the ranks in the same node is continuous. */ - for (unsigned i = 0; i < group_params->member_count; i++) { - int last_same_unit_rank = -1; - for (unsigned j = 0; j < group_params->member_count; j++) { - if (group_params->topo_map[i][j] > domain_distance_ch) { - continue; - } - - if (last_same_unit_rank != -1 && j - last_same_unit_rank != 1) { - *discont_flag = 1; - return UCS_OK; - } - last_same_unit_rank = j; - } + if (domain_distance == UCG_GROUP_MEMBER_DISTANCE_SOCKET) { + *discont_flag = group_params->topo_args.srank_uncontinue; + } else { + *discont_flag = group_params->topo_args.nrank_uncontinue; } - *discont_flag = 0; return UCS_OK; } -ucs_status_t choose_distance_from_topo_aware_level(enum ucg_group_member_distance *domain_distance) +void choose_distance_from_topo_aware_level(enum ucg_group_member_distance *domain_distance) { switch (ucg_algo.topo_level) { case UCG_GROUP_HIERARCHY_LEVEL_NODE: @@ -971,21 +856,6 @@ ucs_status_t choose_distance_from_topo_aware_level(enum ucg_group_member_distanc default: break; } - return UCS_OK; -} - -void ucg_builtin_non_commutative_operation(const ucg_group_params_t *group_params, const ucg_collective_params_t *coll_params, struct ucg_builtin_algorithm *algo, const size_t msg_size) -{ - if (coll_params->send.op_ext && !group_params->op_is_commute_f(coll_params->send.op_ext) && - !(algo->feature_flag & UCG_ALGORITHM_SUPPORT_NON_COMMUTATIVE_OPS)) { - if (coll_params->send.count > 1) { - ucg_builtin_plan_decision_in_noncommutative_many_counts_case(); - ucs_info("Current algorithm does not support many counts non-commutative operation, and switch to Recursive doubling which may have unexpected performance"); - } else { - ucg_builtin_plan_decision_in_noncommutative_largedata_case(msg_size, NULL); - ucs_info("Current algorithm does not support non commutative operation, and switch to Recursive doubling or Ring Algorithm which may have unexpected performance"); - } - } } int ucg_builtin_op_can_reuse(const ucg_plan_t *plan, const ucg_op_t *op, @@ -999,6 +869,10 @@ int ucg_builtin_op_can_reuse(const ucg_plan_t *plan, const ucg_op_t *op, return 0; } + /* Alltoallv does not consider op reuse. */ + if (params->type.modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLTOALLV]) { + return 0; + } if (params->send.count > 0) { builtin_plan->convert_f(params->send.dt_ext, &send_dtype); if (!UCG_DT_IS_CONTIG(params, send_dtype)) { @@ -1009,258 +883,66 @@ int ucg_builtin_op_can_reuse(const ucg_plan_t *plan, const ucg_op_t *op, return 1; } -void ucg_builtin_update_op(const ucg_plan_t *plan, ucg_op_t *op, - const ucg_collective_params_t *params) -{ - ucp_datatype_t send_dtype = UCP_DATATYPE_CONTIG; - ucp_datatype_t recv_dtype = UCP_DATATYPE_CONTIG; - ucg_builtin_plan_t *builtin_plan = (ucg_builtin_plan_t *)plan; - ucg_builtin_op_t *builtin_op = (ucg_builtin_op_t *)op; - - builtin_op->send_dt = NULL; - builtin_op->recv_dt = NULL; - if (params->send.count > 0 && params->send.dt_len > 0) { - builtin_plan->convert_f(params->send.dt_ext, &send_dtype); - if (!UCG_DT_IS_CONTIG(params, send_dtype)) { - builtin_op->send_dt = ucp_dt_generic(send_dtype); - } - } - - if (params->recv.count > 0 && params->recv.dt_len > 0) { - builtin_plan->convert_f(params->recv.dt_ext, &recv_dtype); - if (!UCG_DT_IS_CONTIG(params, recv_dtype)) { - builtin_op->recv_dt = ucp_dt_generic(recv_dtype); - } - } -} - -int ucg_is_noncontig_allreduce(const ucg_group_params_t *group_params, - const ucg_collective_params_t *coll_params) -{ - ucp_datatype_t ucp_datatype; - - if (coll_params->type.modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE] && - coll_params->send.count > 0 && coll_params->send.dt_len > 0) { - group_params->mpi_dt_convert(coll_params->send.dt_ext, &ucp_datatype); - if (!UCP_DT_IS_CONTIG(ucp_datatype)) { - ucs_debug("allreduce non-contiguous datatype"); - return 1; - } - } - - return 0; -} - -int ucg_is_noncommutative_allreduce(const ucg_group_params_t *group_params, - const ucg_collective_params_t *coll_params) -{ - return coll_params->type.modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE] && - coll_params->send.op_ext && !group_params->op_is_commute_f(coll_params->send.op_ext); -} - -#define UCT_MIN_SHORT_ONE_LEN 80 -#define UCT_MIN_BCOPY_ONE_LEN 1000 -int ucg_is_segmented_allreduce(const ucg_collective_params_t *coll_params) -{ - int count = coll_params->send.count; - size_t dt_len = coll_params->send.dt_len; - - if (coll_params->type.modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE]) { - if (dt_len > UCT_MIN_BCOPY_ONE_LEN) { - return 1; - } - - if (dt_len > UCT_MIN_SHORT_ONE_LEN && (dt_len * count) < UCG_GROUP_MED_MSG_SIZE) { - return 1; - } - } - - return 0; -} - -/* - Deal with all unsupport special case. -*/ -ucs_status_t ucg_builtin_change_unsupport_algo(struct ucg_builtin_algorithm *algo, - const ucg_group_params_t *group_params, - const size_t msg_size, - const ucg_collective_params_t *coll_params, - const enum ucg_collective_modifiers ops_type_choose, - enum choose_ops_mask ops_choose, - ucg_builtin_config_t *config) +void ucg_builtin_log_algo() { - ucs_status_t status; - - /* Currently, only algorithm 1 supports non-contiguous datatype for allreduce */ - if (ucg_is_noncontig_allreduce(group_params, coll_params)) { - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_RECURSIVE, &ucg_algo); - ucs_info("allreduce non-contiguous datatype, select algo%d:recursive", UCG_ALGORITHM_ALLREDUCE_RECURSIVE); - return UCS_OK; - } - - /* Currently, only algorithm 1 supports non-commutative op for allreduce */ - if (ucg_is_noncommutative_allreduce(group_params, coll_params)) { - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_RECURSIVE, &ucg_algo); - ucs_info("non-commutative allreduce, select algo%d:recursive", UCG_ALGORITHM_ALLREDUCE_RECURSIVE); - return UCS_OK; - } - - /* Special Case 1 : bind-to none */ - if (!(algo->feature_flag & UCG_ALGORITHM_SUPPORT_BIND_TO_NONE) && (group_params->is_bind_to_none)) { - ucg_builtin_plan_decision_in_unsupport_case(msg_size, group_params, ops_type_choose, coll_params); - ucs_info("Current algorithm don't support bind-to none case, switch to default algorithm"); - } - - /* Special Case 2 : unbalance ppn */ - unsigned is_ppn_unbalance = 0; - status = ucg_builtin_check_ppn(group_params, &is_ppn_unbalance); - if (status != UCS_OK) { - return status; - } - - if (is_ppn_unbalance && (!(algo->feature_flag & UCG_ALGORITHM_SUPPORT_UNBALANCE_PPN))) { - ucg_builtin_plan_decision_in_unsupport_case(msg_size, group_params, ops_type_choose, coll_params); - ucs_info("Current algorithm don't support ppn unbalance case, switch to default algorithm"); - } - - /* Special Case 3 : discontinuous rank */ - unsigned is_discontinuous_rank = 0; - enum ucg_group_member_distance domain_distance = UCG_GROUP_MEMBER_DISTANCE_HOST; - status = choose_distance_from_topo_aware_level(&domain_distance); - if (status != UCS_OK) { - return status; - } - status = ucg_builtin_check_continuous_number(group_params, domain_distance, &is_discontinuous_rank); - if (status != UCS_OK) { - return status; - } - - if (is_discontinuous_rank && (!(algo->feature_flag & UCG_ALGORITHM_SUPPORT_DISCONTINOUS_RANK))) { - ucg_builtin_plan_decision_in_unsupport_case(msg_size, group_params, ops_type_choose, coll_params); - ucs_info("Current algorithm demand rank number is continous. Switch default algorithm whose performance may be not the best"); - } - - if (ops_choose == OPS_ALLREDUCE) { - /* Special Case 4 : non-commutative operation */ - ucg_builtin_non_commutative_operation(group_params, coll_params, algo, msg_size); - - /* Special Case 5 : large datatype */ - if (coll_params->send.dt_len > config->large_datatype_threshold && - !(algo->feature_flag & UCG_ALGORITHM_SUPPORT_LARGE_DATATYPE)) { - ucg_builtin_plan_decision_in_noncommutative_largedata_case(msg_size, NULL); - ucs_info("Current algorithm does not support large datatype, and switch to Recursive doubling or Ring Algorithm which may have unexpected performance"); - } - } - - /* The allreduce result is wrong when phase->segmented=1 and using ring algorithm, must avoid it */ - if (ucg_algo.ring && ucg_is_segmented_allreduce(coll_params)) { - ucg_builtin_allreduce_algo_switch(UCG_ALGORITHM_ALLREDUCE_RECURSIVE, &ucg_algo); - ucs_info("ring algorithm does not support segmented phase, select recursive algorithm"); - return UCS_OK; - } - - return status; + ucs_info("bmtree %u kmtree %u kmtree_intra %u recur %u bruck %u topo %u " + "level %u ring %u pipe %u nap %u binary_block %u ladd %u plummer %u ",ucg_algo.bmtree, ucg_algo.kmtree, + ucg_algo.kmtree_intra, ucg_algo.recursive, ucg_algo.bruck, ucg_algo.topo, (unsigned)ucg_algo.topo_level, + ucg_algo.ring, ucg_algo.pipeline, ucg_algo.NAP, ucg_algo.binary_block, ucg_algo.ladd, ucg_algo.plummer); } -void ucg_builtin_log_algo() +static void ucg_builtin_plan_create(ucg_builtin_plan_t *plan, + enum ucg_builtin_plan_topology_type plan_topo_type, + ucg_collective_params_t *coll_params, + ucg_builtin_group_ctx_t *builtin_ctx) { - ucs_info("bmtree %u kmtree %u kmtree_intra %u recur %u bruck %u topo %u level %u ring %u pipe %u", - ucg_algo.bmtree, ucg_algo.kmtree, ucg_algo.kmtree_intra, ucg_algo.recursive, ucg_algo.bruck, - ucg_algo.topo, (unsigned)ucg_algo.topo_level, ucg_algo.ring, ucg_algo.pipeline); + plan->convert_f = builtin_ctx->group_params->mpi_dt_convert; + plan->dtspan_f = builtin_ctx->group_params->mpi_datatype_span; + plan->resend = &builtin_ctx->send_head; + plan->slots = &builtin_ctx->slots[0]; + plan->am_id = builtin_ctx->am_id; } -ucs_status_t ucg_builtin_algorithm_decision(const ucg_collective_type_t *coll_type, - const size_t msg_size, - const ucg_group_params_t *group_params, - const ucg_collective_params_t *coll_params, - ucg_plan_component_t *plan_component) +STATIC_GTEST void ucg_builtin_set_algo(coll_type_t ctype, int algo_id, ucg_builtin_algo_t *algo) { - ucg_collective_type_t *coll = (ucg_collective_type_t *)coll_type; - enum ucg_collective_modifiers ops_type_choose = coll->modifiers; - ucg_builtin_config_t *config = (ucg_builtin_config_t *)plan_component->plan_config; - enum ucg_builtin_bcast_algorithm bcast_algo_decision = (enum ucg_builtin_bcast_algorithm)config->bcast_algorithm; - enum ucg_builtin_allreduce_algorithm allreduce_algo_decision = (enum ucg_builtin_allreduce_algorithm) - config->allreduce_algorithm; - enum ucg_builtin_barrier_algorithm barrier_algo_decision = (enum ucg_builtin_barrier_algorithm) - config->barrier_algorithm; - - ucs_status_t status; - - /* default algorithm choosen: - Bcast : 3 - Allreduce : small message : 2 - big message : 4 - Barrier : 2 - */ - enum choose_ops_mask ops_choose = ucg_builtin_plan_choose_ops(plan_component, ops_type_choose); - ucs_info("choose ops: %d, bcast mode: %u, allreduce mode: %u, barrier mode: %u", - ops_choose, bcast_algo_decision, allreduce_algo_decision, barrier_algo_decision); - - /* unblanced ppn or not */ - unsigned is_ppn_unbalance = 0; - status = ucg_builtin_check_ppn(group_params, &is_ppn_unbalance); - if (status != UCS_OK) { - ucs_error("Error in check ppn"); - return status; - } - ucs_info("ppn unbalance: %u", is_ppn_unbalance); - - switch (ops_choose) { - case OPS_AUTO_DECISION: - /* Auto algorithm decision: according to is_ppn_unbalance/data/msg_size etc */ - plan_decision_fixed(msg_size, group_params, ops_type_choose, coll_params, config->large_datatype_threshold, is_ppn_unbalance, - &bcast_algo_decision, &allreduce_algo_decision, &barrier_algo_decision); + ucg_builtin_init_algo(algo); + switch (ctype) { + case COLL_TYPE_BARRIER: + ucg_builtin_barrier_algo_switch(algo_id, algo); break; - case OPS_BCAST: - ucg_builtin_bcast_algo_switch(bcast_algo_decision, &ucg_algo); - allreduce_algo_decision = UCG_ALGORITHM_ALLREDUCE_AUTO_DECISION; - barrier_algo_decision = UCG_ALGORITHM_BARRIER_AUTO_DECISION; + case COLL_TYPE_BCAST: + ucg_builtin_bcast_algo_switch(algo_id, algo); break; - case OPS_ALLREDUCE: - ucg_builtin_allreduce_algo_switch(allreduce_algo_decision, &ucg_algo); - bcast_algo_decision = UCG_ALGORITHM_BCAST_AUTO_DECISION; - barrier_algo_decision = UCG_ALGORITHM_BARRIER_AUTO_DECISION; + case COLL_TYPE_ALLREDUCE: + ucg_builtin_allreduce_algo_switch(algo_id, algo); break; - case OPS_BARRIER: - ucg_builtin_barrier_algo_switch(barrier_algo_decision, &ucg_algo); - bcast_algo_decision = UCG_ALGORITHM_BCAST_AUTO_DECISION; - allreduce_algo_decision = UCG_ALGORITHM_ALLREDUCE_AUTO_DECISION; + case COLL_TYPE_ALLTOALLV: + ucg_builtin_alltoallv_algo_switch(algo_id, algo); break; default: + ucs_error("invalid collective type %d", ctype); break; } - /* One API to deal with all special case */ - status = ucg_builtin_change_unsupport_algo(&ucg_algo, group_params, msg_size, coll_params, ops_type_choose, ops_choose, config); ucg_builtin_log_algo(); - - return UCS_OK; } -static ucs_status_t ucg_builtin_plan(ucg_plan_component_t *plan_component, - const ucg_collective_type_t *coll_type, - const size_t msg_size, - ucg_group_h group, +static ucs_status_t ucg_builtin_plan(ucg_group_h group, int algo_id, ucg_collective_params_t *coll_params, ucg_plan_t **plan_p) { - ucs_status_t status; - ucg_builtin_plan_t *plan = NULL; ucg_builtin_group_ctx_t *builtin_ctx = UCG_GROUP_TO_COMPONENT_CTX(ucg_builtin_component, group); - - ucg_builtin_init_algo(&ucg_algo); - - status = ucg_builtin_algorithm_decision(coll_type, msg_size, builtin_ctx->group_params, coll_params, plan_component); - - if (status != UCS_OK) { - return status; - } - - enum ucg_builtin_plan_topology_type plan_topo_type = ucg_builtin_choose_type(coll_type->modifiers); + ucg_plan_component_t *plan_component = &ucg_builtin_component; + ucg_collective_type_t *coll_type = &coll_params->type; + enum ucg_builtin_plan_topology_type plan_topo_type; + ucg_builtin_plan_t *plan = NULL; + ucs_status_t status; + ucg_builtin_set_algo(coll_params->coll_type, algo_id, &ucg_algo); + plan_topo_type = ucg_builtin_choose_type(coll_type->modifiers); ucs_debug("plan topo type: %d", plan_topo_type); @@ -1276,6 +958,26 @@ static ucs_status_t ucg_builtin_plan(ucg_plan_component_t *plan_component, builtin_ctx->group_params, coll_type, &plan); break; + case UCG_PLAN_BINARY_BLOCK: + status = ucg_builtin_binary_block_create(builtin_ctx, plan_topo_type, plan_component->plan_config, + builtin_ctx->group_params, coll_type, &plan); + break; +#if ENABLE_UCG_HICOLL + case UCG_PLAN_NAP: + status = ucg_builtin_NAP_create(builtin_ctx, plan_topo_type, plan_component->plan_config, + builtin_ctx->group_params, coll_type, &plan); + break; + + case UCG_PLAN_ALLTOALLV_LADD: + status = ucg_builtin_throttled_scatter_create(builtin_ctx, plan_topo_type, plan_component->plan_config, + builtin_ctx->group_params, coll_params, &plan); + break; + + case UCG_PLAN_ALLTOALLV_PLUMMER: + status = ucg_builtin_Plummer_create(builtin_ctx, plan_topo_type, plan_component->plan_config, + builtin_ctx->group_params, coll_type, coll_params, &plan); + break; +#endif default: status = ucg_builtin_binomial_tree_create(builtin_ctx, plan_topo_type, plan_component->plan_config, builtin_ctx->group_params, coll_type, &plan); @@ -1299,19 +1001,13 @@ static ucs_status_t ucg_builtin_plan(ucg_plan_component_t *plan_component, } ucs_list_add_head(&builtin_ctx->plan_head, &plan->list); - plan->super.is_noncontig_allreduce = (plan_topo_type != UCG_PLAN_RECURSIVE) ? 0 : - ucg_is_noncontig_allreduce(builtin_ctx->group_params, coll_params); - plan->super.is_ring_plan_topo_type = (plan_topo_type == UCG_PLAN_RING); - plan->convert_f = builtin_ctx->group_params->mpi_dt_convert; - plan->dtspan_f = builtin_ctx->group_params->mpi_datatype_span; - plan->resend = &builtin_ctx->send_head; - plan->slots = &builtin_ctx->slots[0]; - plan->am_id = builtin_ctx->am_id; + ucg_builtin_plan_create(plan, plan_topo_type, coll_params, builtin_ctx); + plan->ucg_algo = ucg_algo; *plan_p = (ucg_plan_t*)plan; return UCS_OK; } -static void ucg_builtin_print(ucg_plan_t *plan, const ucg_collective_params_t *coll_params) +STATIC_GTEST void ucg_builtin_print(ucg_plan_t *plan, const ucg_collective_params_t *coll_params) { unsigned major_version, minor_version, release_number; ucp_get_version(&major_version, &minor_version, &release_number); @@ -1320,20 +1016,14 @@ static void ucg_builtin_print(ucg_plan_t *plan, const ucg_collective_params_t *c printf("plan name: %s\n", plan->planner->name); } -void ucg_builtin_set_phase_thresh_max_short(ucg_builtin_group_ctx_t *ctx, +STATIC_GTEST void ucg_builtin_set_phase_thresh_max_short(ucg_builtin_group_ctx_t *ctx, ucg_builtin_plan_phase_t *phase) { - if (phase->ep_attr->cap.am.max_short < sizeof(ucg_builtin_header_t)) { - phase->send_thresh.max_short_one = 0; - } else { - phase->send_thresh.max_short_one = phase->ep_attr->cap.am.max_short - sizeof(ucg_builtin_header_t); - } - if (phase->send_thresh.max_short_one == 0) { - phase->send_thresh.max_short_max = 0; - } else { - phase->send_thresh.max_short_max = ctx->config->short_max_tx; - } + phase->send_thresh.max_short_one = (phase->ep_attr->cap.am.max_short < sizeof(ucg_builtin_header_t)) ? + 0 : (phase->ep_attr->cap.am.max_short - sizeof(ucg_builtin_header_t)); + + phase->send_thresh.max_short_max = (phase->send_thresh.max_short_one == 0) ? 0 : ctx->config->short_max_tx; if (phase->send_thresh.max_short_one > phase->send_thresh.max_short_max) { phase->send_thresh.max_short_one = phase->send_thresh.max_short_max; @@ -1370,19 +1060,14 @@ void ucg_builtin_set_phase_thresholds(ucg_builtin_group_ctx_t *ctx, phase->send_thresh.initialized = 1; if (!phase->recv_thresh.initialized) { - phase->recv_thresh.max_short_one = phase->send_thresh.max_short_one; - phase->recv_thresh.max_short_max = phase->send_thresh.max_short_max; - phase->recv_thresh.max_bcopy_one = phase->send_thresh.max_bcopy_one; - phase->recv_thresh.max_bcopy_max = phase->send_thresh.max_bcopy_max; - phase->recv_thresh.max_zcopy_one = phase->send_thresh.max_zcopy_one; - phase->recv_thresh.md_attr_cap_max_reg = phase->send_thresh.md_attr_cap_max_reg; + phase->recv_thresh = phase->send_thresh; phase->recv_thresh.initialized = 1; } } void ucg_builtin_log_phase_info(ucg_builtin_plan_phase_t *phase, ucg_group_member_index_t idx) { - ucs_debug("phase create: %p, dest %" PRIu64 ", short_one %zu, short_max %zu, bcopy_one %zu, bcopy_max %zu, zcopy_one %zu, max_reg %zu", + ucs_info("phase create: %p, dest %" PRIu64 ", short_one %zu, short_max %zu, bcopy_one %zu, bcopy_max %zu, zcopy_one %zu, max_reg %zu", phase, idx, phase->send_thresh.max_short_one, phase->send_thresh.max_short_max, phase->send_thresh.max_bcopy_one, phase->send_thresh.max_bcopy_max, phase->send_thresh.max_zcopy_one, phase->md_attr->cap.max_reg); } @@ -1392,13 +1077,17 @@ ucs_status_t ucg_builtin_connect(ucg_builtin_group_ctx_t *ctx, { uct_ep_h ep; ucp_ep_h ucp_ep; + unsigned alloc_cnt; ucs_status_t status = ucg_plan_connect(ctx->group, idx, &ep, &phase->ep_attr, &phase->md, &phase->md_attr, &ucp_ep); if (ucs_unlikely(status != UCS_OK)) { return status; } if (phase->ucp_eps == NULL) { - phase->ucp_eps = UCS_ALLOC_CHECK(sizeof(ucp_ep_h) * phase->ep_cnt, "ucp_eps"); + alloc_cnt = (phase_ep_index != UCG_BUILTIN_CONNECT_SINGLE_EP && phase_ep_index >= phase->ep_cnt) ? + (phase_ep_index + 1) : phase->ep_cnt; + phase->ucp_eps = UCS_ALLOC_CHECK(sizeof(ucp_ep_h) * alloc_cnt, "ucp_eps"); + phase->ep_thresh = UCS_ALLOC_CHECK(sizeof(ucg_builtin_tl_threshold_t) * alloc_cnt, "ucp_ep thresh"); } phase->ucp_eps[(phase_ep_index == UCG_BUILTIN_CONNECT_SINGLE_EP) ? 0 : phase_ep_index] = ucp_ep; @@ -1423,6 +1112,8 @@ ucs_status_t ucg_builtin_connect(ucg_builtin_group_ctx_t *ctx, if (phase->method != UCG_PLAN_METHOD_ALLGATHER_BRUCK && phase->method != UCG_PLAN_METHOD_ALLTOALL_BRUCK && phase->method != UCG_PLAN_METHOD_REDUCE_SCATTER_RING && + phase->method != UCG_PLAN_METHOD_INC && + phase->method != UCG_PLAN_METHOD_ALLTOALLV_LADD && phase->method != UCG_PLAN_METHOD_ALLGATHER_RING) { ucs_assert(phase_ep_index < phase->ep_cnt); } @@ -1431,11 +1122,34 @@ ucs_status_t ucg_builtin_connect(ucg_builtin_group_ctx_t *ctx, /* Set the thresholds */ ucg_builtin_set_phase_thresholds(ctx, phase); + phase->ep_thresh[(phase_ep_index != UCG_BUILTIN_CONNECT_SINGLE_EP) ? phase_ep_index : 0] = phase->send_thresh; ucg_builtin_log_phase_info(phase, idx); return status; } +ucg_group_member_index_t ucg_builtin_get_local_index(ucg_group_member_index_t global_index, + const ucg_group_member_index_t *local_members, + ucg_group_member_index_t member_cnt) +{ + ucg_group_member_index_t local_index = 0; + + ucg_group_member_index_t i; + for (i = 0; i < member_cnt ; i++) { + if (local_members[i] == global_index) { + local_index = i; + break; + } + } + return local_index; +} + +/* Get the -x UCX_BUILTIN_REDUCE_CONSISTENCY config value */ +int ucg_is_allreduce_consistency(const ucg_builtin_group_ctx_t *ctx) +{ + return ctx->config->reduce_consistency; +} + UCG_PLAN_COMPONENT_DEFINE(ucg_builtin_component, "builtin", sizeof(ucg_builtin_group_ctx_t), ucg_builtin_query, ucg_builtin_create, ucg_builtin_destroy, diff --git a/builtin/ops/builtin_cb.inl b/builtin/ops/builtin_cb.inl index 6206302..5434156 100644 --- a/builtin/ops/builtin_cb.inl +++ b/builtin/ops/builtin_cb.inl @@ -1,19 +1,20 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ #include "builtin_ops.h" #include +#include #include #include #include - +#include /* * Below is a list of possible callback/helper functions for an incoming message. * Upon arrival, a message is typically copied or reduced to its collective's - * final recieve buffer, though there are some complex collectives which are + * final receieve buffer, though there are some complex collectives which are * handled otherwise (using intermediate buffers). */ @@ -100,15 +101,46 @@ void ucg_builtin_mpi_reduce_partial(ucg_builtin_request_t *req, size_t offset, v gen_dt->ops.finish(gen_state); data = reduce_buf - gap; offset = (offset / dt_len) * params->recv.dt_len; + ucg_builtin_mpi_reduce(params->recv.op_ext, data, req->step->recv_buffer + offset, length / dt_len, + params->recv.dt_ext); + + if (reduce_buf != NULL) { + ucs_free(reduce_buf); + } + return; } - ucs_debug("mpi_reduce_partial, data:%p, length:%lu, recv_buffer:%p, offset:%lu, dt_len:%lu", + /* only the tree algo need the reduce data to be buffered */ + if (req->step->phase->method != UCG_PLAN_METHOD_REDUCE_TERMINAL + && req->step->phase->method != UCG_PLAN_METHOD_REDUCE_WAYPOINT) { + ucs_debug("mpi_reduce_partial, data:%p, length:%lu, recv_buffer:%p, offset:%lu, dt_len:%lu", data, length, req->step->recv_buffer, offset, dt_len); - ucg_builtin_mpi_reduce(params->recv.op_ext, data, req->step->recv_buffer + offset, + ucg_builtin_mpi_reduce(params->recv.op_ext, data, req->step->recv_buffer + offset, length / dt_len, params->recv.dt_ext); + return; + } + + /* fragmented message not support reduce data buffer */ + if (req->step->reduce_buff == NULL) { + ucg_builtin_mpi_reduce(params->recv.op_ext, data, req->step->recv_buffer + offset, length / dt_len, params->recv.dt_ext); + return; + } - if (reduce_buf != NULL) { - ucs_free(reduce_buf); + if (offset > req->step->rbuf_count) { + ucs_error("Illegal offset:%lu, method:%u", offset, (int)req->step->phase->method); + return; + } + + int tLen = offset * length; + memcpy((char *)req->step->reduce_buff + tLen, data, length); + if (req->pending > 1) { + return; + } + uint32_t loop = 0; + while (loop < req->step->rbuf_count) { + ucg_builtin_mpi_reduce(params->recv.op_ext, (char *)req->step->reduce_buff + loop * length, + req->step->recv_buffer, length / dt_len, params->recv.dt_ext); + loop++; } } @@ -165,6 +197,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_comp_step_cb(ucg_builtin_req ucg_builtin_op_step_t *next_step = ++req->step; req->pending = next_step->fragments_recv * next_step->phase->ep_cnt; req->recv_comp = 0; + req->is_send_cb_called = 0; ucs_container_of(req, ucg_builtin_comp_slot_t, req)->step_idx = next_step->am_header.step_idx; ucs_debug("slot next step: %u",next_step->am_header.step_idx); @@ -213,6 +246,30 @@ static UCS_F_ALWAYS_INLINE int ucg_builtin_comp_send_check_frag_cb(ucg_builtin_r return step->iter_offset != UCG_BUILTIN_OFFSET_PIPELINE_READY; } +#if ENABLE_UCG_HICOLL +static int ucg_builtin_inc_comp_recv_one_cb(ucg_builtin_request_t *req, + uint64_t offset, void *data, size_t length) +{ + int ret = 1; + ucs_status_t status = inc_comp_recv_one(req, offset, data, length); + if (status != UCS_OK) { + ret = 0; + } + (void)ucg_builtin_comp_step_cb(req, NULL); + return ret; +} + +static int ucg_builtin_inc_comp_recv_many_cb(ucg_builtin_request_t *req, + uint64_t offset, void *data, size_t length) +{ + ucs_status_t status = inc_comp_recv_many(req, offset, data, length); + if (status != UCS_OK) { + return 0; + } + return ucg_builtin_comp_step_check_cb(req); +} +#endif + static UCS_F_ALWAYS_INLINE void ucg_builtin_comp_zcopy_check_cb(ucg_builtin_request_t *req) { uint32_t num_store = req->step->zcopy.num_store; @@ -235,6 +292,23 @@ static int ucg_builtin_comp_recv_one_cb(ucg_builtin_request_t *req, return 1; } +/* recv_cb will parse the rank and "actual" data */ +static int ucg_builtin_comp_recv_var_one_cb(ucg_builtin_request_t *req, + uint64_t offset, void *data, size_t length) +{ + /* the first 64bytes data(payload) is the soure rank */ + ucg_group_member_index_t src_rank = *((ucg_group_member_index_t *)data); + + size_t recv_dt_len = req->op->super.params.recv.dt_len; + ucg_builtin_coll_params_t *recv_coll_params = req->step->recv_coll_params; + + int64_t recv_buffer_displ = recv_coll_params->displs[src_rank] * recv_dt_len; + int8_t *recv_buffer = recv_coll_params->init_buf + recv_buffer_displ + offset; + memcpy(recv_buffer, (int8_t *)data + sizeof(src_rank), length - sizeof(src_rank)); + (void)ucg_builtin_comp_step_cb(req, NULL); + return 1; +} + static int ucg_builtin_comp_recv_noncontig_one_cb(ucg_builtin_request_t *req, uint64_t offset, void *data, size_t length) { @@ -278,6 +352,22 @@ static int ucg_builtin_comp_recv_noncontig_many_cb(ucg_builtin_request_t *req, return ucg_builtin_comp_step_check_cb(req); } +/* recv_cb will parse the rank and "actual" data */ +static int ucg_builtin_comp_recv_var_many_cb(ucg_builtin_request_t *req, + uint64_t offset, void *data, size_t length) +{ + /* the first 64bytes data(payload) is the soure rank */ + ucg_group_member_index_t src_rank = *((ucg_group_member_index_t *)data); + + size_t recv_dt_len = req->op->super.params.recv.dt_len; + ucg_builtin_coll_params_t *recv_coll_params = req->step->recv_coll_params; + + int64_t recv_buffer_displ = recv_coll_params->displs[src_rank] * recv_dt_len; + int8_t *recv_buffer = recv_coll_params->init_buf + recv_buffer_displ + offset; + memcpy(recv_buffer, (int8_t *)data + sizeof(src_rank), length - sizeof(src_rank)); + return ucg_builtin_comp_step_check_cb(req); +} + static int ucg_builtin_comp_recv_many_then_send_pipe_cb(ucg_builtin_request_t *req, uint64_t offset, void *data, size_t length) { @@ -444,6 +534,12 @@ static int ucg_builtin_comp_last_barrier_step_many_cb(ucg_builtin_request_t *req return 0; } +/* For variable-length buffers, the value is calculated based on the pending value. */ +void ucg_builtin_step_var_callbacks(unsigned pending, ucg_builtin_comp_recv_cb_t *recv_cb) +{ + *recv_cb = (pending == 1 ? ucg_builtin_comp_recv_var_one_cb : ucg_builtin_comp_recv_var_many_cb); +} + static ucs_status_t ucg_builtin_step_select_callbacks(ucg_builtin_plan_phase_t *phase, int is_contig_recv, ucg_builtin_comp_recv_cb_t *recv_cb, int nonzero_length, int flags) { @@ -453,8 +549,10 @@ static ucs_status_t ucg_builtin_step_select_callbacks(ucg_builtin_plan_phase_t * int is_last_step = flags & UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP; int is_zcopy = flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY; int is_segmented = phase->segmented; - unsigned is_single_msg = ((is_single_ep) && (!is_fragmented) && (!is_segmented)); + int is_partial = phase->ex_attr.is_partial; + unsigned is_single_msg = (is_single_ep && (!is_fragmented) && (!is_segmented) && (!is_partial)); int is_waypoint_fanout = 0;/* special flag for waypoint bcast/scatter, only receive once */ + const int cnt_num = 2; ucs_debug("step select callback, method:%d, flags:0x%x, is_segmented:%d, nonzero_length:%d, recv_contig:%d", phase->method, flags, is_segmented, nonzero_length, is_contig_recv); @@ -482,8 +580,8 @@ static ucs_status_t ucg_builtin_step_select_callbacks(ucg_builtin_plan_phase_t * } break; - case UCG_PLAN_METHOD_RECV_TERMINAL: case UCG_PLAN_METHOD_SEND_TERMINAL: + case UCG_PLAN_METHOD_RECV_TERMINAL: if (!is_contig_recv) { *recv_cb = is_single_msg ? ucg_builtin_comp_recv_noncontig_one_cb : ucg_builtin_comp_recv_noncontig_many_cb; @@ -495,11 +593,12 @@ static ucs_status_t ucg_builtin_step_select_callbacks(ucg_builtin_plan_phase_t * break; case UCG_PLAN_METHOD_REDUCE_WAYPOINT: - is_single_msg |= ((phase->ep_cnt == 2) && (!is_fragmented)); + is_single_msg |= ((phase->ep_cnt == cnt_num) && (!is_fragmented)); if (is_single_msg) { *recv_cb = nonzero_length ? ucg_builtin_comp_reduce_one_then_send_cb : ucg_builtin_comp_wait_one_then_send_cb; - } if (is_segmented && nonzero_length){ + } + if (is_segmented && nonzero_length){ *recv_cb = ucg_builtin_comp_reduce_full_then_send_cb; } else { *recv_cb = nonzero_length ? (is_pipelined ? ucg_builtin_comp_reduce_many_then_send_pipe_cb : @@ -510,6 +609,7 @@ static ucs_status_t ucg_builtin_step_select_callbacks(ucg_builtin_plan_phase_t * case UCG_PLAN_METHOD_REDUCE_TERMINAL: case UCG_PLAN_METHOD_REDUCE_RECURSIVE: + case UCG_PLAN_METHOD_REDUCE_SCATTER_RECURSIVE: if (is_single_msg && !is_zcopy) { *recv_cb = nonzero_length ? ucg_builtin_comp_reduce_one_cb : ucg_builtin_comp_wait_one_cb; @@ -520,7 +620,17 @@ static ucs_status_t ucg_builtin_step_select_callbacks(ucg_builtin_plan_phase_t * ucg_builtin_comp_wait_many_cb; } break; - +#if ENABLE_UCG_HICOLL + case UCG_PLAN_METHOD_INC: + if (is_single_msg && !is_zcopy){ + *recv_cb = nonzero_length ? ucg_builtin_inc_comp_recv_one_cb : + ucg_builtin_comp_wait_one_cb; + } else { + *recv_cb = nonzero_length ? ucg_builtin_inc_comp_recv_many_cb : + ucg_builtin_comp_wait_many_cb; + } + break; +#endif case UCG_PLAN_METHOD_ALLGATHER_BRUCK: *recv_cb = nonzero_length ? ucg_builtin_comp_recv_many_cb : ucg_builtin_comp_wait_many_cb; @@ -544,6 +654,16 @@ static ucs_status_t ucg_builtin_step_select_callbacks(ucg_builtin_plan_phase_t * *recv_cb = ucg_builtin_comp_recv_many_cb; break; + case UCG_PLAN_METHOD_EXCHANGE: + if (is_single_msg && !is_zcopy){ + *recv_cb = nonzero_length ? ucg_builtin_comp_recv_one_cb : + ucg_builtin_comp_wait_one_cb; + } else { + *recv_cb = nonzero_length ? ucg_builtin_comp_recv_many_cb : + ucg_builtin_comp_wait_many_cb; + } + break; + default: ucs_error("Invalid method for a collective operation."); return UCS_ERR_INVALID_PARAM; @@ -562,7 +682,15 @@ static ucs_status_t ucg_builtin_step_select_callbacks(ucg_builtin_plan_phase_t * * Below is a list of possible callback functions for pretreatment before sending. */ -/* send_cb for alltoall to sned discrete elements */ +/* send_cb for INC */ +void ucg_builtin_send_inc(ucg_builtin_request_t *req) +{ +#if ENABLE_UCG_HICOLL + inc_send_cb(req); +#endif +} + +/* send_cb for alltoall to send discrete elements */ static void ucg_builtin_send_alltoall(ucg_builtin_request_t *req) { unsigned i, k; @@ -581,6 +709,375 @@ static void ucg_builtin_send_alltoall(ucg_builtin_request_t *req) } } +ucs_status_t ucg_builtin_plummer_check_data_size(size_t dtype_size, int count) +{ + /* To support the negative offset, the maxinum memory size is 2GBytes. */ + static const uint64_t max_size = 2147483647; + uint64_t total_size = dtype_size * count; + if (total_size > max_size) { + ucs_error("The buffer limit supported by the alltoallv plummer algorithm is exceeded."); + return UCS_ERR_OUT_OF_RANGE; + } + return UCS_OK; +} + +ucs_status_t ucg_builtin_plummer_check_overflow(int left, int right) +{ + if (right < left) { + ucs_error("The buffer limit supported by the alltoallv plummer algorithm is exceeded."); + return UCS_ERR_OUT_OF_RANGE; + } + return UCS_OK; +} + +#define PLUMMER_CHECK_DATA_SIZE(dtype_size, count) do { \ + status = ucg_builtin_plummer_check_data_size(dtype_size, count); \ + if (status != UCS_OK) { \ + req->plummer_req_status = status; \ + return; \ + } \ +} while (0) + +#define PLUMMER_CHECK_OVERFLOW(left, right) do { \ + status = ucg_builtin_plummer_check_overflow(left, right); \ + if (status != UCS_OK) { \ + req->plummer_req_status = status; \ + return; \ + } \ +} while (0) + +#define PLUMMER_REQ_STATUS_NO_MEMORY do { \ + req->plummer_req_status = UCS_ERR_NO_MEMORY; \ + return; \ +} while (0) + +void ucg_builtin_plummer_gather_send_counts_cb(ucg_builtin_request_t *req) +{ + ucg_builtin_op_step_t *step = req->step; + if (step->phase->ex_attr.is_node_leader) { + unsigned ppn = step->phase->ex_attr.ppn; + size_t buffer_size = ppn * step->buf_len_unit; + step->recv_buffer = (int8_t *)ucs_malloc(buffer_size, "allocate gather send counts buffers"); + if (step->recv_buffer == NULL) { + PLUMMER_REQ_STATUS_NO_MEMORY; + } + unsigned local_index = step->phase->ex_attr.recv_start_block; + memcpy(step->recv_buffer + local_index * step->buf_len_unit, step->send_buffer, step->buf_len_unit); + req->op->temp_data_buffer = step->recv_buffer; /* Save for future use */ + /* single ep remote offset = 0 */ + if ( step->phase->ep_cnt == 1) { + step->recv_buffer += step->buf_len_unit; + } + } +} + +void ucg_builtin_plummer_gather_send_buffers_cb(ucg_builtin_request_t *req) +{ + ucs_status_t status = UCS_OK; + ucg_collective_params_t *params = &(req->op->super.params); + ucg_builtin_op_step_t *step = req->step; + ucg_builtin_plan_phase_t *phase = step->phase; + ucg_builtin_coll_params_t *recv_coll_params = step->recv_coll_params; + ucg_builtin_coll_params_t *send_coll_params = step->send_coll_params; + size_t dt_len = params->send.dt_len; + unsigned member_cnt = phase->ex_attr.member_cnt; + unsigned ppn = phase->ex_attr.ppn; + + /*initialize step recv coll parameters */ + if (step->phase->ex_attr.is_node_leader) { + int *temp_send_counts = (int *)req->op->temp_data_buffer; + int8_t *init_send_buf = params->send.buf == MPI_IN_PLACE ? (int8_t *)params->recv.buf + : (int8_t *)params->send.buf; + unsigned i, j, k; + for (i =0; i < ppn; i++) { + k = i * member_cnt; + for (j =0; j < member_cnt; j++) { + recv_coll_params->counts[i] += temp_send_counts[k++]; + } + } + + for (i =0; i < (ppn-1); i++) { + recv_coll_params->displs[i+1] = recv_coll_params->displs[i] + recv_coll_params->counts[i]; + PLUMMER_CHECK_OVERFLOW(recv_coll_params->displs[i], recv_coll_params->displs[i+1]); + } + + int total_recv_count = recv_coll_params->counts[ppn-1] + recv_coll_params->displs[ppn-1]; + PLUMMER_CHECK_DATA_SIZE(dt_len, total_recv_count); + + size_t total_recv_buffer = total_recv_count * dt_len; + req->op->temp_exchange_buffer = (int8_t *)ucs_malloc(total_recv_buffer, "allocate send buffer"); + if (req->op->temp_exchange_buffer== NULL) { + PLUMMER_REQ_STATUS_NO_MEMORY; + } + recv_coll_params->init_buf = req->op->temp_exchange_buffer; + + memcpy(recv_coll_params->init_buf, init_send_buf, + dt_len * (params->send.counts[member_cnt-1] + params->send.displs[member_cnt-1])); + + } else { + send_coll_params->init_buf = params->send.buf == MPI_IN_PLACE ? (int8_t *)params->recv.buf + : (int8_t *)params->send.buf; + + send_coll_params->counts[0] = params->send.counts[member_cnt-1] + params->send.displs[member_cnt-1]; + send_coll_params->displs[0] = 0; + /* initialize step other parameters */ + step->send_buffer = send_coll_params->init_buf; + step->buffer_length = send_coll_params->counts[0] * dt_len; + status = ucg_builtin_step_alloc_pack_rank_buffer(step, send_coll_params->counts[0] * dt_len); + if (status!= UCS_OK) { + req->plummer_req_status = status; + } + } +} + +void ucg_builtin_plummer_gather_recv_counts_cb(ucg_builtin_request_t *req) +{ + ucg_builtin_op_step_t *step = req->step; + if (step->phase->ex_attr.is_node_leader) { + unsigned ppn = step->phase->ex_attr.ppn; + size_t buffer_size = ppn * step->buf_len_unit; + step->recv_buffer = (int8_t *)ucs_malloc(buffer_size, "allocate gather send counts buffers"); + if (step->recv_buffer == NULL) { + PLUMMER_REQ_STATUS_NO_MEMORY; + } + unsigned local_index = step->phase->ex_attr.recv_start_block; + memcpy(step->recv_buffer + local_index * step->buf_len_unit, step->send_buffer, step->buf_len_unit); + req->op->temp_data_buffer1 = step->recv_buffer; /* Save for future use */ + /* single ep remote offset = 0 */ + if ( step->phase->ep_cnt == 1) { + step->recv_buffer += step->buf_len_unit; + } + } +} + +void ucg_builtin_plummer_inter_alltoallv_cb(ucg_builtin_request_t *req) +{ + ucs_status_t status = UCS_OK; + ucg_collective_params_t *params = &(req->op->super.params); + ucg_builtin_op_t *op = req->op; + ucg_builtin_op_step_t *step = req->step; + ucg_builtin_plan_phase_t *phase = step->phase; + ucg_builtin_coll_params_t *recv_coll_params = step->recv_coll_params; + ucg_builtin_coll_params_t *send_coll_params = step->send_coll_params; + size_t send_dt_len = params->send.dt_len; + unsigned node_cnt = phase->ex_attr.member_cnt; + unsigned ppn = phase->ex_attr.ppn; + + /* init send and recv counts */ + int *temp_send_counts = (int *)req->op->temp_data_buffer; + int *temp_recv_counts = (int *)req->op->temp_data_buffer1; + + unsigned i, j, k; + unsigned counter = 0; + for (i = 0; i < ppn; i++) { + for (j = 0; j < node_cnt; j++) { + for(k = 0; k < ppn; k++) { + send_coll_params->counts[j] += temp_send_counts[counter]; + recv_coll_params->counts[j] += temp_recv_counts[counter]; + counter++; + } + } + } + + /* init send and recv displs */ + for (j = 0; j < (node_cnt-1); j++) { + send_coll_params->displs[j+1] = send_coll_params->counts[j] + send_coll_params->displs[j]; + recv_coll_params->displs[j+1] = recv_coll_params->counts[j] + recv_coll_params->displs[j]; + PLUMMER_CHECK_OVERFLOW(send_coll_params->displs[j], send_coll_params->displs[j+1]); + PLUMMER_CHECK_OVERFLOW(recv_coll_params->displs[j], recv_coll_params->displs[j+1]); + } + + /* init send and recv buffer, memory redistribution */ + int *temp_send_displs = (int *)ucs_malloc(counter * sizeof(int), "allocate temp displs"); + if (temp_send_displs == NULL) { + PLUMMER_REQ_STATUS_NO_MEMORY; + } + memset(temp_send_displs, 0, counter * sizeof(int)); + + for (i = 0; i < (ppn*node_cnt*ppn-1); i++) { + temp_send_displs[i+1] = temp_send_displs[i] + temp_send_counts[i]; + } + + PLUMMER_CHECK_DATA_SIZE(send_dt_len, send_coll_params->counts[node_cnt-1]+send_coll_params->displs[node_cnt-1]); + + send_coll_params->init_buf = (int8_t *)ucs_malloc((send_coll_params->counts[node_cnt-1] + + send_coll_params->displs[node_cnt-1]) * send_dt_len, "allocate init buffer"); + if (send_coll_params->init_buf == NULL) { + ucg_builtin_free((void **)&temp_send_displs); + PLUMMER_REQ_STATUS_NO_MEMORY; + } + + int8_t *temp_init_buf = send_coll_params->init_buf; + + unsigned count, disp, idx; + for (j = 0; j < node_cnt; j++) { + for (k = 0; k < ppn; k++) { + for (i = 0; i < ppn; i++) { + idx = i * ppn * node_cnt + j * ppn + k; + count = temp_send_counts[idx] * send_dt_len; + disp = temp_send_displs[idx] * send_dt_len; + if (count > 0) { + memcpy(temp_init_buf, req->op->temp_exchange_buffer+disp, count); + temp_init_buf += count; + } + } + } + } + ucg_builtin_free((void **)&temp_send_displs); + PLUMMER_CHECK_DATA_SIZE(send_dt_len, (recv_coll_params->counts[node_cnt-1]+recv_coll_params->displs[node_cnt-1])); + + recv_coll_params->init_buf = (int8_t *)ucs_malloc((recv_coll_params->counts[node_cnt-1] + + recv_coll_params->displs[node_cnt-1]) * send_dt_len, "allocate init buffer"); + if (recv_coll_params->init_buf == NULL) { + ucg_builtin_free((void **)&send_coll_params->init_buf); + PLUMMER_REQ_STATUS_NO_MEMORY; + } + /* copy to myself */ + unsigned local_index = phase->ex_attr.packed_rank; + memcpy(recv_coll_params->init_buf+recv_coll_params->displs[local_index]*send_dt_len, + send_coll_params->init_buf+send_coll_params->displs[local_index]*send_dt_len, + send_coll_params->counts[local_index]*send_dt_len); + + /* release old buffer, use redistribute buffer */ + ucg_builtin_free((void **)&op->temp_exchange_buffer); + op->temp_exchange_buffer = send_coll_params->init_buf; + op->temp_exchange_buffer1 = recv_coll_params->init_buf; + step->send_buffer = send_coll_params->init_buf; + + unsigned send_start_block = phase->ex_attr.start_block; + unsigned send_num_blocks = phase->ex_attr.num_blocks; + unsigned member_cnt = node_cnt; + unsigned phase_send_buffer_length = 0; + + unsigned block_idx = send_start_block; + while (block_idx < (send_start_block + send_num_blocks)) { + int real_block_idx = block_idx % member_cnt; + phase_send_buffer_length += step->send_coll_params->counts[real_block_idx]; + block_idx++; + } + phase_send_buffer_length *= send_dt_len; + + status = ucg_builtin_step_alloc_pack_rank_buffer(step, phase_send_buffer_length); + if (status != UCS_OK) { + PLUMMER_REQ_STATUS_NO_MEMORY; + } +} + +void ucg_builtin_plummer_scatter_recv_buffers_cb(ucg_builtin_request_t *req) +{ + ucg_collective_params_t *params = &(req->op->super.params); + ucg_builtin_op_step_t *step = req->step; + ucg_builtin_plan_phase_t *phase = step->phase; + ucg_builtin_coll_params_t *recv_coll_params = step->recv_coll_params; + ucg_builtin_coll_params_t *send_coll_params = step->send_coll_params; + size_t send_dt_len = params->send.dt_len; + unsigned member_cnt = phase->ex_attr.member_cnt; + unsigned ppn = phase->ex_attr.ppn; + + /* initialize send coll parameters */ + if (phase->ex_attr.is_node_leader) { + /* temp recv counts */ + int *temp_recv_counts = (int *)req->op->temp_data_buffer1; + + /* init send counts and displs*/ + unsigned i, j, k; + for (i = 0; i < ppn; i++) { + k = i * member_cnt; + for (j = 0; j < member_cnt; j++) { + send_coll_params->counts[i] += temp_recv_counts[k++]; + } + } + for (i = 0; i < (ppn-1); i++) { + send_coll_params->displs[i+1] = send_coll_params->displs[i] + send_coll_params->counts[i]; + } + + /* init send buffers, first memory redistribution */ + int *temp_send_counts_new = (int *)ucs_malloc(member_cnt * sizeof(int), "allocate temp displs"); + if (temp_send_counts_new == NULL) { + PLUMMER_REQ_STATUS_NO_MEMORY; + } + memset(temp_send_counts_new, 0, member_cnt * sizeof(int)); + + unsigned idx1; + unsigned node_cnt = member_cnt / ppn; + for (k = 0; k < node_cnt; k++) { + idx1 = k * ppn; + for (i = 0; i < ppn; i++) { + for (j = 0; j < ppn; j++) { + temp_send_counts_new[idx1+i] += temp_recv_counts[i*member_cnt+idx1+j]; + } + } + } + + int *temp_send_displs_new = (int *)ucs_malloc(member_cnt * sizeof(int), "allocate temp displs"); + if (temp_send_displs_new == NULL) { + ucg_builtin_free((void **)&temp_send_counts_new); + PLUMMER_REQ_STATUS_NO_MEMORY; + } + memset(temp_send_displs_new, 0, member_cnt * sizeof(int)); + + for (i = 0; i < (member_cnt-1); i++) { + temp_send_displs_new[i+1] = temp_send_displs_new[i] + temp_send_counts_new[i]; + } + send_coll_params->init_buf = (int8_t *)ucs_malloc((temp_send_displs_new[member_cnt-1] + + temp_send_displs_new[member_cnt-1]) * send_dt_len, "allocate init buffer"); + if (send_coll_params->init_buf == NULL) { + ucg_builtin_free((void **)&temp_send_counts_new); + ucg_builtin_free((void **)&temp_send_displs_new); + PLUMMER_REQ_STATUS_NO_MEMORY; + } + + int8_t *temp_init_buf = send_coll_params->init_buf; + for (j = 0; j < ppn; j++) { + for (k = 0; k < node_cnt; k++) { + idx1 = k * ppn + j; + unsigned count = temp_send_counts_new[idx1] * send_dt_len; + unsigned disp = temp_send_displs_new[idx1] * send_dt_len; + memcpy(temp_init_buf , req->op->temp_exchange_buffer1 + disp, count); + temp_init_buf += count; + } + } + ucg_builtin_free((void **)&temp_send_counts_new); + ucg_builtin_free((void **)&temp_send_displs_new); + ucg_builtin_free((void **)&req->op->temp_exchange_buffer1); + + req->op->temp_exchange_buffer1 = send_coll_params->init_buf; + + memcpy((int8_t *)params->recv.buf, send_coll_params->init_buf, send_dt_len * send_coll_params->counts[0]); + + unsigned send_start_block = phase->ex_attr.start_block; + unsigned send_num_blocks = phase->ex_attr.num_blocks; + unsigned block_idx = send_start_block; + unsigned phase_send_buffer_length = 0; + + while (block_idx < (send_start_block + send_num_blocks)) { + unsigned real_block_idx = block_idx % ppn; + phase_send_buffer_length += send_coll_params->counts[real_block_idx]; + block_idx++; + } + phase_send_buffer_length *= send_dt_len; + + ucs_status_t status = ucg_builtin_step_alloc_pack_rank_buffer(step, phase_send_buffer_length); + if (status != UCS_OK) { + req->plummer_req_status = status; + } + } else { + /* initialize recv coll parameters */ + recv_coll_params->init_buf = (int8_t *)params->recv.buf; + recv_coll_params->counts[0] = params->recv.counts[member_cnt-1] + params->recv.displs[member_cnt-1]; + recv_coll_params->displs[0] = 0; + } +} + +/* send cb for reduce-way-point */ +void ucg_builtin_send_reduce(ucg_builtin_request_t *req) +{ + ucg_builtin_op_step_t *step = req->step; + if (!(step->flags & UCG_BUILTIN_OP_STEP_FLAG_FIRST_STEP)) { + /* copy reduced data to allocated send buffer */ + memcpy(step->send_buffer, req->op->super.params.recv.buf, step->buffer_length); + } +} /* * Below is a list of possible callback functions for operation initialization. */ @@ -604,6 +1101,24 @@ static void ucg_builtin_init_reduce(ucg_builtin_op_t *op) } } +static void ucg_builtin_init_rabenseifner(ucg_builtin_op_t *op) +{ + ucg_builtin_op_step_t *step = &op->steps[0]; + unsigned step_idx; + size_t len = op->super.params.send.count * op->super.params.send.dt_len; + if (op->super.params.send.buf == MPI_IN_PLACE) { + memcpy(step->recv_buffer, op->super.params.recv.buf, len); + } else { + if (step->recv_buffer != op->super.params.send.buf) { + memcpy(step->recv_buffer, op->super.params.send.buf, len); + } + } + /* Prevent remtoe_offset from being set to 0 by multiple calls */ + for (step_idx = 0; step_idx < ((ucg_builtin_plan_t *)op->super.plan)->phs_cnt; step_idx++) { + (&op->steps[step_idx])->am_header.remote_offset = (&op->steps[step_idx])->remote_offset; + } +} + static void ucg_builtin_init_ring(ucg_builtin_op_t *op) { ucg_builtin_op_step_t *step = &op->steps[0]; @@ -616,6 +1131,19 @@ static void ucg_builtin_init_ring(ucg_builtin_op_t *op) memcpy(step->recv_buffer, step->send_buffer - step->am_header.remote_offset, len); } +void ucg_builtin_init_inc(ucg_builtin_op_t *op) +{ + ucg_builtin_op_step_t *step = &op->steps[0]; + unsigned buf_size; + buf_size = op->super.params.send.count * op->super.params.send.dt_len; + if (step->recv_buffer != NULL && op->super.params.send.buf != NULL && buf_size > 0) { + errno_t status = memcpy_s(step->recv_buffer, buf_size, op->super.params.send.buf, buf_size); + if (status != EOK) { + op->inc_init_status = UCS_ERR_INVALID_PARAM; + } + } +} + /* for allgather, add initial step for first element storage*/ static void ucg_builtin_init_allgather(ucg_builtin_op_t *op) { @@ -636,7 +1164,7 @@ static void ucg_builtin_init_allgather(ucg_builtin_op_t *op) static void ucg_builtin_init_allgather_recursive(ucg_builtin_op_t *op) { ucg_builtin_op_step_t *step = &op->steps[0]; - size_t init_offset = 0; + size_t init_offset; init_offset = op->super.plan->my_index * op->super.params.send.count *op->super.params.send.dt_len; memcpy(step->recv_buffer + init_offset, step->send_buffer, step->buffer_length); } @@ -657,7 +1185,17 @@ static void ucg_builtin_init_alltoall(ucg_builtin_op_t *op) } } +/* for UCG_PLAN_METHOD_EXCHANGE, pairwise at initial step */ +static void ucg_builtin_init_pairwise(ucg_builtin_op_t *op) +{ + ucg_builtin_op_step_t *step = &op->steps[0]; + const ucg_group_params_t *params = ucg_group_get_params(op->super.plan->group); + size_t proc_count = params->member_count; + if (op->super.params.send.buf != MPI_IN_PLACE) { + memcpy(step->recv_buffer, op->super.params.send.buf, step->buffer_length * proc_count); + } +} /* local shift for allgather at final step */ static void ucg_builtin_final_allgather(ucg_builtin_request_t *req) @@ -701,35 +1239,35 @@ static void ucg_builtin_final_alltoall(ucg_builtin_request_t *req) } static UCS_F_ALWAYS_INLINE void -ucg_builtin_init_state(ucg_builtin_op_step_t *step, int option, +ucg_builtin_init_dt_state(ucg_builtin_op_step_t *step, int option, ucp_dt_generic_t *dt_gen, const ucg_collective_params_t *params) { - void *state_gen; + void *state_gen = NULL; /* send or recv count is 0 */ if (dt_gen == NULL) { return; } - ucs_debug("ucg_builtin_init_state, option:%d", option); + ucs_debug("ucg_builtin_init_dt_state, option:%d", option); switch (option) { - case 0: + case UCG_BUILTIN_OP_DT_RECV: state_gen = dt_gen->ops.start_unpack(dt_gen->context, step->recv_buffer, params->recv.count); step->non_contig.unpack_state = state_gen; break; - case 1: + case UCG_BUILTIN_OP_DT_SEND: state_gen = dt_gen->ops.start_pack(dt_gen->context, step->send_buffer, params->send.count); step->non_contig.pack_state = state_gen; break; - case 2: + case UCG_BUILTIN_OP_DT_SWAP: state_gen = dt_gen->ops.start_pack(dt_gen->context, step->recv_buffer, params->recv.count); @@ -737,13 +1275,13 @@ ucg_builtin_init_state(ucg_builtin_op_step_t *step, int option, break; default: - ucs_debug("ucg_builtin_init_state, invalid option:%d", option); + ucs_debug("ucg_builtin_init_dt_state, invalid option:%d", option); break; } } static UCS_F_ALWAYS_INLINE void -ucg_builtin_finalize_state(ucg_builtin_op_step_t *step, int option, +ucg_builtin_finalize_dt_state(ucg_builtin_op_step_t *step, int option, ucp_dt_generic_t *dt_gen) { /* send or recv count is 0 */ @@ -751,23 +1289,23 @@ ucg_builtin_finalize_state(ucg_builtin_op_step_t *step, int option, return; } - ucs_debug("ucg_builtin_finalize_state, option:%d", option); + ucs_debug("ucg_builtin_finalize_dt_state, option:%d", option); switch (option) { - case 0: + case UCG_BUILTIN_OP_DT_RECV: dt_gen->ops.finish(step->non_contig.unpack_state); break; - case 1: + case UCG_BUILTIN_OP_DT_SEND: dt_gen->ops.finish(step->non_contig.pack_state); break; - case 2: + case UCG_BUILTIN_OP_DT_SWAP: dt_gen->ops.finish(step->non_contig.pack_state_recv); break; default: - ucs_debug("ucg_builtin_finalize_state, invalid option:%d", option); + ucs_debug("ucg_builtin_finalize_dt_state, invalid option:%d", option); break; } } @@ -776,7 +1314,7 @@ static void ucg_builtin_init_pack(ucg_builtin_op_t *op) { ucg_builtin_op_step_t *step = &op->steps[0]; do { - ucg_builtin_init_state(step, 1, op->send_dt, &op->super.params); + ucg_builtin_init_dt_state(step, UCG_BUILTIN_OP_DT_SEND, op->send_dt, &op->super.params); } while (!((step++)->flags & UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP)); } @@ -784,7 +1322,7 @@ static void ucg_builtin_init_unpack(ucg_builtin_op_t *op) { ucg_builtin_op_step_t *step = &op->steps[0]; do { - ucg_builtin_init_state(step, 0, op->recv_dt, &op->super.params); + ucg_builtin_init_dt_state(step, UCG_BUILTIN_OP_DT_RECV, op->recv_dt, &op->super.params); } while (!((step++)->flags & UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP)); } @@ -792,10 +1330,10 @@ static void ucg_builtin_init_pack_and_unpack(ucg_builtin_op_t *op) { ucg_builtin_op_step_t *step = &op->steps[0]; do { - ucg_builtin_init_state(step, 1, op->send_dt, &op->super.params); - ucg_builtin_init_state(step, 0, op->recv_dt, &op->super.params); + ucg_builtin_init_dt_state(step, UCG_BUILTIN_OP_DT_SEND, op->send_dt, &op->super.params); + ucg_builtin_init_dt_state(step, UCG_BUILTIN_OP_DT_RECV, op->recv_dt, &op->super.params); if (step->phase->is_swap) { - ucg_builtin_init_state(step, 2, op->recv_dt, &op->super.params); + ucg_builtin_init_dt_state(step, UCG_BUILTIN_OP_DT_SWAP, op->recv_dt, &op->super.params); } } while (!((step++)->flags & UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP)); } @@ -817,7 +1355,7 @@ static void ucg_builtin_finalize_pack(ucg_builtin_request_t *req) ucg_builtin_op_t *op = req->op; ucg_builtin_op_step_t *step = &op->steps[0]; do { - ucg_builtin_finalize_state(step, 1, op->send_dt); + ucg_builtin_finalize_dt_state(step, UCG_BUILTIN_OP_DT_SEND, op->send_dt); } while (!((step++)->flags & UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP)); } @@ -826,7 +1364,7 @@ static void ucg_builtin_finalize_unpack(ucg_builtin_request_t *req) ucg_builtin_op_t *op = req->op; ucg_builtin_op_step_t *step = &op->steps[0]; do { - ucg_builtin_finalize_state(step, 0, op->recv_dt); + ucg_builtin_finalize_dt_state(step, UCG_BUILTIN_OP_DT_RECV, op->recv_dt); } while (!((step++)->flags & UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP)); } @@ -835,23 +1373,154 @@ static void ucg_builtin_finalize_pack_and_unpack(ucg_builtin_request_t *req) ucg_builtin_op_t *op = req->op; ucg_builtin_op_step_t *step = &op->steps[0]; do { - ucg_builtin_finalize_state(step, 1, op->send_dt); - ucg_builtin_finalize_state(step, 0, op->recv_dt); + ucg_builtin_finalize_dt_state(step, UCG_BUILTIN_OP_DT_SEND, op->send_dt); + ucg_builtin_finalize_dt_state(step, UCG_BUILTIN_OP_DT_RECV, op->recv_dt); if (step->phase->is_swap) { - ucg_builtin_finalize_state(step, 2, op->recv_dt); + ucg_builtin_finalize_dt_state(step, UCG_BUILTIN_OP_DT_SWAP, op->recv_dt); } } while (!((step++)->flags & UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP)); } +/* alltoallv throttled scatter algorithm at initial step */ +void ucg_builtin_init_throttled_scatter(ucg_builtin_op_t *op) +{ + ucg_collective_params_t *params = &(op->super.params); + if (params->send.buf != MPI_IN_PLACE) { + size_t my_index = op->super.plan->my_index; + int send_count = params->send.counts[my_index]; + int send_displ = params->send.displs[my_index]; + int recv_displ = params->recv.displs[my_index]; + + if (send_count > 0) { + uint64_t buffer_len = send_count * params->send.dt_len; + uint64_t send_buffer_displ = send_displ * params->send.dt_len; + uint64_t recv_buffer_displ = recv_displ * params->recv.dt_len; + + memcpy(((int8_t *)params->recv.buf) + recv_buffer_displ, + ((int8_t *)params->send.buf) + send_buffer_displ, buffer_len); + } + } + + unsigned step_idx; + for (step_idx = 0; step_idx < ((ucg_builtin_plan_t *)op->super.plan)->phs_cnt; step_idx++) { + (&op->steps[step_idx])->am_header.remote_offset = (&op->steps[step_idx])->remote_offset = 0; + } +} + +void ucg_builtin_final_throttled_scatter(ucg_builtin_request_t *req) +{ + ucg_builtin_op_t *op = req->op; + + unsigned step_idx; + for (step_idx = 0; step_idx < ((ucg_builtin_plan_t *)op->super.plan)->phs_cnt; step_idx++) { + ucg_builtin_op_step_t *step = &(op->steps[step_idx]); + ucg_builtin_step_free_pack_rank_buffer(step); + ucg_builtin_free((void **)&step->send_coll_params); + ucg_builtin_free((void **)&step->recv_coll_params); + } +} + +void ucg_builtin_throttled_scatter_alltoallv_cb(ucg_builtin_request_t *req) +{ + ucg_collective_params_t *params = &(req->op->super.params); + ucg_builtin_op_step_t *step = req->step; + ucg_builtin_plan_phase_t *phase = step->phase; + ucg_builtin_coll_params_t *recv_coll_params = step->recv_coll_params; + ucg_builtin_coll_params_t *send_coll_params = step->send_coll_params; + + /* initialize step send coll parameters */ + send_coll_params->init_buf = params->send.buf == MPI_IN_PLACE ? (int8_t *)params->recv.buf + : (int8_t *)params->send.buf; + recv_coll_params->counts = params->send.counts; + send_coll_params->displs = params->send.displs; + + /* initialize step recv coll parameter */ + recv_coll_params->init_buf = (int8_t *)params->recv.buf; + recv_coll_params->counts = params->recv.counts; + recv_coll_params->displs = params->recv.displs; + + /*allocate pack rank buffer */ + unsigned send_dt_len = params->send.dt_len; + unsigned send_start_block = phase->ex_attr.start_block; + unsigned send_num_blocks = phase->ex_attr.num_blocks; + unsigned member_cnt = phase->ex_attr.member_cnt; + unsigned block_idx = send_start_block; + unsigned phase_send_buffer_length = 0; + + while (block_idx < (send_start_block + send_num_blocks)) { + unsigned real_block_idx = block_idx % member_cnt; + phase_send_buffer_length += step->send_coll_params->counts[real_block_idx]; + block_idx++; + } + phase_send_buffer_length *= send_dt_len; + + ucs_status_t status = ucg_builtin_step_alloc_pack_rank_buffer(step, phase_send_buffer_length); + if (status != UCS_OK) { + req->ladd_req_status = status; + } +} + +void ucg_builtin_init_plummer(ucg_builtin_op_t *op) +{ + ucg_collective_params_t *params = &(op->super.params); + if (params->send.buf != MPI_IN_PLACE) { + /* Copy its own data from the sendbuffer to the recvbuffer. */ + ucg_group_member_index_t my_index = op->super.plan->my_index; + int send_count = params->send.counts[my_index]; + int send_displ = params->send.displs[my_index]; + int recv_displ = params->recv.displs[my_index]; + + if (send_count > 0) { + uint64_t buffer_len = send_count * params->send.dt_len; + uint64_t send_buffer_displ = send_displ * params->send.dt_len; + uint64_t recv_buffer_displ = recv_displ * params->recv.dt_len; + + memcpy(((int8_t *)params->recv.buf) + recv_buffer_displ, + ((int8_t *)params->send.buf) + send_buffer_displ, buffer_len); + } + } + /* In alltoallv, both remote_offset and am_header.remote_offset initial values are 0. + * The Value is calculated dynamically during message sending and recveiving. + */ + unsigned step_idx; + for (step_idx = 0; step_idx < ((ucg_builtin_plan_t *)op->super.plan)->phs_cnt; step_idx++) { + (&op->steps[step_idx])->am_header.remote_offset = (&op->steps[step_idx])->remote_offset; + } +} + +void ucg_builtin_final_plummer(ucg_builtin_request_t *req) +{ + ucg_builtin_op_t *op = req->op; + + unsigned step_idx; + for (step_idx = 0; step_idx < ((ucg_builtin_plan_t *)op->super.plan)->phs_cnt; step_idx++) { + ucg_builtin_op_step_t *step = &(op->steps[step_idx]); + ucg_builtin_step_free_pack_rank_buffer(step); + + if (step->phase->ex_attr.is_variable_len) { + if (step->phase->send_ep_cnt > 0) { + ucg_builtin_free_coll_params(&(step->send_coll_params)); + } + if (step->phase->recv_ep_cnt > 0) { + ucg_builtin_free_coll_params(&(step->recv_coll_params)); + } + } + } + ucg_builtin_free((void **)&(op->temp_data_buffer)); + ucg_builtin_free((void **)&(op->temp_data_buffer1)); + ucg_builtin_free((void **)&(op->temp_exchange_buffer)); + ucg_builtin_free((void **)&(op->temp_exchange_buffer1)); +} + static ucs_status_t ucg_builtin_op_select_callback(ucg_builtin_plan_t *plan, int is_send_contig, int is_recv_contig, ucg_builtin_op_init_cb_t *init_cb, ucg_builtin_op_final_cb_t *final_cb) { - ucs_debug("op select callback, method:%d, send_contig:%d, recv_contig:%d", + ucs_info("op select callback, method:%d, send_contig:%d, recv_contig:%d", plan->phss[0].method, is_send_contig, is_recv_contig); - + unsigned is_allgather = plan->super.type.modifiers & UCG_GROUP_COLLECTIVE_MODIFIER_ALLGATHER; switch (plan->phss[0].method) { case UCG_PLAN_METHOD_REDUCE_WAYPOINT: case UCG_PLAN_METHOD_REDUCE_TERMINAL: @@ -873,6 +1542,11 @@ static ucs_status_t ucg_builtin_op_select_callback(ucg_builtin_plan_t *plan, } break; + case UCG_PLAN_METHOD_REDUCE_SCATTER_RECURSIVE: + *init_cb = ucg_builtin_init_rabenseifner; + *final_cb = NULL; + break; + case UCG_PLAN_METHOD_ALLGATHER_RECURSIVE: *init_cb = ucg_builtin_init_allgather_recursive; *final_cb = NULL; @@ -898,7 +1572,18 @@ static ucs_status_t ucg_builtin_op_select_callback(ucg_builtin_plan_t *plan, *init_cb = ucg_builtin_init_ring; *final_cb = NULL; break; - + case UCG_PLAN_METHOD_INC: + *init_cb = ucg_builtin_init_inc; + *final_cb = NULL; + break; + case UCG_PLAN_METHOD_EXCHANGE: + *init_cb = is_allgather ? ucg_builtin_init_gather : ucg_builtin_init_pairwise; + *final_cb = NULL; + break; + case UCG_PLAN_METHOD_ALLTOALLV_LADD: + *init_cb = ucg_builtin_init_throttled_scatter; + *final_cb = ucg_builtin_final_throttled_scatter; + break; default: if (!is_send_contig) { if (!is_recv_contig) { @@ -918,6 +1603,10 @@ static ucs_status_t ucg_builtin_op_select_callback(ucg_builtin_plan_t *plan, break; } + if (plan->ucg_algo.plummer && (plan->phss[0].method != UCG_PLAN_METHOD_ALLTOALLV_LADD)) { + *init_cb = ucg_builtin_init_plummer; + *final_cb = ucg_builtin_final_plummer; + } return UCS_OK; } @@ -957,6 +1646,7 @@ static inline ucs_status_t ucg_builtin_step_zcopy_prep(ucg_builtin_op_step_t *st ucs_status_t status = uct_md_mem_reg(step->uct_md, step->send_buffer, step->buffer_length, UCT_MD_MEM_ACCESS_ALL, &step->zcopy.memh); if (status != UCS_OK) { + ucs_error("failed to register memory %p, length %ld", step->send_buffer, step->buffer_length); ucs_free(zcomp); zcomp = NULL; return status; @@ -964,6 +1654,45 @@ static inline ucs_status_t ucg_builtin_step_zcopy_prep(ucg_builtin_op_step_t *st return UCS_OK; } +static inline ucs_status_t ucg_builtin_dynamic_zcopy_prep(ucg_builtin_op_step_t *step, unsigned ep_index) +{ + /* Allocate callback context for zero-copy sends */ + ucg_builtin_zcopy_info_t *zcopy = &step->zcopys[ep_index]; + if ((!zcopy->memh) && (!zcopy->zcomp)) { + uint32_t zcomp_cnt = step->fragments; + zcopy->zcopy_pending = zcomp_cnt; + zcopy->memh = NULL; /* - in case the allocation fails... */ + zcopy->num_store = 0; + ucg_builtin_zcomp_t *zcomp = + zcopy->zcomp = (ucg_builtin_zcomp_t *)UCS_ALLOC_CHECK(zcomp_cnt * + sizeof(*zcomp), "ucg_zcopys_completion"); + ucp_ep_h ucp_ep = step->phase->ucp_eps[ep_index]; + zcopy->uct_md = ucp_ep_get_am_uct_md(ucp_ep); + + /* Initialize all the zero-copy send completion structures */ + while(zcomp_cnt--) { + zcomp->comp.func = ucg_builtin_step_am_zcopy_comp_step_check_cb; + zcomp->comp.count = 1; + zcomp++; + } + + /* Register the buffer, creating a memory handle used in zero-copy sends */ + ucs_status_t status = uct_md_mem_reg(zcopy->uct_md, step->send_buffer, + step->buffer_length, UCT_MD_MEM_ACCESS_ALL, &zcopy->memh); + + if (status != UCS_OK) { + ucs_error("failed to register memory %p, length %ld", step->send_buffer, step->buffer_length); + ucg_builtin_free((void **)&zcomp); + return status; + } + + /* set "current" step->zcopy point to step->zcopys[ep_index] for sending */ + step->zcopy.memh = step->zcopys[ep_index].memh; + step->zcopy.num_store = step->zcopys[ep_index].num_store; + step->zcopy.zcomp = step->zcopys[ep_index].zcomp; + } + return UCS_OK; +} static ucs_status_t ucg_builtin_optimize_bcopy_to_zcopy(ucg_builtin_op_t *op) { /* This function was called because we want to "upgrade" a bcopy-send to @@ -994,11 +1723,7 @@ static ucs_status_t ucg_builtin_optimize_bcopy_to_zcopy(ucg_builtin_op_t *op) bcopy_to_zcopy_cleanup: while (step_idx--) { - step = &op->steps[step_idx]; - if (step->zcopy.zcomp != NULL) { - ucs_free(step->zcopy.zcomp); - step->zcopy.zcomp = NULL; - } + ucg_builtin_free((void **)op->steps[step_idx].zcopy.zcomp); } return status; } @@ -1020,7 +1745,12 @@ static ucs_status_t ucg_builtin_op_consider_optimization(ucg_builtin_op_t *op, ucg_builtin_op_step_t *step = NULL; ucg_step_idx_ext_t step_idx = 0; unsigned opt_flag = config->bcopy_to_zcopy_opt; - + /* Currently, this function is shielded in the + * alltoallv scenario because the buffer length changes. + */ + if (op->steps[0].phase->method == UCG_PLAN_METHOD_ALLTOALLV_LADD) { + opt_flag = 0; + } if (opt_flag && !op->send_dt) { do { step = &op->steps[step_idx++]; diff --git a/builtin/ops/builtin_ops.c b/builtin/ops/builtin_ops.c index 98cee21..ef4374c 100644 --- a/builtin/ops/builtin_ops.c +++ b/builtin/ops/builtin_ops.c @@ -1,10 +1,12 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ #include - +#include +#include +#include #include #include #include @@ -19,12 +21,15 @@ ucg_group_member_index_t g_myidx = 0; unsigned num_procs = 0; +/* in order to keep the interface ucg_builtin_step_create no change, use the global para to pass the value */ +short g_myposition = 0; +int g_reduce_coinsidency = 0; /****************************************************************************** * * * Operation Execution * * * ******************************************************************************/ -void ucg_builtin_step_assert(ucg_builtin_op_step_t *step, enum ucg_builtin_op_step_flags step_flag) +static void ucg_builtin_step_assert(ucg_builtin_op_step_t *step, enum ucg_builtin_op_step_flags step_flag) { ucs_assert((step->flags & (UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_SHORT | UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_BCOPY | @@ -34,11 +39,106 @@ void ucg_builtin_step_assert(ucg_builtin_op_step_t *step, enum ucg_builtin_op_st ucs_assert(step->iter_offset != UCG_BUILTIN_OFFSET_PIPELINE_PENDING); } +/* + * calculate the fragments for current buffer length + * INPUT: length, dt_len, ep_thresh + * OUTPUT: fragment_length, fragment, flag + */ +static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_calc_fragment(unsigned length, size_t dt_len, + ucg_builtin_tl_threshold_t *ep_thresh, + size_t *fragment_length, + uint32_t *fragments, + uint16_t *flag) +{ + size_t max_short_one = ep_thresh->max_short_one; + size_t max_short_max = ep_thresh->max_short_max; + size_t max_bcopy_one = ep_thresh->max_bcopy_one; + size_t max_bcopy_max = ep_thresh->max_bcopy_max; + size_t max_zcopy_one = ep_thresh->max_zcopy_one; + size_t md_attr_cap_max_reg = ep_thresh->md_attr_cap_max_reg; + size_t extra_frag = 0; + /* + * Short messages (e.g. RDMA "inline") + */ + if ((length <= max_short_one) && (max_short_one != 0)) { + /* Short send - single message */ + *fragments = 1; + *flag = UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_SHORT; + } else if ((length <= max_short_max) && (max_short_max != 0)) { + if (dt_len <= max_short_one) { + /* Short send - multiple messages */ + *fragment_length = max_short_one - (max_short_one % dt_len); + } else { + *fragment_length = max_short_one; + } + ucs_assert(*fragment_length > 0); + extra_frag = (length % (*fragment_length)) > 0; + *fragments = ((*fragment_length) ? (length / (*fragment_length)) : 0) + extra_frag; + *flag = UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED | UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_SHORT; + /* + * Large messages, if supported (e.g. RDMA "zero-copy") + */ + } else if ((length > max_bcopy_max) && (length <= md_attr_cap_max_reg)) { + if ((length < max_zcopy_one) && (max_zcopy_one != 0)) { + /* ZCopy send - single message */ + *fragments = 1; + *flag = UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY; + } else { + /* ZCopy send - multiple message */ + *fragment_length = (dt_len > max_zcopy_one) ? + max_zcopy_one: + (max_zcopy_one - (max_zcopy_one % dt_len)) ; + ucs_assert(*fragment_length > 0); + extra_frag = (length % (*fragment_length)) > 0; + *fragments = length / (*fragment_length) + extra_frag; + *flag = UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED | UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY; + } + /* + * Medium messages + */ + } else if ((length <= max_bcopy_one) && (max_bcopy_one != 0)) { + /* BCopy send - single message */ + *fragments = 1; + *flag = UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_BCOPY; + } else if ((length <= max_bcopy_max) && (max_bcopy_max != 0)) { + /* BCopy send - multiple messages */ + if (dt_len > max_bcopy_one) { + *fragment_length = max_bcopy_one; + } else { + *fragment_length = max_bcopy_one - (max_bcopy_one % dt_len); + } + extra_frag = (length % (*fragment_length)) > 0; + *fragments = ((*fragment_length) ? (length / (*fragment_length)) : 0) + extra_frag; + *flag = UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED | UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_BCOPY; + } else { + return UCS_ERR_INVALID_PARAM; + } + return UCS_OK; +} + +/* Add rank id to the front of variable-length operation */ +static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_ep_am_short_pack_rank(uct_ep_h ep, uint8_t id, uint64_t header, + void *payload, unsigned length, + ucg_builtin_op_step_t *step) +{ + ucg_builtin_pack_rank_cb_t pack_rank_func = step->variable_length.pack_rank_func; + if (pack_rank_func != NULL) { + size_t new_length = 0; + void *packed_rank_payload = pack_rank_func(step, payload, length, &new_length); + if (packed_rank_payload == NULL) { + return UCS_ERR_NO_MEMORY; + } + payload = packed_rank_payload; + length = new_length; + } + return uct_ep_am_short(ep, id, header, payload, length); +} + static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_dummy_send(ucg_builtin_request_t *req, ucg_builtin_op_step_t *step, uct_ep_h ep, int is_single_send) { - ucg_builtin_step_assert(step, 0); + ucg_builtin_step_assert(step, UCG_BUILTIN_OP_STEP_FLAG_RECV_AFTER_SEND); return UCS_OK; } @@ -47,7 +147,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_short_one(ucg_builti uct_ep_h ep, int is_single_send) { ucg_builtin_step_assert(step, UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_SHORT); - ucs_debug("am_short_one step %u length %zu", step->am_header.step_idx, step->buffer_length); + ucs_info("am_short_one step %u length %zu", step->am_header.step_idx, step->buffer_length); int8_t *send_buffer = step->send_buffer; void *dt_state = step->non_contig.pack_state; @@ -55,8 +155,8 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_short_one(ucg_builti req->op->send_dt->ops.pack(dt_state, 0, step->non_contig.contig_buffer, step->buffer_length); send_buffer = step->non_contig.contig_buffer; } - return step->uct_iface->ops.ep_am_short(ep, step->am_id, - step->am_header.header, send_buffer, step->buffer_length); + return ucg_builtin_ep_am_short_pack_rank(ep, step->am_id, + step->am_header.header, send_buffer, step->buffer_length, step); } static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_short_max(ucg_builtin_request_t *req, @@ -80,15 +180,13 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_short_max(ucg_builti am_iter.remote_offset = (is_single_send) ? step->iter_offset : am_iter.remote_offset + step->iter_offset; - ucs_status_t (*ep_am_short)(uct_ep_h, uint8_t, uint64_t, const void*, unsigned) = - step->uct_iface->ops.ep_am_short; ucg_builtin_step_assert(step, UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_SHORT); /* send every fragment but the last */ if (ucs_likely(buffer_iter < buffer_iter_limit)) { do { ucs_debug("am_short_max step %u offset %" PRIu32 " length %u", step->am_header.step_idx, am_iter.remote_offset, frag_size); - status = ep_am_short(ep, am_id, am_iter.header, buffer_iter, frag_size); + status = ucg_builtin_ep_am_short_pack_rank(ep, am_id, am_iter.header, buffer_iter, frag_size, step); if (is_single_send) { return status; @@ -108,7 +206,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_short_max(ucg_builti } ucs_debug("am_short_max step: %u; offset: %" PRIu32 "", step->am_header.step_idx, am_iter.remote_offset); - status = ep_am_short(ep, am_id, am_iter.header, buffer_iter, send_buffer + step->buffer_length - buffer_iter); + status = ucg_builtin_ep_am_short_pack_rank(ep, am_id, am_iter.header, buffer_iter, send_buffer + step->buffer_length - buffer_iter, step); /* iter_offset can not set to be zero for pipelining */ if (!is_single_send) { step->iter_offset = (status == UCS_OK) ? 0 : buffer_iter - send_buffer; @@ -117,36 +215,58 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_short_max(ucg_builti return status; } +static size_t ucg_builtin_step_fill_bcopy_header(void *dest, ucg_builtin_request_t *req) +{ + ucg_builtin_op_step_t *step = req->step; + unsigned is_rank_tx = step->phase->ex_attr.is_variable_len; + if (is_rank_tx) { + ucg_builtin_header_ext_t *header_ext_ptr = (ucg_builtin_header_ext_t *)dest; + header_ext_ptr->header = step->am_header; + header_ext_ptr->src_rank = step->phase->ex_attr.packed_rank; + return sizeof(ucg_builtin_header_ext_t); + } else { + ucg_builtin_header_t *header_ptr = (ucg_builtin_header_t *)dest; + header_ptr->header = step->am_header.header; + return sizeof(ucg_builtin_header_t); + } +} + static size_t ucg_builtin_step_am_bcopy_single_frag_packer(void *dest, void *arg) { ucg_builtin_request_t *req = (ucg_builtin_request_t*)arg; ucg_builtin_op_step_t *step = req->step; - ucg_builtin_header_t *header_ptr = (ucg_builtin_header_t*)dest; + size_t header_len = ucg_builtin_step_fill_bcopy_header(dest, req); + int8_t *header_ptr = (int8_t*)dest; void *dt_state = step->non_contig.pack_state; - header_ptr->header = step->am_header.header; if (dt_state != NULL) { - req->op->send_dt->ops.pack(dt_state, 0, header_ptr + 1, step->buffer_length); + req->op->send_dt->ops.pack(dt_state, 0, header_ptr + header_len, step->buffer_length); } else { - memcpy(header_ptr + 1, step->send_buffer, step->buffer_length); + errno_t error_status = memcpy_s(header_ptr + header_len, step->buffer_length, step->send_buffer, step->buffer_length); + if (error_status != EOK) { + ucs_fatal("memcpy_s encountered an error. The error code is %d.", error_status); + } } - return sizeof(*header_ptr) + step->buffer_length; + return header_len + step->buffer_length; } static size_t ucg_builtin_step_am_bcopy_full_frag_packer(void *dest, void *arg) { ucg_builtin_request_t *req = (ucg_builtin_request_t*)arg; ucg_builtin_op_step_t *step = req->step; - ucg_builtin_header_t *header_ptr = (ucg_builtin_header_t*)dest; + size_t header_len = ucg_builtin_step_fill_bcopy_header(dest, req); + int8_t *header_ptr = (int8_t*)dest; void *dt_state = step->non_contig.pack_state; - header_ptr->header = step->am_header.header; if (dt_state != NULL) { - req->op->send_dt->ops.pack(dt_state, step->iter_offset, header_ptr + 1, step->fragment_length); + req->op->send_dt->ops.pack(dt_state, step->iter_offset, header_ptr + header_len, step->fragment_length); } else { - memcpy(header_ptr + 1, step->send_buffer + step->iter_offset, step->fragment_length); + errno_t error_status = memcpy_s(header_ptr + header_len, step->fragment_length, step->send_buffer + step->iter_offset, step->fragment_length); + if (error_status != EOK) { + ucs_fatal("memcpy_s encountered an error. The error code is %d.", error_status); + } } - return sizeof(*header_ptr) + step->fragment_length; + return header_len + step->fragment_length; } static size_t ucg_builtin_step_am_bcopy_partial_frag_packer(void *dest, void *arg) @@ -154,16 +274,19 @@ static size_t ucg_builtin_step_am_bcopy_partial_frag_packer(void *dest, void *ar ucg_builtin_request_t *req = (ucg_builtin_request_t*)arg; ucg_builtin_op_step_t *step = req->step; ucg_offset_t last_frag_length = step->buffer_length - step->iter_offset; - ucg_builtin_header_t *header_ptr = (ucg_builtin_header_t*)dest; + size_t header_len = ucg_builtin_step_fill_bcopy_header(dest, req); + int8_t *header_ptr = (int8_t*)dest; void *dt_state = step->non_contig.pack_state; - header_ptr->header = step->am_header.header; if (dt_state != NULL) { - req->op->send_dt->ops.pack(dt_state, step->iter_offset, header_ptr + 1, last_frag_length); + req->op->send_dt->ops.pack(dt_state, step->iter_offset, header_ptr + header_len, last_frag_length); } else { - memcpy(header_ptr + 1, step->send_buffer + step->iter_offset, last_frag_length); + errno_t error_status = memcpy_s(header_ptr + header_len, last_frag_length, step->send_buffer + step->iter_offset, last_frag_length); + if (error_status != EOK) { + ucs_fatal("memcpy_s encountered an error. The error code is %d.", error_status); + } } - return sizeof(*header_ptr) + last_frag_length; + return header_len + last_frag_length; } static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_bcopy_one(ucg_builtin_request_t *req, @@ -174,7 +297,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_bcopy_one(ucg_builti /* send active message to remote endpoint */ ucs_debug("am_bcopy_one step %u length %zu", step->am_header.step_idx, step->buffer_length); - ssize_t len = step->uct_iface->ops.ep_am_bcopy(ep, step->am_id, + ssize_t len = uct_ep_am_bcopy(ep, step->am_id, ucg_builtin_step_am_bcopy_single_frag_packer, req, 0); return (ucs_unlikely(len < 0)) ? (ucs_status_t)len : UCS_OK; } @@ -190,8 +313,6 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_bcopy_max(ucg_builti step->am_header.remote_offset = (is_single_send) ? step->iter_offset : step->am_header.remote_offset; - ssize_t (*ep_am_bcopy)(uct_ep_h, uint8_t, uct_pack_callback_t, void*, unsigned) = - step->uct_iface->ops.ep_am_bcopy; ucg_builtin_step_assert(step, UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_BCOPY); @@ -200,7 +321,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_bcopy_max(ucg_builti /* send every fragment but the last */ do { ucs_debug("am_bcopy_max step %u offset %" PRIu32 " length %u", step->am_header.step_idx, step->am_header.remote_offset, frag_size); - len = ep_am_bcopy(ep, am_id, ucg_builtin_step_am_bcopy_full_frag_packer, req, 0); + len = uct_ep_am_bcopy(ep, am_id, ucg_builtin_step_am_bcopy_full_frag_packer, req, 0); if (is_single_send) { return ucs_unlikely(len < 0) ? (ucs_status_t)len : UCS_OK; @@ -219,7 +340,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_bcopy_max(ucg_builti /* Send last fragment of the message */ ucs_debug("am_bcopy_max step: %u; offset: %" PRIu32 "", step->am_header.step_idx, step->am_header.remote_offset); - len = ep_am_bcopy(ep, am_id, ucg_builtin_step_am_bcopy_partial_frag_packer, req, 0); + len = uct_ep_am_bcopy(ep, am_id, ucg_builtin_step_am_bcopy_partial_frag_packer, req, 0); if (ucs_unlikely(len < 0)) { return (ucs_status_t)len; } @@ -231,10 +352,33 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_bcopy_max(ucg_builti return UCS_OK; } +static ucs_status_t ucg_builtin_step_am_zcopy_pack_rank(uct_ep_h ep, + ucg_builtin_op_step_t *step, + const uct_iov_t *iov, size_t iovcnt, + unsigned flags, + uct_completion_t *comp) +{ + ucs_status_t status; + unsigned is_rank_tx = step->phase->ex_attr.is_variable_len; + if (is_rank_tx) { + step->am_header_ext.header = step->am_header; + step->am_header_ext.src_rank = step->phase->ex_attr.packed_rank; + status = uct_ep_am_zcopy(ep, step->am_id, + &step->am_header_ext, sizeof(step->am_header_ext), + iov, iovcnt, flags, comp); + } else { + status = uct_ep_am_zcopy(ep, step->am_id, + &step->am_header, sizeof(step->am_header), + iov, iovcnt, flags, comp); + } + return status; +} + static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_zcopy_one(ucg_builtin_request_t *req, ucg_builtin_op_step_t *step, uct_ep_h ep, int is_single_send) { + ucs_status_t status = UCS_OK; int8_t *send_buffer = step->send_buffer; void *dt_state = step->non_contig.pack_state; @@ -256,9 +400,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_zcopy_one(ucg_builti zcomp->req = req; ucs_debug("am_zcopy_one step %u length %zu", step->am_header.step_idx, step->buffer_length); - ucs_status_t status = step->uct_iface->ops.ep_am_zcopy(ep, step->am_id, - &step->am_header, sizeof(step->am_header), - &iov, 1, 0, &zcomp->comp); + status = ucg_builtin_step_am_zcopy_pack_rank(ep, step, &iov, 1, 0, &zcomp->comp); return ucs_unlikely(status != UCS_INPROGRESS) ? status : UCS_OK; } @@ -267,7 +409,6 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_zcopy_max(ucg_builti uct_ep_h ep, int is_single_send) { ucs_status_t status; - unsigned am_id = step->am_id; step->am_header.remote_offset = (is_single_send) ? step->iter_offset : step->am_header.remote_offset; int8_t *send_buffer = step->send_buffer; @@ -279,12 +420,14 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_zcopy_max(ucg_builti ucg_offset_t frag_size = step->fragment_length; void* iov_buffer_limit = send_buffer + step->buffer_length - frag_size; - unsigned zcomp_index = step->iter_ep * step->fragments + - step->iter_offset / step->fragment_length; - ucg_builtin_zcomp_t *zcomp = &step->zcopy.zcomp[zcomp_index]; - ucs_status_t (*ep_am_zcopy)(uct_ep_h, uint8_t, const void*, unsigned, - const uct_iov_t*, size_t, unsigned, uct_completion_t*) = - step->uct_iface->ops.ep_am_zcopy; + unsigned zcomp_index; + if (step->phase->ex_attr.is_variable_len) { + step->iter_offset = step->am_header.remote_offset; + zcomp_index = step->iter_offset / step->fragment_length; + } else { + zcomp_index = step->iter_ep * step->fragments + step->iter_offset / step->fragment_length; + } + ucg_builtin_zcomp_t *zcomp = &step->zcopy.zcomp[zcomp_index]; uct_iov_t iov = { .buffer = send_buffer + step->iter_offset, @@ -301,8 +444,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_zcopy_max(ucg_builti /* send every fragment but the last */ do { ucs_debug("am_zcopy_max step %u offset %" PRIu32 " length %u", step->am_header.step_idx, step->am_header.remote_offset, frag_size); - status = ep_am_zcopy(ep, am_id, &step->am_header, - sizeof(step->am_header), &iov, + status = ucg_builtin_step_am_zcopy_pack_rank(ep, step, &iov, 1, 0, &zcomp->comp); (zcomp++)->req = req; @@ -327,9 +469,8 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_zcopy_max(ucg_builti zcomp->req = req; iov.length = send_buffer + step->buffer_length - (int8_t*)iov.buffer; ucs_debug("am_zcopy_max step %u offset %" PRIu32 " length %zu", step->am_header.step_idx, step->am_header.remote_offset, iov.length); - status = ep_am_zcopy(ep, am_id, &step->am_header, - sizeof(step->am_header), - &iov, 1, 0, &zcomp->comp); + status = ucg_builtin_step_am_zcopy_pack_rank(ep, step, &iov, 1, 0, &zcomp->comp); + if (ucs_unlikely(status != UCS_INPROGRESS)) { step->iter_offset = (!is_single_send) ? (int8_t*)iov.buffer - send_buffer : step->iter_offset; @@ -344,16 +485,172 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_zcopy_max(ucg_builti return UCS_OK; } +static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_dynamic(ucg_builtin_request_t *req, + ucg_builtin_op_step_t *step, + uct_ep_h ep, + unsigned is_single_send) +{ + ucs_status_t status = UCS_OK; + int is_short = step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_SHORT; + int is_bcopy = step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_BCOPY; + int is_zcopy = step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY; + int is_fragmented = step->flags & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED; + + if (step->resend_flag & UCG_BUILTIN_OP_STEP_FIRST_SEND) { + step->am_header.remote_offset = 0; + step->iter_offset = 0; + } + + /* Single-send operations (only one fragment passed to UCT) */ + if (!is_fragmented) { + if (is_short) { + status = ucg_builtin_step_am_short_one(req, step, ep, is_single_send); + } else if (is_bcopy) { + status = ucg_builtin_step_am_bcopy_one(req, step, ep, is_single_send); + } else if (is_zcopy) { + status = ucg_builtin_step_am_zcopy_one(req, step, ep, is_single_send); + } + } else { + if (is_short) { + status = ucg_builtin_step_am_short_max(req, step, ep, is_single_send); + } else if (is_bcopy) { + status = ucg_builtin_step_am_bcopy_max(req, step, ep, is_single_send); + } else if (is_zcopy) { + status = ucg_builtin_step_am_zcopy_max(req, step, ep, is_single_send); + } + } + return status; +} + +static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_update_pending(ucg_builtin_request_t *req, unsigned recv_ep_index) +{ + ucs_status_t status = UCS_OK; + ucg_collective_params_t *params = &(req->op->super.params); + ucg_builtin_op_step_t *step = req->step; + ucg_builtin_plan_phase_t *phase = step->phase; + unsigned recv_buffer_length = step->buffer_length_recv; + ucg_builtin_tl_threshold_t *ep_thresh = &phase->ep_thresh[recv_ep_index]; + + size_t fragment_length = 0; + uint32_t fragments = 0; + uint16_t recv_flag = 0; + + status = ucg_builtin_step_calc_fragment(recv_buffer_length, params->recv.dt_len, ep_thresh, + &fragment_length, &fragments, &recv_flag); + if (status != UCS_OK) { + return status; + } + + /* update the pending related to receive count */ + req->pending += fragments; + return status; +} + +static void ucg_builtin_dynamic_calc_pending(ucg_builtin_request_t *req, ucg_request_t **user_req) +{ + ucg_collective_params_t *params = &(req->op->super.params); + ucg_builtin_op_step_t *step = req->step; + ucg_builtin_plan_phase_t *phase = step->phase; + ucg_builtin_coll_params_t *recv_coll_params = step->recv_coll_params; + + unsigned recv_start_block = phase->ex_attr.recv_start_block; + unsigned local_member_cnt = phase->ex_attr.member_cnt; + + unsigned block_idx; + while (step->iter_ep < phase->ep_cnt) { + uct_ep_h *ep_iter = phase->multi_eps + step->iter_ep; + if (*ep_iter) { + block_idx = (recv_start_block + local_member_cnt + step->iter_ep - phase->send_ep_cnt) % local_member_cnt; + if (recv_coll_params->counts[block_idx] > 0) { + step->buffer_length_recv = recv_coll_params->counts[block_idx] * params->recv.dt_len; + ucg_builtin_step_update_pending(req, step->iter_ep); + } + } + step->iter_ep++; + } +} + +static ucs_status_t ucg_builtin_dynamic_send_recv(ucg_builtin_request_t *req, ucg_request_t **user_req) +{ + ucs_status_t status = UCS_OK; + + ucg_builtin_op_step_t *step = req->step; + ucg_builtin_plan_phase_t *phase = step->phase; + ucg_collective_params_t *params = &(req->op->super.params); + ucg_builtin_coll_params_t *send_coll_params = step->send_coll_params; + + /* step dynamic send flag */ + step->flags |= UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_DYNAMIC; + + /* save the original step->flags */ + uint16_t orig_flags = step->flags; + + unsigned send_start_block = phase->ex_attr.start_block; + unsigned local_member_cnt = phase->ex_attr.member_cnt; + + /* initialize the pending before both sending/receiving */ + if (step->resend_flag & UCG_BUILTIN_OP_STEP_FIRST_SEND) { + req->pending = 0; + } + + int block_idx; + while (step->iter_ep < phase->send_ep_cnt) { + uct_ep_h *ep_iter = phase->multi_eps + step->iter_ep; + if (*ep_iter) { + block_idx = (send_start_block + step->iter_ep) % local_member_cnt; + if (step->send_coll_params->counts[block_idx] > 0) { + int send_buffer_length = send_coll_params->counts[block_idx] * params->send.dt_len; + int send_buffer_displ = send_coll_params->displs[block_idx] * params->send.dt_len; + step->send_buffer = send_coll_params->init_buf + send_buffer_displ; + step->buffer_length = send_buffer_length; + uint16_t send_flag = 0; + /* calculate the the flag and fragments for sender */ + status = ucg_builtin_step_calc_fragment(send_buffer_length, params->send.dt_len, + &step->phase->ep_thresh[step->iter_ep], &step->fragment_length, &step->fragments, &send_flag); + if (status !=UCS_OK) { + step->resend_flag = UCG_BUILTIN_OP_STEP_RESEND; + step->flags = orig_flags; + return status; + } + + step->flags |= send_flag; + /* register memory for zero-copy */ + if (step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY) { + status = ucg_builtin_dynamic_zcopy_prep(step, step->iter_ep); + if (status != UCS_OK) { + step->resend_flag = UCG_BUILTIN_OP_STEP_RESEND; + step->flags = orig_flags; + return status; + } + } + + status = ucg_builtin_step_am_dynamic(req, step, *ep_iter, 0); + if (status != UCS_OK) { + step->resend_flag = UCG_BUILTIN_OP_STEP_RESEND; + step->flags = orig_flags; + return status; + } + + /* restor the original state of step->flag */ + step->flags = orig_flags; + } + } + step->iter_ep++; + } + return status; +} + /* * Below is a set of macros, generating most bit-field combinations of * step->flags inside @ref ucg_builtin_step_execute() . */ -#define case_send(req, ureq, step, phase, _send_func) { \ +#define case_send(req, ureq, step, phase, _send_func) do { \ if ((is_rs1 || is_r1s) && ((step)->iter_ep == 0)) { \ uint32_t new_cnt = (step)->iter_ep = is_r1s ? 1 : (phase)->ep_cnt - 1; \ ucs_assert(new_cnt > 0); \ if (is_pipelined) { \ - memset((void*)(step)->fragment_pending, new_cnt, (step)->fragments); \ + memset_s((void*)(step)->fragment_pending, (step)->fragments, \ + new_cnt, (step)->fragments); \ } \ (req)->pending = new_cnt * (step)->fragments_recv; \ /* Beyond the case we fall-back to receiving */ \ @@ -472,7 +769,7 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_am_zcopy_max(ucg_builti return ucg_builtin_comp_step_cb(req, ureq); \ } \ } \ - } \ + } while (0) \ #define INIT_USER_REQUEST_IF_GIVEN(user_req, req) { \ if (ucs_unlikely((user_req) != NULL)) { \ @@ -533,8 +830,21 @@ UCS_PROFILE_FUNC(ucs_status_t, ucg_builtin_step_execute, (req, user_req), * For some operations, like MPI_Alltoall, the * discrete data should be packed then send (e.g. Bruck algorithms). */ - if (req->step->send_cb != NULL) { + if (req->step->send_cb != NULL && !req->is_send_cb_called) { req->step->send_cb(req); + req->is_send_cb_called = 1; + if (req->inc_req_status != UCS_OK) { + status = req->inc_req_status; + goto step_execute_error; + } + if (req->ladd_req_status != UCS_OK) { + status = req->ladd_req_status; + goto step_execute_error; + } + if (req->plummer_req_status != UCS_OK) { + status = req->plummer_req_status; + goto step_execute_error; + } } is_scatter = step->flags & UCG_BUILTIN_OP_STEP_FLAG_LENGTH_PER_REQUEST; @@ -552,24 +862,49 @@ UCS_PROFILE_FUNC(ucs_status_t, ucg_builtin_step_execute, (req, user_req), is_zcopy = step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY; is_fragmented = step->flags & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED; - /* for recv-only step */ - if (is_dummy) case_send(req, user_req, step, phase, ucg_builtin_step_dummy_send); + if (phase->ex_attr.is_variable_len) { + status = ucg_builtin_dynamic_send_recv(req, user_req); + if (status != UCS_OK) { + goto step_execute_error; + } + /* calculate zcopy pending */ + unsigned ep_iter; + for (ep_iter = 0; ep_iter < phase->send_ep_cnt; ep_iter++) { + req->pending += step->zcopys[ep_iter].zcopy_pending; + } - if (!is_fragmented) { /* Single-send operations (only one fragment passed to UCT) */ - if (is_short) { - case_send(req, user_req, step, phase, ucg_builtin_step_am_short_one); - } else if (is_bcopy) { - case_send(req, user_req, step, phase, ucg_builtin_step_am_bcopy_one); - } else if (is_zcopy) { - case_send(req, user_req, step, phase, ucg_builtin_step_am_zcopy_one); + ucg_builtin_dynamic_calc_pending(req, user_req); + if (req->pending == 0) { + if (is_last) { + if (!user_req) { + ucg_builtin_comp_last_step_cb(req, UCS_OK); + } + return UCS_OK; + } else { + return ucg_builtin_comp_step_cb(req, user_req); + } + } else { + ucg_builtin_step_var_callbacks(req->pending, &step->recv_cb); } - } else { /* Multi-send operations (using iter_ep and iter_offset for context) */ - if (is_short) { - case_send(req, user_req, step, phase, ucg_builtin_step_am_short_max); - } else if (is_bcopy) { - case_send(req, user_req, step, phase, ucg_builtin_step_am_bcopy_max); - } else if (is_zcopy) { - case_send(req, user_req, step, phase, ucg_builtin_step_am_zcopy_max); + } else { + ucs_debug("is_dummy:%d is_fragmented:%d is_short:%d is_bcopy:%d is_zcopy:%d", is_dummy, is_fragmented,is_short, is_bcopy, is_zcopy); + if (is_dummy) case_send(req, user_req, step, phase, ucg_builtin_step_dummy_send); + if (!is_fragmented) { + if (is_short) { + case_send(req, user_req, step, phase, ucg_builtin_step_am_short_one); + } else if (is_bcopy) { + case_send(req, user_req, step, phase, ucg_builtin_step_am_bcopy_one); + } else if (is_zcopy) { + case_send(req, user_req, step, phase, ucg_builtin_step_am_zcopy_one); + } + } else { + if (is_short) { + case_send(req, user_req, step, phase, ucg_builtin_step_am_short_max); + } else if (is_bcopy) { + case_send(req, user_req, step, phase, ucg_builtin_step_am_bcopy_max); + } else if (is_zcopy) { + case_send(req, user_req, step, phase, ucg_builtin_step_am_zcopy_max); + } } } finish_send: @@ -587,12 +922,14 @@ UCS_PROFILE_FUNC(ucs_status_t, ucg_builtin_step_execute, (req, user_req), if (is_zcopy && is_recv) { /* Count pre-arrived zcopy msg to req->step->zcopy.num_store */ local_id = slot->local_id; + /* receive from "multiple" EPs with "multiple" fragments */ + unsigned recv_zcopy_cnt = step->fragments_recv * step->phase->ep_cnt; ucg_builtin_comp_desc_t *desc = NULL; ucg_builtin_comp_desc_t *iter = NULL; ucs_list_for_each_safe(desc, iter, &slot->msg_head, super.tag_list[0]) { if (ucs_likely(desc->header.local_id == local_id)) { /* The number of store will not bigger than recv fragments */ - if (++step->zcopy.num_store >= step->fragments_recv) { + if (++step->zcopy.num_store >= recv_zcopy_cnt) { break; } } @@ -623,16 +960,6 @@ UCS_PROFILE_FUNC(ucs_status_t, ucg_builtin_step_execute, (req, user_req), return status; } -void ucg_builtin_dispose_packet(ucg_builtin_comp_desc_t *desc) -{ - /* Dispose of the packet, according to its allocation */ - if (desc->super.flags == UCT_CB_PARAM_FLAG_DESC) { - uct_iface_release_desc(desc); - } else { - ucs_mpool_put_inline(desc); - } -} - ucs_status_t ucg_builtin_msg_process(ucg_builtin_comp_slot_t *slot, ucg_builtin_request_t *req) { static unsigned loop_cnt = 0; @@ -671,8 +998,8 @@ ucs_status_t ucg_builtin_msg_process(ucg_builtin_comp_slot_t *slot, ucg_builtin_ int is_step_done = step->recv_cb(&slot->req, desc->header.remote_offset, &desc->data[0], desc->super.length); - ucg_builtin_dispose_packet(desc); - + desc->release(desc); + desc = NULL; loop_cnt--; /* If the step has indeed completed - check the entire op */ @@ -692,6 +1019,51 @@ ucs_status_t ucg_builtin_msg_process(ucg_builtin_comp_slot_t *slot, ucg_builtin_ return UCS_INPROGRESS; } +void *ucg_builtin_pack_rank(void *step, const void *send_buffer, size_t buffer_len, size_t *new_buffer_len) +{ + ucg_builtin_op_step_t *temp_step = (ucg_builtin_op_step_t *)step; + ucg_group_member_index_t my_idx = temp_step->phase->ex_attr.packed_rank; + int8_t *temp_buffer = (int8_t *)temp_step->variable_length.pack_rank_buffer; + + errno_t error_status = memcpy_s(temp_buffer, sizeof(ucg_group_member_index_t), + (int8_t *)&my_idx, sizeof(ucg_group_member_index_t)); + if (error_status != EOK) { + ucs_fatal("memcpy_s encountered an error. The error code is %d.", error_status); + } + error_status = memcpy_s(temp_buffer + sizeof(ucg_group_member_index_t), buffer_len, + (int8_t *)send_buffer, buffer_len); + if (error_status != EOK) { + ucs_fatal("memcpy_s encountered an error. The error code is %d.", error_status); + } + *new_buffer_len = buffer_len + sizeof(ucg_group_member_index_t); + return temp_buffer; +} + +ucg_group_member_index_t ucg_builtin_unpack_rank(const void *send_buffer, size_t buffer_length) +{ + ucs_assert(buffer_length >= sizeof(ucg_group_member_index_t)); + return *(ucg_group_member_index_t *)send_buffer; +} + +ucs_status_t ucg_builtin_step_alloc_pack_rank_buffer(ucg_builtin_op_step_t *step, size_t buffer_length) +{ + if (step->variable_length.pack_rank_buffer == NULL) { + step->variable_length.pack_rank_buffer = + (int8_t *)UCS_ALLOC_CHECK(buffer_length + sizeof(ucg_group_member_index_t) * step->phase->send_ep_cnt, + "pack rank buffer"); + step->variable_length.pack_rank_func = ucg_builtin_pack_rank; + step->variable_length.unpack_rank_func = ucg_builtin_unpack_rank; + } + return UCS_OK; +} + +void ucg_builtin_step_free_pack_rank_buffer(ucg_builtin_op_step_t *step) +{ + ucg_builtin_free((void **)&step->variable_length.pack_rank_buffer); + step->variable_length.pack_rank_func = NULL; + step->variable_length.unpack_rank_func = NULL; +} + ucs_status_t ucg_builtin_step_set_contig(ucg_builtin_op_step_t *step, int is_contig) { @@ -708,7 +1080,7 @@ ucs_status_t ucg_builtin_step_set_contig(ucg_builtin_op_step_t *step, if (step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY) { /* The send buffer changed, reregister it */ uct_md_mem_dereg(step->uct_md, step->zcopy.memh); - status = uct_md_mem_reg(step->uct_md, step->non_contig.contig_buffer, + status = uct_md_mem_reg(step->uct_md, step->non_contig.contig_buffer, step->buffer_length, UCT_MD_MEM_ACCESS_ALL, &step->zcopy.memh); if (status != UCS_OK) { if (step->zcopy.zcomp != NULL) { @@ -725,9 +1097,32 @@ ucs_status_t ucg_builtin_step_set_contig(ucg_builtin_op_step_t *step, void ucg_builtin_step_release_contig(ucg_builtin_op_step_t *step) { - if (step->non_contig.contig_buffer != NULL) { - ucs_free(step->non_contig.contig_buffer); - step->non_contig.contig_buffer = NULL; + ucg_builtin_free((void **)&step->non_contig.contig_buffer); +} + +static void free_zcomp(ucg_builtin_op_step_t *step) +{ + ucg_builtin_free((void **)&step->zcopy.zcomp); +} + +static void free_zcopy_info(ucg_builtin_op_step_t *step) +{ + if (step->zcopys != NULL) { + unsigned i; + for (i = 0; i < step->phase->send_ep_cnt; i++) { + if (step->zcopys[i].zcomp != NULL) { + uct_md_mem_dereg(step->zcopys[i].uct_md, step->zcopys[i].memh); + ucs_free(step->zcopys[i].zcomp); + } + } + ucg_builtin_free((void **)&step->zcopys); + } +} + +static void free_fragment_pending(ucg_builtin_op_step_t *step) +{ + if (step->zcopy.zcomp != NULL) { + ucg_builtin_free((void **)&step->fragment_pending); } } @@ -736,24 +1131,25 @@ void ucg_builtin_op_discard(ucg_op_t *op) ucg_builtin_op_t *builtin_op = (ucg_builtin_op_t*)op; ucg_builtin_op_step_t *step = &builtin_op->steps[0]; do { - if (step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY) { + if ((step->flags & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY) && (step->phase->ex_attr.is_variable_len == 0)) { uct_md_mem_dereg(step->uct_md, step->zcopy.memh); - if (step->zcopy.zcomp != NULL) { - ucs_free(step->zcopy.zcomp); - step->zcopy.zcomp = NULL; - } + free_zcomp(step); + } else { + free_zcopy_info(step); //for dynamic sending, dereg for all zcopys } if (step->flags & UCG_BUILTIN_OP_STEP_FLAG_PIPELINED) { - if (step->zcopy.zcomp != NULL) { - ucs_free((void*)step->fragment_pending); - step->fragment_pending = NULL; - } + free_fragment_pending(step); + } + /* Free the allreduce buffer */ + if (step->reduce_buff != NULL) { + free(step->reduce_buff); + step->rbuf_count = 0; + step->reduce_buff = NULL; } - ucg_builtin_step_release_contig(step); } while (!((step++)->flags & UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP)); - + ucg_builtin_free((void **)&builtin_op->temp_data_buffer); ucs_mpool_put_inline(op); } @@ -777,6 +1173,10 @@ ucs_status_t ucg_builtin_op_trigger(ucg_op_t *op, ucg_coll_id_t coll_id, ucg_req builtin_req->pending = first_step->fragments_recv * first_step->phase->ep_cnt; builtin_req->recv_comp = 0; + builtin_req->is_send_cb_called = 0; + builtin_req->inc_req_status = UCS_OK; + builtin_req->ladd_req_status = UCS_OK; + builtin_req->plummer_req_status = UCS_OK; slot->step_idx = first_step->am_header.step_idx; ucs_debug("op trigger: step idx %u coll id %u", slot->step_idx, coll_id); @@ -791,7 +1191,15 @@ ucs_status_t ucg_builtin_op_trigger(ucg_op_t *op, ucg_coll_id_t coll_id, ucg_req * some shuffle is required once before starting (e.g. Bruck algorithms). */ if (builtin_op->init_cb != NULL) { +#if ENABLE_UCG_HICOLL + builtin_op->inc_init_status = UCS_OK; +#endif builtin_op->init_cb(builtin_op); +#if ENABLE_UCG_HICOLL + if (builtin_op->inc_init_status != UCS_OK) { + return builtin_op->inc_init_status; + } +#endif } /* Consider optimization, if this operation is used often enough */ @@ -800,13 +1208,30 @@ ucs_status_t ucg_builtin_op_trigger(ucg_op_t *op, ucg_coll_id_t coll_id, ucg_req if (ucs_unlikely(UCS_STATUS_IS_ERR(optm_status))) { return optm_status; } - /* Need to return original status, becuase it can be OK or INPROGRESS */ + /* Need to return original status, because it can be OK or INPROGRESS */ } /* Start the first step, which may actually complete the entire operation */ return ucg_builtin_step_execute(builtin_req, request); } +static size_t ucg_builtin_get_inc_data_length(const ucg_collective_params_t *params) +{ + unsigned inc_header_size = 0; +#if ENABLE_UCG_HICOLL + inc_header_size += inc_get_header_size(); +#endif + enum ucg_collective_modifiers modifiers = params->type.modifiers; + if (modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE] || + modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_BCAST]) { + unsigned data_size = params->send.count * params->send.dt_len; + return data_size + inc_header_size; + } else if (modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_BARRIER]) { + return 1 + inc_header_size; + } + return params->send.count * params->send.dt_len; +} + /****************************************************************************** * * * Operation Creation * @@ -818,7 +1243,8 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_send_flags(ucg_builtin_ size_t dt_len, enum ucg_builtin_op_step_flags *send_flag) { - size_t length = step->buffer_length; + size_t length = phase->method == UCG_PLAN_METHOD_INC ? ucg_builtin_get_inc_data_length(params) : + step->buffer_length; unsigned partial_length = 0; /* Flag whether to go error and resend data */ @@ -867,17 +1293,6 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_send_flags(ucg_builtin_ step->fragments = length / step->fragment_length + partial_length; } - if (phase->method != UCG_PLAN_METHOD_RECV_TERMINAL && phase->method != UCG_PLAN_METHOD_REDUCE_TERMINAL) { - /* memory registration (using the memory registration cache) */ - ucs_status_t status = ucg_builtin_step_zcopy_prep(step); - if (ucs_unlikely(status != UCS_OK)) { - return status; - } - } else { - /* recv only method */ - return UCS_OK; - } - /* * Medium messages */ @@ -942,7 +1357,9 @@ static UCS_F_ALWAYS_INLINE ucs_status_t ucg_builtin_step_recv_flags(ucg_builtin_ enum ucg_builtin_op_step_flags *recv_flag) { *recv_flag = (enum ucg_builtin_op_step_flags)0; - size_t length = step->buffer_length; + size_t length = phase->ex_attr.is_inequal ? step->buffer_length_recv : step->buffer_length; + length = phase->method == UCG_PLAN_METHOD_INC ? ucg_builtin_get_inc_data_length(params) : + length; size_t fragment_length = 0; unsigned partial_length = 0; @@ -1018,7 +1435,43 @@ size_t ucg_builtin_get_dt_len(ucp_dt_generic_t *dt_gen) return len; } -ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, +/* ucg_builtin_coll_params_t init_buf malloc elsewhere and release elsewhere */ +ucg_builtin_coll_params_t *ucg_builtin_allocate_coll_params(unsigned local_member_cnt) +{ + ucg_builtin_coll_params_t *params = + ucs_malloc(sizeof(ucg_builtin_coll_params_t), "allocate variable length params"); + if (params == NULL) { + return NULL; + } + + params->counts = ucs_malloc(local_member_cnt * sizeof(int), "allocate variable length counts"); + if (params->counts == NULL) { + ucg_builtin_free((void **)¶ms); + return NULL; + } + (void)memset_s(params->counts, local_member_cnt * sizeof(int), 0, local_member_cnt * sizeof(int)); + + params->displs = ucs_malloc(local_member_cnt * sizeof(int), "allocate variable length displs"); + if (params->displs == NULL) { + ucg_builtin_free((void **)¶ms->counts); + ucg_builtin_free((void **)¶ms); + return NULL; + } + (void)memset_s(params->displs, local_member_cnt * sizeof(int), 0, local_member_cnt * sizeof(int)); + return params; +} + +void ucg_builtin_free_coll_params(ucg_builtin_coll_params_t **params) +{ + if (*params != NULL) { + ucg_builtin_free((void **)&(*params)->displs); + ucg_builtin_free((void **)&(*params)->counts); + ucg_builtin_free((void **)params); + } +} + +ucs_status_t ucg_builtin_step_create(ucg_builtin_op_t *op, + ucg_builtin_plan_phase_t *phase, ucp_datatype_t send_dtype, ucp_datatype_t recv_dtype, unsigned extra_flags, @@ -1028,6 +1481,7 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, int8_t **current_data_buffer, ucg_builtin_op_step_t *step) { + ucs_status_t status; /* Set the parameters determining the send-flags later on */ int is_send_contig = UCG_DT_IS_CONTIG(params, send_dtype); int is_recv_contig = UCG_DT_IS_CONTIG(params, recv_dtype); @@ -1035,12 +1489,10 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, ucg_builtin_get_dt_len(ucp_dt_generic(send_dtype)); size_t recv_dt_len = is_recv_contig ? params->recv.dt_len : ucg_builtin_get_dt_len(ucp_dt_generic(recv_dtype)); + (void)memset_s(step, sizeof(ucg_builtin_op_step_t), 0, sizeof(ucg_builtin_op_step_t)); step->buffer_length = send_dt_len * params->send.count; step->uct_md = phase->md; - if (phase->md) { - step->uct_iface = (phase->ep_cnt == 1) ? phase->single_ep->iface : - phase->multi_eps[0]->iface; - } + /* Note: we assume all the UCT endpoints have the same interface */ step->phase = phase; step->am_id = base_am_id; @@ -1049,16 +1501,108 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, step->iter_ep = 0; step->iter_offset = 0; step->fragment_pending = NULL; + + step->send_coll_params = NULL; + step->recv_coll_params = NULL; + step->variable_length.pack_rank_buffer = NULL; + step->variable_length.pack_rank_func = NULL; + step->variable_length.unpack_rank_func = NULL; + /* allocate the zcopy_info array for dynamic sending */ + step->zcopys = (ucg_builtin_zcopy_info_t *)UCS_ALLOC_CHECK(phase->ep_cnt * sizeof(ucg_builtin_zcopy_info_t), + "ucg_zcopys_info"); + (void)memset_s(step->zcopys, phase->ep_cnt * sizeof(ucg_builtin_zcopy_info_t), + 0, phase->ep_cnt * sizeof(ucg_builtin_zcopy_info_t)); step->recv_buffer = (int8_t*)params->recv.buf; step->send_buffer = ((params->send.buf == MPI_IN_PLACE) || !(extra_flags & UCG_BUILTIN_OP_STEP_FLAG_FIRST_STEP)) ? (int8_t*)params->recv.buf : (int8_t*)params->send.buf; step->send_cb = NULL; - + if (phase->init_phase_cb != NULL) { + status = phase->init_phase_cb(phase, params); + if (status != UCS_OK) { + return status; + } + } step->non_contig.contig_buffer = NULL; step->non_contig.pack_state = NULL; step->non_contig.unpack_state = NULL; step->non_contig.pack_state_recv = NULL; + step->reduce_buff = NULL; + + ucg_builtin_plan_t *builtin_plan = (ucg_builtin_plan_t*)op->super.plan; + + if (phase->method == UCG_PLAN_METHOD_ALLTOALLV_LADD) { + step->send_coll_params = + (ucg_builtin_coll_params_t *)ucs_malloc(sizeof(ucg_builtin_coll_params_t), "allocate var_len_params"); + if (step->send_coll_params == NULL) { + return UCS_ERR_NO_MEMORY; + } + + step->recv_coll_params = + (ucg_builtin_coll_params_t *)ucs_malloc(sizeof(ucg_builtin_coll_params_t), "allocate var_len_params"); + if (step->recv_coll_params == NULL) { + ucg_builtin_free((void **)&step->send_coll_params); + return UCS_ERR_NO_MEMORY; + } + + step->flags |= extra_flags; + step->resend_flag = UCG_BUILTIN_OP_STEP_FIRST_SEND; + step->am_header.remote_offset = 0; + step->remote_offset = step->am_header.remote_offset; + step->send_cb = ucg_builtin_throttled_scatter_alltoallv_cb; + + return UCS_OK; + } + + if (phase->ex_attr.is_plummer) { + if (phase->ex_attr.is_variable_len == 0) { + step->buf_len_unit = phase->ex_attr.member_cnt * sizeof(int); + step->buffer_length = step->buf_len_unit; + step->am_header.remote_offset = phase->ex_attr.packed_rank * step->buffer_length; + /* The remote offset is an absolute value */ + step->remote_offset = step->am_header.remote_offset; + if (phase->step_index == UCG_PLAN_PLUMMER_STEP_INTRA_GATHER_SEND_COUNTS) { + step->send_buffer = (int8_t *)params->send.counts; + step->send_cb = ucg_builtin_plummer_gather_send_counts_cb; + } else { + step->send_buffer = (int8_t *)params->recv.counts; + step->send_cb = ucg_builtin_plummer_gather_recv_counts_cb; + } + } else { + if (phase->send_ep_cnt > 0) { + step->send_coll_params = ucg_builtin_allocate_coll_params(phase->ep_cnt); + if (step->send_coll_params == NULL) { + return UCS_ERR_NO_MEMORY; + } + } + if (phase->recv_ep_cnt > 0) { + step->recv_coll_params = ucg_builtin_allocate_coll_params(phase->ep_cnt); + if (step->recv_coll_params == NULL) { + if (step->send_coll_params) { + ucg_builtin_free_coll_params(&(step->send_coll_params)); + } + return UCS_ERR_NO_MEMORY; + } + } + step->flags |= extra_flags; + step->resend_flag = UCG_BUILTIN_OP_STEP_FIRST_SEND; + step->am_header.remote_offset = 0; + step->remote_offset = step->am_header.remote_offset; + + int is_plummer_phase = 0; +#if ENABLE_UCG_HICOLL + is_plummer_phase = phase->method == UCG_PLAN_METHOD_ALLTOALLV_PLUMMER; +#endif + if (phase->step_index == UCG_PLAN_PLUMMER_STEP_INTRA_GATHER_SEND_BUFFERS) { + step->send_cb = ucg_builtin_plummer_gather_send_buffers_cb; + } else if (is_plummer_phase) { + step->send_cb = ucg_builtin_plummer_inter_alltoallv_cb; + } else { + step->send_cb = ucg_builtin_plummer_scatter_recv_buffers_cb; + } + return UCS_OK; + } + } /* special parameter of buffer length should be set for allgather with bruck plan */ if (phase->method == UCG_PLAN_METHOD_ALLGATHER_BRUCK) { @@ -1088,14 +1632,6 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, step->send_cb = ucg_builtin_send_alltoall; } - if (phase->method != UCG_PLAN_METHOD_BCAST_WAYPOINT) { - if (*current_data_buffer) { - step->send_buffer = *current_data_buffer; - } else { - *current_data_buffer = step->recv_buffer; - } - } - if (phase->method == UCG_PLAN_METHOD_REDUCE_SCATTER_RING || phase->method == UCG_PLAN_METHOD_ALLGATHER_RING) { int num_offset_blocks; @@ -1123,7 +1659,7 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, if (phase->method == UCG_PLAN_METHOD_ALLGATHER_RECURSIVE) { size_t power = 1UL << (phase->step_index - 1); - size_t base_index = 0; + size_t base_index; base_index = (g_myidx / power) * power; step->am_header.remote_offset = base_index * params->send.count * params->send.dt_len; @@ -1133,6 +1669,17 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, } step->buffer_length *= power; } + if (phase->ex_attr.is_partial) { + if (builtin_plan->ucg_algo.binary_block == 1) { + step->buffer_length = phase->ex_attr.num_blocks * params->send.dt_len; + step->buf_len_unit = step->buffer_length; + step->am_header.remote_offset = phase->ex_attr.start_block * params->send.dt_len; + step->send_buffer += step->am_header.remote_offset; + step->buffer_length_recv = phase->ex_attr.peer_block * params->send.dt_len; + step->remote_offset = step->am_header.remote_offset; + } + } + ucs_assert(base_am_id < UCP_AM_ID_MAX); /* Decide how the messages are sent (regardless of my role) */ @@ -1140,7 +1687,7 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, recv_flag = (enum ucg_builtin_op_step_flags) 0; send_flag = (enum ucg_builtin_op_step_flags) 0; /* Note: in principle, step->send_buffer should not be changed after this function */ - ucs_status_t status = ucg_builtin_step_send_flags(step, phase, params, send_dt_len, &send_flag); + status = ucg_builtin_step_send_flags(step, phase, params, send_dt_len, &send_flag); extra_flags |= (send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED); if (ucs_unlikely(status != UCS_OK)) { return status; @@ -1168,60 +1715,42 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, extra_flags |= UCG_BUILTIN_OP_STEP_FLAG_LENGTH_PER_REQUEST; /* no break */ case UCG_PLAN_METHOD_REDUCE_WAYPOINT: - extra_flags = ((send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED) && ucg_algo.pipeline) ? + if (phase->ex_attr.is_partial) { + step->buffer_length = params->send.dt_len * params->send.count; + } + extra_flags = ((send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED) && builtin_plan->ucg_algo.pipeline) ? (extra_flags | UCG_BUILTIN_OP_STEP_FLAG_PIPELINED) : extra_flags; extra_flags |= UCG_BUILTIN_OP_STEP_FLAG_RECV_BEFORE_SEND1; step->flags = send_flag | extra_flags; - *current_data_buffer = (int8_t*)ucs_calloc(1, step->buffer_length, "ucg_fanin_waypoint_buffer"); - if (*current_data_buffer == NULL) { - return UCS_ERR_NO_MEMORY; - } - step->send_buffer = *current_data_buffer; - step->recv_buffer = step->send_buffer; - - if (params->send.buf == MPI_IN_PLACE) { - memcpy(step->send_buffer, params->recv.buf, step->buffer_length); - } else { - memcpy(step->send_buffer, params->send.buf, step->buffer_length); - } - - if (send_flag & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY) { - /* The send buffer changed, reregister it */ - uct_md_mem_dereg(step->uct_md, step->zcopy.memh); - status = uct_md_mem_reg(step->uct_md, step->send_buffer, - step->buffer_length, UCT_MD_MEM_ACCESS_ALL, &step->zcopy.memh); - if (status != UCS_OK) { - if (step->zcopy.zcomp != NULL) { - ucs_free(step->zcopy.zcomp); - step->zcopy.zcomp = NULL; - } - return status; - } - } - - if (!step->recv_buffer) { - return UCS_ERR_NO_MEMORY; + ucs_assert(params->type.modifiers != ucg_predefined_modifiers[UCG_PRIMITIVE_REDUCE]); + step->send_buffer = step->recv_buffer; + if (phase->ex_attr.is_partial) { + unsigned block_length = params->send.count * params->send.dt_len / phase->ex_attr.total_num_blocks; + step->buffer_length = phase->ex_attr.num_blocks * block_length; + step->send_buffer += step->am_header.remote_offset; } break; /* Recv-one, Send-all */ case UCG_PLAN_METHOD_BCAST_WAYPOINT: - extra_flags = ((send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED) && ucg_algo.pipeline) ? + extra_flags = ((send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED) && builtin_plan->ucg_algo.pipeline) ? (extra_flags | UCG_BUILTIN_OP_STEP_FLAG_PIPELINED) : extra_flags; extra_flags |= UCG_BUILTIN_OP_STEP_FLAG_RECV1_BEFORE_SEND; step->flags = send_flag | extra_flags; break; case UCG_PLAN_METHOD_SCATTER_WAYPOINT: - extra_flags = ((send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED) && ucg_algo.pipeline) ? + extra_flags = ((send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED) &&builtin_plan->ucg_algo.pipeline) ? (extra_flags | UCG_BUILTIN_OP_STEP_FLAG_PIPELINED) : extra_flags; extra_flags |= UCG_BUILTIN_OP_STEP_FLAG_RECV1_BEFORE_SEND; extra_flags |= UCG_BUILTIN_OP_STEP_FLAG_LENGTH_PER_REQUEST; step->flags = send_flag | extra_flags; - *current_data_buffer = (int8_t*)ucs_calloc(1, step->buffer_length, "ucg_fanout_waypoint_buffer"); if (*current_data_buffer == NULL) { - return UCS_ERR_NO_MEMORY; + *current_data_buffer = (int8_t *)ucs_calloc(1, step->buffer_length, "ucg_fanout_waypoint_buffer"); + if (*current_data_buffer == NULL) { + return UCS_ERR_NO_MEMORY; + } } step->send_buffer = *current_data_buffer; step->recv_buffer = step->send_buffer; @@ -1233,6 +1762,7 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, /* Recursive patterns */ case UCG_PLAN_METHOD_REDUCE_RECURSIVE: case UCG_PLAN_METHOD_ALLGATHER_RECURSIVE: + case UCG_PLAN_METHOD_REDUCE_SCATTER_RECURSIVE: extra_flags |= UCG_BUILTIN_OP_STEP_FLAG_RECV_AFTER_SEND; step->flags = send_flag | extra_flags; break; @@ -1255,15 +1785,36 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, case UCG_PLAN_METHOD_REDUCE_SCATTER_RING: case UCG_PLAN_METHOD_ALLGATHER_RING: + case UCG_PLAN_METHOD_EXCHANGE: extra_flags |= UCG_BUILTIN_OP_STEP_FLAG_RECV_AFTER_SEND; step->flags = send_flag | extra_flags; break; +#if ENABLE_UCG_HICOLL + case UCG_PLAN_METHOD_INC: + extra_flags |= UCG_BUILTIN_OP_STEP_FLAG_RECV_AFTER_SEND; + step->flags = send_flag | extra_flags; + step->send_cb = ucg_builtin_send_inc; + break; +#endif default: ucs_error("Invalid method for a collective operation."); return UCS_ERR_INVALID_PARAM; } + if (send_flag & UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY) { + if (phase->method != UCG_PLAN_METHOD_RECV_TERMINAL && + phase->method != UCG_PLAN_METHOD_REDUCE_TERMINAL ) { + /* memory registration (using the memory registration cache)*/ + status = ucg_builtin_step_zcopy_prep(step); + if (ucs_unlikely(status != UCS_OK)) { + ucs_error("Failed to register the buffer in zcopy"); + return status; + } + } + } + + status = ucg_builtin_step_recv_flags(step, phase, params, recv_dt_len, &recv_flag); if (status != UCS_OK) { return status; @@ -1276,13 +1827,33 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, if (step->flags & send_flag) { if (phase->method != UCG_PLAN_METHOD_ALLGATHER_RECURSIVE && phase->method != UCG_PLAN_METHOD_REDUCE_SCATTER_RING && - phase->method != UCG_PLAN_METHOD_ALLGATHER_RING) { + phase->method != UCG_PLAN_METHOD_ALLGATHER_RING && + !phase->ex_attr.is_partial) { step->am_header.remote_offset = 0; } } + /* create allreudce buffer */ + if ((phase->method == UCG_PLAN_METHOD_REDUCE_TERMINAL || phase->method == UCG_PLAN_METHOD_REDUCE_WAYPOINT) + && !(send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED) && g_reduce_coinsidency + && (extra_flags & UCG_BUILTIN_OP_STEP_FLAG_FIRST_STEP)) { + step->rbuf_count = step->fragments_recv + * (phase->ep_cnt - ((phase->method == UCG_PLAN_METHOD_REDUCE_TERMINAL) ? 0 : 1)); + step->reduce_buff = (void *)UCS_ALLOC_CHECK(step->buffer_length * step->rbuf_count, "reduce buffer for child"); + ucs_debug("rb count:%d, ep count:%u", step->rbuf_count, phase->ep_cnt); + } + if (!(send_flag & UCG_BUILTIN_OP_STEP_FLAG_FRAGMENTED) + && ((phase->method == UCG_PLAN_METHOD_REDUCE_WAYPOINT) || (phase->method == UCG_PLAN_METHOD_SEND_TERMINAL)) + && g_reduce_coinsidency) { + unsigned is_allreduce = (ucg_builtin_get_coll_type(¶ms->type) == COLL_TYPE_ALLREDUCE); + if (is_allreduce) { + ucs_debug("my position%d", g_myposition); + step->am_header.remote_offset = g_myposition; + } + } + /* Pipelining preparation */ - if ((step->flags & UCG_BUILTIN_OP_STEP_FLAG_PIPELINED) && ucg_algo.pipeline) { + if ((step->flags & UCG_BUILTIN_OP_STEP_FLAG_PIPELINED) && builtin_plan->ucg_algo.pipeline) { step->fragment_pending = (uint8_t*)UCS_ALLOC_CHECK(step->fragments * sizeof(uint8_t*), "ucg_builtin_step_pipelining"); } @@ -1290,7 +1861,8 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, if (phase->method != UCG_PLAN_METHOD_ALLGATHER_BRUCK && phase->method != UCG_PLAN_METHOD_ALLTOALL_BRUCK && phase->method != UCG_PLAN_METHOD_REDUCE_SCATTER_RING && - phase->method != UCG_PLAN_METHOD_ALLGATHER_RING) { + phase->method != UCG_PLAN_METHOD_ALLGATHER_RING && + !phase->ex_attr.is_inequal) { recv_flag = (enum ucg_builtin_op_step_flags)step->flags; step->fragments_recv = step->fragments; } @@ -1309,7 +1881,7 @@ ucs_status_t ucg_builtin_step_create(ucg_builtin_plan_phase_t *phase, params->send.count > 0, recv_flag); } -static inline int ucg_builtin_convert_datatype(ucg_builtin_plan_t *builtin_plan, +static inline ucs_status_t ucg_builtin_convert_datatype(ucg_builtin_plan_t *builtin_plan, void *param_datatype, ucp_datatype_t *ucp_datatype) { @@ -1344,7 +1916,10 @@ void ucg_builtin_swap_net_recv(char *netdata, size_t length, size_t offset, ucs_fatal("no memory for malloc, length:%lu", length); } - memcpy(tmp_buffer, netdata, length); + errno_t error_status = memcpy_s(tmp_buffer, length, netdata, length); + if (error_status != EOK) { + ucs_fatal("memcpy_s encountered an error, The error code is %d.", error_status); + } if (gen_dt != NULL) { if (step->recv_cb == ucg_builtin_comp_reduce_full_cb) { ucs_debug("large non-contiguous datatype can not swap here"); @@ -1353,8 +1928,14 @@ void ucg_builtin_swap_net_recv(char *netdata, size_t length, size_t offset, gen_dt->ops.unpack(state_unpack, offset, tmp_buffer, length); } } else { - memcpy(netdata, recv_buffer + offset, length); - memcpy(recv_buffer + offset, tmp_buffer, length); + error_status = memcpy_s(netdata, length, recv_buffer + offset, length); + if (error_status != EOK) { + ucs_fatal("memcpy_s encountered an error, The error code is %d.", error_status); + } + error_status = memcpy_s(recv_buffer + offset, length, tmp_buffer, length); + if (error_status != EOK) { + ucs_fatal("memcpy_s encountered an error, The error code is %d.", error_status); + } } free(tmp_buffer); @@ -1373,18 +1954,23 @@ ucs_status_t ucg_builtin_op_create(ucg_plan_t *plan, ucg_builtin_op_t *op = (ucg_builtin_op_t*) ucs_mpool_get_inline(&builtin_plan->op_mp); + ucg_builtin_group_ctx_t *builtin_ctx = UCG_GROUP_TO_COMPONENT_CTX(ucg_builtin_component, plan->group); if (op == NULL) { return UCS_ERR_NO_MEMORY; } ucg_builtin_op_step_t *next_step = &op->steps[0]; unsigned am_id = builtin_plan->am_id; - int8_t *current_data_buffer = NULL; + op->super.plan = plan; /* obtain UCX datatypes corresponding to the extenral datatypes passed */ op->dtspan_f = builtin_plan->dtspan_f; op->send_dt = NULL; op->recv_dt = NULL; + op->temp_data_buffer = NULL; + op->temp_data_buffer1 = NULL; + op->temp_exchange_buffer = NULL; + op->temp_exchange_buffer1 = NULL; if (params->send.count > 0 && params->send.dt_len > 0) { status = ucg_builtin_convert_datatype(builtin_plan, params->send.dt_ext, &send_dtype); if (ucs_unlikely(status != UCS_OK)) { @@ -1406,6 +1992,8 @@ ucs_status_t ucg_builtin_op_create(ucg_plan_t *plan, /* get number of processes */ num_procs = (unsigned)(ucg_group_get_params(plan->group))->member_count; g_myidx = plan->my_index; + g_myposition = plan->up_offset; + g_reduce_coinsidency = ucg_is_allreduce_consistency(builtin_ctx); ucs_debug("ucg rank: %" PRIu64 " phase cnt %u", g_myidx, phase_count); /* Select the right initialization callback */ status = ucg_builtin_op_select_callback(builtin_plan, UCG_DT_IS_CONTIG(params, send_dtype), UCG_DT_IS_CONTIG(params, recv_dtype), &op->init_cb, &op->final_cb); @@ -1416,32 +2004,32 @@ ucs_status_t ucg_builtin_op_create(ucg_plan_t *plan, /* Create a step in the op for each phase in the topology */ if (phase_count == 1) { /* The only step in the plan */ - status = ucg_builtin_step_create(next_phase, send_dtype, recv_dtype, + status = ucg_builtin_step_create(op, next_phase, send_dtype, recv_dtype, UCG_BUILTIN_OP_STEP_FLAG_FIRST_STEP | UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP, am_id, plan->group_id, params, - ¤t_data_buffer, next_step); + &(op->temp_data_buffer), next_step); } else { /* First step of many */ - status = ucg_builtin_step_create(next_phase, send_dtype, recv_dtype, + status = ucg_builtin_step_create(op, next_phase, send_dtype, recv_dtype, UCG_BUILTIN_OP_STEP_FLAG_FIRST_STEP, am_id, plan->group_id, - params, ¤t_data_buffer, next_step); + params, &(op->temp_data_buffer), next_step); if (ucs_unlikely(status != UCS_OK)) { goto op_cleanup; } ucg_step_idx_ext_t step_cnt; for (step_cnt = 1; step_cnt < phase_count - 1; step_cnt++) { - status = ucg_builtin_step_create(++next_phase, send_dtype, recv_dtype, 0, am_id, - plan->group_id, params, ¤t_data_buffer, ++next_step); + status = ucg_builtin_step_create(op, ++next_phase, send_dtype, recv_dtype, 0, am_id, + plan->group_id, params, &(op->temp_data_buffer), ++next_step); if (ucs_unlikely(status != UCS_OK)) { goto op_cleanup; } } /* Last step gets a special flag */ - status = ucg_builtin_step_create(++next_phase, send_dtype, recv_dtype, + status = ucg_builtin_step_create(op, ++next_phase, send_dtype, recv_dtype, UCG_BUILTIN_OP_STEP_FLAG_LAST_STEP, am_id, plan->group_id, - params, ¤t_data_buffer, ++next_step); + params, &(op->temp_data_buffer), ++next_step); } if (ucs_unlikely(status != UCS_OK)) { goto op_cleanup; diff --git a/builtin/ops/builtin_ops.h b/builtin/ops/builtin_ops.h index b719bdc..6b5d96d 100644 --- a/builtin/ops/builtin_ops.h +++ b/builtin/ops/builtin_ops.h @@ -1,16 +1,15 @@ /* * Copyright (C) Huawei Technologies Co., Ltd. 2019-2020. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Notes: See file LICENSE for terms. */ #ifndef UCG_BUILTIN_OPS_H_ #define UCG_BUILTIN_OPS_H_ -BEGIN_C_DECLS - #include "../plan/builtin_plan.h" #include +BEGIN_C_DECLS /* * The built-in collective operations are composed of one or more steps. * In each step, we apply a method to a subgroup of peer processes. @@ -38,10 +37,6 @@ extern unsigned builtin_base_am_id; extern ucg_group_member_index_t g_myidx; extern unsigned num_procs; -#ifndef MPI_IN_PLACE -#define MPI_IN_PLACE ((void*)0x1) -#endif - typedef union ucg_builtin_header { struct { ucg_group_id_t group_id; @@ -57,6 +52,11 @@ typedef union ucg_builtin_header { uint64_t header; } ucg_builtin_header_t; +typedef struct ucg_builtin_header_ext { + ucg_builtin_header_t header; + ucg_group_member_index_t src_rank; +} UCS_S_PACKED ucg_builtin_header_ext_t; + /* * The builtin operation */ @@ -77,6 +77,7 @@ enum ucg_builtin_op_step_flags { UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_SHORT = UCS_BIT(9), UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_BCOPY = UCS_BIT(10), UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_ZCOPY = UCS_BIT(11), + UCG_BUILTIN_OP_STEP_FLAG_SEND_AM_DYNAMIC = UCS_BIT(12), }; enum ucg_builtin_op_step_displs_rule { @@ -90,6 +91,21 @@ enum ucg_builtin_op_step_resend_flag { UCG_BUILTIN_OP_STEP_RESEND, }; +enum ucg_builtin_op_dt_option { + /* Option for non-contig dt */ + UCG_BUILTIN_OP_DT_RECV, + UCG_BUILTIN_OP_DT_SEND, + UCG_BUILTIN_OP_DT_SWAP, +}; + +enum Plummer_step_modifier { + UCG_PLAN_PLUMMER_STEP_INTRA_GATHER_SEND_COUNTS = 0, + UCG_PLAN_PLUMMER_STEP_INTRA_GATHER_SEND_BUFFERS = 1, + UCG_PLAN_PLUMMER_STEP_INTRA_GATHER_RECV_COUNTS = 2, + UCG_PLAN_PLUMMER_STEP_INTER_ALLTOALLV = 3, + UCG_PLAN_PLUMMER_STEP_INTRA_SCATTER_RECV_BUFFERS = 4, +}; + /* Definitions of several callback functions, used during an operation */ typedef struct ucg_builtin_op ucg_builtin_op_t; typedef struct ucg_builtin_request ucg_builtin_request_t; @@ -107,9 +123,30 @@ typedef struct ucg_builtin_zcomp { ucg_builtin_request_t *req; } ucg_builtin_zcomp_t; +typedef struct ucg_builtin_coll_params { + int8_t *init_buf; + int *counts; + int *displs; +} ucg_builtin_coll_params_t; + +ucg_builtin_coll_params_t *ucg_builtin_allocate_coll_params(unsigned local_member_cnt); +void ucg_builtin_free_coll_params(ucg_builtin_coll_params_t **params); + +typedef struct ucg_builtin_zcopy_info { + uct_md_h uct_md; + uct_mem_h memh; + ucg_builtin_zcomp_t *zcomp; + uint32_t num_store; /* < number of step's store zcopy messages */ + uint32_t zcopy_pending; +} ucg_builtin_zcopy_info_t; + +typedef void *(*ucg_builtin_pack_rank_cb_t)(void *step, const void *send_buffer, size_t buffer_len, + size_t *new_buffer_len); +typedef ucg_group_member_index_t (*ucg_builtin_unpack_rank_cb_t)(const void *send_buffer, size_t buffer_len); + typedef struct ucg_builtin_op_step { uint16_t flags; /* @ref enum ucg_builtin_op_step_flags */ - uint8_t iter_ep; /* iterator, somewhat volatile */ + uint32_t iter_ep; /* iterator, somewhat volatile */ ucg_offset_t iter_offset; /* iterator, somewhat volatile */ ucg_offset_t remote_offset; /* for algorithm like ring */ #define UCG_BUILTIN_OFFSET_PIPELINE_READY ((ucg_offset_t)-1) @@ -119,11 +156,15 @@ typedef struct ucg_builtin_op_step { uct_md_h uct_md; ucg_builtin_plan_phase_t *phase; + ucg_builtin_coll_params_t *send_coll_params; + ucg_builtin_coll_params_t *recv_coll_params; + int8_t *send_buffer; int8_t *recv_buffer; size_t buffer_length; size_t buffer_length_recv; ucg_builtin_header_t am_header; + ucg_builtin_header_ext_t am_header_ext; /* extended header with rank */ uint32_t am_id; size_t buf_len_unit; /* only for discrete buffer sending */ @@ -146,6 +187,13 @@ typedef struct ucg_builtin_op_step { ucg_builtin_comp_send_cb_t send_cb; ucg_builtin_comp_recv_cb_t recv_cb; + /* Fields intended for send and receive variable length */ + struct { + int8_t *pack_rank_buffer; + ucg_builtin_pack_rank_cb_t pack_rank_func; + ucg_builtin_unpack_rank_cb_t unpack_rank_func; + } variable_length; + /* Fields intended for non-contig datatypes */ struct { int8_t *contig_buffer; @@ -160,6 +208,14 @@ typedef struct ucg_builtin_op_step { ucg_builtin_zcomp_t *zcomp; uint32_t num_store; /* < number of step's store zcopy messages */ } zcopy; + + /* for dynamic sending, the array of zcopy is used */ + ucg_builtin_zcopy_info_t *zcopys; + + /* Terminal or Waypoint node of the allreduce tree-algo need to + alloc the buffer to save the child rank value */ + void *reduce_buff; + uint32_t rbuf_count; /* element count of the reduce_buff */ } ucg_builtin_op_step_t; typedef struct ucg_builtin_comp_slot ucg_builtin_comp_slot_t; @@ -174,6 +230,11 @@ struct ucg_builtin_op { dt_span_f dtspan_f; ucg_builtin_comp_slot_t *slots; /**< slots pointer, for faster initialization */ ucs_list_link_t *resend; /**< resend pointer, for faster resend */ + ucs_status_t inc_init_status; + int8_t *temp_data_buffer; /**< temp buffer for reduce adn scatter way-point*/ + int8_t *temp_data_buffer1; /**< temp buffer for reduce adn scatter way-point*/ + int8_t *temp_exchange_buffer; /**< temp buffer exchange data */ + int8_t *temp_exchange_buffer1; /**< temp buffer exchange data */ ucg_builtin_op_step_t steps[]; /**< steps required to complete the operation */ }; @@ -189,9 +250,14 @@ struct ucg_builtin_request { ucg_request_t *comp_req; /**< completion status is written here */ ucs_list_link_t send_list; /**< membership in progress list */ unsigned recv_comp; /**< if recv is complete, only use in r1s */ + ucs_status_t inc_req_status; + ucs_status_t ladd_req_status; + ucs_status_t plummer_req_status; + unsigned is_send_cb_called; /**< whether send_cb has been called */ }; -ucs_status_t ucg_builtin_step_create (ucg_builtin_plan_phase_t *phase, +ucs_status_t ucg_builtin_step_create (ucg_builtin_op_t *op, + ucg_builtin_plan_phase_t *phase, ucp_datatype_t send_dtype, ucp_datatype_t recv_dtype, unsigned extra_flags, @@ -217,6 +283,9 @@ void ucg_builtin_swap_net_recv(char *netdata, size_t length, size_t offset, size_t ucg_builtin_get_dt_len(ucp_dt_generic_t *dt_gen); +ucs_status_t ucg_builtin_step_alloc_pack_rank_buffer(ucg_builtin_op_step_t *step, + size_t buffer_length); +void ucg_builtin_step_free_pack_rank_buffer(ucg_builtin_op_step_t *step); /* * Incoming messages are processed for one of the collective operations * currently outstanding - arranged in as a window (think: TCP) of slots. @@ -226,9 +295,13 @@ size_t ucg_builtin_get_dt_len(ucp_dt_generic_t *dt_gen); * a "header", a.k.a. "immediate value" (see UCT API), which refers to the * location to apply (write or reduce) the payload within the local buffer. */ + +typedef void (*ucg_desc_release_func_t)(void *); + typedef struct ucg_builtin_comp_desc { ucp_recv_desc_t super; - char padding[UCP_WORKER_HEADROOM_PRIV_SIZE]; + ucg_desc_release_func_t release; + char padding[UCP_WORKER_HEADROOM_PRIV_SIZE - sizeof(ucg_desc_release_func_t)]; ucg_builtin_header_t header; char data[0]; } ucg_builtin_comp_desc_t; @@ -265,7 +338,7 @@ struct ucg_builtin_comp_slot { * is 0 but expected be 1. So we use UCG_DT_IS_CONTIG instead. */ #define UCG_DT_IS_CONTIG(_params, _datatype) \ - ((_params->send.dt_len) ? (UCP_DT_IS_CONTIG(_datatype)) : 1) + (((_params)->send.dt_len) ? (UCP_DT_IS_CONTIG(_datatype)) : 1) END_C_DECLS diff --git a/builtin/plan/builtin_algo_check.c b/builtin/plan/builtin_algo_check.c new file mode 100644 index 0000000..7182942 --- /dev/null +++ b/builtin/plan/builtin_algo_check.c @@ -0,0 +1,573 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * Description: Algorithm decision for collective operation + */ + + +#include +#include + +#include "builtin_plan.h" +#include "builtin_algo_decision.h" + +typedef enum { + CHECK_NON_CONTIG_DATATYPE, + CHECK_NON_COMMUTATIVE, + CHECK_NAP_UNSUPPORT, + CHECK_RABEN_UNSUPPORT, + CHECK_NAWARE_RABEN_UNSUPPORT, + CHECK_SAWARE_RABEN_UNSUPPORT, + CHECK_BIND_TO_NONE, + CHECK_PPN_UNBALANCE, + CHECK_NRANK_UNCONTINUE, + CHECK_PPS_UNBALANCE, + CHECK_SRANK_UNCONTINUE, + CHECK_LARGE_DATATYPE, + CHECK_PHASE_SEGMENT, + CHECK_INC_UNSUPPORT, + CHECK_MPI_IN_PLACE, + CHECK_PLUMMER_UNSUPPORT, + /* The new check item must be added above */ + CHECK_ITEM_NUMS +} check_item_t; + +static const char *check_item_str_array[CHECK_ITEM_NUMS] = { + "non_contig_datatype", + "non_commutative", + "nap_unsupport", + "raben_unsupport", + "naware_raben_unsupport", + "saware_raben_unsupport", + "bind_to_none", + "ppn_unbalance", + "nrank_uncontinue", + "pps_unbalance", + "srank_uncontinue", + "large_datatype", + "phase_segment", + "inc_unsupport", + "mpi_in_place", + "plummer_unsupport", +}; + +static int ucg_builtin_check_non_contig_datatype(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + ucp_datatype_t ucp_datatype; + if (coll_params->send.count <= 0 || coll_params->send.dt_len <= 0) { + return 0; + } + + group_params->mpi_dt_convert(coll_params->send.dt_ext, &ucp_datatype); + return !UCP_DT_IS_CONTIG(ucp_datatype); +} + +static int ucg_builtin_check_non_commutative(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return !group_params->op_is_commute_f(coll_params->send.op_ext); +} + +static int ucg_builtin_check_nap_unsupport(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + unsigned node_nums; + int ppn_local; + ppn_local = group_params->topo_args.ppn_local; + node_nums = group_params->member_count / group_params->topo_args.ppn_local; + return (group_params->topo_args.ppn_unbalance || ppn_local <= 1 || (node_nums & (node_nums - 1))); +} + +static int ucg_builtin_check_raben_unsupport(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + /* Raben does not support odd number of processes */ + const int even_number = 2; + return (group_params->member_count % even_number); +} + +static int ucg_builtin_check_naware_raben_unsupport(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + int ppn_local; + const int even_number = 2; + + ppn_local = group_params->topo_args.ppn_local; + return (group_params->topo_args.ppn_unbalance || (ppn_local % even_number == 1 && ppn_local != 1)); +} + +static int ucg_builtin_check_saware_raben_unsupport(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + int pps_local; + const int even_number = 2; + + pps_local = group_params->topo_args.pps_local; + return (group_params->topo_args.pps_unbalance || (pps_local % even_number == 1 && pps_local != 1)); +} + +static int ucg_builtin_check_bind_to_none(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return group_params->topo_args.bind_to_none; +} + +static int ucg_builtin_check_ppn_unbalance(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return group_params->topo_args.ppn_unbalance; +} + +static int ucg_builtin_check_nrank_uncontinue(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return group_params->topo_args.nrank_uncontinue; +} + +static int ucg_builtin_check_pps_unbalance(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return group_params->topo_args.pps_unbalance; +} + +static int ucg_builtin_check_srank_uncontinue(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return group_params->topo_args.srank_uncontinue || group_params->topo_args.nrank_uncontinue; +} + +static int ucg_builtin_check_large_datatype(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + const int large_datatype_threshold = 32; + return (coll_params->send.dt_len > large_datatype_threshold); +} + +static int ucg_builtin_check_phase_segment(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + int count = coll_params->send.count; + size_t dt_len = coll_params->send.dt_len; + +#define UCG_SHORT_THRESHHOLD 176 +#define UCT_MIN_SHORT_ONE_LEN 80 +#define UCT_MIN_BCOPY_ONE_LEN 1000 + if (dt_len > UCT_MIN_BCOPY_ONE_LEN) { + return 1; + } + if (dt_len > UCT_MIN_SHORT_ONE_LEN && (dt_len * count) <= UCG_SHORT_THRESHHOLD) { + return 1; + } +#undef UCG_SHORT_THRESHHOLD +#undef UCT_MIN_SHORT_ONE_LEN +#undef UCT_MIN_BCOPY_ONE_LEN + return 0; +} + +static int ucg_builtin_check_inc_unsupport(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + int enable_inc = 0; +#if ENABLE_UCG_HICOLL + enable_inc = inc_used(group_params); +#endif + return !enable_inc; +} + +static int ucg_builtin_check_mpi_in_place(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return coll_params->send.buf == MPI_IN_PLACE; +} + +STATIC_GTEST int ucg_builtin_check_plummer_unsupport(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + ucg_group_member_index_t i; + for (i = 0; i < group_params->member_count; i++) { + if ((coll_params->send.displs[i] < 0) || (coll_params->recv.displs[i] < 0)) { + return 1; + } + } + if ((coll_params->send.displs[0] != 0) || (coll_params->recv.displs[0] != 0)) { + return 1; + } + for (i = 0; i < (group_params->member_count - 1); i++) { + if ((coll_params->send.displs[i + 1] != (coll_params->send.displs[i] + coll_params->send.counts[i])) || + (coll_params->recv.displs[i + 1] != (coll_params->recv.displs[i] + coll_params->recv.counts[i]))) { + return 1; + } + } + return 0; +} + +typedef int (*check_f)(const ucg_group_params_t *group_params, const ucg_collective_params_t *coll_params); + +static check_f check_fun_array[CHECK_ITEM_NUMS] = { + ucg_builtin_check_non_contig_datatype, + ucg_builtin_check_non_commutative, + ucg_builtin_check_nap_unsupport, + ucg_builtin_check_raben_unsupport, + ucg_builtin_check_naware_raben_unsupport, + ucg_builtin_check_saware_raben_unsupport, + ucg_builtin_check_bind_to_none, + ucg_builtin_check_ppn_unbalance, + ucg_builtin_check_nrank_uncontinue, + ucg_builtin_check_pps_unbalance, + ucg_builtin_check_srank_uncontinue, + ucg_builtin_check_large_datatype, + ucg_builtin_check_phase_segment, + ucg_builtin_check_inc_unsupport, + ucg_builtin_check_mpi_in_place, + ucg_builtin_check_plummer_unsupport, +}; + +typedef struct { + check_item_t chk_item; + int algo_fb; +} check_fallback_t; + +typedef struct { + check_fallback_t *chkfb; + int chkfb_size; +} chkfb_tbl_t; + +#define CHKFB_BARRIER(n) \ + chkfb_barrier_algo##n + +#define CHKFB_SIZE_BARRIER(n) \ + (sizeof(chkfb_barrier_algo##n) / sizeof(chkfb_barrier_algo##n[0])) + +#define CHKFB_BCAST(n) \ + chkfb_bcast_algo##n + +#define CHKFB_SIZE_BCAST(n) \ + (sizeof(chkfb_bcast_algo##n) / sizeof(chkfb_bcast_algo##n[0])) + +#define CHKFB_ALLREDUCE(n) \ + chkfb_allreduce_algo##n + +#define CHKFB_SIZE_ALLREDUCE(n) \ + (sizeof(chkfb_allreduce_algo##n) / sizeof(chkfb_allreduce_algo##n[0])) + +#define CHKFB_ALLTOALLV(n) \ + chkfb_alltoallv_algo##n + +#define CHKFB_SIZE_ALLTOALLV(n) \ + (sizeof(chkfb_alltoallv_algo##n) / sizeof(chkfb_alltoallv_algo##n[0])) + +static check_fallback_t chkfb_allreduce_algo2[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_LARGE_DATATYPE, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo3[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_BIND_TO_NONE, 2}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_PPS_UNBALANCE, 2}, + {CHECK_SRANK_UNCONTINUE, 2}, + {CHECK_LARGE_DATATYPE, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo4[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_PHASE_SEGMENT, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo5[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, + {CHECK_LARGE_DATATYPE, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo6[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_BIND_TO_NONE, 5}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_PPS_UNBALANCE, 5}, + {CHECK_SRANK_UNCONTINUE, 5}, + {CHECK_LARGE_DATATYPE, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo7[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, + {CHECK_LARGE_DATATYPE, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo8[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_BIND_TO_NONE, 7}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_PPS_UNBALANCE, 7}, + {CHECK_SRANK_UNCONTINUE, 7}, + {CHECK_LARGE_DATATYPE, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo9[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, + {CHECK_LARGE_DATATYPE, 1}, + {CHECK_INC_UNSUPPORT, 3}, +}; + +static check_fallback_t chkfb_allreduce_algo10[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_BIND_TO_NONE, 2}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_PPS_UNBALANCE, 2}, + {CHECK_SRANK_UNCONTINUE, 2}, + {CHECK_LARGE_DATATYPE, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo11[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_NAP_UNSUPPORT, 2}, + {CHECK_BIND_TO_NONE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, + {CHECK_LARGE_DATATYPE, 1}, +}; + +static check_fallback_t chkfb_allreduce_algo12[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_RABEN_UNSUPPORT, 4}, + {CHECK_BIND_TO_NONE, 2}, + {CHECK_LARGE_DATATYPE, 4}, +}; + +static check_fallback_t chkfb_allreduce_algo13[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_RABEN_UNSUPPORT, 4}, + {CHECK_NAWARE_RABEN_UNSUPPORT, 12}, + {CHECK_BIND_TO_NONE, 2}, + {CHECK_PPN_UNBALANCE, 12}, + {CHECK_NRANK_UNCONTINUE, 12}, + {CHECK_LARGE_DATATYPE, 4}, +}; + +static check_fallback_t chkfb_allreduce_algo14[] = { + {CHECK_NON_CONTIG_DATATYPE, 1}, + {CHECK_NON_COMMUTATIVE, 1}, + {CHECK_RABEN_UNSUPPORT, 4}, + {CHECK_SAWARE_RABEN_UNSUPPORT, 12}, + {CHECK_BIND_TO_NONE, 2}, + {CHECK_PPN_UNBALANCE, 12}, + {CHECK_PPS_UNBALANCE, 12}, + {CHECK_SRANK_UNCONTINUE, 12}, + {CHECK_LARGE_DATATYPE, 4}, +}; + +static check_fallback_t chkfb_barrier_algo3[] = { + {CHECK_BIND_TO_NONE, 2}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_PPS_UNBALANCE, 2}, + {CHECK_SRANK_UNCONTINUE, 2}, +}; + +static check_fallback_t chkfb_barrier_algo4[] = { + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, +}; + +static check_fallback_t chkfb_barrier_algo5[] = { + {CHECK_BIND_TO_NONE, 4}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_PPS_UNBALANCE, 4}, + {CHECK_SRANK_UNCONTINUE, 4}, +}; + +static check_fallback_t chkfb_barrier_algo6[] = { + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, +}; + +static check_fallback_t chkfb_barrier_algo7[] = { + {CHECK_BIND_TO_NONE, 6}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_PPS_UNBALANCE, 6}, + {CHECK_SRANK_UNCONTINUE, 6}, +}; + +static check_fallback_t chkfb_barrier_algo8[] = { + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, + {CHECK_INC_UNSUPPORT, 2}, +}; + +static check_fallback_t chkfb_barrier_algo9[] = { + {CHECK_BIND_TO_NONE, 2}, + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_PPS_UNBALANCE, 2}, + {CHECK_SRANK_UNCONTINUE, 2}, + {CHECK_INC_UNSUPPORT, 3}, +}; + +static check_fallback_t chkfb_barrier_algo10[] = { + {CHECK_NAP_UNSUPPORT, 2}, + {CHECK_BIND_TO_NONE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, +}; + +static check_fallback_t chkfb_bcast_algo3[] = { + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, +}; + +static check_fallback_t chkfb_bcast_algo4[] = { + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, +}; + +static check_fallback_t chkfb_bcast_algo5[] = { + {CHECK_PPN_UNBALANCE, 2}, + {CHECK_NRANK_UNCONTINUE, 2}, + {CHECK_INC_UNSUPPORT, 2}, +}; + +static check_fallback_t chkfb_alltoallv_algo2[] = { + {CHECK_PPN_UNBALANCE, 1}, + {CHECK_NRANK_UNCONTINUE, 1}, + {CHECK_MPI_IN_PLACE, 1}, + {CHECK_PLUMMER_UNSUPPORT, 1}, +}; + +chkfb_tbl_t chkfb_barrier[UCG_ALGORITHM_BARRIER_LAST] = { + {NULL, 0}, + {NULL, 0}, + {NULL, 0}, + {CHKFB_BARRIER(3), CHKFB_SIZE_BARRIER(3)}, + {CHKFB_BARRIER(4), CHKFB_SIZE_BARRIER(4)}, + {CHKFB_BARRIER(5), CHKFB_SIZE_BARRIER(5)}, + {CHKFB_BARRIER(6), CHKFB_SIZE_BARRIER(6)}, + {CHKFB_BARRIER(7), CHKFB_SIZE_BARRIER(7)}, + {CHKFB_BARRIER(8), CHKFB_SIZE_BARRIER(8)}, + {CHKFB_BARRIER(9), CHKFB_SIZE_BARRIER(9)}, + {CHKFB_BARRIER(10), CHKFB_SIZE_BARRIER(10)}, +}; + +chkfb_tbl_t chkfb_bcast[UCG_ALGORITHM_BCAST_LAST] = { + {NULL, 0}, + {NULL, 0}, + {NULL, 0}, + {CHKFB_BCAST(3), CHKFB_SIZE_BCAST(3)}, + {CHKFB_BCAST(4), CHKFB_SIZE_BCAST(4)}, + {CHKFB_BCAST(5), CHKFB_SIZE_BCAST(5)}, +}; + +chkfb_tbl_t chkfb_allreduce[UCG_ALGORITHM_ALLREDUCE_LAST] = { + {NULL, 0}, + {NULL, 0}, + {CHKFB_ALLREDUCE(2), CHKFB_SIZE_ALLREDUCE(2)}, + {CHKFB_ALLREDUCE(3), CHKFB_SIZE_ALLREDUCE(3)}, + {CHKFB_ALLREDUCE(4), CHKFB_SIZE_ALLREDUCE(4)}, + {CHKFB_ALLREDUCE(5), CHKFB_SIZE_ALLREDUCE(5)}, + {CHKFB_ALLREDUCE(6), CHKFB_SIZE_ALLREDUCE(6)}, + {CHKFB_ALLREDUCE(7), CHKFB_SIZE_ALLREDUCE(7)}, + {CHKFB_ALLREDUCE(8), CHKFB_SIZE_ALLREDUCE(8)}, + {CHKFB_ALLREDUCE(9), CHKFB_SIZE_ALLREDUCE(9)}, + {CHKFB_ALLREDUCE(10), CHKFB_SIZE_ALLREDUCE(10)}, + {CHKFB_ALLREDUCE(11), CHKFB_SIZE_ALLREDUCE(11)}, + {CHKFB_ALLREDUCE(12), CHKFB_SIZE_ALLREDUCE(12)}, + {CHKFB_ALLREDUCE(13), CHKFB_SIZE_ALLREDUCE(13)}, + {CHKFB_ALLREDUCE(14), CHKFB_SIZE_ALLREDUCE(14)}, +}; + +chkfb_tbl_t chkfb_alltoallv[UCG_ALGORITHM_ALLTOALLV_LAST] = { + {NULL, 0}, + {NULL, 0}, + {CHKFB_ALLTOALLV(2), CHKFB_SIZE_ALLTOALLV(2)}, +}; + +#undef CHKFB_BARRIER +#undef CHKFB_SIZE_BARRIER + +#undef CHKFB_BCAST +#undef CHKFB_SIZE_BCAST + +#undef CHKFB_ALLREDUCE +#undef CHKFB_SIZE_ALLREDUCE + +#undef CHKFB_ALLTOALLV +#undef CHKFB_SIZE_ALLTOALLV + +static inline check_fallback_t *ucg_builtin_barrier_check_fallback_array(int algo, int *arr_size) +{ + *arr_size = chkfb_barrier[algo].chkfb_size; + return chkfb_barrier[algo].chkfb; +} + +static inline check_fallback_t *ucg_builtin_bcast_check_fallback_array(int algo, int *arr_size) +{ + *arr_size = chkfb_bcast[algo].chkfb_size; + return chkfb_bcast[algo].chkfb; +} + +static inline check_fallback_t *ucg_builtin_allreduce_check_fallback_array(int algo, int *arr_size) +{ + *arr_size = chkfb_allreduce[algo].chkfb_size; + return chkfb_allreduce[algo].chkfb; +} + +static inline check_fallback_t *ucg_builtin_alltoallv_check_fallback_array(int algo, int *arr_size) +{ + *arr_size = chkfb_alltoallv[algo].chkfb_size; + return chkfb_alltoallv[algo].chkfb; +} + +typedef check_fallback_t *(*chk_fb_arr_f)(int algo, int *arr_size); + +static chk_fb_arr_f check_fallback[COLL_TYPE_NUMS] = { + ucg_builtin_barrier_check_fallback_array, + ucg_builtin_bcast_check_fallback_array, + ucg_builtin_allreduce_check_fallback_array, + ucg_builtin_alltoallv_check_fallback_array, +}; + +static check_fallback_t *ucg_builtin_get_check_fallback_array(coll_type_t coll_type, int algo, int *arr_size) +{ + return check_fallback[coll_type](algo, arr_size); +} + +int ucg_builtin_algo_check_fallback(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params, + int algo) +{ + check_fallback_t *chk_fb = NULL; + check_item_t chk_item; + check_f chk_fun; + int i, size, algo_fb; + algo_fb = algo; + algo = 0; + while (algo != algo_fb) { + algo = algo_fb; + size = 0; + chk_fb = ucg_builtin_get_check_fallback_array(coll_params->coll_type, algo, &size); + for (i = 0; i < size; i++) { + chk_item = chk_fb[i].chk_item; + chk_fun = check_fun_array[chk_item]; + if (chk_fun(group_params, coll_params)) { + algo_fb = chk_fb[i].algo_fb; + ucs_info("current algo is %d, check item is %s, fallback to algo %d", + algo, check_item_str_array[chk_item], algo_fb); + break; + } + } + } + return algo_fb; +} diff --git a/builtin/plan/builtin_algo_decision.c b/builtin/plan/builtin_algo_decision.c new file mode 100644 index 0000000..704e538 --- /dev/null +++ b/builtin/plan/builtin_algo_decision.c @@ -0,0 +1,96 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include +#include +#include +#include + +#include "builtin_algo_decision.h" + +static const char *coll_type_str_array[COLL_TYPE_NUMS] = { + "barrier", + "bcast", + "allreduce", + "alltoallv", +}; + +typedef struct { + int low; + int up; +} boundary_t; + +boundary_t boundary[COLL_TYPE_NUMS] = { + {UCG_ALGORITHM_BARRIER_AUTO_DECISION, UCG_ALGORITHM_BARRIER_LAST}, + {UCG_ALGORITHM_BCAST_BMTREE, UCG_ALGORITHM_BCAST_LAST}, + {UCG_ALGORITHM_ALLREDUCE_AUTO_DECISION, UCG_ALGORITHM_ALLREDUCE_LAST}, + {UCG_ALGORITHM_ALLTOALLV_AUTO_DECISION, UCG_ALGORITHM_ALLTOALLV_LAST}, +}; + +static inline int ucg_builtin_get_valid_algo(int algo, int lb, int ub) +{ + if (algo > lb && algo < ub) { + return algo; + } + return 0; +} + +static int ucg_builtin_get_custom_algo(coll_type_t coll_type) +{ + ucg_builtin_config_t *config = (ucg_builtin_config_t *)ucg_builtin_component.plan_config; + int algo = 0; + switch (coll_type) + { + case COLL_TYPE_BARRIER: + algo = (int)config->barrier_algorithm; + break; + case COLL_TYPE_BCAST: + algo = (int)config->bcast_algorithm; + break; + case COLL_TYPE_ALLREDUCE: + algo = (int)config->allreduce_algorithm; + break; + case COLL_TYPE_ALLTOALLV: + algo = (int)config->alltoallv_algorithm; + break; + default: + break; + } + return ucg_builtin_get_valid_algo(algo, boundary[coll_type].low, boundary[coll_type].up); +} +STATIC_GTEST coll_type_t ucg_builtin_get_coll_type(const ucg_collective_type_t *coll_type) +{ + if (coll_type->modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_BARRIER]) { + return COLL_TYPE_BARRIER; + } + if (coll_type->modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_BCAST]) { + return COLL_TYPE_BCAST; + } + if (coll_type->modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLREDUCE]) { + return COLL_TYPE_ALLREDUCE; + } + if (coll_type->modifiers == ucg_predefined_modifiers[UCG_PRIMITIVE_ALLTOALLV]) { + return COLL_TYPE_ALLTOALLV; + } + return COLL_TYPE_NUMS; +} + +int ucg_builtin_algo_decision(const ucg_group_params_t *group_params, const ucg_collective_params_t *coll_params) +{ + int algo, algo_final; + algo = ucg_builtin_get_custom_algo(coll_params->coll_type); + ucs_info("current coll_type is %s", coll_type_str_array[coll_params->coll_type]); + /**/ + if (algo) { + ucs_info("custom algorithm is %d", algo); + } else { + algo = ucg_builtin_algo_auto_select(group_params, coll_params); + ucs_info("auto select algorithm is %d", algo); + } + /* Check whether this algo can use, fall back if not */ + algo_final = ucg_builtin_algo_check_fallback(group_params, coll_params, algo); + ucs_info("final algorithm is %d", algo_final); + return algo_final; +} diff --git a/builtin/plan/builtin_algo_decision.h b/builtin/plan/builtin_algo_decision.h new file mode 100644 index 0000000..96ecdb9 --- /dev/null +++ b/builtin/plan/builtin_algo_decision.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * Description: Algorithm decision for collective operation + */ + +#ifndef UCG_BUILTIN_ALGO_DECISION_H +#define UCG_BUILTIN_ALGO_DECISION_H + +#include +#include + +BEGIN_C_DECLS + +coll_type_t ucg_builtin_get_coll_type(const ucg_collective_type_t *coll_type); + +int ucg_builtin_algo_auto_select(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params); + +int ucg_builtin_algo_check_fallback(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params, + int algo); + +int ucg_builtin_algo_decision(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params); + +END_C_DECLS +#endif \ No newline at end of file diff --git a/builtin/plan/builtin_algo_select.c b/builtin/plan/builtin_algo_select.c new file mode 100644 index 0000000..461103c --- /dev/null +++ b/builtin/plan/builtin_algo_select.c @@ -0,0 +1,353 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + + +#include +#include +#include "src/ucg/builtin/ops/builtin_ops.h" + +#include "builtin_algo_decision.h" +#include "builtin_plan.h" +typedef enum { + SIZE_LEVEL_4B, + SIZE_LEVEL_8B, + SIZE_LEVEL_16B, + SIZE_LEVEL_32B, + SIZE_LEVEL_64B, + SIZE_LEVEL_128B, + SIZE_LEVEL_256B, + SIZE_LEVEL_512B, + SIZE_LEVEL_1KB, + SIZE_LEVEL_2KB, + SIZE_LEVEL_4KB, + SIZE_LEVEL_LG, + SIZE_LEVEL_NUMS +} size_level_t; + +typedef enum { + PPN_LEVEL_4, + PPN_LEVEL_8, + PPN_LEVEL_16, + PPN_LEVEL_32, + PPN_LEVEL_64, + PPN_LEVEL_LG, + PPN_LEVEL_NUMS +} ppn_level_t; + +typedef enum { + NODE_LEVEL_4, + NODE_LEVEL_8, + NODE_LEVEL_16, + NODE_LEVEL_32, + NODE_LEVEL_LG, + NODE_LEVEL_NUMS +} node_level_t; + +const static int barrier_algo_tbl[PPN_LEVEL_NUMS][NODE_LEVEL_NUMS] = { + {10, 7, 2, 6, 4}, + {10, 10, 2, 6, 7}, + {10, 10, 10, 7, 7}, + {10, 10, 10, 10, 6}, + {10, 10, 10, 10, 6}, + {10, 10, 10, 10, 5}, +}; +const static int bcast_algo_tbl[SIZE_LEVEL_NUMS][PPN_LEVEL_NUMS][NODE_LEVEL_NUMS] = { + { + {3, 3, 3, 3, 3}, + {4, 4, 3, 3, 3}, + {4, 4, 4, 3, 3}, + {3, 3, 4, 4, 3}, + {3, 3, 3, 3, 3}, + {3, 3, 4, 3, 3}, + }, { /* SIZE_LEVEL_8B*/ + {3, 3, 3, 3, 3}, + {4, 4, 4, 3, 3}, + {4, 4, 4, 3, 3}, + {3, 3, 4, 4, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 4, 3, 3}, + }, { /* SIZE_LEVEL_16B*/ + {3, 3, 4, 3, 3}, + {3, 4, 3, 3, 3}, + {4, 4, 4, 3, 3}, + {3, 3, 4, 4, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 4, 3, 3}, + }, {/* SIZE_LEVEL_32B*/ + {3, 3, 3, 3, 3}, + {4, 4, 3, 3, 3}, + {3, 4, 4, 3, 3}, + {3, 3, 4, 4, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 3, 3, 3}, + }, { /* SIZE_LEVEL_64B*/ + {3, 3, 3, 3, 3}, + {4, 3, 3, 3, 3}, + {4, 4, 4, 3, 3}, + {3, 3, 4, 4, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 3, 3, 3}, + }, { /* SIZE_LEVEL_128B*/ + {3, 3, 3, 3, 3}, + {3, 4, 3, 3, 3}, + {3, 4, 4, 3, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 3, 4, 3}, + }, {/* SIZE_LEVEL_256B*/ + {3, 3, 3, 3, 3}, + {4, 3, 3, 3, 3}, + {3, 4, 4, 4, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 4, 4, 3}, + {3, 3, 4, 3, 3}, + }, { /* SIZE_LEVEL_512B*/ + {4, 3, 4, 3, 3}, + {4, 3, 3, 3, 3}, + {3, 3, 4, 4, 4}, + {4, 3, 4, 4, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 4, 4, 3}, + }, { /* SIZE_LEVEL_1KB*/ + {3, 3, 3, 3, 3}, + {3, 3, 3, 3, 3}, + {4, 4, 4, 3, 3}, + {3, 3, 3, 4, 3}, + {3, 3, 4, 4, 3}, + {3, 3, 4, 3, 3}, + }, {/* SIZE_LEVEL_2KB*/ + {4, 3, 3, 3, 3}, + {4, 3, 3, 4, 3}, + {4, 4, 4, 3, 4}, + {4, 3, 4, 4, 4}, + {3, 4, 3, 4, 3}, + {3, 4, 4, 4, 3}, + }, { /* SIZE_LEVEL_4KB*/ + {3, 3, 3, 3, 3}, + {4, 4, 4, 4, 3}, + {4, 4, 4, 3, 3}, + {3, 4, 4, 4, 3}, + {4, 4, 4, 4, 4}, + {4, 4, 4, 4, 4}, + }, {/* SIZE_LEVEL_LG*/ + {4, 3, 3, 3, 3}, + {4, 3, 3, 4, 3}, + {4, 4, 4, 4, 4}, + {4, 4, 4, 4, 4}, + {4, 4, 4, 4, 4}, + {4, 4, 4, 4, 4}, + } +}; +const static int allreduce_algo_tbl[SIZE_LEVEL_NUMS][PPN_LEVEL_NUMS][NODE_LEVEL_NUMS] = { + { + {11, 8, 8, 8, 7}, + {11, 11, 8, 8, 7}, + {11, 11, 11, 8, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + }, { /* SIZE_LEVEL_8B*/ + {11, 8, 8, 8, 7}, + {11, 11, 8, 8, 7}, + {11, 11, 11, 8, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + }, { /* SIZE_LEVEL_16B*/ + {11, 8, 8, 8, 7}, + {11, 11, 8, 8, 7}, + {11, 11, 11, 8, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + }, { /* SIZE_LEVEL_32B*/ + {11, 8, 8, 8, 7}, + {11, 11, 8, 8, 7}, + {11, 11, 11, 8, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + }, { /* SIZE_LEVEL_64B*/ + {11, 8, 8, 8, 7}, + {11, 11, 8, 8, 7}, + {11, 11, 11, 8, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 11}, + }, { /* SIZE_LEVEL_128B*/ + {14, 8, 8, 8, 7}, + {11, 11, 8, 8, 8}, + {11, 11, 11, 8, 7}, + {11, 11, 11, 11, 8}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 11, 7}, + }, { /* SIZE_LEVEL_256B*/ + {14, 13, 13, 13, 13}, + {11, 11, 13, 13, 13}, + {11, 11, 11, 14, 14}, + {11, 11, 11, 11, 7}, + {11, 11, 11, 8, 7}, + {11, 11, 11, 7, 7}, + }, { /* SIZE_LEVEL_512B*/ + {13, 13, 13, 13, 13}, + {13, 13, 13, 13, 13}, + {14, 14, 11, 13, 13}, + {14, 11, 14, 14, 14}, + {11, 11, 11, 8, 7}, + {11, 11, 7, 8, 7}, + }, { /* SIZE_LEVEL_1KB*/ + {13, 8, 8, 8, 8}, + {13, 13, 13, 13, 13}, + {14, 14, 14, 14, 14}, + {13, 13, 13, 13, 13}, + {14, 14, 14, 14, 7}, + {11, 11, 7, 8, 8}, + }, { /* SIZE_LEVEL_2KB*/ + {13, 13, 13, 13, 8}, + {13, 13, 12, 12, 12}, + {13, 13, 13, 13, 13}, + {14, 14, 14, 14, 14}, + {13, 13, 13, 13, 13}, + {14, 11, 7, 7, 7}, + }, { /* SIZE_LEVEL_4KB*/ + {13, 13, 13, 13, 12}, + {13, 13, 13, 12, 12}, + {12, 12, 12, 12, 12}, + {13, 13, 13, 13, 13}, + {14, 14, 13, 14, 14}, + {13, 13, 13, 13, 7}, + }, { /* SIZE_LEVEL_LG*/ + {13, 13, 13, 13, 12}, + {13, 13, 13, 12, 12}, + {13, 12, 12, 12, 12}, + {12, 12, 12, 12, 12}, + {13, 13, 13, 13, 13}, + {12, 12, 5, 5, 6}, + } +}; + +static inline int log2_n(unsigned int n, unsigned int begin) +{ + int index = 0; + while (n >= begin) { + n = n >> 1; + index++; + } + return index; +} + +static size_level_t ucg_builtin_get_size_level(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + const int size_lev_small = 4; + const int size_lev_large = 4096; + int size; + int dt_len; + ucp_datatype_t ucp_datatype; + group_params->mpi_dt_convert(coll_params->send.dt_ext, &ucp_datatype); + dt_len = UCP_DT_IS_CONTIG(ucp_datatype) ? coll_params->send.dt_len : + ucg_builtin_get_dt_len(ucp_dt_generic(ucp_datatype)); + size = dt_len * coll_params->send.count; + ucs_info("The SIZE parameter of auto select algorithm is %d", size); + if (size <= size_lev_small) { + return SIZE_LEVEL_4B; + } + if (size > size_lev_large) { + return SIZE_LEVEL_LG; + } + size--; + return (size_level_t)log2_n(size, size_lev_small); +} + +static ppn_level_t ucg_builtin_get_ppn_level(const ucg_group_params_t *group_params) +{ + const int ppn_lev_small = 4; + const int ppn_lev_large = 64; + int ppn_max; + ppn_max = group_params->topo_args.ppn_max; + ucs_info("The PPN parameter of auto select algorithm is %d", ppn_max); + if (ppn_max <= ppn_lev_small) { + return PPN_LEVEL_4; + } + if (ppn_max > ppn_lev_large) { + return PPN_LEVEL_LG; + } + ppn_max--; + return (ppn_level_t)log2_n(ppn_max, ppn_lev_small); +} + +static node_level_t ucg_builtin_get_node_level(const ucg_group_params_t *group_params) +{ + const int node_lev_small = 4; + const int node_lev_large = 32; + int node_nums; + node_nums = group_params->topo_args.node_nums; + ucs_info("The NODE parameter of auto select algorithm is %d", node_nums); + if (node_nums <= node_lev_small) { + return NODE_LEVEL_4; + } + if (node_nums > node_lev_large) { + return NODE_LEVEL_LG; + } + node_nums--; + return (node_level_t)log2_n(node_nums, node_lev_small); +} + +static int ucg_builtin_barrier_algo_select(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + ppn_level_t ppn_lev; + node_level_t node_lev; + + ppn_lev = ucg_builtin_get_ppn_level(group_params); + node_lev = ucg_builtin_get_node_level(group_params); + return barrier_algo_tbl[ppn_lev][node_lev]; +} + +static int ucg_builtin_bcast_algo_select(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + size_level_t size_lev; + ppn_level_t ppn_lev; + node_level_t node_lev; + + size_lev = ucg_builtin_get_size_level(group_params, coll_params); + ppn_lev = ucg_builtin_get_ppn_level(group_params); + node_lev = ucg_builtin_get_node_level(group_params); + return bcast_algo_tbl[size_lev][ppn_lev][node_lev]; +} + +static int ucg_builtin_allreduce_algo_select(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + size_level_t size_lev; + ppn_level_t ppn_lev; + node_level_t node_lev; + + size_lev = ucg_builtin_get_size_level(group_params, coll_params); + ppn_lev = ucg_builtin_get_ppn_level(group_params); + node_lev = ucg_builtin_get_node_level(group_params); + return allreduce_algo_tbl[size_lev][ppn_lev][node_lev]; +} + +static int ucg_builtin_alltoallv_algo_select(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return UCG_ALGORITHM_ALLTOALLV_NODE_AWARE_PLUMMER; +} + +typedef int (*algo_select_f)(const ucg_group_params_t *group_params, const ucg_collective_params_t *coll_params); +static algo_select_f algo_select[COLL_TYPE_NUMS] = { + ucg_builtin_barrier_algo_select, + ucg_builtin_bcast_algo_select, + ucg_builtin_allreduce_algo_select, + ucg_builtin_alltoallv_algo_select, +}; + +int ucg_builtin_algo_auto_select(const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params) +{ + return algo_select[coll_params->coll_type](group_params, coll_params); +} diff --git a/builtin/plan/builtin_binary_block.c b/builtin/plan/builtin_binary_block.c new file mode 100644 index 0000000..1f926a7 --- /dev/null +++ b/builtin/plan/builtin_binary_block.c @@ -0,0 +1,1159 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2020-2021. All rights reserved. + * Description: Rabenseifner algorithm for MPI_Allreduce + */ + +#include +#include +#include +#include +#include +#include +#include +#include "builtin_plan.h" + +#define MAX_PEERS (100) +#define MAX_PHASES (128) + +typedef struct ucg_builtin_binary_block_params { + ucg_builtin_base_params_t super; +} ucg_builtin_binary_block_params_t; + +/**************************************************************************** + * * + * Binary block's Algorithm * + * allreduce = reduce_scatter + allgather * + * reduce_scatter phase 0: Recursive Having 1: tree * + * allgather phase 0: Recursive Doubling * + * * + * **************************************************************************/ + +/* + * @brief Only keep the lowest 1 in the binary + * e.g. 5(101) -> 1(001) + */ +STATIC_GTEST unsigned ucg_builtin_keep_lowest_1_bit(unsigned num) +{ + return num & (~num + 1); +} + +/* + * @brief Only keep the highest 1 in the binary + * e.g. 5(101) -> 4(100) + */ +STATIC_GTEST unsigned ucg_builtin_keep_highest_1_bit(unsigned num) +{ + unsigned high = 0x0; + while (num) { + high = num & (~num + 1); + num = num & (~high); + } + return high; +} + +/* + * @brief Get the number of 1 in the binary + * e.g. 5(101) -> 2 + */ +STATIC_GTEST unsigned ucg_builtin_get_1bit_cnt(unsigned num) +{ + unsigned cnt = 0; + while (num > 0) { + num &= (num - 1); // clear the lowest 1 + cnt++; + } + return cnt; +} + +/* + * @brief Keep the least bits of the CNT that are the same as those of the NUM. + * e.g. NUM: 0010 0100 + * CNT: 1011 1110 + * RET: 0011 1110 + */ +STATIC_GTEST unsigned ucg_builtin_get_low_all(unsigned num, unsigned cnt) +{ + unsigned high = ucg_builtin_keep_highest_1_bit(num); + unsigned lowMask = (high - 1) | high; + unsigned ret = cnt & lowMask; + return ret; +} + +/* + * @brief Get the process_cnt and begin_index of previous group + * @param member_cnt the number of all process + * @param previous_group_process_cnt the process number of previous group + * @param previous_group_begin_index the begin process index of previous group + * e.g.my_index:6, member_cnt:15:(1) (2 3) (4 5 6 7) (8 9 ... 15) + * ==> previous_group_process_cnt:2, previous_group_begin_index:2 + */ +STATIC_GTEST void ucg_builtin_get_binaryblocks_previous_group(unsigned my_index, + unsigned member_cnt, + unsigned *previous_group_process_cnt, + unsigned *previous_group_begin_index) +{ + unsigned ret = ucg_builtin_get_low_all(my_index, member_cnt); + if (my_index > ret) { + *previous_group_begin_index = ret; + *previous_group_process_cnt = ucg_builtin_keep_highest_1_bit(ret); + } else { + *previous_group_begin_index = (~ucg_builtin_keep_highest_1_bit(ret)) & ret; + *previous_group_process_cnt = ucg_builtin_keep_highest_1_bit(*previous_group_begin_index); + } + *previous_group_begin_index = (~ucg_builtin_keep_highest_1_bit(*previous_group_begin_index)) & + (*previous_group_begin_index); +} + +/* + * @brief Get the process_cnt and begin_index of previous group + * @param member_cnt the number of all process + * @param current_group_process_cnt the process number of current group + * @param current_group_begin_index the begin process index of current group + * e.g.my_index:6, member_cnt:15:(1) (2 3) (4 5 6 7) (8 9 ... 15) + * ==> current_group_process_cnt:4, current_group_begin_index:4 + */ +STATIC_GTEST void ucg_builtin_get_binaryblocks_current_group(unsigned my_index, + unsigned member_cnt, + unsigned *current_group_process_cnt, + unsigned *current_group_begin_index) +{ + unsigned ret = ucg_builtin_get_low_all(my_index, member_cnt); + if (my_index > ret) { + *current_group_process_cnt = ucg_builtin_keep_lowest_1_bit(member_cnt - ret); + *current_group_begin_index = ret; + } else { + *current_group_process_cnt = ucg_builtin_keep_highest_1_bit(ret); + *current_group_begin_index = (~ucg_builtin_keep_highest_1_bit(ret)) & ret; + } +} + +/* + * @brief Get the process_cnt and begin_index of previous group + * @param member_cnt the number of all process + * @param next_group_process_cnt the process number of next group + * @param next_group_begin_index the begin process index of next group + * e.g.my_index:6, member_cnt:15:(1) (2 3) (4 5 6 7) (8 9 ... 15) + * ==> next_group_process_cnt:8, next_group_begin_index:8 + */ +STATIC_GTEST void ucg_builtin_get_binaryblocks_next_group(unsigned my_index, + unsigned member_cnt, + unsigned *next_group_process_cnt, + unsigned *next_group_begin_index) +{ + unsigned ret = ucg_builtin_get_low_all(my_index, member_cnt); + if (my_index <= ret) { + *next_group_process_cnt = ucg_builtin_keep_lowest_1_bit(member_cnt - ret); + *next_group_begin_index = ret; + } else { + *next_group_process_cnt = ucg_builtin_keep_lowest_1_bit(member_cnt - ret); + *next_group_process_cnt = ucg_builtin_keep_lowest_1_bit(member_cnt - ret - *next_group_process_cnt); + *next_group_begin_index = (*next_group_process_cnt - 1) & member_cnt; + } +} + +/* + * @brief Get the number of group before current group + * @param current_group_begin_index the begin process index of current group + * @param ahead_group_cnt the number of group in front of current group + * e.g. current_group_begin_index:4, member_cnt:15:(1) (2 3) (4 5 6 7) (8 9 ... 15) + * ==> ahead_group_cnt:2 + */ +STATIC_GTEST void ucg_builtin_get_binaryblocks_ahead_group_cnt(unsigned member_cnt, + unsigned current_group_begin_index, + unsigned *ahead_group_cnt) +{ + if (current_group_begin_index == 0) { + *ahead_group_cnt = 0; + } else { + unsigned previous_sum_group_process_cnt = member_cnt - (~current_group_begin_index & member_cnt); + *ahead_group_cnt = ucg_builtin_get_1bit_cnt(previous_sum_group_process_cnt); + } +} + +/* + * @brief Get the number of group after current group + * @param next_group_begin_index the begin process index of next group + * @param behind_group_cnt the number of group in front of current group + * e.g. next_group_begin_index:4, member_cnt:15:(1) (2 3) (4 5 6 7) (8 9 ... 15) + * ==> behind_group_cnt:2 + */ +STATIC_GTEST void ucg_builtin_get_binaryblocks_behind_group_cnt(unsigned member_cnt, + unsigned next_group_begin_index, + unsigned *behind_group_cnt) +{ + if (next_group_begin_index >= member_cnt) { + *behind_group_cnt = 0; + } else { + unsigned after_sum_group_process_cnt = member_cnt - next_group_begin_index; + *behind_group_cnt = ucg_builtin_get_1bit_cnt(after_sum_group_process_cnt); + } +} + +/* + * @brief Get the peer process index in extra_reduction phase + */ +STATIC_GTEST void ucg_builtin_get_extra_reduction_peer_index(unsigned my_index, + unsigned member_cnt, + unsigned *peer_index) +{ + unsigned previous_group_process_cnt, previous_group_begin_index; + ucg_builtin_get_binaryblocks_previous_group(my_index, member_cnt, + &previous_group_process_cnt, &previous_group_begin_index); + if (previous_group_process_cnt == 0) { + *peer_index = my_index - 1; + } else { + unsigned offset = (my_index - previous_group_begin_index) % previous_group_process_cnt; + offset = (offset == 0) ? previous_group_process_cnt : offset; + *peer_index = previous_group_begin_index + offset - 1; + } +} + +/* + * @brief Obtains the index of the block to be sent + * @param ep_cnt the number of endpoint + * @param ep_idx the index of endpoint + * @return the index of the block to be sent + * e.g. ep_cnt:8, ep_idx:3 + * ==> arr:[0 4 2 6 1 5 3 7], ret:a[3-1]=6 + */ +STATIC_GTEST ucs_status_t ucg_builtin_get_recv_block_index(unsigned ep_cnt, unsigned ep_idx, unsigned* ret) +{ + size_t alloc_size = ep_cnt * sizeof(unsigned); + unsigned *arr = (unsigned *)ucs_malloc(alloc_size, "arr"); + if (arr == NULL) { + ucs_error("no memory for malloc"); + return UCS_ERR_NO_MEMORY; + } + errno_t res = memset_s(arr, alloc_size, 0, alloc_size); + if (res != EOK) { + ucs_free(arr); + arr = NULL; + return UCS_ERR_INVALID_PARAM; + } + unsigned i; + const unsigned power_of_two = 2; + for (i = 0; i < ep_cnt; i++) { + if (!(i & (i - 1))) { + arr[i] = i ? (ep_cnt / power_of_two / i) : 0; + } + } + for (i = 0; i < ep_cnt; i++) { + unsigned distance = 1; + unsigned value = ep_cnt / power_of_two; + while (i + distance < ep_cnt && i + distance < i * power_of_two && !arr[i + distance]) { + arr[i + distance] = arr[i] + value; + distance *= power_of_two; + value /= power_of_two; + } + } + *ret = arr[ep_idx]; + ucs_free(arr); + arr = NULL; + return UCS_OK; +} + +STATIC_GTEST void ucg_builtin_block_buffer(unsigned buffer_cnt, + unsigned block_cnt, + unsigned * const block_buffer) +{ + unsigned idx; + unsigned tmp = block_cnt; + for (idx = 0; idx < block_cnt; idx++, tmp--) { + block_buffer[idx] = buffer_cnt / tmp; + if (buffer_cnt % tmp) { + block_buffer[idx]++; + } + buffer_cnt -= block_buffer[idx]; + } +} + +STATIC_GTEST UCS_F_ALWAYS_INLINE unsigned ucg_builtin_calc_disp(unsigned *block_num, unsigned start, unsigned cnt) +{ + unsigned idx; + unsigned sum = 0; + for (idx = start; idx < start + cnt; idx++) { + sum += block_num[idx]; + } + return sum; +} + +STATIC_GTEST ucs_status_t ucg_builtin_divide_block_buffers(unsigned block_cnt, + unsigned total_group_process_cnt, + unsigned total_group_cnt, + unsigned **block_buffers) +{ + unsigned previous_group_process_cnt = 0; + unsigned previous_group_begin_index = 0; + unsigned current_group_process_cnt, current_group_begin_index; + unsigned group_idx, temp; + ucs_status_t status = UCS_OK; + for (group_idx = 0; group_idx < total_group_cnt; group_idx++) { + if (group_idx == 0) { + ucg_builtin_get_binaryblocks_current_group(1, total_group_process_cnt, + ¤t_group_process_cnt, ¤t_group_begin_index); + unsigned *block_buffer = (unsigned *)ucs_malloc(sizeof(unsigned) * + current_group_process_cnt, "allocate block"); + if (block_buffer == NULL) { + status = UCS_ERR_NO_MEMORY; + ucs_error("no memory for malloc"); + goto cleanup_buffer; + } + ucg_builtin_block_buffer(block_cnt, current_group_process_cnt, block_buffer); + block_buffers[group_idx] = block_buffer; + } else { + ucg_builtin_get_binaryblocks_next_group(previous_group_begin_index + 1, total_group_process_cnt, + ¤t_group_process_cnt, ¤t_group_begin_index); + unsigned *block_buffer = (unsigned *)ucs_malloc(sizeof(unsigned) * + current_group_process_cnt, "allocate block"); + if (block_buffer == NULL) { + status = UCS_ERR_NO_MEMORY; + ucs_error("no memory for malloc"); + goto cleanup_buffer; + } + unsigned idx; + unsigned step = current_group_process_cnt / previous_group_process_cnt; + for (idx = 0; idx < previous_group_process_cnt; idx++) { + unsigned previous_group_block_cnt = block_buffers[group_idx - 1][idx]; + ucg_builtin_block_buffer(previous_group_block_cnt, step, &block_buffer[idx * step]); + } + block_buffers[group_idx] = block_buffer; + } + previous_group_process_cnt = current_group_process_cnt; + previous_group_begin_index = current_group_begin_index; + } + return status; + +cleanup_buffer: + for (temp = 0; temp < group_idx; temp++) { + ucs_free(block_buffers[temp]); + block_buffers[temp] = NULL; + } + return status; +} + +STATIC_GTEST void ucg_builtin_destory_block_buffers(unsigned total_group_cnt, unsigned **block_buffers) +{ + unsigned group_idx; + for (group_idx = 0; group_idx < total_group_cnt; group_idx++) { + ucs_free(block_buffers[group_idx]); + block_buffers[group_idx] = NULL; + } + ucs_free(block_buffers); + block_buffers = NULL; +} + +STATIC_GTEST ucs_status_t ucg_builtin_init_block_buffers(unsigned block_cnt, + unsigned total_group_process_cnt, + unsigned total_group_cnt, + unsigned ***block_buffers) +{ + *block_buffers = (unsigned**)ucs_malloc(sizeof(unsigned *)*total_group_cnt, "allocate blocks"); + if (*block_buffers == NULL) { + ucs_error("no memory for malloc"); + return UCS_ERR_NO_MEMORY; + } + + ucs_status_t status = ucg_builtin_divide_block_buffers(block_cnt, total_group_process_cnt, + total_group_cnt, *block_buffers); + if (status != UCS_OK) { + ucs_free(*block_buffers); + *block_buffers = NULL; + } + return status; +} + +STATIC_GTEST ucs_status_t ucg_builtin_reduce_scatter_phase_cb(ucg_builtin_plan_phase_t *phase, + const ucg_collective_params_t *coll_params) +{ + ucs_assert(phase != NULL && coll_params != NULL); + + static unsigned next_start_block = 0; + // first phase: static variable reset + if (phase->raben_extend.first_step_flag) { + next_start_block = 0; + } + + unsigned block_cnt = coll_params->send.count; + unsigned total_group_cnt = phase->raben_extend.index_group.total_group_cnt; + unsigned total_group_process_cnt = phase->raben_extend.index_group.total_group_process_cnt; + unsigned **block_buffers = NULL; + ucs_status_t status = ucg_builtin_init_block_buffers(block_cnt, total_group_process_cnt, + total_group_cnt, &block_buffers); + if (status != UCS_OK) { + return status; + } + + unsigned cur_group_begin_index = phase->raben_extend.index_group.cur_group_begin_index; + unsigned cur_group_process_cnt = phase->raben_extend.index_group.cur_group_process_cnt; + unsigned ahead_group_cnt = phase->raben_extend.index_group.ahead_group_cnt; + const unsigned factor = 2; + /* local peer index */ + unsigned local_group_index = phase->raben_extend.index_group.local_group_index - cur_group_begin_index; + unsigned step_size = 1 << phase->raben_extend.step_index; + unsigned step_base = local_group_index - local_group_index % (step_size * factor); + unsigned local_group_peer = step_base + (local_group_index - step_base + step_size) % (step_size * factor); + + /* send && receive blocks index */ + unsigned send_num_blocks = (cur_group_process_cnt / factor) >> phase->raben_extend.step_index; + unsigned send_start_block = next_start_block + ((local_group_index < local_group_peer) ? send_num_blocks : 0); + unsigned recv_num_blocks = send_num_blocks; + unsigned recv_start_block = next_start_block + ((local_group_index < local_group_peer) ? 0 : send_num_blocks); + next_start_block += ((local_group_index < local_group_peer) ? 0 : send_num_blocks); + + /* send && receive real blocks */ + phase->ex_attr.start_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], 0, send_start_block); + phase->ex_attr.num_blocks = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], + send_start_block, send_num_blocks); + phase->ex_attr.peer_start_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], 0, recv_start_block); + phase->ex_attr.peer_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], + recv_start_block, recv_num_blocks); + phase->ex_attr.total_num_blocks = block_cnt; + phase->ex_attr.is_inequal = 1; + phase->ex_attr.is_partial = 1; + /* free */ + ucg_builtin_destory_block_buffers(total_group_cnt, block_buffers); + return UCS_OK; +} + +STATIC_GTEST ucs_status_t ucg_builtin_intra_reduce_scatter(ucg_builtin_index_group_t *index_group, + ucg_builtin_plan_phase_t **phase, + ucg_builtin_plan_t *binary_block, + ucg_builtin_group_ctx_t *ctx, + ucg_step_idx_t *step_idx) +{ + ucs_status_t status = UCS_OK; + unsigned idx; + unsigned step_size = 1; + const unsigned factor = 2; + ucg_group_member_index_t local_group_index = index_group->local_group_index - index_group->cur_group_begin_index; + unsigned step_cnt = ucs_ilog2(index_group->cur_group_process_cnt); + unsigned high = ucg_builtin_keep_highest_1_bit(index_group->total_group_process_cnt); + for (idx = 0; idx < step_cnt && status == UCS_OK; idx++, (*phase)++, step_size *= factor) { + (*phase)->step_index = *step_idx + idx; + (*phase)->method = UCG_PLAN_METHOD_REDUCE_SCATTER_RECURSIVE; + (*phase)->ep_cnt = 1; + #if ENABLE_DEBUG_DATA + (*phase)->indexes = UCS_ALLOC_CHECK(sizeof(ucg_group_member_index_t), "binary block indexes"); + #endif + binary_block->ep_cnt++; + binary_block->phs_cnt++; + + /* reduce-scatter phase peer index */ + unsigned step_base = local_group_index-local_group_index % (step_size *factor); + unsigned local_group_peer_index = step_base + (local_group_index - step_base + step_size) % + (step_size * factor); + /* Calculate relative real process ID using local index, only continuous process IDs are supported. */ + ucg_group_member_index_t real_peer_index = index_group->my_index + local_group_peer_index - local_group_index; + status = ucg_builtin_connect(ctx, real_peer_index, *phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + (*phase)->raben_extend.step_index = idx; + (*phase)->raben_extend.first_step_flag = (idx ? 0 : 1); + (*phase)->raben_extend.index_group = *index_group; + (*phase)->init_phase_cb = ucg_builtin_reduce_scatter_phase_cb; + } + *step_idx += ucs_ilog2(high); + return status; +} + +STATIC_GTEST ucs_status_t ucg_builtin_extra_reduce_receive_cb(ucg_builtin_plan_phase_t *phase, + const ucg_collective_params_t *coll_params) +{ + if (phase == NULL || coll_params == NULL) { + return UCS_ERR_INVALID_PARAM; + } + unsigned block_cnt = coll_params->send.count; + unsigned total_group_cnt = phase->raben_extend.index_group.total_group_cnt; + unsigned total_group_process_cnt = phase->raben_extend.index_group.total_group_process_cnt; + unsigned **block_buffers = NULL; + ucs_status_t status = ucg_builtin_init_block_buffers(block_cnt, total_group_process_cnt, + total_group_cnt, &block_buffers); + if (status != UCS_OK) { + return status; + } + + unsigned ahead_group_cnt = phase->raben_extend.index_group.ahead_group_cnt; + unsigned recv_block_index = phase->raben_extend.index_group.recv_block_index; + + phase->ex_attr.start_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], 0, recv_block_index); + phase->ex_attr.num_blocks = block_buffers[ahead_group_cnt][recv_block_index]; + phase->ex_attr.peer_start_block = phase->ex_attr.start_block; + phase->ex_attr.peer_block = phase->ex_attr.num_blocks; + phase->ex_attr.total_num_blocks = coll_params->send.count; + phase->ex_attr.is_partial = 1; + + ucg_builtin_destory_block_buffers(total_group_cnt, block_buffers); + return UCS_OK; +} + +STATIC_GTEST ucs_status_t ucg_builtin_extra_reduce_send_cb(ucg_builtin_plan_phase_t *phase, + const ucg_collective_params_t *coll_params) +{ + ucs_assert(phase != NULL && coll_params != NULL); + + unsigned block_cnt = coll_params->send.count; + unsigned total_group_cnt = phase->raben_extend.index_group.total_group_cnt; + unsigned total_group_process_cnt = phase->raben_extend.index_group.total_group_process_cnt; + unsigned **block_buffers = NULL; + ucs_status_t status = ucg_builtin_init_block_buffers(block_cnt, total_group_process_cnt, + total_group_cnt, &block_buffers); + if (status != UCS_OK) { + return status; + } + + unsigned cur_group_process_cnt = phase->raben_extend.index_group.cur_group_process_cnt; + unsigned next_group_process_cnt = phase->raben_extend.index_group.next_group_process_cnt; + unsigned next_group_idx = phase->raben_extend.index_group.ahead_group_cnt + 1; + unsigned recv_block_index = phase->raben_extend.index_group.recv_block_index; + unsigned ep_cnt = next_group_process_cnt / cur_group_process_cnt; + unsigned idx = recv_block_index * ep_cnt; + + phase->ex_attr.peer_start_block = ucg_builtin_calc_disp(block_buffers[phase->raben_extend.index_group.ahead_group_cnt], + 0, recv_block_index); + phase->ex_attr.peer_block = block_buffers[phase->raben_extend.index_group.ahead_group_cnt][recv_block_index]; + phase->ex_attr.start_block = phase->ex_attr.peer_start_block; + phase->ex_attr.start_block += ucg_builtin_calc_disp(block_buffers[next_group_idx], idx, + phase->raben_extend.step_index); + phase->ex_attr.num_blocks = block_buffers[next_group_idx][idx + phase->raben_extend.step_index]; + phase->ex_attr.total_num_blocks = block_cnt; + phase->ex_attr.is_partial = 1; + + ucg_builtin_destory_block_buffers(total_group_cnt, block_buffers); + return UCS_OK; +} + +STATIC_GTEST ucs_status_t ucg_builtin_intra_extra_reduction(ucg_builtin_index_group_t *index_group, + ucg_builtin_plan_phase_t **phase, + ucg_builtin_plan_t *binary_block, + ucg_builtin_group_ctx_t *ctx, + ucg_step_idx_t *step_idx) +{ + ucs_status_t status = UCS_OK; + + /* receive first, intra socket or node */ + if (index_group->local_group_index != index_group->local_peer_ahead_group) { + (*phase)->step_index = *step_idx + (index_group->ahead_group_cnt - 1); + (*phase)->method = UCG_PLAN_METHOD_REDUCE_TERMINAL; + (*phase)->ep_cnt = 1; +#if ENABLE_DEBUG_DATA + (*phase)->indexes = UCS_ALLOC_CHECK(sizeof(ucg_group_member_index_t), "binary block indexes"); +#endif + binary_block->ep_cnt++; + binary_block->phs_cnt++; + (*phase)->raben_extend.index_group = *index_group; + (*phase)->init_phase_cb = ucg_builtin_extra_reduce_receive_cb; + ucg_group_member_index_t real_peer_index = index_group->my_index + + index_group->local_peer_ahead_group - + index_group->local_group_index; + status = ucg_builtin_connect(ctx, real_peer_index, *phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + (*phase)++; + } + + /* then send, intra socket or node */ + if (status == UCS_OK && index_group->next_group_process_cnt > 0) { + unsigned idx; + unsigned ep_cnt = index_group->next_group_process_cnt / index_group->cur_group_process_cnt; + for (idx = 0; idx < ep_cnt && status == UCS_OK; ++idx) { + (*phase)->step_index = *step_idx + index_group->ahead_group_cnt; + (*phase)->method = UCG_PLAN_METHOD_SEND_TERMINAL; + (*phase)->ep_cnt = 1; +#if ENABLE_DEBUG_DATA + (*phase)->indexes = UCS_ALLOC_CHECK(sizeof(ucg_group_member_index_t), "binary block indexes"); +#endif + binary_block->ep_cnt++; + binary_block->phs_cnt++; + unsigned recv_block_index; + status = ucg_builtin_get_recv_block_index(ep_cnt, idx, &recv_block_index); + if (status != UCS_OK) { + return status; + } + unsigned peer_index = index_group->next_group_begin_index + index_group->local_group_index - + index_group->cur_group_begin_index + recv_block_index * + index_group->cur_group_process_cnt; + (*phase)->raben_extend.step_index = idx; + (*phase)->raben_extend.index_group = *index_group; + (*phase)->init_phase_cb = ucg_builtin_extra_reduce_send_cb; + ucg_group_member_index_t real_peer_index = index_group->my_index + peer_index - + index_group->local_group_index; + status = ucg_builtin_connect(ctx, real_peer_index, *phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + (*phase)++; + } + } + *step_idx += index_group->total_group_cnt - 1; + return status; +} + +STATIC_GTEST ucs_status_t ucg_builtin_intra_node_allreduce_cb(ucg_builtin_plan_phase_t *phase, + const ucg_collective_params_t *coll_params) +{ + ucs_assert(phase != NULL && coll_params != NULL); + + unsigned block_cnt = coll_params->send.count; + unsigned total_group_cnt = phase->raben_extend.index_group.total_group_cnt; + unsigned total_group_process_cnt = phase->raben_extend.index_group.total_group_process_cnt; + unsigned **block_buffers = NULL; + ucs_status_t status = ucg_builtin_init_block_buffers(block_cnt, total_group_process_cnt, + total_group_cnt, &block_buffers); + if (status != UCS_OK) { + return status; + } + + unsigned ahead_group_cnt = phase->raben_extend.index_group.ahead_group_cnt; + unsigned recv_block_index = phase->raben_extend.index_group.recv_block_index; + + phase->ex_attr.start_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], 0, recv_block_index); + phase->ex_attr.num_blocks = block_buffers[ahead_group_cnt][recv_block_index]; + phase->ex_attr.peer_start_block = phase->ex_attr.start_block; + phase->ex_attr.peer_block = phase->ex_attr.num_blocks; + phase->ex_attr.total_num_blocks = block_cnt; + phase->ex_attr.is_partial = 1; + + ucg_builtin_destory_block_buffers(total_group_cnt, block_buffers); + return UCS_OK; +} + +STATIC_GTEST ucs_status_t ucg_builtin_extra_receive_bcast_cb(ucg_builtin_plan_phase_t *phase, + const ucg_collective_params_t *coll_params) +{ + ucs_assert(phase != NULL && coll_params != NULL); + + unsigned block_cnt = coll_params->send.count; + unsigned total_group_cnt = phase->raben_extend.index_group.total_group_cnt; + unsigned total_group_process_cnt = phase->raben_extend.index_group.total_group_process_cnt; + unsigned **block_buffers = NULL; + ucs_status_t status = ucg_builtin_init_block_buffers(block_cnt, total_group_process_cnt, + total_group_cnt, &block_buffers); + if (status != UCS_OK) { + return status; + } + + unsigned current_group_process_cnt = phase->raben_extend.index_group.cur_group_process_cnt; + unsigned next_group_process_cnt = phase->raben_extend.index_group.next_group_process_cnt; + unsigned next_group_idx = phase->raben_extend.index_group.ahead_group_cnt + 1; + unsigned idx = phase->raben_extend.index_group.recv_block_index * + (next_group_process_cnt / current_group_process_cnt); + + /* receive previous phase address */ + phase->ex_attr.start_block = ucg_builtin_calc_disp(block_buffers[next_group_idx], + 0, idx + phase->raben_extend.step_index); + phase->ex_attr.num_blocks = block_buffers[next_group_idx][idx + phase->raben_extend.step_index]; + phase->ex_attr.peer_start_block = phase->ex_attr.start_block; + phase->ex_attr.peer_block = phase->ex_attr.num_blocks; + phase->ex_attr.total_num_blocks = block_cnt; + phase->ex_attr.is_partial = 1; + + ucg_builtin_destory_block_buffers(total_group_cnt, block_buffers); + return UCS_OK; +} + +STATIC_GTEST ucs_status_t ucg_builtin_extra_send_bcast_cb(ucg_builtin_plan_phase_t *phase, + const ucg_collective_params_t *coll_params) +{ + ucs_assert(phase != NULL && coll_params != NULL); + + unsigned block_cnt = coll_params->send.count; + unsigned total_group_cnt = phase->raben_extend.index_group.total_group_cnt; + unsigned total_group_process_cnt = phase->raben_extend.index_group.total_group_process_cnt; + unsigned **block_buffers = NULL; + ucs_status_t status = ucg_builtin_init_block_buffers(block_cnt, total_group_process_cnt, + total_group_cnt, &block_buffers); + if (status != UCS_OK) { + return status; + } + + unsigned ahead_group_cnt = phase->raben_extend.index_group.ahead_group_cnt; + unsigned start_block = phase->raben_extend.index_group.recv_block_index; + + phase->ex_attr.start_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], 0, start_block); + phase->ex_attr.num_blocks = block_buffers[ahead_group_cnt][start_block]; + phase->ex_attr.peer_start_block = phase->ex_attr.start_block; + phase->ex_attr.peer_block = phase->ex_attr.num_blocks; + phase->ex_attr.total_num_blocks = block_cnt; + phase->ex_attr.is_partial = 1; + + ucg_builtin_destory_block_buffers(total_group_cnt, block_buffers); + return UCS_OK; +} + +STATIC_GTEST ucs_status_t ucg_builtin_intra_bcast(ucg_builtin_index_group_t *index_group, + ucg_builtin_plan_phase_t **phase, + ucg_builtin_plan_t *binary_block, + ucg_builtin_group_ctx_t *ctx, + ucg_step_idx_t *step_idx) +{ + ucs_status_t status = UCS_OK; + + /* receive first */ + if (index_group->next_group_process_cnt > 0) { + unsigned idx, peer_idx; + unsigned step_cnt = index_group->next_group_process_cnt / index_group->cur_group_process_cnt; + for (idx = 0; idx < step_cnt && status == UCS_OK; idx++) { + (*phase)->step_index = *step_idx + (index_group->behind_group_cnt - 1); + (*phase)->method = UCG_PLAN_METHOD_RECV_TERMINAL; + (*phase)->ep_cnt = 1; +#if ENABLE_DEBUG_DATA + (*phase)->indexes = UCS_ALLOC_CHECK(sizeof(ucg_group_member_index_t), "binary block indexes"); +#endif + binary_block->ep_cnt++; + binary_block->phs_cnt++; + unsigned recv_block_index; + status = ucg_builtin_get_recv_block_index(step_cnt, idx, &recv_block_index); + if (status != UCS_OK) { + return status; + } + peer_idx = index_group->next_group_begin_index + index_group->local_group_index - + index_group->cur_group_begin_index + recv_block_index * + index_group->cur_group_process_cnt; + (*phase)->raben_extend.step_index = idx; + (*phase)->raben_extend.index_group = *index_group; + (*phase)->init_phase_cb = ucg_builtin_extra_receive_bcast_cb; + ucg_group_member_index_t real_peer_index = index_group->my_index + peer_idx - + index_group->local_group_index; + status = ucg_builtin_connect(ctx, real_peer_index, *phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + (*phase)++; + } + } + + /* then send */ + if (status == UCS_OK && index_group->local_peer_ahead_group != index_group->local_group_index) { + (*phase)->step_index = *step_idx + index_group->behind_group_cnt; + (*phase)->method = UCG_PLAN_METHOD_SEND_TERMINAL; + (*phase)->ep_cnt = 1; +#if ENABLE_DEBUG_DATA + (*phase)->indexes = UCS_ALLOC_CHECK(sizeof(ucg_group_member_index_t), "binary block indexes"); +#endif + binary_block->ep_cnt++; + binary_block->phs_cnt++; + + (*phase)->raben_extend.index_group = *index_group; + (*phase)->init_phase_cb = ucg_builtin_extra_send_bcast_cb; + ucg_group_member_index_t real_peer_index = index_group->my_index + + index_group->local_peer_ahead_group - + index_group->local_group_index; + status = ucg_builtin_connect(ctx, real_peer_index, *phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + (*phase)++; + } + *step_idx += index_group->total_group_cnt - 1; + return status; +} + +STATIC_GTEST ucs_status_t ucg_builtin_extra_allgather_cb(ucg_builtin_plan_phase_t *phase, + const ucg_collective_params_t *coll_params) +{ + ucs_assert(phase != NULL && coll_params != NULL); + + unsigned block_cnt = coll_params->send.count; + unsigned total_group_cnt = phase->raben_extend.index_group.total_group_cnt; + unsigned total_group_process_cnt = phase->raben_extend.index_group.total_group_process_cnt; + unsigned **block_buffers = NULL; + ucs_status_t status = ucg_builtin_init_block_buffers(block_cnt, total_group_process_cnt, + total_group_cnt, &block_buffers); + if (status != UCS_OK) { + return status; + } + const unsigned factor = 2; + unsigned cur_group_begin_index = phase->raben_extend.index_group.cur_group_begin_index; + unsigned cur_group_process_cnt = phase->raben_extend.index_group.cur_group_process_cnt; + unsigned ahead_group_cnt = phase->raben_extend.index_group.ahead_group_cnt; + unsigned local_group_idx = phase->raben_extend.index_group.local_group_index - cur_group_begin_index; + unsigned step_size = (cur_group_process_cnt / factor) >> phase->raben_extend.step_index; + unsigned step_base = local_group_idx -local_group_idx % (step_size * factor); + unsigned local_group_peer = step_base + (local_group_idx - step_base + step_size) % (step_size * factor); + + static unsigned send_start_block = 0; + if (phase->raben_extend.step_index == 0) { + send_start_block = phase->raben_extend.index_group.recv_block_index; + } + + /* send && receive block */ + unsigned recv_start_block = send_start_block; + unsigned num_blocks = 1 << phase->raben_extend.step_index; + recv_start_block = (local_group_idx < local_group_peer) ? (recv_start_block + num_blocks) : + (recv_start_block - num_blocks); + + phase->ex_attr.start_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], + 0, send_start_block); + phase->ex_attr.num_blocks = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], + send_start_block, num_blocks); + phase->ex_attr.peer_start_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], + 0, recv_start_block); + phase->ex_attr.peer_block = ucg_builtin_calc_disp(block_buffers[ahead_group_cnt], + recv_start_block, num_blocks); + phase->ex_attr.total_num_blocks = block_cnt; + phase->ex_attr.is_inequal = 1; + phase->ex_attr.is_partial = 1; + + if (local_group_idx > local_group_peer) { + send_start_block = recv_start_block; + } + + ucg_builtin_destory_block_buffers(total_group_cnt, block_buffers); + return UCS_OK; +} + +STATIC_GTEST ucs_status_t ucg_builtin_intra_allgather(ucg_builtin_index_group_t *index_group, + ucg_builtin_plan_phase_t **phase, + ucg_builtin_plan_t *binary_block, + ucg_builtin_group_ctx_t *ctx, + ucg_step_idx_t *step_idx) +{ + ucs_status_t status = UCS_OK; + + unsigned idx; + const unsigned factor = 2; + unsigned step_cnt = ucs_ilog2(index_group->cur_group_process_cnt); + unsigned step_size = index_group->cur_group_process_cnt / factor; + unsigned high = ucg_builtin_keep_highest_1_bit(index_group->total_group_process_cnt); + ucg_group_member_index_t local_group_index = index_group->local_group_index - index_group->cur_group_begin_index; + for (idx = 0; idx < step_cnt && status == UCS_OK; idx++, step_size/= factor) { + (*phase)->step_index = *step_idx + idx; + (*phase)->method = UCG_PLAN_METHOD_EXCHANGE; + (*phase)->ep_cnt = 1; +#if ENABLE_DEBUG_DATA + (*phase)->indexes = UCS_ALLOC_CHECK(sizeof(ucg_group_member_index_t), "binary block indexes"); +#endif + binary_block->ep_cnt++; + binary_block->phs_cnt++; + + unsigned step_base = local_group_index - local_group_index % (step_size *factor); + unsigned local_group_peer = step_base + (local_group_index - step_base + step_size) % + (step_size * factor); + ucg_group_member_index_t real_peer_index = index_group->my_index + local_group_peer - local_group_index; + status = ucg_builtin_connect(ctx, real_peer_index, *phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + + (*phase)->raben_extend.step_index = idx; + (*phase)->raben_extend.index_group = *index_group; + (*phase)->init_phase_cb = ucg_builtin_extra_allgather_cb; + (*phase)++; + } + *step_idx += ucs_ilog2(high); + return status; +} + +STATIC_GTEST ucs_status_t ucg_builtin_binary_block_init(unsigned local_idx, + unsigned total_group_process_cnt, + ucg_builtin_plan_t *binary_block, + ucg_builtin_plan_phase_t **phase, + ucg_builtin_group_ctx_t *ctx, + ucg_step_idx_t *step_idx, + ucg_builtin_index_group_t *index_group) +{ + unsigned ahead_group_cnt, behind_group_cnt; + unsigned cur_group_process_cnt, cur_group_begin_index; + unsigned next_group_process_cnt, next_group_begin_index; + unsigned local_ahead_peer; + ucs_status_t status; + + ucg_builtin_get_binaryblocks_current_group(local_idx + 1, total_group_process_cnt, + &cur_group_process_cnt, &cur_group_begin_index); + ucg_builtin_get_binaryblocks_ahead_group_cnt(total_group_process_cnt, cur_group_begin_index, + &ahead_group_cnt); + ucg_builtin_get_binaryblocks_next_group(local_idx + 1, total_group_process_cnt, + &next_group_process_cnt, &next_group_begin_index); + ucg_builtin_get_binaryblocks_behind_group_cnt(total_group_process_cnt, next_group_begin_index, + &behind_group_cnt); + ucg_builtin_get_extra_reduction_peer_index(local_idx + 1, total_group_process_cnt, + &local_ahead_peer); + + index_group->my_index = binary_block->super.my_index; + index_group->cur_group_begin_index = cur_group_begin_index; + index_group->cur_group_process_cnt = cur_group_process_cnt; + index_group->next_group_begin_index = next_group_begin_index; + index_group->next_group_process_cnt = next_group_process_cnt; + index_group->total_group_process_cnt = total_group_process_cnt; + index_group->ahead_group_cnt = ahead_group_cnt; + index_group->behind_group_cnt = behind_group_cnt; + index_group->total_group_cnt = ucg_builtin_get_1bit_cnt(total_group_process_cnt); + index_group->local_group_index = local_idx; + index_group->local_peer_ahead_group = local_ahead_peer; + index_group->recv_block_index = 0; + + status = ucg_builtin_get_recv_block_index(cur_group_process_cnt, local_idx - cur_group_begin_index, + &index_group->recv_block_index); + if (status != UCS_OK) { + return status; + } + *phase = &binary_block->phss[binary_block->phs_cnt]; + *step_idx = binary_block->step_cnt; + + /* 1st part: intra_node/socket reduce-scatter, use local index */ + status = ucg_builtin_intra_reduce_scatter(index_group, phase, binary_block, ctx, step_idx); + return status; +} + +STATIC_GTEST ucs_status_t ucg_builtin_binary_block_build(ucg_builtin_plan_t *binary_block, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + ucg_builtin_topo_aware_params_t *params, + const ucg_group_member_index_t member_cnt) +{ + ucs_status_t status; + ucg_step_idx_t step_idx = 0; + unsigned total_group_process_cnt = member_cnt; + unsigned local_idx = binary_block->super.my_index; + ucg_builtin_plan_phase_t *phase = NULL; + ucg_builtin_index_group_t index_group; + + /* 1st part: intra_node/socket reduce-scatter, use local index */ + status = ucg_builtin_binary_block_init(local_idx, total_group_process_cnt, binary_block, + &phase, ctx, &step_idx, &index_group); + if (status != UCS_OK) { + ucs_error("Binary-blocks algorithm failed in intra reduce_scatter phase"); + return status; + } + binary_block->step_cnt = step_idx; + + /* 2nd part: intra_node/socket rabenseifner extra reduction */ + phase = &binary_block->phss[binary_block->phs_cnt]; + status = ucg_builtin_intra_extra_reduction(&index_group, &phase, binary_block, ctx, &step_idx); + if (status != UCS_OK) { + ucs_error("Binary-blocks algorithm failed in intra reduction phase"); + return status; + } + binary_block->step_cnt = step_idx; + + /* 5th part: intra_node broadcast */ + phase = &binary_block->phss[binary_block->phs_cnt]; + status = ucg_builtin_intra_bcast(&index_group, &phase, binary_block, ctx, &step_idx); + if (status != UCS_OK) { + ucs_error("Binary-blocks algorithm failed in intra-node broadcast phase"); + return status; + } + binary_block->step_cnt = step_idx; + + /* 6th part: intra_node allgather */ + phase = &binary_block->phss[binary_block->phs_cnt]; + status = ucg_builtin_intra_allgather(&index_group, &phase, binary_block, ctx, &step_idx); + if (status != UCS_OK) { + ucs_error("Binary-blocks algorithm failed in intra-node allgather phase"); + return status; + } + binary_block->step_cnt = step_idx; + return status; +} + +STATIC_GTEST void ucg_builtin_modify_buffer(ucg_builtin_plan_t *binary_block, + unsigned phs_start, + ucg_builtin_index_group_t *index_group) +{ + unsigned phs_end = binary_block->phs_cnt; + + ucg_builtin_plan_phase_t *local_phase = &binary_block->phss[phs_start]; + /* modify buffer */ + unsigned phs_idx; + for (phs_idx = phs_start; phs_idx < phs_end; phs_idx++, local_phase++) { + local_phase->raben_extend.index_group = *index_group; + local_phase->init_phase_cb = ucg_builtin_intra_node_allreduce_cb; + local_phase->ex_attr.is_partial = 1; + } +} + +STATIC_GTEST ucs_status_t ucg_builtin_topo_aware_binary_block_build(ucg_builtin_plan_t *binary_block, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + ucg_builtin_topo_aware_params_t *params, + const ucg_group_member_index_t member_cnt) +{ + ucs_status_t status; + + /* Ensure that continuous process IDs in a node */ + unsigned ppn = params->topo_params->num_local_procs; + unsigned node_cnt = params->topo_params->node_cnt; + ucg_group_member_index_t* node_leaders = params->topo_params->node_leaders; + unsigned node_leaders_shift = binary_block->super.my_index - + *(params->topo_params->local_members); + + unsigned node_idx; + for (node_idx = 0; node_idx < node_cnt; ++node_idx) { + node_leaders[node_idx] += node_leaders_shift; + } + + /* Ensure that continuous process IDs in a socket */ + unsigned pps = params->topo_params->local.socket.member_cnt; + unsigned local_socket_cnt = params->topo_params->local.socket.num; + ucg_group_member_index_t* socket_leaders = params->topo_params->local.socket.leaders; + unsigned socket_leaders_shift = binary_block->super.my_index - + *(params->topo_params->local.socket.members); + + ucg_step_idx_t step_idx = 0; + ucg_builtin_plan_phase_t *phase = NULL; + ucg_builtin_index_group_t index_group; + unsigned total_group_process_cnt = 0; + unsigned local_idx = 0; + switch (ucg_algo.topo_level) { + case UCG_GROUP_HIERARCHY_LEVEL_NODE: + total_group_process_cnt = ppn; + local_idx = node_leaders_shift; + break; + + case UCG_GROUP_HIERARCHY_LEVEL_SOCKET: + total_group_process_cnt = pps; + local_idx = socket_leaders_shift; + break; + + case UCG_GROUP_HIERARCHY_LEVEL_L3CACHE: + break; + + default: + ucs_error("The current topolevel is not supported"); + break; + } + + /* 1st part: intra-node/socket reduce-scatter, use local index */ + status = ucg_builtin_binary_block_init(local_idx, total_group_process_cnt, binary_block, &phase, + ctx, &step_idx, &index_group); + if (status != UCS_OK) { + ucs_error("Binary-blocks topology-aware algorithm failed in intra reduce_scatter phase"); + return status; + } + binary_block->step_cnt = step_idx; + + /* 2nd part: intra-node/socket rabenseifner extra reduction */ + phase = &binary_block->phss[binary_block->phs_cnt]; + status = ucg_builtin_intra_extra_reduction(&index_group, &phase, binary_block, ctx, &step_idx); + if (status != UCS_OK) { + ucs_error("Binary-blocks topology-aware algorithm failed in intra reduction phase"); + return status; + } + binary_block->step_cnt = step_idx; + + if (ucg_algo.topo_level == UCG_GROUP_HIERARCHY_LEVEL_SOCKET) { + if (index_group.next_group_begin_index == total_group_process_cnt && local_socket_cnt > 1) { + /* 3rd part: intra-socket allreduce */ + /* Only the largest group in socket participates in the recursive operation. */ + unsigned phs_start = binary_block->phs_cnt; + status = ucg_builtin_recursive_binary_build(binary_block, params->super.ctx, config, + socket_leaders, local_socket_cnt, UCG_PLAN_BUILD_PARTIAL, + UCG_PLAN_RECURSIVE_TYPE_ALLREDUCE); + if (status != UCS_OK) { + ucs_error("Binary-blocks topology-aware algorithm failed in intra reduction phase"); + return status; + } + ucg_builtin_modify_buffer(binary_block, phs_start, &index_group); + } + /*update step idx */ + step_idx += local_socket_cnt; + binary_block->step_cnt = step_idx; + } + + /* 4th part: inter-node allreduce */ + /* Only the largest group in socket participates in the recursive operation. */ + if (index_group.next_group_begin_index == total_group_process_cnt && node_cnt > 1) { + unsigned phs_start = binary_block->phs_cnt; + status = ucg_builtin_recursive_binary_build(binary_block, params->super.ctx, config, + node_leaders, node_cnt, UCG_PLAN_BUILD_PARTIAL, + UCG_PLAN_RECURSIVE_TYPE_ALLREDUCE); + if (status != UCS_OK) { + ucs_error("Binary-blocks topology-aware algorithm failed in inter reduce phase"); + return status; + } + ucg_builtin_modify_buffer(binary_block, phs_start, &index_group); + } + /*update step idx */ + step_idx += node_cnt; + binary_block->step_cnt = step_idx; + + /* 5th part: intra-node broadcast */ + phase = &binary_block->phss[binary_block->phs_cnt]; + status = ucg_builtin_intra_bcast(&index_group, &phase, binary_block, ctx, &step_idx); + if (status != UCS_OK) { + ucs_error("Binary-blocks topology-aware algorithm failed in intra-node broadcast phase"); + return status; + } + binary_block->step_cnt = step_idx; + + /* 6th part: intra-node allgather */ + phase = &binary_block->phss[binary_block->phs_cnt]; + status = ucg_builtin_intra_allgather(&index_group, &phase, binary_block, ctx, &step_idx); + if (status != UCS_OK) { + ucs_error("Binary-blocks topology-aware algorithm failed in intra-node allgather phase"); + return status; + } + binary_block->step_cnt = step_idx; + + return status; +} + +ucs_status_t ucg_builtin_binary_block_create(ucg_builtin_group_ctx_t *ctx, + enum ucg_builtin_plan_topology_type plan_topo_type, + const ucg_builtin_config_t *config, + const ucg_group_params_t *group_params, + const ucg_collective_type_t *coll_type, + ucg_builtin_plan_t **plan_p) +{ + ucs_status_t status; + + /* Allocate worst-case memory footprint, resized down later */ + size_t alloc_size = sizeof(ucg_builtin_plan_t) + + MAX_PHASES * (sizeof(ucg_builtin_plan_phase_t) + (MAX_PEERS * sizeof(uct_ep_h))); + + ucg_builtin_plan_t *binary_block = (ucg_builtin_plan_t*)UCS_ALLOC_CHECK(alloc_size, "rabenseifner algorithm"); + + /* Initialize all variables of current plan */ + errno_t res = memset_s(binary_block, alloc_size, 0, alloc_size); + if (res != EOK) { + ucs_free(binary_block); + binary_block = NULL; + return UCS_ERR_INVALID_PARAM; + } + /** + * set my_index firstly, + * it should be set in ucg_collective_create after plan is created + */ + binary_block->super.my_index = group_params->member_index; + + /* topology information obtain from ompi layer */ + ucg_builtin_topo_params_t *topo_params = + (ucg_builtin_topo_params_t *)UCS_ALLOC_CHECK(sizeof(ucg_builtin_topo_params_t), "topo params"); + + status = ucg_builtin_query_topo(group_params, topo_params); + if (status != UCS_OK) { + ucs_error("query topo failed"); + ucs_free(topo_params); + topo_params = NULL; + return status; + } + + ucg_builtin_base_params_t base = { + .ctx = ctx, + .coll_type = coll_type, + .topo_type = plan_topo_type, + .group_params = group_params, + }; + + ucg_builtin_topo_aware_params_t params = { + .super = base, + .root = base.coll_type->root, + .topo_params = topo_params, + }; + + if (ucg_algo.topo == 0) { + status = ucg_builtin_binary_block_build(binary_block, params.super.ctx, config, + ¶ms, group_params->member_count); + if (status != UCS_OK) { + ucs_error("binary blocks method failed"); + goto err; + } + } else if (ucg_algo.topo == 1) { + status = ucg_builtin_topo_aware_binary_block_build(binary_block, params.super.ctx, config, + ¶ms, group_params->member_count); + if (status != UCS_OK) { + ucs_error("Topo-aware binary blocks method failed"); + goto err; + } + } else { + ucs_error("Invalid parameters for binary blocks method"); + status = UCS_ERR_INVALID_PARAM; + goto err; + } + + if (binary_block->phs_cnt > MAX_PHASES) { + ucs_error("Please increase the MAX_PHASES!\n"); + } + if (binary_block->ep_cnt > MAX_PEERS) { + ucs_error("Please increase the MAX_PEERS!\n"); + } + *plan_p = (ucg_builtin_plan_t*)binary_block; + return status; +err: + ucg_builtin_destroy_topo(topo_params); + return status; +} diff --git a/builtin/plan/builtin_binomial_tree.c b/builtin/plan/builtin_binomial_tree.c index 1e13720..8464143 100644 --- a/builtin/plan/builtin_binomial_tree.c +++ b/builtin/plan/builtin_binomial_tree.c @@ -1,8 +1,9 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED. - * See file LICENSE for terms. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * Description: Binomial-tree and K-tree algorithm */ +#include #include "builtin_plan.h" #include #include @@ -24,18 +25,6 @@ */ #define MAX_PHASES 10 /* till now, binomial tree can only support 2^MAX_PHASES process at most */ -typedef struct ucg_builtin_binomial_tree_params { - ucg_builtin_group_ctx_t *ctx; - const ucg_group_params_t *group_params; - const ucg_collective_type_t *coll_type; - enum ucg_builtin_plan_topology_type topo_type; - ucg_group_member_index_t root; - int tree_degree_inter_fanout; - int tree_degree_inter_fanin; - int tree_degree_intra_fanout; - int tree_degree_intra_fanin; -} ucg_builtin_binomial_tree_params_t; - ucs_config_field_t ucg_builtin_binomial_tree_config_table[] = { {"DEGREE_INTER_FANOUT", "8", "k-nomial tree degree for inter node with fanout process.\n", ucs_offsetof(ucg_builtin_binomial_tree_config_t, degree_inter_fanout), UCS_CONFIG_TYPE_UINT}, @@ -54,16 +43,11 @@ ucs_config_field_t ucg_builtin_binomial_tree_config_table[] = { unsigned ucg_builtin_calculate_ppx(const ucg_group_params_t *group_params, enum ucg_group_member_distance domain_distance) { - unsigned member_idx; - unsigned ppx = 0; - for (member_idx = 0; member_idx < group_params->member_count; member_idx++) { - enum ucg_group_member_distance next_distance = group_params->distance[member_idx]; - ucs_assert(next_distance < UCG_GROUP_MEMBER_DISTANCE_LAST); - if (ucs_likely(next_distance <= domain_distance)) { - ppx++; - } + if (domain_distance == UCG_GROUP_MEMBER_DISTANCE_SOCKET) { + return group_params->topo_args.pps_local; + } else { + return group_params->topo_args.ppn_local; } - return ppx; } /* @@ -75,7 +59,7 @@ enum ucg_builtin_tree_direction { UCG_PLAN_RIGHT_MOST_TREE }; -static ucs_status_t ucg_builtin_bmtree_algo_build_left(unsigned rank, +static void ucg_builtin_bmtree_algo_build_left(unsigned rank, unsigned root, unsigned size, ucg_group_member_index_t *up, unsigned *up_cnt, @@ -113,10 +97,9 @@ static ucs_status_t ucg_builtin_bmtree_algo_build_left(unsigned rank, } *down_cnt = num_child; - return UCS_OK; } -static ucs_status_t ucg_builtin_bmtree_algo_build_right(unsigned rank, +static void ucg_builtin_bmtree_algo_build_right(unsigned rank, unsigned root, unsigned size, ucg_group_member_index_t *up, unsigned *up_cnt, @@ -149,7 +132,6 @@ static ucs_status_t ucg_builtin_bmtree_algo_build_right(unsigned rank, } *down_cnt = num_child; - return UCS_OK; } @@ -177,7 +159,7 @@ static ucs_status_t ucg_builtin_get_rank(const ucg_group_member_index_t *member_ /* if and only if one myrank or root */ if (root_num != 1 || rank_num != 1) { - ucs_error("Invaild member list: has %u myself and %u root/subroot", rank_num, root_num); + ucs_error("Invalid member list: has %u myself and %u root/subroot", rank_num, root_num); return UCS_ERR_INVALID_PARAM; } @@ -200,15 +182,15 @@ ucs_status_t ucg_builtin_bmtree_algo_build(const ucg_group_member_index_t *membe return status; } - /* Notes: rank & root both correpsonds index in member_list */ + /* Notes: rank & root both corresponds index in member_list */ if (direction == UCG_PLAN_LEFT_MOST_TREE) { /* left-most Binomial Tree */ - (void)ucg_builtin_bmtree_algo_build_left(rank, root, size, up, up_cnt, down, down_cnt); + ucg_builtin_bmtree_algo_build_left(rank, root, size, up, up_cnt, down, down_cnt); } else if (direction == UCG_PLAN_RIGHT_MOST_TREE) { /* right-most Binomial Tree */ - (void)ucg_builtin_bmtree_algo_build_right(rank, root, size, up, up_cnt, down, down_cnt); + ucg_builtin_bmtree_algo_build_right(rank, root, size, up, up_cnt, down, down_cnt); } else { - ucs_error("Invaild tree direction"); + ucs_error("Invalid tree direction"); return UCS_ERR_INVALID_PARAM; } @@ -246,8 +228,7 @@ static ucs_status_t ucg_builtin_kmtree_algo_build_left(unsigned rank, while (mask < size) { if (vrank % (degree * mask)) { - up[0] = vrank / (degree * mask) * (degree * mask); - up[0] = (up[0] + root) % size; + up[0] = (vrank / (degree * mask) * (degree * mask) + root) % size; *up_cnt = 1; break; } @@ -296,8 +277,7 @@ static ucs_status_t ucg_builtin_kmtree_algo_build_right(unsigned rank, /* find parent */ while (mask < size) { if (vrank % (degree * mask)) { - up[0] = vrank / (degree * mask) * (degree * mask); - up[0] = (up[0] + root) % size; + up[0] = (vrank / (degree * mask) * (degree * mask) + root) % size; *up_cnt = 1; break; } @@ -349,15 +329,17 @@ ucs_status_t ucg_builtin_kmtree_algo_build(const ucg_group_member_index_t *membe if (direction == UCG_PLAN_LEFT_MOST_TREE) { /* leftmost k-nomial tree for fanout */ - (void)ucg_builtin_kmtree_algo_build_left(rank, root, size, degree, up, up_cnt, down, down_cnt); + status = ucg_builtin_kmtree_algo_build_left(rank, root, size, degree, up, up_cnt, down, down_cnt); } else if (direction == UCG_PLAN_RIGHT_MOST_TREE) { /* right-most k-nomial tree for fanin */ - (void)ucg_builtin_kmtree_algo_build_right(rank, root, size, degree, up, up_cnt, down, down_cnt); + status = ucg_builtin_kmtree_algo_build_right(rank, root, size, degree, up, up_cnt, down, down_cnt); } else { ucs_error("Invaild tree direction"); return UCS_ERR_INVALID_PARAM; } - + if (status != UCS_OK) { + return status; + } unsigned idx; /* convert index to real rank */ for (idx = 0; idx < *up_cnt; idx++) { @@ -378,8 +360,9 @@ ucs_status_t ucg_builtin_connect_leader(ucg_group_member_index_t my_index, unsig ucg_group_member_index_t *down_fanin, unsigned *down_fanin_cnt) { ucs_status_t status = UCS_OK; + ucs_assert(ppx > 0); if (ppx_last_level % ppx != 0) { - ucs_error("cheak ppn and ppx in last topo level"); + ucs_error("check ppn and ppx in last topo level"); return UCS_ERR_INVALID_PARAM; } @@ -508,7 +491,7 @@ static void ucg_builtin_get_node_leaders_normal_level(ucg_group_member_index_t m } } -static ucs_status_t ucg_builtin_get_node_leaders(const uint16_t *node_index, ucg_group_member_index_t member_count, +static void ucg_builtin_get_node_leaders(const uint16_t *node_index, ucg_group_member_index_t member_count, enum ucg_group_hierarchy_level level, unsigned ppx, ucg_group_member_index_t *leaders) { @@ -517,8 +500,6 @@ static ucs_status_t ucg_builtin_get_node_leaders(const uint16_t *node_index, ucg } else { ucg_builtin_get_node_leaders_normal_level(member_count, ppx, leaders); } - - return UCS_OK; } static ucs_status_t ucg_builtin_tree_inter_fanin_connect(const ucg_builtin_binomial_tree_params_t *params, @@ -555,9 +536,9 @@ static ucs_status_t ucg_builtin_tree_inter_fanin_connect(const ucg_builtin_binom for (member_idx = down_fanin_cnt; member_idx < down_fanin_cnt + up_fanin_cnt; member_idx++) { down_fanin[member_idx] = up_fanin[member_idx - down_fanin_cnt]; } - down_fanin_cnt = down_fanin_cnt + up_fanin_cnt; + status = ucg_builtin_binomial_tree_connect_phase((*phase)++, params, tree->phs_cnt, eps, down_fanin, - down_fanin_cnt, fanin_method); + down_fanin_cnt + up_fanin_cnt, fanin_method); } return status; } @@ -600,8 +581,8 @@ static ucs_status_t ucg_builtin_tree_inter_fanout_connect(const ucg_builtin_bino for (member_idx = up_cnt; member_idx < up_cnt + down_cnt; member_idx++) { up[member_idx] = down[member_idx - up_cnt]; } - up_cnt = up_cnt + down_cnt; - status = ucg_builtin_binomial_tree_connect_phase(phase, params, tree->phs_cnt + 1, eps, up, up_cnt, + + status = ucg_builtin_binomial_tree_connect_phase(phase, params, tree->phs_cnt + 1, eps, up, up_cnt + down_cnt, fanout_method); } @@ -620,6 +601,7 @@ static ucs_status_t ucg_builtin_tree_inter_fanin_fanout_create(const ucg_builtin ucg_builtin_plan_t *tree) { ucs_status_t status = UCS_OK; + ucs_assert(ppx > 0); /* Calculate the number of binomial tree steps for inter-node only */ if (my_index % ppx == local_root && node_count > 1) { ucg_group_member_index_t up[MAX_PEERS] = { 0 }; @@ -638,7 +620,11 @@ static ucs_status_t ucg_builtin_tree_inter_fanin_fanout_create(const ucg_builtin size_t alloc_size = sizeof(ucg_group_member_index_t) * size; ucg_group_member_index_t *member_list = (ucg_group_member_index_t *)(UCS_ALLOC_CHECK(alloc_size, "member list")); - memset(member_list, 0, alloc_size); + errno_t res = memset_s(member_list, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } + for (idx = 0; idx < size; idx++) { member_list[idx] = (params->root % ppx) + ppx * idx; } @@ -689,7 +675,6 @@ static ucs_status_t ucg_builtin_binomial_tree_inter_fanout_connect(const ucg_bui /* Receive from parents */ if (up_cnt == 1 && down_cnt == 0) { /* Connect this phase to its peers */ - ucs_assert(up_cnt == 1); /* sanity check: not multi-root */ status = ucg_builtin_binomial_tree_connect_phase(phase, params, 0, eps, up, up_cnt, fanout_method); } @@ -707,8 +692,8 @@ static ucs_status_t ucg_builtin_binomial_tree_inter_fanout_connect(const ucg_bui for (member_idx = up_cnt; member_idx < up_cnt + down_cnt; member_idx++) { up[member_idx] = down[member_idx - up_cnt]; } - up_cnt = up_cnt + down_cnt; - status = ucg_builtin_binomial_tree_connect_phase(phase, params, 0, eps, up, up_cnt, fanout_method); + + status = ucg_builtin_binomial_tree_connect_phase(phase, params, 0, eps, up, up_cnt + down_cnt, fanout_method); } return status; } @@ -728,7 +713,11 @@ static ucs_status_t ucg_builtin_prepare_inter_fanout_member_idx(const ucg_builti size_t alloc_size = sizeof(ucg_group_member_index_t) * size; ucg_group_member_index_t *member_list = (ucg_group_member_index_t *)(UCS_ALLOC_CHECK(alloc_size, "member list")); - memset(member_list, 0, alloc_size); + errno_t res = memset_s(member_list, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } + for (idx = 0; idx < size; idx++) { if (is_use_topo_info) { member_list[idx] = topo_params->subroot_array[idx]; @@ -845,14 +834,14 @@ static ucs_status_t ucg_builtin_binomial_tree_inter_create(enum ucg_builtin_plan unsigned *step_inc_cnt) { ucs_status_t status = UCS_OK; - unsigned factor = 2; + const unsigned factor = 2; switch (topo_type) { case UCG_PLAN_RECURSIVE: if (is_subroot && node_count > 1) { unsigned phs_cnt = tree->phs_cnt; ucg_group_member_index_t *node_leaders = UCS_ALLOC_CHECK(node_count * sizeof(ucg_group_member_index_t), "recursive ranks"); - (void)ucg_builtin_get_node_leaders(params->group_params->node_index, + ucg_builtin_get_node_leaders(params->group_params->node_index, params->group_params->member_count, ucg_algo.topo_level, ppx, node_leaders); ucg_builtin_recursive_connect(params->ctx, my_index, node_leaders, node_count, factor, 0, tree); @@ -862,7 +851,7 @@ static ucs_status_t ucg_builtin_binomial_tree_inter_create(enum ucg_builtin_plan } else { *phs_inc_cnt = 0; } - (void)ucg_builtin_recursive_compute_steps(my_index_local, node_count, factor, step_inc_cnt); + ucg_builtin_recursive_compute_steps(my_index_local, node_count, factor, step_inc_cnt); ucs_debug("phase inc: %d step inc: %d", *phs_inc_cnt, *step_inc_cnt); break; case UCG_PLAN_TREE_FANIN_FANOUT: /* for inter allreduce, another choice is reduce+bcast with k-nominal tree */ @@ -916,13 +905,14 @@ static ucs_status_t ucg_builtin_binomial_tree_add_inter( enum ucg_collective_modifiers mod = params->coll_type->modifiers; unsigned node_idx; + unsigned member_idx; unsigned is_subroot = 0; - /* node-aware: using subroot_array and node_cnt to support unblance ppn case */ + /* node-aware: using subroot_array and node_cnt to support unbalance ppn case */ size_t alloc_size = sizeof(ucg_group_member_index_t) * topo_params->node_cnt; ucg_group_member_index_t *subroot_array = (ucg_group_member_index_t *)UCS_ALLOC_CHECK(alloc_size, "subroot array"); - for (unsigned member_idx = 0; member_idx < topo_params->node_cnt; member_idx++) { + for (member_idx = 0; member_idx < topo_params->node_cnt; member_idx++) { subroot_array[member_idx] = topo_params->subroot_array[member_idx]; } @@ -944,8 +934,8 @@ static ucs_status_t ucg_builtin_binomial_tree_add_inter( } if (node_count == 1) { - phs_inc_cnt = 0; - step_inc_cnt = 0; + *phs_inc_cnt = 0; + *step_inc_cnt = 0; ucs_free(subroot_array); subroot_array = NULL; return UCS_OK; @@ -1087,12 +1077,22 @@ static ucs_status_t ucg_builtin_binomial_tree_connect_fanin_fanout(ucg_builtin_p if (params->topo_type == UCG_PLAN_TREE_FANIN_FANOUT) { inter_node_topo_type = (ucg_algo.kmtree == 1) ? UCG_PLAN_TREE_FANIN_FANOUT : UCG_PLAN_RECURSIVE; +#if ENABLE_UCG_HICOLL + if (ucg_algo.inc && inc_used(params->group_params)) { + inter_node_topo_type = UCG_PLAN_INC; + } +#endif /* For fanin-fanout (e.g. allreduce) - copy existing connections */ /* recursive or k-nomial tree for inter-nodes */ /* especially for k-nomial tree, socket-aware algorithm (topo_level) ppx should be replaced by real ppn */ if (inter_node_topo_type == UCG_PLAN_RECURSIVE && ucg_algo.topo_level == UCG_GROUP_HIERARCHY_LEVEL_L3CACHE) { status = ucg_builtin_binomial_tree_add_inter(tree, &tree->phss[(ppx > 1) ? 1 : 0], params, eps, inter_node_topo_type, &phs_inc_cnt, &step_inc_cnt, (ppx != 1) ? pps : ppx, topo_params); + } else if (inter_node_topo_type == UCG_PLAN_INC) { +#if ENABLE_UCG_HICOLL + status = ucg_builtin_add_inc(tree, &tree->phss[ppx > 1 ? 1 : 0], params, eps, &phs_inc_cnt, &step_inc_cnt, + (ucg_algo.kmtree == 1 && ucg_algo.topo_level && ppx != 1) ? ppx * SPN : ppx, ucg_algo.topo_level); +#endif } else { status = ucg_builtin_binomial_tree_add_inter(tree, &tree->phss[(ppx > 1) ? 1 : 0], params, eps, inter_node_topo_type, &phs_inc_cnt, &step_inc_cnt, @@ -1132,9 +1132,21 @@ static ucs_status_t ucg_builtin_topo_tree_connect_fanout(ucg_builtin_plan_t *tre unsigned step_inc_cnt = 0; enum ucg_builtin_plan_topology_type inter_node_topo_type; inter_node_topo_type = UCG_PLAN_TREE_FANOUT; - /* binomial tree for inter-nodes */ +#if ENABLE_UCG_HICOLL + if (ucg_algo.inc && inc_used(params->group_params)) { + inter_node_topo_type = UCG_PLAN_INC; + status = ucg_builtin_add_inc(tree, &tree->phss[0], params, eps, &phs_inc_cnt, &step_inc_cnt, ppx, + ucg_algo.topo_level); + } else { + /* binomial tree for inter-nodes */ + status = ucg_builtin_binomial_tree_add_inter(tree, &tree->phss[0], params, eps, inter_node_topo_type, + &phs_inc_cnt, &step_inc_cnt, ppx, topo_params); + } +#else status = ucg_builtin_binomial_tree_add_inter(tree, &tree->phss[0], params, eps, inter_node_topo_type, - &phs_inc_cnt, &step_inc_cnt, ppx, topo_params); + &phs_inc_cnt, &step_inc_cnt, ppx, topo_params); +#endif + if (status != UCS_OK) { return status; } @@ -1150,7 +1162,7 @@ static ucs_status_t ucg_builtin_topo_tree_connect_fanout(ucg_builtin_plan_t *tre return status; } -static ucs_status_t ucg_builtin_non_topo_tree_connect_fanout(ucg_builtin_plan_t *tree, +STATIC_GTEST ucs_status_t ucg_builtin_non_topo_tree_connect_fanout(ucg_builtin_plan_t *tree, const ucg_builtin_binomial_tree_params_t *params, ucg_group_member_index_t *up, unsigned up_cnt, @@ -1336,6 +1348,14 @@ static ucs_status_t ucg_builtin_kinomial_tree_build_intra(const ucg_builtin_bino up_fanin, up_fanin_cnt, down_fanin, down_fanin_cnt); } + if (ucg_builtin_need_calate_position(params->coll_type, *up_fanin_cnt, params->ctx, UCG_PLAN_TREE_FANIN)) { + int degree = ((params->group_params->node_index[rank] == params->group_params->node_index[up_fanin[0]]) + ? params->tree_degree_intra_fanin : params->tree_degree_inter_fanin); + short upOffset = ucg_get_tree_buffer_pos(rank, up_fanin[0], root, *ppx, degree, member_list); + tree->super.up_offset = ((upOffset == -1) ? *down_fanin_cnt : upOffset); + ucs_debug("degree:%d, myrank:%lu, uprank:%lu, down_fanin_cnt:%u, offset:%d, root:%u, size:%u", + degree, rank, up_fanin[0], tree->super.up_offset, *down_fanin_cnt, root, *ppx); + } return status; } @@ -1366,7 +1386,8 @@ ucs_status_t ucg_builtin_bmtree_algo_build_fanin_fanout(const ucg_group_member_i return status; } -static ucs_status_t ucg_builtin_binomial_tree_build_intra(ucg_group_member_index_t *member_list, +static ucs_status_t ucg_builtin_binomial_tree_build_intra(const ucg_builtin_binomial_tree_params_t *params, + ucg_group_member_index_t *member_list, unsigned root, ucg_group_member_index_t rank, ucg_group_member_index_t *up, @@ -1402,6 +1423,14 @@ static ucs_status_t ucg_builtin_binomial_tree_build_intra(ucg_group_member_index status = ucg_builtin_connect_leader(tree->super.my_index, *ppx, *pps, up, up_cnt, down, down_cnt, up_fanin, up_fanin_cnt, down_fanin, down_fanin_cnt); } + + if (ucg_builtin_need_calate_position(params->coll_type, *up_fanin_cnt, params->ctx, UCG_PLAN_TREE_FANIN)) { + int degree = ((params->group_params->node_index[rank] == params->group_params->node_index[up_fanin[0]]) + ? params->tree_degree_intra_fanin : params->tree_degree_inter_fanin); + tree->super.up_offset = ucg_get_tree_buffer_pos(rank, up_fanin[0], root, *ppx, degree, member_list); + ucs_debug("degree:%d, myrank:%lu, uprank:%lu, offset:%d, root:%u, size:%u", + degree, rank, up_fanin[0], tree->super.up_offset, root, *ppx); + } return status; } @@ -1432,7 +1461,8 @@ static void ucg_builtin_prepare_member_idx(const ucg_builtin_binomial_tree_param k = 0; ucg_group_member_index_t member_idx; for (member_idx = 0; member_idx < params->group_params->member_count; member_idx++) { - if (ucs_likely(params->group_params->distance[member_idx] <= domain_distance)) { + if (ucg_builtin_get_distance(params->group_params, params->group_params->member_index, + member_idx) <= domain_distance) { member_list[k++] = member_idx; } } @@ -1468,28 +1498,15 @@ static ucs_status_t ucg_builtin_topo_tree_build(const ucg_builtin_binomial_tree_ /* Socket-aware algorithm: - special case 1: socket unbalance case in socket-aware algorithm; ( ppn = 2 * pps or ppn = pps is OK for socket-aware) + special case 1: socket unbalance case in socket-aware algorithm; + ( ppn = 2 * pps or ppn = pps is OK for socket-aware) special case 2: allreduce algo(8) ppx = 1 or pps = ppn; solution: change topo-aware level: socket -> node. */ - /* case 1 */ - if (ucg_algo.topo_level == UCG_GROUP_HIERARCHY_LEVEL_SOCKET) { - if (!params->group_params->is_socket_balance) { - ucs_info("Warning: process number in every socket must be same in socket-aware algorithm, please make sure ppn " - "must be even and '--map-by socket' included. Switch to corresponding node-aware algorithm already."); - ucg_algo.topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; - status = choose_distance_from_topo_aware_level(&domain_distance); - if (status != UCS_OK) { - return status; - } - *ppx = *ppn; - } - } - /* case 2 */ if (ucg_algo.topo_level == UCG_GROUP_HIERARCHY_LEVEL_SOCKET && ucg_algo.kmtree && (*ppx == 1 || *pps == *ppn)) { ucg_algo.topo_level = UCG_GROUP_HIERARCHY_LEVEL_NODE; - status = choose_distance_from_topo_aware_level(&domain_distance); + choose_distance_from_topo_aware_level(&domain_distance); *ppx = *ppn; } @@ -1500,7 +1517,10 @@ static ucs_status_t ucg_builtin_topo_tree_build(const ucg_builtin_binomial_tree_ /* construct member list when topo_aware */ size_t alloc_size = sizeof(ucg_group_member_index_t) * (*ppx); ucg_group_member_index_t *member_list = (ucg_group_member_index_t *)(UCS_ALLOC_CHECK(alloc_size, "member list")); - memset(member_list, 0, alloc_size); + errno_t res = memset_s(member_list, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } ucg_builtin_prepare_member_idx(params, topo_params, domain_distance, ppx, rank, member_list); @@ -1510,7 +1530,7 @@ static ucs_status_t ucg_builtin_topo_tree_build(const ucg_builtin_binomial_tree_ up_cnt, down, down_cnt, up_fanin, up_fanin_cnt, down_fanin, down_fanin_cnt, ppx, ppn, tree); } else { - status = ucg_builtin_binomial_tree_build_intra(member_list, root, rank, up, + status = ucg_builtin_binomial_tree_build_intra(params, member_list, root, rank, up, up_cnt, down, down_cnt, up_fanin, up_fanin_cnt, down_fanin, down_fanin_cnt, ppx, pps, ppn, tree); } @@ -1546,7 +1566,8 @@ static ucs_status_t ucg_builtin_binomial_tree_algo_build(ucg_group_member_index_ return status; } - status = ucg_builtin_bmtree_algo_build(member_list, size, rank, root, UCG_PLAN_RIGHT_MOST_TREE, up_fanin, up_fanin_cnt, down_fanin, down_fanin_cnt); + status = ucg_builtin_bmtree_algo_build(member_list, size, rank, root, UCG_PLAN_RIGHT_MOST_TREE, up_fanin, + up_fanin_cnt, down_fanin, down_fanin_cnt); ucs_free(member_list); member_list = NULL; return status; @@ -1577,7 +1598,7 @@ static ucs_status_t ucg_builtin_tree_build(const ucg_builtin_binomial_tree_param /* socket-aware: ppx = pps (processes per socket) */ /* L3cache-aware: ppx = ppl (processes per L3cache) */ enum ucg_group_member_distance domain_distance = UCG_GROUP_MEMBER_DISTANCE_HOST; - status = choose_distance_from_topo_aware_level(&domain_distance); + choose_distance_from_topo_aware_level(&domain_distance); *ppx = ucg_builtin_calculate_ppx(params->group_params, domain_distance); *ppn = ucg_builtin_calculate_ppx(params->group_params, UCG_GROUP_MEMBER_DISTANCE_HOST); *pps = ucg_builtin_calculate_ppx(params->group_params, UCG_GROUP_MEMBER_DISTANCE_SOCKET); @@ -1589,7 +1610,10 @@ static ucs_status_t ucg_builtin_tree_build(const ucg_builtin_binomial_tree_param /* create member_list for un-topo */ size_t alloc_size = sizeof(ucg_group_member_index_t) * size; ucg_group_member_index_t *member_list = (ucg_group_member_index_t *)(UCS_ALLOC_CHECK(alloc_size, "member list")); - memset(member_list, 0, alloc_size); + errno_t res = memset_s(member_list, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } ucg_group_member_index_t member_idx; for (member_idx = 0; member_idx < params->group_params->member_count; member_idx++) { @@ -1668,7 +1692,8 @@ static ucs_status_t ucg_builtin_binomial_tree_build(const ucg_builtin_binomial_t tree->super.my_index = rank; /* topology information obtain from ompi layer */ - ucg_builtin_topology_info_params_t *topo_params = (ucg_builtin_topology_info_params_t *)UCS_ALLOC_CHECK(sizeof(ucg_builtin_topology_info_params_t), "topo params"); + ucg_builtin_topology_info_params_t *topo_params = (ucg_builtin_topology_info_params_t *)UCS_ALLOC_CHECK( + sizeof(ucg_builtin_topology_info_params_t), "topo params"); status = ucg_builtin_topology_info_create(topo_params, params->group_params, params->root); if (status != UCS_OK) { ucg_builtin_binomial_tree_free_topo_info(&topo_params); @@ -1691,13 +1716,14 @@ static ucs_status_t ucg_builtin_binomial_tree_build(const ucg_builtin_binomial_t /* fill in the tree phases while establishing the connections */ status = ucg_builtin_binomial_tree_connect(tree, params, - alloc_size, up, up_cnt, down, down_cnt, up_fanin, up_fanin_cnt, down_fanin, down_fanin_cnt, ppx, pps, topo_params); + alloc_size, up, up_cnt, down, down_cnt, up_fanin, up_fanin_cnt, + down_fanin, down_fanin_cnt, ppx, pps, topo_params); ucg_builtin_binomial_tree_free_topo_info(&topo_params); return status; } -static UCS_F_NOINLINE void ucg_calcu_node_cnt(unsigned *node_cnt, const ucg_group_params_t *group_params) +static void ucg_calcu_node_cnt(unsigned *node_cnt, const ucg_group_params_t *group_params) { ucg_group_member_index_t member_idx; unsigned node_idx; @@ -1711,15 +1737,6 @@ static UCS_F_NOINLINE void ucg_calcu_node_cnt(unsigned *node_cnt, const ucg_grou (*node_cnt)++; } -static unsigned ucg_tree_degree_set(unsigned param, unsigned config) -{ - if (param > config) { - return param; - } else { - return config; - } -} - ucs_status_t ucg_builtin_binomial_tree_create(ucg_builtin_group_ctx_t *ctx, enum ucg_builtin_plan_topology_type plan_topo_type, const ucg_builtin_config_t *config, @@ -1731,7 +1748,10 @@ ucs_status_t ucg_builtin_binomial_tree_create(ucg_builtin_group_ctx_t *ctx, size_t alloc_size = sizeof(ucg_builtin_plan_t) + MAX_PHASES * sizeof(ucg_builtin_plan_phase_t) + MAX_PEERS * sizeof(uct_ep_h); ucg_builtin_plan_t *tree = (ucg_builtin_plan_t*)UCS_ALLOC_CHECK(alloc_size, "tree topology"); - memset(tree, 0, alloc_size); + errno_t res = memset_s(tree, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } tree->phs_cnt = 0; /* will be incremented with usage */ unsigned node_cnt = 0; @@ -1745,8 +1765,8 @@ ucs_status_t ucg_builtin_binomial_tree_create(ucg_builtin_group_ctx_t *ctx, .topo_type = plan_topo_type, .group_params = group_params, .root = coll_type->root, - .tree_degree_inter_fanout = ucg_tree_degree_set(node_cnt - 1, config->bmtree.degree_inter_fanout), - .tree_degree_inter_fanin = ucg_tree_degree_set(node_cnt - 1, config->bmtree.degree_inter_fanin), + .tree_degree_inter_fanout = config->bmtree.degree_inter_fanout, + .tree_degree_inter_fanin = config->bmtree.degree_inter_fanin, .tree_degree_intra_fanout = config->bmtree.degree_intra_fanout, .tree_degree_intra_fanin = config->bmtree.degree_intra_fanin }; @@ -1762,7 +1782,5 @@ ucs_status_t ucg_builtin_binomial_tree_create(ucg_builtin_group_ctx_t *ctx, /* Reduce the allocation size according to actual usage */ *plan_p = tree; ucs_assert(*plan_p != NULL); /* only reduces size - should never fail */ - (*plan_p)->super.support_non_commutative = 0; - (*plan_p)->super.support_large_datatype = 0; return UCS_OK; -} \ No newline at end of file +} diff --git a/builtin/plan/builtin_plan.h b/builtin/plan/builtin_plan.h index befe696..423d0e7 100644 --- a/builtin/plan/builtin_plan.h +++ b/builtin/plan/builtin_plan.h @@ -1,5 +1,5 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -9,11 +9,25 @@ #include #include #include +#include +#include + +BEGIN_C_DECLS + +#ifndef ENABLE_GTEST + #define STATIC_GTEST static +#else + #define STATIC_GTEST +#endif + +#ifndef MPI_IN_PLACE +#define MPI_IN_PLACE ((void *)0x1) +#endif enum UCS_S_PACKED ucg_builtin_algorithm_feature { UCG_ALGORITHM_SUPPORT_COMMON_FEATURE = UCS_BIT(0), /* support common feature */ UCG_ALGORITHM_SUPPORT_UNBALANCE_PPN = UCS_BIT(1), /* support unbalanced ppn */ - UCG_ALGORITHM_SUPPORT_DISCONTINOUS_RANK = UCS_BIT(2), /* suport discontinuous rank */ + UCG_ALGORITHM_SUPPORT_DISCONTINUOUS_RANK = UCS_BIT(2), /* suport discontinuous rank */ UCG_ALGORITHM_SUPPORT_RANK_FEATURE = (UCS_BIT(1) | UCS_BIT(2)), /* support discontinuous rank and unbalanced ppn */ UCG_ALGORITHM_SUPPORT_NON_COMMUTATIVE_OPS = UCS_BIT(3), /* support non-commutative operation (e.g. matrix muliplication) */ UCG_ALGORITHM_SUPPORT_LARGE_DATATYPE = UCS_BIT(4), /* support large datatype */ @@ -21,39 +35,32 @@ enum UCS_S_PACKED ucg_builtin_algorithm_feature { UCG_ALGORITHM_SUPPORT_BIND_TO_NONE = UCS_BIT(5), /* suport bind-to none */ }; -/************** Algorithm selection related varibales **************/ -struct ucg_builtin_algorithm { - unsigned bmtree; /* bmtree 0: builtin tree 1: binomial tree */ - unsigned kmtree; /* kmtree for inter communication 0: buildin tree 1: k-momial tree */ - unsigned kmtree_intra; /* kmtree for intra communication 0: buildin tree 1: k-momial tree */ - unsigned recursive; /* recursive 0: recursive 1: topo-aware recursive */ - unsigned bruck; /* recursive 0: recursive 1: allgather bruck */ - unsigned topo; /* topo 0: standard tree 1: topo-aware tree */ - enum ucg_group_hierarchy_level topo_level; +/************** Algorithm selection related variables **************/ +typedef struct ucg_builtin_algorithm { + uint16_t bmtree : 1; /* bmtree 0: builtin tree 1: binomial tree */ + uint16_t kmtree : 1; /* kmtree for inter communication 0: builtin tree 1: k-nomial tree */ + uint16_t kmtree_intra : 1; /* kmtree for intra communication 0: builtin tree 1: k-nomial tree */ + uint16_t recursive : 1; /* recursive 0: recursive 1: topo-aware recursive */ + uint16_t bruck : 1; /* recursive 0: recursive 1: allgather bruck */ + uint16_t ring : 1; /* ring 0: recursive 1: ring */ + uint16_t NAP : 1; /* NAP 0: recursive 1: Node Aware Parallel */ + uint16_t pipeline : 1; /* pipeline 0: normal send 1: pipelining send for waypoint */ + uint16_t inc : 1; /*inc 0: normal algo 1:in net work computing */ + uint16_t binary_block : 1; /*binary block 0:false 1:yes*/ + uint16_t ladd : 1; /* ladd 0:false 1:yes*/ + uint16_t plummer : 1; /*plummer 0:false 1:yes*/ + uint16_t topo : 1; /* topo 0: standard tree 1: topo-aware tree */ /* topo_level = */ /* UCG_GROUP_HIERARCHY_LEVEL_NODE: node-aware */ /* UCG_GROUP_HIERARCHY_LEVEL_SOCKET: socket-aware */ /* UCG_GROUP_HIERARCHY_LEVEL_L3CACHE: L3cache-aware */ - unsigned ring; /* ring 0: recursive 1: ring */ - unsigned pipeline; /* pipeline 0: normal send 1: pipelining send for waypoint */ + uint16_t topo_level : 2; /**/ + uint16_t reserved : 2; /**/ uint8_t feature_flag; /* @ref enum ucg_builtin_algorithm_feature */ -}; +} ucg_builtin_algo_t; extern struct ucg_builtin_algorithm ucg_algo; -enum choose_ops_mask { - OPS_AUTO_DECISION, - OPS_BCAST, - OPS_ALLREDUCE, - OPS_BARRIER -}; - -enum ucg_change_algo { - NONE_CASE = 0, - UNSUPPORT_CASE = 1, - NONCOMMUTATIVE_LARGEDATA_CASE = 2, -}; - /************** Algorithm selection related varibales **************/ enum ucg_builtin_plan_topology_type { UCG_PLAN_RECURSIVE, @@ -61,10 +68,39 @@ enum ucg_builtin_plan_topology_type { UCG_PLAN_TREE_FANOUT, UCG_PLAN_TREE_FANIN_FANOUT, UCG_PLAN_ALLTOALL_AGGREGATION, - UCG_PLAN_ALLTOALL_BRCUK, + UCG_PLAN_ALLTOALL_BRUCK, UCG_PLAN_BRUCK, - UCG_PLAN_LAST, UCG_PLAN_RING, + UCG_PLAN_INC, + UCG_PLAN_BMTREE, + UCG_PLAN_KMTREE, + UCG_PLAN_NAP, + UCG_PLAN_BINARY_BLOCK, + UCG_PLAN_ALLTOALLV_LADD, + UCG_PLAN_ALLTOALLV_PLUMMER, + UCG_PLAN_LAST +}; + +enum ucg_builtin_plan_recursive_type { + UCG_PLAN_RECURSIVE_TYPE_ALLREDUCE +}; + +/* connnection pattern for different collective operations */ +enum ucg_builtin_plan_connect_pattern { + UCG_PLAN_PATTERN_ONE_TO_MANY = UCS_BIT(0), + UCG_PLAN_PATTERN_MANY_TO_ONE = UCS_BIT(1), + UCG_PLAN_PATTERN_MANY_TO_MANY = UCS_BIT(2), + UCG_PLAN_PATTERN_COLLECT = UCS_BIT(3), + UCG_PLAN_PATTERN_DISTRIBUTE = UCS_BIT(4), +}; + +/* plan build type + * FULL: all members will participate plan creation + * PARTIAL: only partial members will create the plan (e.g. topo-aware algorithm) + */ +enum ucg_builtin_plan_build_type { + UCG_PLAN_BUILD_FULL, + UCG_PLAN_BUILD_PARTIAL }; enum UCS_S_PACKED ucg_builtin_plan_method_type { @@ -77,6 +113,7 @@ enum UCS_S_PACKED ucg_builtin_plan_method_type { UCG_PLAN_METHOD_REDUCE_TERMINAL, /* receive and reduce from each peer */ UCG_PLAN_METHOD_REDUCE_WAYPOINT, /* receive, reduce, and pass onwards */ UCG_PLAN_METHOD_REDUCE_RECURSIVE, /* send+receive and reduce (RD) */ + UCG_PLAN_METHOD_REDUCE_SCATTER_RECURSIVE, /* send + receive in half and reduce (RH) */ UCG_PLAN_METHOD_NEIGHBOR, /* "halo exchange", for neighborhood ops */ UCG_PLAN_METHOD_ALLGATHER_BRUCK, /* send+receive for allgather (BRUCK) */ @@ -84,6 +121,14 @@ enum UCS_S_PACKED ucg_builtin_plan_method_type { UCG_PLAN_METHOD_ALLTOALL_BRUCK, /* send+receive for alltoall (BRUCK) */ UCG_PLAN_METHOD_REDUCE_SCATTER_RING, UCG_PLAN_METHOD_ALLGATHER_RING, + UCG_PLAN_METHOD_INC, + UCG_PLAN_METHOD_EXCHANGE, /* exchange messages between peers */ + UCG_PLAN_METHOD_ALLTOALLV_LADD, /* alltoallv for ladd */ + UCG_PLAN_METHOD_SEND_V_TERMINAL, /* send operation for gatherv */ + UCG_PLAN_METHOD_RECV_V_TERMINAL, /* recv oparation for scatter */ + UCG_PLAN_METHOD_SCATTER_V_TERMINAL,/* scatterv operation for fanout */ + UCG_PLAN_METHOD_GATHER_V_TERMINAL, /* gatherv operation for fanin */ + UCG_PLAN_METHOD_ALLTOALLV_PLUMMER, /* inter node alltoallv for plummer*/ }; enum ucg_builtin_bcast_algorithm { @@ -92,6 +137,7 @@ enum ucg_builtin_bcast_algorithm { UCG_ALGORITHM_BCAST_NODE_AWARE_BMTREE = 2, /* Topo-aware tree (Binomial tree + Binomial tree) */ UCG_ALGORITHM_BCAST_NODE_AWARE_KMTREE_AND_BMTREE = 3, /* Topo-aware tree (K-nomial tree + Binomial tree) */ UCG_ALGORITHM_BCAST_NODE_AWARE_KMTREE = 4, /* Topo-aware tree (K-nomial tree + K-nomial tree) */ + UCG_ALGORITHM_BCAST_NODE_AWARE_INC = 5, /* Node-aware In Network Computing (INC)*/ UCG_ALGORITHM_BCAST_LAST, }; @@ -105,6 +151,12 @@ enum ucg_builtin_allreduce_algorithm { UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_RECURSIVE_AND_KMTREE = 6, /* Topo-aware Recursive (with K-nomial tree for intra node, ppn inside socket) */ UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_KMTREE = 7, /* Topo-aware FANIN-FANOUT (with K-nomial tree for intra node, ppn inside node) */ UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_KMTREE = 8, /* Topo-aware FANIN-FANOUT (with K-nomial tree for intra node, ppn inside socket) */ + UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_INC = 9, + UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_INC = 10, + UCG_ALGORITHM_ALLREDUCE_NAP = 11, + UCG_ALGORITHM_ALLREDUCE_RABENSEIFNER_BINARY_BLOCK = 12, + UCG_ALGORITHM_ALLREDUCE_NODE_AWARE_RABENSEIFNER_BINARY_BLOCK = 13, + UCG_ALGORITHM_ALLREDUCE_SOCKET_AWARE_RABENSEIFNER_BINARY_BLOCK = 14, UCG_ALGORITHM_ALLREDUCE_LAST, }; @@ -117,9 +169,19 @@ enum ucg_builtin_barrier_algorithm { UCG_ALGORITHM_BARRIER_SOCKET_AWARE_RECURSIVE_AND_KMTREE = 5, /* Topo-aware Recursive (with K-nomial tree for intra node, ppn inside socket) */ UCG_ALGORITHM_BARRIER_NODE_AWARE_KMTREE = 6, /* Topo-aware FANIN-FANOUT (with K-nomial tree for intra node, ppn inside node) */ UCG_ALGORITHM_BARRIER_SOCKET_AWARE_KMTREE = 7, /* Topo-aware FANIN-FANOUT (with K-nomial tree for intra node, ppn inside socket) */ + UCG_ALGORITHM_BARRIER_NODE_AWARE_INC = 8, + UCG_ALGORITHM_BARRIER_SOCKET_AWARE_INC = 9, + UCG_ALGORITHM_BARRIER_NAP = 10, UCG_ALGORITHM_BARRIER_LAST, }; +enum ucg_builtin_alltoallv_algorithm { + UCG_ALGORITHM_ALLTOALLV_AUTO_DECISION = 0, + UCG_ALGORITHM_ALLTOALLV_LADD = 1, /* Throttled scattered destination */ + UCG_ALGORITHM_ALLTOALLV_NODE_AWARE_PLUMMER = 2, /* gather+alltoallv+scatterv*/ + UCG_ALGORITHM_ALLTOALLV_LAST, +}; + typedef struct ucg_builtin_tl_threshold { int initialized; size_t max_short_one; /* max single short message */ @@ -130,6 +192,54 @@ typedef struct ucg_builtin_tl_threshold { size_t md_attr_cap_max_reg; } ucg_builtin_tl_threshold_t; +/* special feature for some algorithms (rabenseifner, bruck), + * e.g. first rank, start position of buffer, sending buffer period + */ +typedef struct ucg_builtin_plan_extra_attr { + unsigned total_num_blocks; /* total number of blocks for all phase */ + unsigned num_blocks; /* number fo blocks the buffer for current phase */ + unsigned start_block; /* send start block for current phase */ + unsigned recv_start_block; /* recv start block for current phase */ + unsigned peer_block; /* start blcok for peer */ + unsigned peer_start_block; /* peer's start block for current phase */ + unsigned is_partial; /* send or recv partially */ + unsigned not_shift_send_buffer; /* whether send_buffer is not to shift */ + unsigned member_cnt; /* number of members for local plan */ + unsigned local_first_idx; /* first step indext for local plan */ + unsigned is_inequal; /* different block between local and peer side*/ + unsigned packed_rank; /* local rank information */ + unsigned is_node_leader; /* indecates whether the node leader is the node leader */ + unsigned is_variable_len; /* indicates whether the length is variable */ + unsigned is_plummer; /* indicates whether plummer algorithm */ + unsigned ppn; /* number of process on a node */ +} ucg_builtin_plan_extra_attr_t; +struct ucg_builtin_plan_phase; +typedef ucs_status_t (*ucg_builtin_init_phase_by_step_cb_t)(struct ucg_builtin_plan_phase *phase, + const ucg_collective_params_t *coll_params); + +/* for binary block rabenseifner algorithms requires group information */ +typedef struct ucg_builtin_index_group { + ucg_group_member_index_t my_index; + unsigned cur_group_begin_index; + unsigned cur_group_process_cnt; + unsigned next_group_begin_index; + unsigned next_group_process_cnt; + unsigned total_group_process_cnt; + unsigned ahead_group_cnt; + unsigned behind_group_cnt; + unsigned total_group_cnt; + + unsigned local_group_index; + unsigned local_peer_ahead_group; + unsigned recv_block_index; /* receive from ahead group block index */ +} ucg_builtin_index_group_t; + +typedef struct ucg_builtin_plan_raben_phase_extend { + unsigned step_index; /* step index in each phase of algorithm */ + unsigned first_step_flag; + ucg_builtin_index_group_t index_group; +} ucg_builtin_plan_raben_phase_extend_t; + typedef struct ucg_builtin_plan_phase { /* Parameters for buffer send/recv action */ union { @@ -137,6 +247,8 @@ typedef struct ucg_builtin_plan_phase { uct_ep_h single_ep; /* single endpoint handle */ }; uint32_t ep_cnt; /* Number of endpoints (below) */ + uint32_t send_ep_cnt; /* Send number of endpoints (below) */ + uint32_t recv_ep_cnt; /* Recv number of endpoints (below) */ enum ucg_builtin_plan_method_type method; /* how to apply this map */ ucg_step_idx_ext_t step_index; /* determines step index */ @@ -146,17 +258,20 @@ typedef struct ucg_builtin_plan_phase { uct_md_h md; /* memory (registration) domain */ const uct_md_attr_t *md_attr; /* memory domain attributes */ const uct_iface_attr_t *ep_attr; /* endpoint attributes */ - + ucg_builtin_plan_extra_attr_t ex_attr; /* plan extra attributes */ /* flag for swap recv buffer and data when op is non commutative */ unsigned is_swap; int segmented; /* 1: message to receive is segmented;0: message to receive is not segmented. */ int8_t *recv_cache_buffer; /* temp buffer to receive segmented messages. */ ucp_ep_h *ucp_eps; /* ucp_ep related with this phase(used for release) */ - + /* layout for multi_eps : s s s | r r */ + ucg_builtin_tl_threshold_t *ep_thresh; /* threshold for every uct_ep*/ #if ENABLE_DEBUG_DATA ucg_group_member_index_t *indexes; /* array corresponding to EPs */ #endif + ucg_builtin_init_phase_by_step_cb_t init_phase_cb; /* callback fun : init phase params by step */ + ucg_builtin_plan_raben_phase_extend_t raben_extend; /*extended attribute of Rabenseifner, used by binary block algorithm */ } ucg_builtin_plan_phase_t; typedef struct ucg_builtin_group_ctx ucg_builtin_group_ctx_t; @@ -171,11 +286,10 @@ typedef struct ucg_builtin_plan { ucg_step_idx_ext_t step_cnt; /* number of steps in the normal flow */ ucg_step_idx_ext_t ep_cnt; /* total endpoint count */ uint16_t am_id; /* active message ID */ - size_t non_power_of_two; /* number of processes is power of two or not */ + ucg_builtin_algo_t ucg_algo; dt_convert_f convert_f; /* convert datatypes */ dt_span_f dtspan_f; ucg_builtin_plan_phase_t phss[]; /* topology's phases */ -/* uct_ep_h eps[]; * logically located here */ } ucg_builtin_plan_t; #define UCG_BUILTIN_CONNECT_SINGLE_EP ((unsigned)-1) @@ -185,6 +299,20 @@ ucs_status_t ucg_builtin_connect(ucg_builtin_group_ctx_t *ctx, typedef struct ucg_builtin_config ucg_builtin_config_t; +/* NAP Algorithm related functions */ +typedef struct ucg_builtin_NAP_config { + unsigned num_NAP_group; + unsigned init_allreduce_method; + unsigned final_allreduce_method; +} ucg_builtin_NAP_config_t; +extern ucs_config_field_t ucg_builtin_NAP_config_table[]; +ucs_status_t ucg_builtin_NAP_create(ucg_builtin_group_ctx_t *ctx, + enum ucg_builtin_plan_topology_type plan_topo_type, + const ucg_builtin_config_t *config, + const ucg_group_params_t *group_params, + const ucg_collective_type_t *coll_type, + ucg_builtin_plan_t **plan_p); + typedef struct ucg_builtin_binomial_tree_config { unsigned degree_inter_fanout; unsigned degree_inter_fanin; @@ -192,6 +320,19 @@ typedef struct ucg_builtin_binomial_tree_config { unsigned degree_intra_fanin; } ucg_builtin_binomial_tree_config_t; extern ucs_config_field_t ucg_builtin_binomial_tree_config_table[]; + +typedef struct ucg_builtin_binomial_tree_params { + ucg_builtin_group_ctx_t *ctx; + const ucg_group_params_t *group_params; + const ucg_collective_type_t *coll_type; + enum ucg_builtin_plan_topology_type topo_type; + ucg_group_member_index_t root; + int tree_degree_inter_fanout; + int tree_degree_inter_fanin; + int tree_degree_intra_fanout; + int tree_degree_intra_fanin; +} ucg_builtin_binomial_tree_params_t; + ucs_status_t ucg_builtin_binomial_tree_create(ucg_builtin_group_ctx_t *ctx, enum ucg_builtin_plan_topology_type plan_topo_type, const ucg_builtin_config_t *config, @@ -218,9 +359,72 @@ ucs_status_t ucg_builtin_recursive_connect(ucg_builtin_group_ctx_t *ctx, unsigned check_swap, ucg_builtin_plan_t *recursive); -ucs_status_t ucg_builtin_recursive_compute_steps(ucg_group_member_index_t my_index_local, +void ucg_builtin_recursive_compute_steps(ucg_group_member_index_t my_index_local, unsigned rank_count, unsigned factor, unsigned *steps); +/* Binary block Algorithm related functions */ +typedef struct ucg_builtin_binary_block_config { + unsigned inter_allreduce_method; +} ucg_builtin_binary_block_config_t; + +ucs_status_t ucg_builtin_binary_block_create(ucg_builtin_group_ctx_t *ctx, + enum ucg_builtin_plan_topology_type plan_topo_type, + const ucg_builtin_config_t *config, + const ucg_group_params_t *group_params, + const ucg_collective_type_t *coll_type, + ucg_builtin_plan_t **plan_p); + +void ucg_builtin_free(void **p); + +typedef struct throttled_scatter_params { + unsigned max_phs_cnt; + unsigned max_ep_cnt; + unsigned throttle_factor; +} throttled_scatter_params_t; + +/* Throttled Scattered Destination Algorithm related functions */ +void ucg_builtin_throttled_scatter_get_max_phase_cnt(const ucg_builtin_config_t *config, + ucg_group_member_index_t member_count, + throttled_scatter_params_t *params); + +void ucg_builtin_ladd_modify_ep_thresholds(ucg_builtin_plan_phase_t *phase, unsigned phase_ep_index); + +ucs_status_t ucg_builtin_throttled_scatter_build(ucg_builtin_group_ctx_t *ctx, + const throttled_scatter_params_t *ladd_params, + unsigned ppn, + enum ucg_builtin_plan_build_type plan_build_type, + ucg_group_member_index_t member_cnt, + const ucg_group_member_index_t *member_list, + const ucg_collective_params_t *coll_params, + ucg_builtin_plan_t *throttled_scatter, + uct_ep_h **next_ep); + +ucs_status_t ucg_builtin_throttled_scatter_create(ucg_builtin_group_ctx_t *ctx, + enum ucg_builtin_plan_topology_type plan_topo_type, + const ucg_builtin_config_t *config, + const ucg_group_params_t *group_params, + const ucg_collective_params_t *coll_params, + ucg_builtin_plan_t **plan_p); + +/* Throttled scattered destination algorithm related functions */ +ucs_status_t ucg_builtin_Plummer_create(ucg_builtin_group_ctx_t *ctx, + const enum ucg_builtin_plan_topology_type plan_topo_type, + const ucg_builtin_config_t *config, + const ucg_group_params_t *group_params, + const ucg_collective_type_t *coll_type, + const ucg_collective_params_t *coll_params, + ucg_builtin_plan_t **plan_p); + +/* configuration for tree family */ +typedef struct ucg_builtin_trees_config { + unsigned inter_tree_type; + unsigned intra_tree_type; + unsigned inter_degree_fanout; + unsigned inter_degree_fanin; + unsigned intra_degree_fanout; + unsigned intra_degree_fanin; +} ucg_builtin_trees_config_t; +extern ucs_config_field_t ucg_builtin_trees_config_table[]; typedef struct ucg_builtin_bruck_config { unsigned factor; @@ -230,6 +434,22 @@ typedef struct ucg_builtin_ring_config { unsigned factor; } ucg_builtin_ring_config_t; +typedef struct ucg_inc_config { + int enable; + uint16_t comm_id_control; + uint16_t tag; + uint16_t tag_low32; + uint8_t query_hop; + uint8_t notify_hop; + uint32_t kill_hop; + uint32_t tag_high32; + int max_data_size; + int node_under_tor; + int socket_count; + unsigned header_under_tor; + uint64_t job_id; +} ucg_inc_config_t; + ucs_status_t ucg_builtin_ring_create(ucg_builtin_group_ctx_t *ctx, enum ucg_builtin_plan_topology_type plan_topo_type, const ucg_builtin_config_t *config, @@ -244,12 +464,19 @@ ucs_status_t ucg_topo_neighbor_create(ucg_builtin_group_ctx_t *ctx, const ucg_collective_type_t *coll_type, ucg_builtin_plan_t **plan_p); +extern ucs_config_field_t ucg_inc_config_table[]; /* INC configure table */ + struct ucg_builtin_config { ucg_plan_config_t super; ucg_builtin_binomial_tree_config_t bmtree; ucg_builtin_recursive_config_t recursive; - + ucg_builtin_trees_config_t trees; +#if ENABLE_UCG_HICOLL + ucg_inc_config_t inc; +#endif + ucg_builtin_NAP_config_t NAP; + ucg_builtin_binary_block_config_t binary_block; unsigned cache_size; size_t short_max_tx; size_t bcopy_max_tx; @@ -260,13 +487,15 @@ struct ucg_builtin_config { double bcast_algorithm; double allreduce_algorithm; double barrier_algorithm; - + double alltoallv_algorithm; unsigned pipelining; unsigned max_msg_list_size; + unsigned throttle_factor; + int reduce_consistency; /* reduce operate result consistency flag, default is n */ }; -ucs_status_t choose_distance_from_topo_aware_level(enum ucg_group_member_distance *domain_distance); +void choose_distance_from_topo_aware_level(enum ucg_group_member_distance *domain_distance); /***************************** Topology information *****************************/ typedef struct ucg_builtin_topology_info_params { @@ -276,6 +505,66 @@ typedef struct ucg_builtin_topology_info_params { ucg_group_member_index_t *subroot_array; } ucg_builtin_topology_info_params_t; +/* base parameters for all plans */ +typedef struct ucg_builtin_base_params { + ucg_builtin_group_ctx_t *ctx; + const ucg_group_params_t *group_params; + const ucg_collective_type_t *coll_type; + enum ucg_builtin_plan_topology_type topo_type; +} ucg_builtin_base_params_t; + +/* Topolopy-Aware Algorithm related functions */ +typedef struct ucg_builtin_topo_aware_params { + ucg_builtin_base_params_t super; + ucg_group_member_index_t root; + ucg_builtin_topo_params_t *topo_params; +} ucg_builtin_topo_aware_params_t; + +ucs_status_t ucg_builtin_topo_aware_add_intra(ucg_builtin_plan_t *topo_aware, + const ucg_builtin_config_t *config, + ucg_builtin_topo_aware_params_t *params, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + enum ucg_builtin_plan_topology_type topo_type, + enum ucg_group_hierarchy_level topo_level, + enum ucg_builtin_plan_connect_pattern pattern); + +/*base plan build*/ +ucs_status_t ucg_builtin_bmtree_build(ucg_builtin_plan_t *bmtree, + ucg_builtin_base_params_t *params, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + const ucg_group_member_index_t member_root, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_connect_pattern pattern); + +ucs_status_t ucg_builtin_kmtree_build(ucg_builtin_plan_t *kmtree, + ucg_builtin_base_params_t *params, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + const ucg_group_member_index_t member_root, + const unsigned degree, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_connect_pattern pattern); + +ucs_status_t ucg_builtin_recursive_build(ucg_builtin_plan_t *recursive, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_recursive_type recursive_type); + +ucs_status_t ucg_builtin_recursive_binary_build(ucg_builtin_plan_t *recursive, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_recursive_type recursive_type); + ucs_status_t ucg_builtin_topology_info_create(ucg_builtin_topology_info_params_t *topo_params, const ucg_group_params_t *group_params, ucg_group_member_index_t root); @@ -286,52 +575,73 @@ void ucg_builtin_msg_dump(ucp_worker_h worker, uct_am_trace_type_t type, uint8_t id, const void *data, size_t length, char *buffer, size_t max); -ucs_status_t ucg_builtin_bcast_algo_switch(const enum ucg_builtin_bcast_algorithm bcast_algo_decision, struct ucg_builtin_algorithm *algo); +void ucg_builtin_bcast_algo_switch(const enum ucg_builtin_bcast_algorithm bcast_algo_decision, struct ucg_builtin_algorithm *algo); + +void ucg_builtin_barrier_algo_switch(const enum ucg_builtin_barrier_algorithm barrier_algo_decision,struct ucg_builtin_algorithm *algo); -ucs_status_t ucg_builtin_barrier_algo_switch(const enum ucg_builtin_barrier_algorithm barrier_algo_decision, struct ucg_builtin_algorithm *algo); +void ucg_builtin_allreduce_algo_switch(const enum ucg_builtin_allreduce_algorithm allreduce_algo_decision, struct ucg_builtin_algorithm *algo); -ucs_status_t ucg_builtin_allreduce_algo_switch(const enum ucg_builtin_allreduce_algorithm allreduce_algo_decision, struct ucg_builtin_algorithm *algo); +void ucg_builtin_alltoallv_algo_switch(const enum ucg_builtin_alltoallv_algorithm alltoallv_algo_decision, + struct ucg_builtin_algorithm *algo); ucs_status_t ucg_builtin_check_ppn(const ucg_group_params_t *group_params, unsigned *unequal_ppn); +ucs_status_t ucg_builtin_check_nap(const ucg_group_params_t *group_params); + ucs_status_t ucg_builtin_find_myself(const ucg_group_params_t *group_params, ucg_group_member_index_t *myrank); +enum ucg_group_member_distance ucg_builtin_get_distance(const ucg_group_params_t *group_params, + ucg_group_member_index_t rank1, + ucg_group_member_index_t rank2); + ucs_status_t ucg_builtin_check_continuous_number(const ucg_group_params_t *group_params, enum ucg_group_member_distance domain_distance, unsigned *discont_flag); enum ucg_builtin_plan_topology_type ucg_builtin_choose_type(enum ucg_collective_modifiers flags); -void ucg_builtin_plan_decision_in_discontinuous_case(const size_t msg_size, - const ucg_group_params_t *group_params, - const enum ucg_collective_modifiers modifiers, - const ucg_collective_params_t *coll_params); - -void plan_decision_fixed(const size_t msg_size, - const ucg_group_params_t *group_params, - const enum ucg_collective_modifiers modifiers, - const ucg_collective_params_t *coll_params, - const unsigned large_datatype_threshold, - const int is_unbalanced_ppn, - enum ucg_builtin_bcast_algorithm *bcast_algo_decision, - enum ucg_builtin_allreduce_algorithm *allreduce_algo_decision, - enum ucg_builtin_barrier_algorithm *barrier_algo_decision); - -enum choose_ops_mask ucg_builtin_plan_choose_ops(ucg_plan_component_t *plan_component, enum ucg_collective_modifiers ops_type_choose); - -ucs_status_t ucg_builtin_algorithm_decision(const ucg_collective_type_t *coll_type, - const size_t msg_size, - const ucg_group_params_t *group_params, - const ucg_collective_params_t *coll_params, - ucg_plan_component_t *plan_component); - unsigned ucg_builtin_calculate_ppx(const ucg_group_params_t *group_params, enum ucg_group_member_distance domain_distance); ucs_status_t ucg_builtin_destroy_plan(ucg_builtin_plan_t *plan, ucg_group_h group); +ucs_status_t ucg_builtin_check_non_aware_Raben(const ucg_group_params_t *group_params); + +ucg_group_member_index_t ucg_builtin_get_local_index(ucg_group_member_index_t global_index, + const ucg_group_member_index_t *local_members, + ucg_group_member_index_t member_cnt); + +int ucg_is_allreduce_consistency(const ucg_builtin_group_ctx_t *ctx); + +short ucg_get_tree_buffer_pos(ucg_group_member_index_t myrank, + ucg_group_member_index_t uprank, + ucg_group_member_index_t root, + unsigned size, + unsigned degree, + const ucg_group_member_index_t *member_list); + +int ucg_builtin_need_calate_position(const ucg_collective_type_t *coll, + unsigned up_cnt, + const ucg_builtin_group_ctx_t *ctx, + enum ucg_builtin_plan_topology_type tree_topo); + +int inc_get_header_size(); +void init_inc_params(void *ucg_group); +ucs_status_t inc_create(void *ucg_group, void *ucg_config, const void *ucg_params); +ucs_status_t inc_destroy(void *ucg_group, uint8_t fail_cause); +ucs_status_t inc_check_set_packet_para(void *group, void *params); +ucs_status_t ucg_builtin_add_inc(void *tree, void *phase, const void *params, uct_ep_h **eps, + unsigned *phs_inc_cnt, unsigned *step_inc_cnt, unsigned ppx, enum ucg_group_hierarchy_level topo_level); +void inc_send_cb(void *builtin_req); +ucs_status_t inc_comp_recv_one(void *req, uint64_t offset, const void *data, size_t length); +ucs_status_t inc_comp_recv_many(void *req, uint64_t offset, const void *data, size_t length); +size_t inc_enable(void *builtin_config); +size_t inc_available(const void *group); +size_t inc_used(const void *ucg_params); + +END_C_DECLS #endif diff --git a/builtin/plan/builtin_plan_cache.c b/builtin/plan/builtin_plan_cache.c new file mode 100644 index 0000000..36e6ef9 --- /dev/null +++ b/builtin/plan/builtin_plan_cache.c @@ -0,0 +1,137 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include +#include + +#include "builtin_plan.h" +#include "builtin_plan_cache.h" + +#define ROOT_NUMS 96 + +const static int cache_size[COLL_TYPE_NUMS] = { + UCG_ALGORITHM_BARRIER_LAST - 1, + (UCG_ALGORITHM_BCAST_LAST - 1) * ROOT_NUMS, + UCG_ALGORITHM_ALLREDUCE_LAST - 1, + UCG_ALGORITHM_ALLTOALLV_LAST - 1, +}; + +static inline unsigned ucg_collective_compare_basic_coll_params(const ucg_collective_params_t *left, + const ucg_collective_params_t *right) +{ + return !memcmp(left, right, sizeof(ucg_collective_params_t)); +} + +static inline unsigned ucg_collective_compare_full_coll_params(ucg_group_h group, + const ucg_collective_params_t *left, + const ucg_collective_params_t *right) +{ + ucg_group_member_index_t member_count = ucg_group_get_member_count(group); + int send_counts_len = member_count * sizeof(int); + + unsigned is_same = ucg_collective_compare_basic_coll_params(left, right) && + (!memcmp(left->send.counts, right->send.counts, send_counts_len)) && + (!memcmp(left->send.displs, right->send.displs, send_counts_len)) && + (!memcmp(left->recv.counts, right->recv.counts, send_counts_len)) && + (!memcmp(left->recv.displs, right->recv.displs, send_counts_len)); + return is_same; +} + +ucs_status_t ucg_builtin_pcache_init(ucg_group_h group) +{ + coll_type_t coll_type; + size_t alloc_size; + + for (coll_type = 0; coll_type < COLL_TYPE_NUMS; coll_type++) { + group->builtin_pcache[coll_type] = NULL; + alloc_size = sizeof(ucg_plan_t *) * cache_size[coll_type]; + group->builtin_pcache[coll_type] = (ucg_plan_t **)UCS_ALLOC_CHECK(alloc_size, "builtin_pcache"); + memset_s(group->builtin_pcache[coll_type], alloc_size, 0, alloc_size); + } + return UCS_OK; +} + +void ucg_builtin_pcache_destroy(ucg_group_h group) +{ + coll_type_t coll_type; + for (coll_type = 0; coll_type < COLL_TYPE_NUMS; coll_type++) { + if (group->builtin_pcache[coll_type]) { + ucs_free(group->builtin_pcache[coll_type]); + group->builtin_pcache[coll_type] = NULL; + } + } +} + +static ucg_plan_t *ucg_builtin_alltoallv_pcache_find(const ucg_group_h group, int algo, + const ucg_collective_params_t *coll_params) +{ + coll_type_t coll_type = coll_params->coll_type; + ucg_plan_t *plan = group->builtin_pcache[coll_type][algo - 1]; + ucg_op_t *op = NULL; + + if (ucs_likely(plan != NULL)) { + if (algo == UCG_ALGORITHM_ALLTOALLV_LADD) { + ucs_list_for_each(op, &plan->op_head, list) { + if (ucg_collective_compare_full_coll_params(group, &op->params, coll_params)) { + return plan; + } + } + } else if (algo == UCG_ALGORITHM_ALLTOALLV_NODE_AWARE_PLUMMER) { + ucs_list_for_each(op, &plan->op_head, list) { + if (ucg_collective_compare_basic_coll_params(&op->params, coll_params)) { + return plan; + } + } + } + } + return NULL; +} + +ucg_plan_t *ucg_builtin_pcache_find(const ucg_group_h group, int algo, + const ucg_collective_params_t *coll_params) +{ + coll_type_t coll_type = coll_params->coll_type; + ucg_plan_t *plan = NULL; + int pos; + + switch (coll_type) { + case COLL_TYPE_BCAST: + pos = (coll_params->type.root % ROOT_NUMS) * (UCG_ALGORITHM_BCAST_LAST - 1) + algo - 1; + plan = group->builtin_pcache[coll_type][pos]; + return (plan != NULL && plan->type.root != coll_params->type.root) ? NULL : plan; + + case COLL_TYPE_ALLTOALLV: + return ucg_builtin_alltoallv_pcache_find(group, algo, coll_params); + + default: + return group->builtin_pcache[coll_type][algo - 1]; + } +} + +void ucg_builtin_pcache_update(ucg_group_h group, ucg_plan_t *plan, int algo, + const ucg_collective_params_t *coll_params) +{ + coll_type_t coll_type = coll_params->coll_type; + ucg_builtin_plan_t *builtin_plan = NULL; + ucg_plan_t *plan_old = NULL; + int pos; + + switch (coll_type) + { + case COLL_TYPE_BCAST: + pos = (coll_params->type.root % ROOT_NUMS) * (UCG_ALGORITHM_BCAST_LAST - 1) + algo - 1; + plan_old = group->builtin_pcache[coll_type][pos]; + group->builtin_pcache[coll_type][pos] = plan; + break; + default: + plan_old = group->builtin_pcache[coll_type][algo - 1]; + group->builtin_pcache[coll_type][algo - 1] = plan; + break; + } + if (plan_old) { + builtin_plan = ucs_derived_of(plan_old, ucg_builtin_plan_t); + ucg_builtin_destroy_plan(builtin_plan, group); + } +} diff --git a/builtin/plan/builtin_plan_cache.h b/builtin/plan/builtin_plan_cache.h new file mode 100644 index 0000000..fd294f2 --- /dev/null +++ b/builtin/plan/builtin_plan_cache.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCG_BUILTIN_PLAN_CACHE_H +#define UCG_BUILTIN_PLAN_CACHE_H + +#include +#include + +BEGIN_C_DECLS + +ucs_status_t ucg_builtin_pcache_init(ucg_group_h group); + +void ucg_builtin_pcache_destroy(ucg_group_h group); + +ucg_plan_t *ucg_builtin_pcache_find(const ucg_group_h group, int algo, + const ucg_collective_params_t *coll_params); + +void ucg_builtin_pcache_update(ucg_group_h group, ucg_plan_t *plan, int algo, + const ucg_collective_params_t *coll_params); +END_C_DECLS +#endif \ No newline at end of file diff --git a/builtin/plan/builtin_recursive.c b/builtin/plan/builtin_recursive.c index 0f89474..366a8b9 100644 --- a/builtin/plan/builtin_recursive.c +++ b/builtin/plan/builtin_recursive.c @@ -3,16 +3,431 @@ * See file LICENSE for terms. */ +#include #include "builtin_plan.h" #include #include #include +#include #include #include #define MAX_PEERS 100 #define MAX_PHASES 16 #define NUM_TWO 2 +#define FACTOR 2 + +static void ucg_builtin_check_swap(unsigned factor, ucg_step_idx_t step_idx, + ucg_group_member_index_t my_index, ucg_builtin_plan_phase_t *phase) +{ + if (factor == 0) { + return; + } + /* The condition which don't use peer_idx as considering communicator split and dup case */ + unsigned current_scale = 1; + unsigned i; + for (i = 0; i < step_idx + 1; i++) { + current_scale *= factor; + } + if (my_index % current_scale < current_scale / factor) { + phase->is_swap = 1; + } else { + phase->is_swap = 0; + } +} + +static ucs_status_t ucg_builtin_recursive_build_power_factor(ucg_builtin_plan_t *recursive, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_recursive_type recursive_type, + ucg_builtin_plan_phase_t **phase, + uct_ep_h **next_ep, + ucg_step_idx_ext_t *step_idx, + ucg_group_member_index_t my_index, + ucg_step_idx_ext_t step_cnt, + unsigned factor, + unsigned extra_indexs) +{ + ucs_status_t status = UCS_OK; + ucg_step_idx_ext_t local_step_idx; + unsigned step_size = 1; + for (local_step_idx = 0; ((local_step_idx < step_cnt) && (status == UCS_OK)); + local_step_idx++, (*phase)++, step_size = step_size * factor) { + (*phase)->ep_cnt = factor - 1; + (*phase)->step_index = (*step_idx) + local_step_idx; + (*phase)->multi_eps = (*next_ep); +#if ENABLE_DEBUG_DATA || ENABLE_FAULT_TOLERANCE + (*phase)->indexes = UCS_ALLOC_CHECK((factor - 1) * sizeof(my_index), "recursive topology indexes"); +#endif + /* In each step, there are one or more peers */ + unsigned step_peer_idx; + unsigned step_base = my_index - (my_index % (step_size * factor)); + for (step_peer_idx = 1; ((step_peer_idx < factor) && (status == UCS_OK)); step_peer_idx++) { + ucg_group_member_index_t peer_index = step_base + ((my_index - step_base + step_size * step_peer_idx) + % (step_size * factor)); + ucs_info("%lu's peer #%u/%u (step #%u/%u): %lu ", my_index, step_peer_idx, + factor - 1, local_step_idx + 1, step_cnt, peer_index); + recursive->ep_cnt++; + + /* extra attributes */ + switch (recursive_type) + { + case UCG_PLAN_RECURSIVE_TYPE_ALLREDUCE: + (*phase)->method = UCG_PLAN_METHOD_REDUCE_RECURSIVE; + /* To support non-commutative opration */ + ucg_builtin_check_swap(factor, local_step_idx, my_index, (*phase)); + break; + + default: { + ucs_error("invaid recursive type!"); + return UCS_ERR_INVALID_PARAM; + break; + } + } + + /* Calculate the real rank number */ + if (extra_indexs != 0) { + if (peer_index < extra_indexs) { + peer_index = NUM_TWO * peer_index + 1; + } else { + peer_index += extra_indexs; + } + } + if (build_type == UCG_PLAN_BUILD_PARTIAL) { + peer_index = member_list[peer_index]; + } + status = ucg_builtin_connect(ctx, peer_index, *phase, (factor != FACTOR) ? + (step_peer_idx - 1) : UCG_BUILTIN_CONNECT_SINGLE_EP); + } + } + + /* update the count of phase and step */ + if (extra_indexs == 0) { + *step_idx += step_cnt; + } + recursive->phs_cnt += step_cnt; + return status; +} + +static ucs_status_t ucg_builtin_recursive_build_non_power_factor_post(ucg_builtin_plan_t *recursive, + ucg_builtin_group_ctx_t *ctx, + const ucg_group_member_index_t *member_list, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_recursive_type recursive_type, + uct_ep_h *next_ep, + ucg_builtin_plan_phase_t *phase, + ucg_step_idx_ext_t *step_idx, + ucg_group_member_index_t my_index, + unsigned factor) +{ + ucs_status_t status; + ucg_group_member_index_t peer_index; + unsigned is_even_rank = (my_index % NUM_TWO == 0); + switch (recursive_type) { + case UCG_PLAN_RECURSIVE_TYPE_ALLREDUCE: { + phase->method = is_even_rank ? UCG_PLAN_METHOD_RECV_TERMINAL : UCG_PLAN_METHOD_SEND_TERMINAL; + peer_index = is_even_rank ? (my_index + 1) : (my_index - 1); + break; + } + default: { + ucs_error("invaid recursive type!"); + return UCS_ERR_INVALID_PARAM; + } + } + phase->ep_cnt = factor - 1; + phase->step_index = (*step_idx); +#if ENABLE_DEBUG_DATA + phase->indexes = UCS_ALLOC_CHECK((factor - 1) * sizeof(my_index), "recursive topology indexes"); +#endif + phase->multi_eps = next_ep; + phase->is_swap = 0; + + /* Calculate the real rank number */ + if (build_type == UCG_PLAN_BUILD_PARTIAL) { + peer_index = member_list[peer_index]; + } + status = ucg_builtin_connect(ctx, peer_index, phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + return status; +} + +static ucs_status_t ucg_builtin_recursive_build_non_power_factor_pre(ucg_builtin_plan_t *recursive, + ucg_builtin_group_ctx_t *ctx, + const ucg_group_member_index_t *member_list, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_recursive_type recursive_type, + uct_ep_h *next_ep, + ucg_builtin_plan_phase_t *phase, + ucg_step_idx_ext_t *step_idx, + ucg_group_member_index_t my_index, + unsigned factor) +{ + ucs_status_t status; + ucg_group_member_index_t peer_index; + unsigned is_even_rank = (my_index % NUM_TWO == 0); + switch (recursive_type) { + case UCG_PLAN_RECURSIVE_TYPE_ALLREDUCE: { + phase->method = is_even_rank ? UCG_PLAN_METHOD_SEND_TERMINAL : UCG_PLAN_METHOD_REDUCE_TERMINAL; + peer_index = is_even_rank ? (my_index + 1) : (my_index - 1); + break; + } + default: { + ucs_error("invaid recursive type!"); + return UCS_ERR_INVALID_PARAM; + } + } + phase->ep_cnt = factor - 1; + phase->step_index = (*step_idx); +#if ENABLE_DEBUG_DATA + phase->indexes = UCS_ALLOC_CHECK((factor - 1) * sizeof(my_index), "recursive topology indexes"); +#endif + phase->multi_eps = next_ep; + phase->is_swap = 0; + + /* Calculate the real rank number */ + if (build_type == UCG_PLAN_BUILD_PARTIAL) { + peer_index = member_list[peer_index]; + } + status = ucg_builtin_connect(ctx, peer_index, phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + return status; + +} + +static ucs_status_t ucg_builtin_recursive_build_non_power_factor(ucg_builtin_plan_t *recursive, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_recursive_type recursive_type, + ucg_builtin_plan_phase_t **phase, + uct_ep_h **next_ep, + ucg_step_idx_ext_t *step_idx, + ucg_group_member_index_t my_index, + ucg_step_idx_t step_cnt, + unsigned factor, + unsigned extra_indexs) +{ + ucs_status_t status = UCS_OK; + ucg_group_member_index_t new_my_index; + if (my_index < NUM_TWO * extra_indexs && my_index % NUM_TWO == 0) { + new_my_index = (ucg_group_member_index_t)-1; + } else if (my_index < NUM_TWO * extra_indexs && my_index % NUM_TWO != 0) { + new_my_index = my_index / NUM_TWO; + } else { + new_my_index = my_index - extra_indexs; + } + /* + TO support non-commutative operation, like matrix multiplication, modified recursive doubling change a little + - Even ranks less than 2 * extra_indexs only has pre- and after- processing steps; + - Odd ranks less than 2 * extra_indexs participate all processing steps; + - ranks more than 2 * extra_indexs only pure recursive doubling. + + An example: 0 1 2 3 4 5 + pre- 0 -> 1 2 -> 3 4 5 + RD: 1 3 4 5 + + recursive 1 <-> 3 4 <-> 5 + 1 <-> 4 3 <-> 5 + + post- 0 <- 1 2 <- 3 4 5 + */ + /* 1st: pre - processing steps for non power for two processes case */ + if (my_index < NUM_TWO * extra_indexs) { + status = ucg_builtin_recursive_build_non_power_factor_pre(recursive, ctx, member_list, build_type, + recursive_type, *next_ep, *phase, + step_idx, my_index, factor); + (*phase)++; + (*next_ep)++; + recursive->phs_cnt++; + recursive->ep_cnt++; + } + (*step_idx)++; + + /*2nd: Calculate the peers for each step*/ + if (new_my_index != ((ucg_group_member_index_t)-1)) { + status = ucg_builtin_recursive_build_power_factor(recursive, ctx, config, member_list, + member_cnt, build_type, recursive_type, + phase, next_ep, step_idx, new_my_index, + step_cnt, factor, extra_indexs); + } + (*step_idx) += step_cnt; + + /*3nd: after - processing steps for non power for two processes case*/ + if (my_index < NUM_TWO * extra_indexs) { + status = ucg_builtin_recursive_build_non_power_factor_post(recursive, ctx, member_list, build_type, + recursive_type, *next_ep, *phase, + step_idx, my_index, factor); + (*phase)++; + (*next_ep)++; + recursive->phs_cnt++; + recursive->ep_cnt++; + } + (*step_idx)++; + return status; +} + +static void ucg_builtin_cal_my_index(const enum ucg_builtin_plan_build_type build_type, + ucg_group_member_index_t *my_index, + ucg_builtin_plan_t *recursive, + ucg_group_member_index_t *member_idx, + const ucg_group_member_index_t member_cnt, + const ucg_group_member_index_t *member_list) +{ + if (build_type == UCG_PLAN_BUILD_FULL) { + *my_index = recursive->super.my_index; + *member_idx = 0; + } else { + /* find the local my own index */ + ucg_group_member_index_t index; + for (index = 0; index < member_cnt; index++) { + if (member_list[index] == recursive->super.my_index) { + *my_index = index; + *member_idx = index; + break; + } + } + } +} + +ucs_status_t ucg_builtin_recursive_binary_build(ucg_builtin_plan_t *recursive, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_recursive_type recursive_type) +{ + ucs_status_t status; + /* recursive factor should be less than the "member_cnt" */ + unsigned factor = (member_cnt < config->recursive.factor) ? member_cnt : config->recursive.factor; + if (recursive_type != UCG_PLAN_RECURSIVE_TYPE_ALLREDUCE) { + factor = FACTOR; + } + + unsigned step_size = 1; + unsigned step_cnt = 0; + while ((step_size * factor) <= member_cnt) { + step_size *= factor; + step_cnt++; + } + unsigned extra_indexs = member_cnt - step_size; + /* my_index is always "local" */ + ucg_group_member_index_t my_index = 0; + ucg_group_member_index_t member_idx = 0; + ucg_builtin_cal_my_index(build_type, &my_index, recursive, &member_idx, member_cnt, member_list); + + /*do nothing for one that is not in member_list */ + if (member_idx == member_cnt) { + /* step_cnt is updated while phs_cnt is not */ + recursive->step_cnt += (extra_indexs == 0) ? step_cnt :(step_cnt + NUM_TWO); + return UCS_OK; + } + /* first phase */ + ucg_builtin_plan_phase_t *phase = &recursive->phss[recursive->phs_cnt]; + /* next_ep shifts as ep_cnt grows */ + uct_ep_h *next_ep = (uct_ep_h *)(&recursive->phss[MAX_PHASES]) + recursive->ep_cnt; + /* Record the step of the current plan */ + ucg_step_idx_ext_t step_idx = recursive->step_cnt; + + if (extra_indexs == 0) { + /* case: power-of-factor number of processes */ + status = ucg_builtin_recursive_build_power_factor(recursive, ctx, config, member_list, + member_cnt, build_type, recursive_type, &phase, + &next_ep, &step_idx, my_index, step_cnt, + factor, extra_indexs); + } else { + /* case: non-power-of-factor number of processes */ + status = ucg_builtin_recursive_build_non_power_factor(recursive, ctx, config, member_list, + member_cnt, build_type, recursive_type, &phase, + &next_ep, &step_idx, my_index, step_cnt, + factor, extra_indexs); + } + recursive->step_cnt = step_idx; + return status; +} + +ucs_status_t ucg_builtin_recursive_build(ucg_builtin_plan_t *recursive, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_recursive_type recursive_type) +{ + ucs_status_t status = UCS_OK; + /* next_ep shifts as ep_cnt grows */ + uct_ep_h *next_ep = (uct_ep_h *)(&recursive->phss[MAX_PHASES]) + recursive->ep_cnt; + + /* my_index is always "local" */ + ucg_group_member_index_t my_index = 0; + ucg_group_member_index_t member_idx; + + if (ucs_popcount(member_cnt) > 1) { + ucs_error("Do not support non-power-of-two number of processes currently!"); + return UCS_ERR_INVALID_PARAM; + } + + /* number of steps for recursive */ + unsigned step_cnt = ucs_ilog2(member_cnt); + + if (build_type == UCG_PLAN_BUILD_FULL) { + my_index = recursive->super.my_index; + } else if (build_type == UCG_PLAN_BUILD_PARTIAL) { + /* find the local my own index */ + for (member_idx = 0; member_idx < member_cnt; member_idx++) { + if (member_list[member_idx] == recursive->super.my_index) { + my_index = member_idx; + break; + } + } + /*do nothing for one that is not in member_list */ + if (member_idx == member_cnt) { + /* step_cnt is updated while phs_cnt is not */ + recursive->step_cnt += step_cnt; + return UCS_OK; + } + } + + /* first phase */ + ucg_builtin_plan_phase_t *phase = &recursive->phss[recursive->phs_cnt]; + + ucg_step_idx_t step_idx; + unsigned step_size = 1; + unsigned factor = config->recursive.factor; + for (step_idx = 0; ((step_idx < step_cnt) && (status == UCS_OK)); + step_idx++, phase++, step_size = step_size * factor) { + phase->method = UCG_PLAN_METHOD_REDUCE_RECURSIVE; + phase->ep_cnt = factor - 1; + phase->step_index = recursive->step_cnt++; +#if ENABLE_DEBUG_DATA || ENABLE_FAULT_TOLERANCE + phase->indexes = UCS_ALLOC_CHECK((factor - 1) * sizeof(my_index), "recursive topology indexes"); +#endif + /* In each step, there are one or more peers */ + unsigned step_peer_idx; + unsigned step_base = my_index - (my_index % (step_size * factor)); + for (step_peer_idx = 1; ((step_peer_idx < factor) && (status == UCS_OK)); step_peer_idx++) { + ucg_group_member_index_t peer_index = step_base + ((my_index - step_base + step_size * step_peer_idx) + % (step_size * factor)); + ucs_info("%lu's peer #%u/%u (step #%u/%u): %lu ", my_index, step_peer_idx, + factor - 1, step_idx + 1, step_cnt, peer_index); + phase->multi_eps = next_ep++; + recursive->ep_cnt++; + + /* the real rank number */ + if (build_type == UCG_PLAN_BUILD_PARTIAL) { + peer_index = member_list[peer_index]; + } + status = ucg_builtin_connect(ctx, peer_index, phase, (factor != NUM_TWO) ? + (step_peer_idx - 1) : UCG_BUILTIN_CONNECT_SINGLE_EP); + } + } + /* update the count of phase and step */ + recursive->phs_cnt += step_cnt; + return status; +} static ucs_status_t ucg_builtin_recursive_non_pow_two_pre(ucg_builtin_group_ctx_t *ctx, uct_ep_h *next_ep, @@ -90,24 +505,6 @@ static ucs_status_t ucg_builtin_recursive_non_pow_two_post(ucg_builtin_group_ctx return status; } -static ucs_status_t ucg_builtin_check_swap(unsigned factor, ucg_step_idx_t step_idx, - ucg_group_member_index_t my_index, ucg_builtin_plan_phase_t *phase) -{ - /* The condition which don't use peer_idx as considering communicator split and dup case */ - unsigned current_scale = 1; - unsigned i; - for (i = 0; i < step_idx + 1; i++) { - current_scale *= factor; - } - if ((my_index % current_scale) < (current_scale / factor)) { - phase->is_swap = 1; - } else { - phase->is_swap = 0; - } - - return UCS_OK; -} - static ucs_status_t ucg_builtin_recursive_non_pow_two_inter(ucg_builtin_group_ctx_t *ctx, ucg_group_member_index_t new_my_index, ucg_group_member_index_t *member_list, @@ -355,7 +752,7 @@ ucs_status_t ucg_builtin_recursive_connect(ucg_builtin_group_ctx_t *ctx, return status; } -ucs_status_t ucg_builtin_recursive_compute_steps(ucg_group_member_index_t my_index_local, unsigned rank_count, +void ucg_builtin_recursive_compute_steps(ucg_group_member_index_t my_index_local, unsigned rank_count, unsigned factor, unsigned *steps) { unsigned step_size = 1; @@ -390,8 +787,6 @@ ucs_status_t ucg_builtin_recursive_compute_steps(ucg_group_member_index_t my_ind } *steps = (step_size != rank_count) ? (near_power_of_two_step + NUM_TWO) : step_idx; - - return UCS_OK; } void ucg_builtin_recursive_init_member_list(ucg_group_member_index_t member_cnt, ucg_group_member_index_t *member_list) @@ -407,12 +802,7 @@ ucs_status_t ucg_builtin_recursive_create(ucg_builtin_group_ctx_t *ctx, const ucg_group_params_t *group_params, const ucg_collective_type_t *coll_type, ucg_builtin_plan_t **plan_p) { /* Find my own index */ - ucg_group_member_index_t my_rank = 0; - while ((my_rank < group_params->member_count) && - (group_params->distance[my_rank] != - UCG_GROUP_MEMBER_DISTANCE_SELF)) { - my_rank++; - } + ucg_group_member_index_t my_rank = group_params->member_index; ucg_group_member_index_t member_cnt = group_params->member_count; ucg_group_member_index_t *member_list = UCS_ALLOC_CHECK(member_cnt * sizeof(ucg_group_member_index_t), "member list"); @@ -438,7 +828,10 @@ ucs_status_t ucg_builtin_recursive_create(ucg_builtin_group_ctx_t *ctx, member_list = NULL; return UCS_ERR_NO_MEMORY; } - memset(recursive, 0, alloc_size); + errno_t res = memset_s(recursive, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } ucs_status_t status = ucg_builtin_recursive_connect(ctx, my_rank, member_list, member_cnt, factor, 1, recursive); if (status != UCS_OK) { goto out; @@ -452,4 +845,4 @@ ucs_status_t ucg_builtin_recursive_create(ucg_builtin_group_ctx_t *ctx, ucs_free(member_list); member_list = NULL; return status; -} \ No newline at end of file +} diff --git a/builtin/plan/builtin_ring.c b/builtin/plan/builtin_ring.c index b876913..7c37d44 100644 --- a/builtin/plan/builtin_ring.c +++ b/builtin/plan/builtin_ring.c @@ -1,5 +1,5 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -7,6 +7,7 @@ #include #include #include +#include #include "builtin_plan.h" @@ -34,16 +35,19 @@ ucs_status_t ucg_builtin_ring_connect(ucg_builtin_group_ctx_t *ctx, ucs_status_t status; uct_ep_h *next_ep = (uct_ep_h*)(phase + step_idx); if (peer_index_src != peer_index_dst) { - phase->ep_cnt = 1; /* 1 sender and 1 receiver ,the phase->ep_cnt is the number of receiver */ - unsigned phase_ep_index = 1; /* index: 0 for sender and 1 for receiver */ + /* when np > 2, each phase of every rank in ring algorithm has two endpoints: 1 sender and 1 recveiver. + * ep_cnt = 2 is for storing two ucp_eps in ucg_builtin_connect() + */ + const unsigned peer_cnt = 2; + phase->ep_cnt = peer_cnt; phase->multi_eps = next_ep++; - /* connected to src process for second EP, recv */ - status = ucg_builtin_connect(ctx, peer_index_src, phase, phase_ep_index); + /* connected to src process for second EP, receiver store in phase->ucp_eps[1] */ + status = ucg_builtin_connect(ctx, peer_index_src, phase, 1); if (status != UCS_OK) { return status; } - phase_ep_index--; + next_ep++; /* set threshold for receiver @@ -51,16 +55,17 @@ ucs_status_t ucg_builtin_ring_connect(ucg_builtin_group_ctx_t *ctx, */ ucg_builtin_ring_assign_recv_thresh(phase); - /* connected to dst process for first EP, send */ - status = ucg_builtin_connect(ctx, peer_index_dst, phase, phase_ep_index); + /* connected to dst process for first EP, sender store in phase->ucp_eps[0] */ + status = ucg_builtin_connect(ctx, peer_index_dst, phase, 0); if (status != UCS_OK) { return status; } /* - * while phase->ep_cnt is set to be 1. So phase->single_ep should - * point to multi_eps[0]. + * phase->ep_cnt affects the number of iterations in case_send(). + * "1" is written here because there is only one sender. */ + phase->ep_cnt = 1; phase->single_ep = phase->multi_eps[0]; } else { phase->ep_cnt = 1; @@ -81,12 +86,7 @@ ucs_status_t ucg_builtin_ring_connect(ucg_builtin_group_ctx_t *ctx, void ucg_builtin_ring_find_my_index(const ucg_group_params_t *group_params, unsigned proc_count, ucg_group_member_index_t *my_index) { - while ((*my_index < proc_count) && - (group_params->distance[*my_index] != - UCG_GROUP_MEMBER_DISTANCE_SELF)) { - (*my_index)++; - } - ucs_assert(*my_index != proc_count); + *my_index = group_params->member_index; } ucs_status_t ucg_builtin_ring_create(ucg_builtin_group_ctx_t *ctx, enum ucg_builtin_plan_topology_type plan_topo_type, @@ -110,7 +110,11 @@ ucs_status_t ucg_builtin_ring_create(ucg_builtin_group_ctx_t *ctx, + (INDEX_DOUBLE * step_idx * sizeof(uct_ep_h))); ucg_builtin_plan_t *ring = (ucg_builtin_plan_t*)UCS_ALLOC_CHECK(alloc_size, "ring topology"); - memset(ring, 0, alloc_size); + errno_t res = memset_s(ring, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } + ucg_builtin_plan_phase_t *phase = &ring->phss[0]; ring->ep_cnt = step_idx * INDEX_DOUBLE; /* the number of endpoints each step is always 2 for ring */ ring->phs_cnt = step_idx; @@ -153,6 +157,7 @@ ucs_status_t ucg_builtin_ring_create(ucg_builtin_group_ctx_t *ctx, /* the following endpoint is as same as phase(0) */ *phase = phase_zero; phase->ucp_eps = NULL; + phase->ep_thresh = NULL; /* modify method and step_index in phase */ if (step_idx < proc_count - 1) { @@ -167,8 +172,6 @@ ucs_status_t ucg_builtin_ring_create(ucg_builtin_group_ctx_t *ctx, } ring->super.my_index = my_index; - ring->super.support_non_commutative = 0; - ring->super.support_large_datatype = 1; *plan_p = ring; return status; } \ No newline at end of file diff --git a/builtin/plan/builtin_topo.c b/builtin/plan/builtin_topo.c new file mode 100644 index 0000000..43062e4 --- /dev/null +++ b/builtin/plan/builtin_topo.c @@ -0,0 +1,290 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include +#include +#include +#include + +#include "builtin_plan.h" +#include "builtin_topo.h" + +static inline void ucg_builtin_topo_set_nodecnt(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t max_node_idx) +{ + topo_params->node_cnt = ++max_node_idx; +} +static inline void ucg_builtin_topo_set_localprocs(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t proc_cnt) +{ + topo_params->num_local_procs = proc_cnt; +} + +static inline void ucg_builtin_topo_set_membercnt(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t pps) +{ + topo_params->local.socket.member_cnt = pps; +} + +static inline void ucg_builtin_topo_set_socketnum(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t proc_cnt, + ucg_group_member_index_t pps) +{ + topo_params->local.socket.num = (pps == 0) ? 0 : (proc_cnt / pps); +} + +static inline void ucg_builtin_topo_set(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t proc_cnt, + ucg_group_member_index_t max_node_idx, + ucg_group_member_index_t pps) +{ + ucg_builtin_topo_set_nodecnt(topo_params, max_node_idx); + ucg_builtin_topo_set_localprocs(topo_params, proc_cnt); + ucg_builtin_topo_set_membercnt(topo_params, pps); + ucg_builtin_topo_set_socketnum(topo_params, proc_cnt, pps); +} + +static inline void ucg_builtin_topo_init_local_leaders(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t init_member_idx, + ucg_group_member_index_t socket_idx) +{ + topo_params->local.socket.leaders[socket_idx] = init_member_idx; +} + +static inline void ucg_builtin_topo_init_local_members(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t init_member_idx, + ucg_group_member_index_t proc_cnt) +{ + topo_params->local_members[proc_cnt] = init_member_idx; +} + +static inline void ucg_builtin_topo_init_socket_members(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t init_member_idx, + ucg_group_member_index_t pps) +{ + topo_params->local.socket.members[pps] = init_member_idx; +} + +static void ucg_builtin_topo_init_local(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t init_member_idx, + ucg_group_member_index_t proc_cnt, + ucg_group_member_index_t pps, + ucg_group_member_index_t socket_idx) +{ + for (socket_idx = 0; socket_idx < topo_params->local.socket.num; socket_idx++) { + ucg_builtin_topo_init_local_leaders(topo_params, init_member_idx, socket_idx); + } + + for (proc_cnt = 0; proc_cnt < topo_params->num_local_procs; proc_cnt++) { + ucg_builtin_topo_init_local_members(topo_params, init_member_idx, proc_cnt); + } + + for (pps = 0; pps < topo_params->local.socket.member_cnt; pps++) { + ucg_builtin_topo_init_socket_members(topo_params, init_member_idx, pps); + } +} + +static inline void ucg_builtin_topo_init_node_leaders(ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t init_member_idx, + ucg_group_member_index_t node_idx) +{ + topo_params->node_leaders[node_idx] = init_member_idx; +} + +static void ucg_builtin_topo_init(const ucg_group_params_t *group_params, + ucg_group_member_index_t member_idx, + ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t init_member_idx, + ucg_group_member_index_t node_idx, + ucg_group_member_index_t proc_cnt, + ucg_group_member_index_t my_rank, + ucg_group_member_index_t pps, + ucg_group_member_index_t socket_idx) +{ + for (node_idx = 0; node_idx < topo_params->node_cnt; node_idx++) { + ucg_builtin_topo_init_node_leaders(topo_params, init_member_idx, node_idx); + } + ucg_builtin_topo_init_local(topo_params, init_member_idx, proc_cnt, pps, socket_idx); + + /* rnak list on local node */ + for (member_idx = 0, proc_cnt = 0; member_idx < group_params->member_count; member_idx++) { + if (group_params->node_index[member_idx] == group_params->node_index[my_rank]) { + topo_params->local_members[proc_cnt++] = member_idx; + } + } + ucs_assert(proc_cnt == topo_params->num_local_procs); +} + +static void ucg_builtin_topo_leader(const ucg_group_params_t *group_params, + ucg_group_member_index_t member_idx, + ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t node_idx) +{ + topo_params->node_leaders[0] = 0; + for (member_idx = 1, node_idx = 0; member_idx < group_params->member_count; member_idx++) { + if (group_params->node_index[member_idx] > node_idx) { + node_idx++; + topo_params->node_leaders[node_idx] = member_idx; + } + } + ucs_assert(node_idx + 1 == topo_params->node_cnt); +} + +static void ucg_builtin_topo_list(const ucg_group_params_t *group_params, + ucg_builtin_topo_params_t *topo_params) +{ + ucg_group_member_index_t member_idx; + ucg_group_member_index_t pps; + enum ucg_group_member_distance next_distance; + for (member_idx = 0, pps = 0; member_idx < group_params->member_count; member_idx++) { + next_distance = ucg_builtin_get_distance(group_params, group_params->member_index, member_idx); + if (next_distance <= UCG_GROUP_MEMBER_DISTANCE_SOCKET) { + topo_params->local.socket.members[pps++] = member_idx; + } + } + ucs_assert(pps == topo_params->local.socket.member_cnt); +} + +static inline void ucg_builtin_topo_get(const ucg_group_params_t *group_params, + ucg_builtin_topo_params_t *topo_params) +{ + /* set my own index */ + topo_params->my_index = group_params->member_index; + /* set total number of processes */ + topo_params->num_procs = group_params->member_count; +} + +static void ucg_builtin_topo_subroot(ucg_group_member_index_t socket_idx, + ucg_builtin_topo_params_t *topo_params, + ucg_group_member_index_t pps) +{ + if (pps != 0) { + for (socket_idx = 0; socket_idx < topo_params->local.socket.num; socket_idx++) { + topo_params->local.socket.leaders[socket_idx] = + topo_params->my_index % pps + socket_idx * pps + topo_params->local_members[0]; + } + /* set my socket index */ + topo_params->local.socket.idx = (topo_params->my_index - topo_params->local_members[0]) / pps; + } +} + +static inline void free_local_members(ucg_builtin_topo_params_t *topo_params) +{ + ucs_free(topo_params->local_members); + topo_params->local_members = NULL; +} + +static inline void free_node_leaders(ucg_builtin_topo_params_t *topo_params) +{ + ucs_free(topo_params->node_leaders); + topo_params->node_leaders = NULL; +} + +static inline void free_socket_members(ucg_builtin_topo_params_t *topo_params) +{ + ucs_free(topo_params->local.socket.members); + topo_params->local.socket.members = NULL; +} + +ucs_status_t ucg_builtin_query_topo(const ucg_group_params_t *group_params, + ucg_builtin_topo_params_t *topo_params) +{ + ucg_group_member_index_t member_idx; + ucg_group_member_index_t node_idx = 0; + ucg_group_member_index_t socket_idx = 0; + ucg_group_member_index_t proc_cnt = 0; + ucg_group_member_index_t pps = 0; + ucg_group_member_index_t max_node_idx = 0; + ucg_group_member_index_t my_rank = group_params->member_index; + ucg_group_member_index_t init_member_idx = (ucg_group_member_index_t)-1; + enum ucg_group_member_distance next_distance; + + ucg_builtin_topo_get(group_params, topo_params); + for (member_idx = 0; member_idx < group_params->member_count; member_idx++) { + if (max_node_idx < group_params->node_index[member_idx]) { + max_node_idx = group_params->node_index[member_idx]; + } + if (group_params->node_index[member_idx] == group_params->node_index[my_rank]) { + proc_cnt++; + } + /* calculate processec number per socket (pps)*/ + next_distance = ucg_builtin_get_distance(group_params, group_params->member_index, member_idx); + if (next_distance <= UCG_GROUP_MEMBER_DISTANCE_SOCKET) { + pps++; + } + } + + ucg_builtin_topo_set(topo_params, proc_cnt, max_node_idx, pps); + + /* allocate local_members & node_leaders */ + size_t alloc_size = sizeof(ucg_group_member_index_t) * topo_params->num_local_procs; + topo_params->local_members = (ucg_group_member_index_t *)UCS_ALLOC_CHECK(alloc_size, "rank in same node"); + + alloc_size = sizeof(ucg_group_member_index_t) * topo_params->node_cnt; + topo_params->node_leaders = (ucg_group_member_index_t *)ucs_malloc(alloc_size, "node leaders list"); + if (topo_params->node_leaders == NULL) { + free_local_members(topo_params); + return UCS_ERR_INVALID_PARAM; + } + + /* allocate local_members & node_leaders */ + alloc_size = sizeof(ucg_group_member_index_t) * topo_params->local.socket.member_cnt; + topo_params->local.socket.members = (ucg_group_member_index_t *)ucs_malloc(alloc_size, "rank in same socket"); + if (topo_params->local.socket.members == NULL) { + free_local_members(topo_params); + free_node_leaders(topo_params); + return UCS_ERR_INVALID_PARAM; + } + + alloc_size = sizeof(ucg_group_member_index_t) * topo_params->local.socket.num; + topo_params->local.socket.leaders = (ucg_group_member_index_t *)ucs_malloc(alloc_size, "socket leaders list"); + if (topo_params->local.socket.leaders == NULL) { + free_local_members(topo_params); + free_node_leaders(topo_params); + free_socket_members(topo_params); + return UCS_ERR_INVALID_PARAM; + } + + /* Initialization */ + ucg_builtin_topo_init(group_params, member_idx, topo_params, init_member_idx, + node_idx, proc_cnt, my_rank, pps, socket_idx); + + /* node leaders: Pick first rank number as subroot in same node */ + ucg_builtin_topo_leader(group_params, member_idx, topo_params, node_idx); + + /* rank list on own socket */ + ucg_builtin_topo_list(group_params, topo_params); + + /** + * socket leaders: Pick first rank number as subroot in same socket + * with strong assumption that pps is uniform! + * */ + ucg_builtin_topo_subroot(socket_idx, topo_params, pps); + return UCS_OK; +} + +void ucg_builtin_destroy_topo(ucg_builtin_topo_params_t *topo_params) +{ + if (topo_params->local_members) { + ucs_free(topo_params->local_members); + topo_params->local_members = NULL; + } + + if (topo_params->node_leaders) { + ucs_free(topo_params->node_leaders); + topo_params->node_leaders = NULL; + } + + if (topo_params->local.socket.members) { + ucs_free(topo_params->local.socket.members); + topo_params->local.socket.members = NULL; + } + + if (topo_params->local.socket.leaders) { + ucs_free(topo_params->local.socket.leaders); + topo_params->local.socket.leaders = NULL; + } + ucg_builtin_free((void **)&topo_params); +} diff --git a/builtin/plan/builtin_topo.h b/builtin/plan/builtin_topo.h new file mode 100644 index 0000000..9b64db3 --- /dev/null +++ b/builtin/plan/builtin_topo.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#ifndef UCG_BUILTIN_TOPO_H +#define UCG_BUILTIN_TOPO_H + +#include + +typedef struct ucg_builtin_topo_org { + unsigned num; + unsigned idx; + /* "leader_cnt" need not be same with "num" */ + unsigned leader_cnt; + unsigned member_cnt; + ucg_group_member_index_t *leaders; + ucg_group_member_index_t *members; +} ucg_builtin_topo_org_t; + +typedef struct ucg_builtin_topo_local { + ucg_builtin_topo_org_t socket; + ucg_builtin_topo_org_t L3cache; +} ucg_builtin_topo_local_t; + +typedef struct ucg_builtin_topo_params { + unsigned num_local_procs; + unsigned node_cnt; + ucg_group_member_index_t my_index; + ucg_group_member_index_t num_procs; + ucg_group_member_index_t *local_members; + ucg_group_member_index_t *node_leaders; + ucg_builtin_topo_local_t local; +} ucg_builtin_topo_params_t; + +ucs_status_t ucg_builtin_query_topo(const ucg_group_params_t *group_params, + ucg_builtin_topo_params_t *topo_params); + +void ucg_builtin_destroy_topo(ucg_builtin_topo_params_t *topo_params); + +#endif \ No newline at end of file diff --git a/builtin/plan/builtin_topo_aware.c b/builtin/plan/builtin_topo_aware.c new file mode 100644 index 0000000..bb96173 --- /dev/null +++ b/builtin/plan/builtin_topo_aware.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * Description: Topo-aware algorithm + */ + + +#include +#include +#include +#include +#include +#include + +#include "builtin_plan.h" + +/**/ + +#define MAX_PEERS 100 +#define MAX_PHASES 16 + +ucs_status_t ucg_builtin_topo_aware_add_intra(ucg_builtin_plan_t *topo_aware, + const ucg_builtin_config_t *config, + ucg_builtin_topo_aware_params_t *params, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + enum ucg_builtin_plan_topology_type topo_type, + enum ucg_group_hierarchy_level topo_level, + enum ucg_builtin_plan_connect_pattern pattern) +{ + ucs_status_t status = UCS_OK; + unsigned num_group = 1; + unsigned leader_shift = 0; + ucs_assert(member_cnt > 0); + + if (member_cnt == 1) { + ucs_debug("member_cnt is 1, skip adding intra-phase"); + return status; + } + switch (topo_type) { + case UCG_PLAN_RECURSIVE: + status = ucg_builtin_recursive_build(topo_aware, params->super.ctx, config, + member_list + leader_shift, member_cnt / num_group, + UCG_PLAN_BUILD_PARTIAL, UCG_PLAN_RECURSIVE_TYPE_ALLREDUCE); + break; + case UCG_PLAN_BMTREE: + status = ucg_builtin_bmtree_build(topo_aware, ¶ms->super, config, + member_list + leader_shift, member_cnt / num_group, + member_list[leader_shift], UCG_PLAN_BUILD_PARTIAL, pattern); + break; + case UCG_PLAN_KMTREE: { + unsigned degree; + if (pattern == UCG_PLAN_PATTERN_MANY_TO_ONE) { + degree = config->trees.intra_degree_fanin; + } else if (pattern == UCG_PLAN_PATTERN_ONE_TO_MANY) { + degree = config->trees.intra_degree_fanout; + } else { + ucs_error("Plan pattern should be either ONE_TO_MANY or MANY_TO_ONE for tree!"); + return UCS_ERR_INVALID_PARAM; + } + status = ucg_builtin_kmtree_build(topo_aware, ¶ms->super, config, + member_list + leader_shift, member_cnt / num_group, + member_list[leader_shift], degree, UCG_PLAN_BUILD_PARTIAL, pattern); + break; + } + default: + break; + + } + return status; +} + diff --git a/builtin/plan/builtin_topo_info.c b/builtin/plan/builtin_topo_info.c index 480477c..e9dca69 100644 --- a/builtin/plan/builtin_topo_info.c +++ b/builtin/plan/builtin_topo_info.c @@ -1,5 +1,5 @@ /* - * Copyright (C) Huawei Technologies Co., Ltd. 2019. ALL RIGHTS RESERVED. + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -8,6 +8,7 @@ #include #include #include +#include #include "builtin_plan.h" @@ -16,22 +17,7 @@ ucs_status_t ucg_builtin_find_myself(const ucg_group_params_t *group_params, ucg_group_member_index_t *myrank) { - /* find my own rank */ - unsigned member_idx; - ucg_group_member_index_t init_myrank = (ucg_group_member_index_t) - 1; - *myrank = init_myrank; - for (member_idx = 0; member_idx < group_params->member_count; member_idx++) { - ucs_assert(group_params->distance[member_idx] < UCG_GROUP_MEMBER_DISTANCE_LAST); - if (group_params->distance[member_idx] == UCG_GROUP_MEMBER_DISTANCE_SELF) { - *myrank = member_idx; - break; - } - } - - if (*myrank == init_myrank) { - ucs_error("No member with distance==UCP_GROUP_MEMBER_DISTANCE_SELF found"); - return UCS_ERR_INVALID_PARAM; - } + *myrank = group_params->member_index; return UCS_OK; } @@ -115,7 +101,7 @@ ucs_status_t ucg_builtin_topology_info_create(ucg_builtin_topology_info_params_t unsigned node_idx; unsigned ppn_idx = 0; ucg_group_member_index_t myrank = 0; - /* initalization */ + /* initialization */ topo_params->node_cnt = 0; topo_params->ppn_cnt = 0; @@ -180,10 +166,13 @@ ucs_status_t ucg_builtin_check_ppn(const ucg_group_params_t *group_params, } node_cnt++; - /* ppn array: record ppn vaule in every single node */ + /* ppn array: record ppn value in every single node */ size_t alloc_size = sizeof(unsigned) * node_cnt; unsigned *ppn_array = (unsigned *)UCS_ALLOC_CHECK(alloc_size, "ppn array"); - memset(ppn_array, 0, alloc_size); + errno_t res = memset_s(ppn_array, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } for (member_idx = 0; member_idx < group_params->member_count; member_idx++) { node_idx = group_params->node_index[member_idx]; ppn_array[node_idx]++; @@ -200,4 +189,68 @@ ucs_status_t ucg_builtin_check_ppn(const ucg_group_params_t *group_params, ucs_free(ppn_array); ppn_array = NULL; return UCS_OK; +} + +/* check nap support or not */ +/* the cases can use nap should be ppn > 1 and power of 2 or power of ppn */ +ucs_status_t ucg_builtin_check_nap(const ucg_group_params_t *group_params) +{ + ucg_group_member_index_t member_idx; + volatile unsigned node_cnt = 0; + unsigned node_idx; + unsigned ppn; + /* node count */ + for (member_idx = 0; member_idx < group_params->member_count; member_idx++) { + node_idx = group_params->node_index[member_idx]; + if (node_cnt < node_idx) { + node_cnt = node_idx; + } + } + node_cnt++; + + /* ppn array: record ppn vaule in every single node */ + size_t alloc_size = sizeof(unsigned) * node_cnt; + unsigned *ppn_array = (unsigned *)UCS_ALLOC_CHECK(alloc_size, "ppn array"); + errno_t res = memset_s(ppn_array, alloc_size, 0, alloc_size); + if (res != EOK) { + return UCS_ERR_INVALID_PARAM; + } + for (member_idx = 0; member_idx < group_params->member_count; member_idx++) { + node_idx = group_params->node_index[member_idx]; + ppn_array[node_idx]++; + } + + /* check balance ppn or not */ + for (node_idx = 0; node_idx < (node_cnt - 1); node_idx++) { + if (ppn_array[node_idx] != ppn_array[node_idx + 1]) { + ucs_free(ppn_array); + ppn_array = NULL; + return UCS_ERR_UNSUPPORTED; + } + } + + /* check ppn with 1 */ + ppn = ppn_array[0]; + ucs_free(ppn_array); + ppn_array = NULL; + if (ppn <= 1) { + return UCS_ERR_UNSUPPORTED; + } + + /* check node count is power of 2 */ + if (!(node_cnt & (node_cnt -1 ))) { + return UCS_OK; + } else { + return UCS_ERR_UNSUPPORTED; + } +} + +ucs_status_t ucg_builtin_check_non_aware_Raben(const ucg_group_params_t *group_params) +{ + const unsigned even_factor = 2; + /* Raben does not support odd number of processes */ + if (group_params->member_count % even_factor != 0) { + return UCS_ERR_UNSUPPORTED; + } + return UCS_OK; } \ No newline at end of file diff --git a/builtin/plan/builtin_trees.c b/builtin/plan/builtin_trees.c new file mode 100644 index 0000000..8b40127 --- /dev/null +++ b/builtin/plan/builtin_trees.c @@ -0,0 +1,569 @@ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "builtin_plan.h" + +#include +#include +#include +#include + +#include "builtin_plan.h" +#define MAX_PEERS (100) +#define MAX_PHASES (16) + +ucs_config_field_t ucg_builtin_trees_config_table[] = { + {"INTER_TREE_TYPE", "1", "inter-node tree type. (0: binomial tree, 1: k-nomial tree)\n", + ucs_offsetof(ucg_builtin_trees_config_t, inter_tree_type), UCS_CONFIG_TYPE_UINT}, + + {"INTRA_TREE_TYPE", "0", "intra-node tree type. (0: binomial tree, 1: k-nomial tree)\n", + ucs_offsetof(ucg_builtin_trees_config_t, intra_tree_type), UCS_CONFIG_TYPE_UINT}, + + {"INTER_DEGREE_FANOUT", "8", "k-nomial tree degree for inter node with fanout process.\n", + ucs_offsetof(ucg_builtin_trees_config_t, inter_degree_fanout), UCS_CONFIG_TYPE_UINT}, + + {"INTER_DEGREE_FANIN", "8", "k-nomial tree degree for inter node with fanin process.\n", + ucs_offsetof(ucg_builtin_trees_config_t, inter_degree_fanin), UCS_CONFIG_TYPE_UINT}, + + {"INTRA_DEGREE_FANOUT", "2", "k-nomial tree degree for intra node with fanout process.\n", + ucs_offsetof(ucg_builtin_trees_config_t, intra_degree_fanout), UCS_CONFIG_TYPE_UINT}, + + {"INTRA_DEGREE_FANIN", "2", "k-nomial tree degree for intra node with fanin process.\n", + ucs_offsetof(ucg_builtin_trees_config_t, intra_degree_fanin), UCS_CONFIG_TYPE_UINT}, + {NULL} +}; +static inline void ucg_builtin_phase_init(ucg_builtin_plan_phase_t *phase, + ucg_step_idx_t step_index, + unsigned peer_cnt, + enum ucg_builtin_plan_method_type method) +{ + phase->method = method; + phase->ep_cnt = peer_cnt; + phase->step_index = step_index; +} + +ucs_status_t ucg_builtin_treenode_connect_to_phase(ucg_builtin_plan_phase_t *phase, + ucg_builtin_group_ctx_t *ctx, + ucg_step_idx_t step_index, + uct_ep_h **eps, + ucg_group_member_index_t *peers, + unsigned peer_cnt, + enum ucg_builtin_plan_method_type method) +{ + /* Initialization */ + ucs_assert(peer_cnt > 0); + ucs_status_t status = UCS_OK; + ucg_builtin_phase_init(phase, step_index, peer_cnt, method); +#if ENABLE_DEBUG_DATA || ENABLE_FAULT_TOLERANCE + phase->indexes = UCS_ALLOC_CHECK(peer_cnt * sizeof(*peers), + "binomial tree topology indexes"); +#endif + if (peer_cnt == 1) { + status = ucg_builtin_connect(ctx, peers[0], phase, UCG_BUILTIN_CONNECT_SINGLE_EP); + } else { + phase->multi_eps = *eps; + *eps += peer_cnt; + + /* connect every endpoint, by group member index */ + unsigned idx; + for (idx = 0; (idx < peer_cnt) && (status == UCS_OK); idx++, peers++) { + status = ucg_builtin_connect(ctx, *peers, phase, idx); + } + } + return status; +} + +ucs_status_t ucg_builtin_treenode_connect(ucg_builtin_plan_t *tree, + ucg_builtin_group_ctx_t *ctx, + const ucg_builtin_config_t *config, + enum ucg_collective_modifiers mod, + uct_ep_h *next_ep, + ucg_group_member_index_t *up, + ucg_group_member_index_t up_cnt, + ucg_group_member_index_t *down, + ucg_group_member_index_t down_cnt, + enum ucg_builtin_plan_topology_type tree_topo) +{ + ucs_status_t status = UCS_OK; + ucg_builtin_plan_phase_t *phase = &tree->phss[tree->phs_cnt]; + + /**/ + enum ucg_builtin_plan_method_type method; + switch (tree_topo) { + case UCG_PLAN_TREE_FANIN: + if (down_cnt) { + method = ((unsigned)mod & UCG_GROUP_COLLECTIVE_MODIFIER_AGGREGATE) ? + (up_cnt ? UCG_PLAN_METHOD_REDUCE_WAYPOINT : UCG_PLAN_METHOD_REDUCE_TERMINAL): + (up_cnt ? UCG_PLAN_METHOD_GATHER_WAYPOINT : UCG_PLAN_METHOD_RECV_TERMINAL); + } else { + method = UCG_PLAN_METHOD_SEND_TERMINAL; + } + break; + case UCG_PLAN_TREE_FANOUT: + if (down_cnt) { + method = ((unsigned)mod & UCG_GROUP_COLLECTIVE_MODIFIER_BROADCAST) ? + (up_cnt ? UCG_PLAN_METHOD_BCAST_WAYPOINT : UCG_PLAN_METHOD_SEND_TERMINAL) : + (up_cnt ? UCG_PLAN_METHOD_SCATTER_WAYPOINT : UCG_PLAN_METHOD_SCATTER_TERMINAL); + } else { + method = UCG_PLAN_METHOD_RECV_TERMINAL; + } + break; + default: + ucs_error("Tree should be either FANIN or FANOUT!"); + return UCS_ERR_INVALID_PARAM; + } + + /* connect to phase */ + /* Leaf */ + if (up_cnt == 1 && down_cnt == 0) { + /* Connect this phase to its peers */ + status = ucg_builtin_treenode_connect_to_phase(phase, ctx, tree->step_cnt, + &next_ep, up, up_cnt, method); + } + /* root */ + if (up_cnt == 0 && down_cnt > 0) { + /* Connect this phase to its peers */ + status = ucg_builtin_treenode_connect_to_phase(phase, ctx, tree->step_cnt, + &next_ep, down, down_cnt, method); + } + /* Waypoint */ + /** + * layout of peers which need to be connected: + * FANIN: [down][down][down][up] + * FANOUT: [up][down][down][down] + */ + if (up_cnt == 1 && down_cnt > 0) { + /* Connect this phase to its peers */ + ucg_group_member_index_t member_idx; + if (tree_topo == UCG_PLAN_TREE_FANIN) { + for (member_idx = down_cnt; member_idx < down_cnt + up_cnt; member_idx++) { + down[member_idx] = up[member_idx - down_cnt]; + } + status = ucg_builtin_treenode_connect_to_phase(phase, ctx, tree->step_cnt, + &next_ep, down, down_cnt + up_cnt, method); + } else if (tree_topo == UCG_PLAN_TREE_FANOUT) { + for (member_idx = up_cnt; member_idx < down_cnt + up_cnt; member_idx++) { + up[member_idx] = down[member_idx - up_cnt]; + } + status = ucg_builtin_treenode_connect_to_phase(phase, ctx, tree->step_cnt, + &next_ep, up, up_cnt + down_cnt, method); + } + } + return status; +} + +static inline void ucg_builtin_tree_updata(ucg_builtin_plan_t *tree, + ucg_group_member_index_t up_cnt, + ucg_group_member_index_t down_cnt) +{ + tree->phs_cnt++; + tree->step_cnt++; + tree->ep_cnt += (up_cnt + down_cnt); +} + +static inline void ucg_builtin_tree_step_cnt(ucg_builtin_plan_t *tree) +{ + tree->step_cnt += 1; +} + +static ucs_status_t ucg_builtin_get_tree_topo(enum ucg_builtin_plan_connect_pattern pattern, + enum ucg_builtin_plan_topology_type *tree_topo) +{ + switch (pattern){ + case UCG_PLAN_PATTERN_MANY_TO_ONE: + *tree_topo = UCG_PLAN_TREE_FANIN; + break; + case UCG_PLAN_PATTERN_ONE_TO_MANY: + *tree_topo = UCG_PLAN_TREE_FANOUT; + break; + default: + ucs_error("For tree, do not support many-to-many pattern!!!"); + return UCS_ERR_INVALID_PARAM; + } + return UCS_OK; +} +ucs_status_t ucg_builtin_bmtree_build(ucg_builtin_plan_t *bmtree, + ucg_builtin_base_params_t *params, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + const ucg_group_member_index_t member_root, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_connect_pattern pattern) +{ + ucs_status_t status; + ucg_builtin_group_ctx_t *ctx = params->ctx; + enum ucg_collective_modifiers mod = params->coll_type->modifiers; + enum ucg_builtin_plan_topology_type tree_topo = UCG_PLAN_LAST; + /* next_ep shifts as ep_cnt grows */ + uct_ep_h *next_ep = (uct_ep_h *)(&bmtree->phss[MAX_PHASES]) + bmtree->ep_cnt; + + status = ucg_builtin_get_tree_topo(pattern, &tree_topo); + if (status) { + return status; + } + ucg_group_member_index_t member_idx, rank_shift, peer, value; + /* my_index is always "local" */ + ucg_group_member_index_t my_index = 0; + ucg_group_member_index_t root = 0; + ucg_group_member_index_t up[MAX_PEERS] = {0}; + ucg_group_member_index_t down[MAX_PEERS] = {0}; + ucg_group_member_index_t up_cnt = 0; + ucg_group_member_index_t down_cnt = 0; + ucg_group_member_index_t num_child = 0; + ucg_group_member_index_t tree_mask = 1; + + if (build_type == UCG_PLAN_BUILD_FULL) { + my_index = bmtree->super.my_index; + root = member_root; + } else if (build_type == UCG_PLAN_BUILD_PARTIAL) { + /* Find the local my own index */ + for (member_idx = 0; member_idx < member_cnt; member_idx++) { + if (member_list[member_idx] == bmtree->super.my_index) { + my_index = member_idx; + break; + } + } + + if (member_idx == member_cnt) { + /* step_cnt is updated by one for trees while phs_cnt is not*/ + ucg_builtin_tree_step_cnt(bmtree); + return UCS_OK; + } + + for (member_idx = 0; member_idx < member_cnt; member_idx++) { + if (member_list[member_idx] == member_root) { + root = member_idx; + break; + } + } + /* for trees, the root should be in member list */ + if (member_idx == member_cnt) { + ucs_error("The root is not in the member list for binomial tree build!!!"); + return UCS_ERR_INVALID_PARAM; + } + } + +/* + left-most tree for FANOUT + right-most tree for FANIN +*/ + if (tree_topo == UCG_PLAN_TREE_FANIN) { + /* right-most tree */ + rank_shift = (my_index - root + member_cnt) % member_cnt; + if (root == my_index) { + up_cnt = 0; + } + while (tree_mask < member_cnt) { + peer = rank_shift ^ tree_mask; + if (peer < rank_shift) { + up[0] = (peer + root) % member_cnt; + up_cnt = 1; + break; + } else if (peer < member_cnt){ + down[num_child] = (peer + root) % member_cnt; + num_child++; + } + tree_mask <<= 1; + } + down_cnt = num_child; + } else if (tree_topo == UCG_PLAN_TREE_FANOUT) { + /* left-most tree */ + rank_shift = (my_index - root + member_cnt) % member_cnt; + value = rank_shift; + for (tree_mask = 1; value > 0; value >>= 1, tree_mask <<= 1) { + } + if (root == my_index) { + up_cnt = 0; + } else { + peer = rank_shift ^ (tree_mask >> 1); + up[0] = (peer + root) % member_cnt; + up_cnt = 1; + } + /* find children */ + while (tree_mask < member_cnt) { + peer = rank_shift ^ tree_mask; + if (peer >= member_cnt) { + break; + } + down[num_child] = (peer + root) % member_cnt; + num_child++; + tree_mask <<= 1; + } + down_cnt = num_child; + } else { + ucs_error("Tree should be either FANIN or FANOUT!"); + return UCS_ERR_INVALID_PARAM; + } + + if (build_type == UCG_PLAN_BUILD_PARTIAL) { + /* convert index to real rank */ + for (member_idx = 0; member_idx < up_cnt; member_idx++) { + up[member_idx] = member_list[up[member_idx]]; + } + + for (member_idx = 0; member_idx < down_cnt; member_idx++) { + down[member_idx] = member_list[down[member_idx]]; + } + + if (ucg_builtin_need_calate_position(params->coll_type, up_cnt, params->ctx, tree_topo)) { + bmtree->super.up_offset = ucg_get_tree_buffer_pos(bmtree->super.my_index, up[0], root, member_cnt, + config->bmtree.degree_intra_fanin, member_list); + ucs_debug("up_offset:%u, degree_intra_fanin=%u, up[0]=%lu, myrank:%lu, root:%lu, size:%lu", + bmtree->super.up_offset, config->bmtree.degree_intra_fanin, up[0], + bmtree->super.my_index, root, member_cnt); + } + } + status = ucg_builtin_treenode_connect(bmtree, ctx, config, mod, next_ep, + up, up_cnt, down, down_cnt, tree_topo); + ucg_builtin_tree_updata(bmtree, up_cnt, down_cnt); + return status; +} + +ucs_status_t ucg_builtin_kmtree_build(ucg_builtin_plan_t *kmtree, + ucg_builtin_base_params_t *params, + const ucg_builtin_config_t *config, + const ucg_group_member_index_t *member_list, + const ucg_group_member_index_t member_cnt, + const ucg_group_member_index_t member_root, + const unsigned degree, + enum ucg_builtin_plan_build_type build_type, + enum ucg_builtin_plan_connect_pattern pattern) +{ + if (degree == 0) { + return UCS_ERR_INVALID_PARAM; + } + ucs_status_t status; + ucg_builtin_group_ctx_t *ctx = params->ctx; + enum ucg_collective_modifiers mod = params->coll_type->modifiers; + enum ucg_builtin_plan_topology_type tree_topo = UCG_PLAN_LAST; + /* next_ep shifts as ep_cnt grows */ + uct_ep_h *next_ep = (uct_ep_h *)(&kmtree->phss[MAX_PHASES]) + kmtree->ep_cnt; + + status = ucg_builtin_get_tree_topo(pattern, &tree_topo); + if (status) { + return status; + } + ucg_group_member_index_t member_idx, rank_shift, orig_mask, peer; + /* my_index is always "local" */ + ucg_group_member_index_t my_index = 0; + ucg_group_member_index_t root = 0; + ucg_group_member_index_t up[MAX_PEERS] = {0}; + ucg_group_member_index_t down[MAX_PEERS] = {0}; + ucg_group_member_index_t up_cnt = 0; + ucg_group_member_index_t down_cnt = 0; + ucg_group_member_index_t num_child = 0; + ucg_group_member_index_t tree_mask = 1; + unsigned k; + + if (build_type == UCG_PLAN_BUILD_FULL) { + my_index = kmtree->super.my_index; + root = member_root; + } else if (build_type == UCG_PLAN_BUILD_PARTIAL) { + /* Find the local my own index */ + for (member_idx = 0; member_idx < member_cnt; member_idx++) { + if (member_list[member_idx] == kmtree->super.my_index) { + my_index = member_idx; + break; + } + } + + if (member_idx == member_cnt) { + /* step_cnt is updated by one for trees while phs_cnt is not*/ + ucg_builtin_tree_step_cnt(kmtree); + return UCS_OK; + } + + for (member_idx = 0; member_idx < member_cnt; member_idx++) { + if (member_list[member_idx] == member_root) { + root = member_idx; + break; + } + } + /* for trees, the root should be in member list */ + if (member_idx == member_cnt) { + ucs_error("The root is not in the member list for binomial tree build!!!"); + return UCS_ERR_INVALID_PARAM; + } + } + +/* + left-most tree for FANOUT + right-most tree for FANIN +*/ + if (tree_topo == UCG_PLAN_TREE_FANIN) { + /* right-most tree */ + rank_shift = (my_index - root + member_cnt) % member_cnt; + while (tree_mask < member_cnt) { + if (rank_shift % (degree * tree_mask)) { + peer = rank_shift / (degree * tree_mask) * (degree * tree_mask); + up[0] = (peer + root) % member_cnt; + up_cnt = 1; + break; + } + tree_mask *= degree; + + } + tree_mask /= degree; + orig_mask = tree_mask; + while (tree_mask > 0) { + for (k = 1; k < degree; k++) { + peer = rank_shift + tree_mask * k; + if (peer < member_cnt) { + num_child++; + } + } + tree_mask /= degree; + } + down_cnt = num_child; + tree_mask = orig_mask; + while (tree_mask > 0) { + for (k = 1; k < degree; k++) { + peer = rank_shift + tree_mask * k; + if (peer < member_cnt) { + peer = (peer + root) % member_cnt; + down[--num_child] = peer; + } + } + tree_mask /= degree; + } + } else if (tree_topo == UCG_PLAN_TREE_FANOUT) { + /* left-most tree */ + rank_shift = (my_index - root + member_cnt) % member_cnt; + while (tree_mask < member_cnt) { + if (rank_shift % (degree * tree_mask)) { + peer = rank_shift / (degree * tree_mask) * (degree * tree_mask); + up[0] = (peer + root) % member_cnt; + up_cnt = 1; + break; + } + tree_mask *= degree; + } + /* find children */ + tree_mask /= degree; + while (tree_mask > 0) { + for (k = 1; k < degree; k++) { + peer = rank_shift + tree_mask * k; + if (peer < member_cnt) { + peer = (peer + root) % member_cnt; + down[num_child] = peer; + num_child++; + } + } + tree_mask /= degree; + } + down_cnt = num_child; + } else { + ucs_error("Tree should be either FANIN or FANOUT!"); + return UCS_ERR_INVALID_PARAM; + } + + if (build_type == UCG_PLAN_BUILD_PARTIAL) { + /* convert index to real rank */ + for (member_idx = 0; member_idx < up_cnt; member_idx++) { + up[member_idx] = member_list[up[member_idx]]; + } + + for (member_idx = 0; member_idx < down_cnt; member_idx++) { + down[member_idx] = member_list[down[member_idx]]; + } + + if (ucg_builtin_need_calate_position(params->coll_type, up_cnt, params->ctx, tree_topo)) { + kmtree->super.up_offset = ucg_get_tree_buffer_pos(kmtree->super.my_index, up[0], root, member_cnt, + config->bmtree.degree_intra_fanin, member_list); + ucs_debug("up_offset:%u, degree_intra_fanin=%u, up[0]=%lu, myrank:%lu, root:%lu, size:%lu", + kmtree->super.up_offset, config->bmtree.degree_intra_fanin, up[0], + kmtree->super.my_index, root, member_cnt); + } + } + status = ucg_builtin_treenode_connect(kmtree, ctx, config, mod, next_ep, + up, up_cnt, down, down_cnt, tree_topo); + ucg_builtin_tree_updata(kmtree, up_cnt, down_cnt); + return status; +} + +int ucg_builtin_kmtree_get_child_ranks(unsigned rank, + unsigned root, + unsigned size, + unsigned degree, + int *downBuff, + int *pDownCnt) +{ + if (degree == 0 || size == 0) { + return -1; + } + unsigned numChild = 0; + unsigned mask = 1; + unsigned localRank = (rank - root + size) % size; + + /* find max mask */ + while (mask < size) { + if (localRank % (degree * mask)) { + break; + } + mask *= degree; + } + + /* find children */ + mask /= degree; + unsigned k; + unsigned i = 1; + while (mask >= i) { + for (k = 1; k < degree; k++) { + unsigned childRank = localRank + i * k; + if (childRank < size) { + childRank = (childRank + root) % size; + downBuff[numChild] = childRank; + numChild++; + } + } + i *= degree; + } + *pDownCnt = numChild; + return 0; +} + +short ucg_get_tree_buffer_pos(ucg_group_member_index_t myrank, + ucg_group_member_index_t uprank, + ucg_group_member_index_t root, + unsigned size, unsigned degree, + const ucg_group_member_index_t *member_list) +{ + int down[MAX_PEERS] = {0}; + int downCnt; + short ret = -1; + int idx; + if (ucg_builtin_kmtree_get_child_ranks(uprank, root, size, degree, down, &downCnt) == -1) { + return -1; + } + + /* convert index to real rank */ + for (idx = 0; idx < downCnt; idx++) { + down[idx] = member_list[down[idx]]; + } + for (idx = 0; idx < downCnt; idx++) { + if (down[idx] == myrank) { + ret = idx; + break; + } + } + ucs_debug("myrank:%lu, up:%lu, root:%lu, size:%u, down_cnt:%d, pos:%d", + myrank, uprank, root, size, downCnt, ret); + return ((ret == -1) ? downCnt : ret ); +} + +int ucg_builtin_need_calate_position(const ucg_collective_type_t *coll, + unsigned up_cnt, + const ucg_builtin_group_ctx_t *ctx, + enum ucg_builtin_plan_topology_type tree_topo) +{ + if ((ucg_builtin_get_coll_type(coll) == COLL_TYPE_ALLREDUCE) + && (up_cnt == 1) + && (ucg_is_allreduce_consistency(ctx) == 1) + && (tree_topo == UCG_PLAN_TREE_FANIN)) { + return 1; + } + return 0; +} diff --git a/configure.m4 b/configure.m4 index 0e4b98d..50bbb65 100644 --- a/configure.m4 +++ b/configure.m4 @@ -17,6 +17,19 @@ AS_IF([test "x$enable_fault_tolerance" = xyes], AC_DEFINE([ENABLE_FAULT_TOLERANCE], [1], [Enable fault-tolerance])], [:]) +# +# Enable ucg-hicoll +# +AC_ARG_ENABLE([ucg-hicoll], + [AS_HELP_STRING([--enable-ucg-hicoll], + [Enable ucg-hicoll, default: NO])], + [], + [enable_ucg_hicoll=no]) + +AS_IF([test "x$enable_ucg_hicoll" = xyes], + [AS_MESSAGE([enabling with ucg-hicoll]) + AC_DEFINE([ENABLE_UCG_HICOLL], [1], [Enable ucg-hicoll])], + [:]) # Set special flags for API incompatiblity detection (below) SAVE_CPPFLAGS="$CPPFLAGS" @@ -50,9 +63,9 @@ ucg_modules=":builtin" m4_include([src/ucg/base/configure.m4]) m4_include([src/ucg/builtin/configure.m4]) m4_include([src/ucg/hicoll/configure.m4]) +m4_include([src/ucg/secure/configure.m4]) AC_DEFINE_UNQUOTED([ucg_MODULES], ["${ucg_modules}"], [UCG loadable modules]) AC_CONFIG_FILES([src/ucg/Makefile src/ucg/api/ucg_version.h src/ucg/base/ucg_version.c]) - diff --git a/secure/Makefile.am b/secure/Makefile.am new file mode 100644 index 0000000..0ca6378 --- /dev/null +++ b/secure/Makefile.am @@ -0,0 +1,25 @@ + +# +# Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. +# See file LICENSE for terms +# + +if HAVE_OMPI_SRC +override CC = $(MPICC) +override CXX = $(MPICXX) +else +CXXFLAGS += "-fpermissive" +endif + +noinst_LTLIBRARIES = libucg_secure.la +libucg_secure_la_CFLAGS = $(BASE_CFLAGS) +libucg_secure_la_CPPFLAGS = $(BASE_CPPFLAGS) + +noinst_HEADERS = \ + include/securec.h \ + include/securectype.h + +libucg_secure_la_SOURCES = \ + src/memcpy_s.c \ + src/memmove_s.c \ + src/memset_s.c diff --git a/secure/configure.m4 b/secure/configure.m4 new file mode 100644 index 0000000..d168e6b --- /dev/null +++ b/secure/configure.m4 @@ -0,0 +1,6 @@ +# +# Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. +# See file LICENSE for terms +# + +AC_CONFIG_FILES([src/ucg/secure/Makefile]) diff --git a/secure/include/securec.h b/secure/include/securec.h new file mode 100644 index 0000000..5deafca --- /dev/null +++ b/secure/include/securec.h @@ -0,0 +1,203 @@ +/******************************************************************************* +* Copyright @ Huawei Technologies Co., Ltd. 1998-2014. All rights reserved. +* File name: securec.h +* Decription: +* the user of this secure c library should include this header file +* in you source code. This header file declare all supported API +* prototype of the library, such as memcpy_s, strcpy_s, wcscpy_s, +* strcat_s, strncat_s, sprintf_s, scanf_s, and so on. +* History: +* 1. Date: +* Author: +* Modification: +******************************************************************************** +*/ + +#ifndef __SECUREC_H__5D13A042_DC3F_4ED9_A8D1_882811274C27 +#define __SECUREC_H__5D13A042_DC3F_4ED9_A8D1_882811274C27 + +/* If you need high performance, enable the WITH_PERFORMANCE_ADDONS macro! */ +#define WITH_PERFORMANCE_ADDONS + +#include "securectype.h" /*lint !e537*/ +#include + +/* If stack size on some embedded platform is limited, you can define the following macro +* which will put some variables on heap instead of stack. +#define STACK_SIZE_LESS_THAN_1K +*/ + +/* for performance consideration, the following macro will call the corresponding API +* of libC for memcpy, memmove and memset +*/ +#define CALL_LIBC_COR_API + +/*define error code*/ +#ifndef errno_t +typedef int errno_t; +#endif + +/* success */ +#define EOK (0) + +/* invalid parameter */ +#ifdef EINVAL +#undef EINVAL +#endif +#define EINVAL (22) +#define EINVAL_AND_RESET (22 | 0X80) +/* invalid parameter range */ +#ifdef ERANGE +#undef ERANGE /* to avoid redefinition */ +#endif +#define ERANGE (34) +#define ERANGE_AND_RESET (34 | 0X80) + +/* A wide-character code has been detected that does not correspond to a +* valid character, or a byte sequence does not form a valid wide-character code +*/ +#ifdef EILSEQ +#undef EILSEQ +#endif +#define EILSEQ (42) + +#ifdef EOVERLAP_AND_RESET +#undef EOVERLAP_AND_RESET +#endif +/*Once the buffer overlap is detected, the dest buffer must be reseted! */ +#define EOVERLAP_AND_RESET (54 | 0X80) + +/*if you need export the function of this library in Win32 dll, use __declspec(dllexport) */ + +#ifdef __cplusplus +extern "C" +{ +#endif + /* return SecureC Version */ + void getHwSecureCVersion(char* verStr, int bufSize, unsigned short* verNumber); + + /* wmemcpy */ + errno_t wmemcpy_s(wchar_t* dest, size_t destMax, const wchar_t* src, size_t count); + + /* memmove */ + errno_t memmove_s(void* dest, size_t destMax, const void* src, size_t count); + + errno_t wmemmove_s(wchar_t* dest, size_t destMax, const wchar_t* src, size_t count); + + errno_t wcscpy_s(wchar_t* strDest, size_t destMax, const wchar_t* strSrc); + + errno_t wcsncpy_s(wchar_t* strDest, size_t destMax, const wchar_t* strSrc, size_t count); + + errno_t wcscat_s(wchar_t* strDest, size_t destMax, const wchar_t* strSrc); + + errno_t wcsncat_s(wchar_t* strDest, size_t destMax, const wchar_t* strSrc, size_t count); + + /* strtok */ + char* strtok_s(char* strToken, const char* strDelimit, char** context); + + wchar_t* wcstok_s(wchar_t* strToken, const wchar_t* strDelimit, wchar_t** context); + + /* sprintf */ + int sprintf_s(char* strDest, size_t destMax, const char* format, ...); + + int swprintf_s(wchar_t* strDest, size_t destMax, const wchar_t* format, ...); + + /* vsprintf */ + int vsprintf_s(char* strDest, size_t destMax, const char* format, va_list argptr); + + int vswprintf_s(wchar_t* strDest, size_t destMax, const wchar_t* format, va_list argptr); + + int vsnprintf_s(char* strDest, size_t destMax, size_t count, const char* format, va_list arglist); + + /* snprintf */ + int snprintf_s(char* strDest, size_t destMax, size_t count, const char* format, ...); + + /* scanf */ + int scanf_s(const char* format, ...); + + int wscanf_s(const wchar_t* format, ...); + + /* vscanf */ + int vscanf_s(const char* format, va_list arglist); + + int vwscanf_s(const wchar_t* format, va_list arglist); + + /* fscanf */ + int fscanf_s(FILE* stream, const char* format, ...); + + int fwscanf_s(FILE* stream, const wchar_t* format, ...); + + /* vfscanf */ + int vfscanf_s(FILE* stream, const char* format, va_list arglist); + + int vfwscanf_s(FILE* stream, const wchar_t* format, va_list arglist); + + /* sscanf */ + int sscanf_s(const char* buffer, const char* format, ...); + + int swscanf_s(const wchar_t* buffer, const wchar_t* format, ...); + + /* vsscanf */ + int vsscanf_s(const char* buffer, const char* format, va_list argptr); + + int vswscanf_s(const wchar_t* buffer, const wchar_t* format, va_list arglist); + + /* gets */ + char* gets_s(char* buffer, size_t destMax); + + /* memset function*/ + errno_t memset_s(void* dest, size_t destMax, int c, size_t count); + /* memcpy function*/ + errno_t memcpy_s(void* dest, size_t destMax, const void* src, size_t count); + + /* strcpy */ + errno_t strcpy_s(char* strDest, size_t destMax, const char* strSrc); + /* strncpy */ + errno_t strncpy_s(char* strDest, size_t destMax, const char* strSrc, size_t count); + + /* strcat */ + errno_t strcat_s(char* strDest, size_t destMax, const char* strSrc); + /* strncat */ + errno_t strncat_s(char* strDest, size_t destMax, const char* strSrc, size_t count); + + errno_t strncpy_error(char* strDest, size_t destMax, const char* strSrc, size_t count); + errno_t strcpy_error(char* strDest, size_t destMax, const char* strSrc); + +#if defined(WITH_PERFORMANCE_ADDONS) + /* those functions are used by macro */ + errno_t memset_sOptAsm(void *dest, size_t destMax, int c, size_t count); + errno_t memset_sOptTc(void* dest, size_t destMax, int c, size_t count); + errno_t memcpy_sOptAsm(void* dest, size_t destMax, const void* src, size_t count); + errno_t memcpy_sOptTc(void* dest, size_t destMax, const void* src, size_t count); + + /* strcpy_sp is a macro, NOT a function in performance optimization mode. */ +#define strcpy_sp(dest, destMax, src) /*lint -save -e506 -e1055 */ (( __builtin_constant_p((destMax)) && __builtin_constant_p((src))) ? \ + STRCPY_SM((dest), (destMax), (src)) : strcpy_s((dest), (destMax), (src)) ) /*lint -restore */ + + /* strncpy_sp is a macro, NOT a function in performance optimization mode. */ +#define strncpy_sp(dest, destMax, src, count) /*lint -save -e506 -e1055 */ ((__builtin_constant_p((count)) && __builtin_constant_p((destMax)) && __builtin_constant_p((src))) ? \ + STRNCPY_SM((dest), (destMax), (src), (count)) : strncpy_s((dest), (destMax), (src), (count)) ) /*lint -restore */ + + /* strcat_sp is a macro, NOT a function in performance optimization mode. */ +#define strcat_sp(dest, destMax, src) /*lint -save -e506 -e1055 */ (( __builtin_constant_p((destMax)) && __builtin_constant_p((src))) ? \ + STRCAT_SM((dest), (destMax), (src)) : strcat_s((dest), (destMax), (src)) ) /*lint -restore */ + + /* strncat_sp is a macro, NOT a function in performance optimization mode. */ +#define strncat_sp(dest, destMax, src, count) /*lint -save -e506 -e1055 */ ((__builtin_constant_p((count)) && __builtin_constant_p((destMax)) && __builtin_constant_p((src))) ? \ + STRNCAT_SM((dest), (destMax), (src), (count)) : strncat_s((dest), (destMax), (src), (count)) ) /*lint -restore */ + + /* memcpy_sp is a macro, NOT a function in performance optimization mode. */ +#define memcpy_sp(dest, destMax, src, count) /*lint -save -e506 -e1055 */ (__builtin_constant_p((count)) ? (MEMCPY_SM((dest), (destMax), (src), (count))) : \ + (__builtin_constant_p((destMax)) ? (((size_t)(destMax) > 0 && (((UINT64T)(destMax) << 1) >> 1) < SECUREC_MEM_MAX_LEN)) ? memcpy_sOptTc((dest), (destMax), (src), (count)) : ERANGE ) : memcpy_sOptAsm((dest), (destMax), (src), (count)))) /*lint -restore */ + + /* memset_sp is a macro, NOT a function in performance optimization mode. */ +#define memset_sp(dest, destMax, c, count) /*lint -save -e506 -e1055 */ (__builtin_constant_p((count)) ? (MEMSET_SM((dest), (destMax), (c), (count))) : \ + (__builtin_constant_p((destMax)) ? (((size_t)(destMax) > 0 && (((UINT64T)(destMax) << 1) >> 1) < SECUREC_MEM_MAX_LEN)) ? memset_sOptTc((dest), (destMax), (c), (count)) : ERANGE ) : memset_sOptAsm((dest), (destMax), (c), (count)))) /*lint -restore */ + +#endif + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif/* __SECUREC_H__5D13A042_DC3F_4ED9_A8D1_882811274C27 */ diff --git a/secure/include/securectype.h b/secure/include/securectype.h new file mode 100644 index 0000000..5de7337 --- /dev/null +++ b/secure/include/securectype.h @@ -0,0 +1,251 @@ +/**** + * Copyright (C) Huawei Technologies Co., Ltd. 2019-2021. ALL RIGHTS RESERVED. + * File name: securectype.h + * Description: + * define internal used macro and data type. The marco of SECUREC_ON_64BITS + * will be determined in this header file, which is a switch for part + * *************************************************************************** of code. Some macro are used to supress waining by MS complier. + * Note: + * user can change the value of SECUREC_STRING_MAX_LEN and SECUREC_MEM_MAX_LEN + * macro to meet their specitial need. + * History: + * 1. Date: 2014/4/10 + * Author: LiShunda + * Modification: add error detection macro. If pointer size of dest system is NOT + * 4 bytes or 8 bytes, use #error "unsupported system..." to report the compiling error. + * 2. Date:2014/5/16 + * Author: LiShunda + * Modification: in HP UX system, UINT8T is defined, so add a macro(_hpux) to detect + * whether on this system. + * 3. Date:2014/6/3 + * Author: LiShunda + * Modification: remove , for pclint will give a warning on including this file. + * 4. Date:2014/6/10 + * Author: LiShunda + * Modification: change uint8_t to UINT8T, which can avoid type redefinition. + ****************************************************************************** + */ + +#ifndef __SECURECTYPE_H__A7BBB686_AADA_451B_B9F9_44DACDAE18A7 +#define __SECURECTYPE_H__A7BBB686_AADA_451B_B9F9_44DACDAE18A7 + + +/*Shielding VC symbol redefinition warning*/ +#if defined(_MSC_VER) && (_MSC_VER >= 1400) +#ifdef __STDC_WANT_SECURE_LIB__ + #undef __STDC_WANT_SECURE_LIB__ +#endif + #define __STDC_WANT_SECURE_LIB__ 0 +#ifdef _CRTIMP_ALTERNATIVE + #undef _CRTIMP_ALTERNATIVE +#endif + #define _CRTIMP_ALTERNATIVE //comment microsoft *_s function +#endif + +#include +#include +#include +/* #include this file is used to define some macros, such as INT_MAX and SIZE_MAX */ + +#if (defined(_WIN32) || defined(_WIN64)) +/* in windows platform, can't use optimized function for there is no __builtin_constant_p like function */ + +#ifdef WITH_PERFORMANCE_ADDONS +#undef WITH_PERFORMANCE_ADDONS +#endif +/* If need optimized macro, can define this: #define __builtin_constant_p(x) 1 */ +#endif + +#if defined(__GNUC__) && ((__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3 /*above 3.4*/ )) ) + long __builtin_expect(long exp, long c); +#define LIKELY(x) __builtin_expect(!!(x), 1) +#define UNLIKELY(x) __builtin_expect(!!(x), 0) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +#ifndef TWO_MIN +#define TWO_MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#define WCHAR_SIZE sizeof(wchar_t) + +/*ref //sourceforge.net/p/predef/wiki/OperatingSystems/ +#if !(defined(__hpux) || defined(_AIX) || defined(__VXWORKS__) || defined(__vxworks) ||defined(__ANDROID__) || defined(__WRLINUX__)|| defined(_TYPE_uint8_t)) +typedef unsigned char unit8_t; +#endif +*/ +typedef char INT8T; +typedef unsigned char UINT8T; + +/* define the max length of the string */ +#define SECUREC_STRING_MAX_LEN (0x7fffffffUL) +#define SECUREC_WCHAR_STRING_MAX_LEN (SECUREC_STRING_MAX_LEN / WCHAR_SIZE) + +/* LSD SECUREC_MEM_MAX_LEN for memcpy and memmove*/ +#define SECUREC_MEM_MAX_LEN (0x7fffffffUL) +#define SECUREC_WCHAR_MEM_MAX_LEN (SECUREC_MEM_MAX_LEN / WCHAR_SIZE) + +#if SECUREC_STRING_MAX_LEN > 0x7fffffff +#error "max string is 2G, or you may remove this macro" +#endif + +#define IN_REGISTER register + +#if defined(WITH_PERFORMANCE_ADDONS) + /* for strncpy_s performance optimization */ +#define STRNCPY_SM(dest, destMax, src, count) \ + ((dest != NULL && src != NULL && (size_t)destMax >0 && (size_t)destMax <= SECUREC_STRING_MAX_LEN && (TWO_MIN(count , strlen(src)) + 1) <= (size_t)destMax ) ? ( (count < strlen(src))? (memcpy(dest, src, count), *((char*)dest + count) = '\0', EOK) :( memcpy(dest, src, strlen(src) + 1), EOK ) ) :(strncpy_error(dest, destMax, src, count)) ) + +#define STRCPY_SM(dest, destMax, src) \ + (( NULL != dest && NULL != src && (size_t)destMax >0 && (size_t)destMax <= SECUREC_STRING_MAX_LEN && ( strlen(src) + 1) <= (size_t)destMax )? (memcpy(dest, src, strlen(src) + 1), EOK) :( strcpy_error(dest, destMax, src))) + + /* for strcat_s performance optimization */ +#if defined(__GNUC__) +#define STRCAT_SM(dest, destMax, src) \ + ({ int catRet =EOK;\ + if ((dest) != NULL && (src) != NULL && (size_t)(destMax) > 0 && (size_t)(destMax) <= SECUREC_STRING_MAX_LEN) {\ + char* pCatTmpDst = (dest);\ + size_t catRestSz = (destMax);\ + do{\ + while(catRestSz > 0 && *pCatTmpDst) {\ + ++pCatTmpDst;\ + --catRestSz;\ + }\ + if (catRestSz == 0) {\ + catRet = EINVAL;\ + break;\ + }\ + if ( ( strlen(src) + 1) <= catRestSz ) {\ + memcpy(pCatTmpDst, (src), strlen(src) + 1);\ + catRet = EOK;\ + }else{\ + catRet = ERANGE;\ + }\ + }while(0);\ + if ( EOK != catRet) catRet = strcat_s((dest), (destMax), (src));\ + }else{\ + catRet = strcat_s((dest), (destMax), (src));\ + }\ + catRet;}) +#else +#define STRCAT_SM(dest, destMax, src) strcat_s(dest, destMax, src) +#endif + + /*for strncat_s performance optimization*/ +#if defined(__GNUC__) +#define STRNCAT_SM(dest, destMax, src, count) \ + ({ int ncatRet = EOK;\ + if ((dest) != NULL && (src) != NULL && (size_t)destMax > 0 && (size_t)destMax <= SECUREC_STRING_MAX_LEN && (size_t)count <= SECUREC_STRING_MAX_LEN) {\ + char* pCatTmpDest = (dest);\ + size_t ncatRestSz = (destMax);\ + do{\ + while(ncatRestSz > 0 && *pCatTmpDest) {\ + ++pCatTmpDest;\ + --ncatRestSz;\ + }\ + if (ncatRestSz == 0) {\ + ncatRet = EINVAL;\ + break;\ + }\ + if ( (TWO_MIN((count) , strlen(src)) + 1) <= ncatRestSz ) {\ + if ((count) < strlen(src)) {\ + memcpy(pCatTmpDest, (src), (count));\ + *(pCatTmpDest + (count)) = '\0';\ + }else {\ + memcpy(pCatTmpDest, (src), strlen(src) + 1);\ + }\ + }else{\ + ncatRet = ERANGE;\ + }\ + }while(0);\ + if ( EOK != ncatRet) ncatRet = strncat_s((dest), (destMax), (src), (count));\ + }else{\ + ncatRet = strncat_s((dest), (destMax), (src), (count));\ + }\ + ncatRet;}) +#else +#define STRNCAT_SM(dest, destMax, src, count) strncat_s(dest, destMax, src, count) +#endif + + /* + MEMCPY_SM do NOT check buffer overlap by default, or you can add this check to improve security + condCheck = condCheck || (dest == src) || (dest > src && dest < (void*)((UINT8T*)src + count));\ + condCheck = condCheck || (src > dest && src < (void*)((UINT8T*)dest + count)); \ + */ + +#define MEMCPY_SM(dest, destMax, src, count)\ + (!(((size_t)destMax== 0 )||((((UINT64T)(destMax) << 1 ) >> 1) > SECUREC_MEM_MAX_LEN)||((size_t)count > (size_t)destMax) || (NULL == (void*)dest) || (NULL == (void*)src))? (memcpy(dest, src, count), EOK) : (memcpy_s(dest, destMax, src, count))) + +#define MEMSET_SM(dest, destMax, c, count)\ + (!(((size_t)destMax == 0 ) || ((((UINT64T)(destMax) << 1 ) >> 1) > SECUREC_MEM_MAX_LEN) || (NULL == (void*)dest) || ((size_t)count > (size_t)destMax)) ? (memset(dest, c, count), EOK) : ( memset_s(dest, destMax, c, count))) + +#endif /* WITH_PERFORMANCE_ADDONS */ + +#if defined(_MSC_VER) || defined(__ARMCC_VERSION) +typedef __int64 INT64T; +typedef unsigned __int64 UINT64T; +#if defined(__ARMCC_VERSION) +typedef int INT32T; +typedef unsigned int UINT32T; +#else +typedef __int32 INT32T; +typedef unsigned __int32 UINT32T; +#endif +#else +typedef int INT32T; +typedef unsigned int UINT32T; +typedef long long INT64T; +typedef unsigned long long UINT64T; +#endif + +#ifdef _WIN64 +#define SECUREC_ON_64BITS +#endif + +#if defined(__LP64__) || defined(_LP64) +#define SECUREC_ON_64BITS +#endif + +#if (defined(__GNUC__ ) && defined(__SIZEOF_POINTER__ )) +#if (__SIZEOF_POINTER__ != 4) && (__SIZEOF_POINTER__ != 8) +#error "unsupported system, contact Security Design Technology Department of 2012 Labs" +#endif +#endif + +#if (!defined(SECUREC_ON_64BITS) && defined(__GNUC__ ) && defined(__SIZEOF_POINTER__ )) +#if __SIZEOF_POINTER__ == 8 +#define SECUREC_ON_64BITS +#endif +#endif + +#if (defined(__VXWORKS__) || defined(__vxworks) || defined(__VXWORKS)) +#ifndef _VXWORKS_PLATFORM_ +#define _VXWORKS_PLATFORM_ +#endif +#endif + +#if defined(__SVR4) || defined(__svr4__) +#define __SOLARIS +#endif + +#if (defined(__hpux) || defined(_AIX) || defined(__SOLARIS)) +#define __UNIX +#endif + +/*if enable COMPATIBLE_LINUX_FORMAT, the output format will be compatible to Linux.*/ +#if !(defined(_WIN32) || defined(_WIN64)|| defined(_VXWORKS_PLATFORM_)) +#define COMPATIBLE_LINUX_FORMAT +#endif + +#ifdef COMPATIBLE_LINUX_FORMAT +#include +#include +#endif + +#ifdef _VXWORKS_PLATFORM_ +#include +#endif + +#endif /*__SECURECTYPE_H__A7BBB686_AADA_451B_B9F9_44DACDAE18A7*/ diff --git a/secure/src/memcpy_s.c b/secure/src/memcpy_s.c new file mode 100644 index 0000000..ae795bc --- /dev/null +++ b/secure/src/memcpy_s.c @@ -0,0 +1,212 @@ +/******************************************************************************* +* Copyright @ Huawei Technologies Co., Ltd. 1998-2014. All rights reserved. +* File name: memcpy_s.c +* History: +* 1. Date: +* Author: +* Modification: +******************************************************************************** +*/ + +#include +#include "securecutil.h" + +/******************************************************************************* +* +* memcpy_s +* +* +* errno_t memcpy_s(void *dest, size_t destMax, const void *src, size_t count); +* +* +* memcpy_s copies count bytes from src to dest +* +* +* dest new buffer. +* destMax Size of the destination buffer. +* src Buffer to copy from. +* count Number of characters to copy +* +* +* dest buffer is updated. +* +* +* EOK Success +* EINVAL dest == NULL or strSrc == NULL +* ERANGE count > destMax or destMax > +* SECUREC_MEM_MAX_LEN or destMax == 0 +* EOVERLAP_AND_RESET dest buffer and source buffer are overlapped +* +* if an error occured, dest will be filled with 0. +* If the source and destination overlap, the behavior of memcpy_s is undefined. +* Use memmove_s to handle overlapping regions. +******************************************************************************* +*/ + +/* assembly language memcpy */ +extern void* memcpy_opt(void* dest, const void* src, size_t n); + +errno_t memcpy_s(void* dest, size_t destMax, const void* src, size_t count) +{ + if (destMax == 0 || destMax > SECUREC_MEM_MAX_LEN) { + SECUREC_ERROR_INVALID_RANGE("memcpy_s"); + return ERANGE; + } + if (dest == NULL || src == NULL) { + SECUREC_ERROR_INVALID_PARAMTER("memcpy_s"); + if (dest != NULL) { + (void)memset(dest, 0, destMax); + return EINVAL_AND_RESET; + } + return EINVAL; + } + if (count > destMax) { + (void)memset(dest, 0, destMax); + SECUREC_ERROR_INVALID_RANGE("memcpy_s"); + return ERANGE_AND_RESET; + } + if (dest == src) { + return EOK; + } + if ((dest > src && dest < (void *)((UINT8T*)src + count)) || + (src > dest && src < (void *)((UINT8T*)dest + count)) ) + { + (void)memset(dest, 0, destMax); + SECUREC_ERROR_BUFFER_OVERLAP("memcpy_s"); + return EOVERLAP_AND_RESET; + } + (void)memcpy(dest, src, count); + return EOK; +} + + +#if defined(WITH_PERFORMANCE_ADDONS) + +errno_t memcpy_sOptAsm(void* dest, size_t destMax, const void* src, size_t count) +{ + if (LIKELY( count <= destMax && dest && src /*&& dest != src*/ + && destMax <= SECUREC_MEM_MAX_LEN + && count > 0 + && ( (dest > src && (void*)((UINT8T*)src + count) <= dest) || + (src > dest && (void*)((UINT8T*)dest + count) <= src) ) + ) ) + { + if (count > 32) + { + /*large enough, let system API do it*/ +#ifdef USE_ASM + memcpy_opt(dest, src, count); +#else + (void)memcpy(dest, src, count); +#endif + return EOK; + } + else + { + switch (count) + { + case 1:*(MY_STR1 *)dest=*(MY_STR1 *)src;break; + case 2:*(MY_STR2 *)dest=*(MY_STR2 *)src;break; + case 3:*(MY_STR3 *)dest=*(MY_STR3 *)src;break; + case 4:*(MY_STR4 *)dest=*(MY_STR4 *)src;break; + case 5:*(MY_STR5 *)dest=*(MY_STR5 *)src;break; + case 6:*(MY_STR6 *)dest=*(MY_STR6 *)src;break; + case 7:*(MY_STR7 *)dest=*(MY_STR7 *)src;break; + case 8:*(MY_STR8 *)dest=*(MY_STR8 *)src;break; + case 9:*(MY_STR9 *)dest=*(MY_STR9 *)src;break; + case 10:*(MY_STR10 *)dest=*(MY_STR10 *)src;break; + case 11:*(MY_STR11 *)dest=*(MY_STR11 *)src;break; + case 12:*(MY_STR12 *)dest=*(MY_STR12 *)src;break; + case 13:*(MY_STR13 *)dest=*(MY_STR13 *)src;break; + case 14:*(MY_STR14 *)dest=*(MY_STR14 *)src;break; + case 15:*(MY_STR15 *)dest=*(MY_STR15 *)src;break; + case 16:*(MY_STR16 *)dest=*(MY_STR16 *)src;break; + case 17:*(MY_STR17 *)dest=*(MY_STR17 *)src;break; + case 18:*(MY_STR18 *)dest=*(MY_STR18 *)src;break; + case 19:*(MY_STR19 *)dest=*(MY_STR19 *)src;break; + case 20:*(MY_STR20 *)dest=*(MY_STR20 *)src;break; + case 21:*(MY_STR21 *)dest=*(MY_STR21 *)src;break; + case 22:*(MY_STR22 *)dest=*(MY_STR22 *)src;break; + case 23:*(MY_STR23 *)dest=*(MY_STR23 *)src;break; + case 24:*(MY_STR24*)dest=*(MY_STR24 *)src;break; + case 25:*(MY_STR25 *)dest=*(MY_STR25 *)src;break; + case 26:*(MY_STR26 *)dest=*(MY_STR26 *)src;break; + case 27:*(MY_STR27 *)dest=*(MY_STR27 *)src;break; + case 28:*(MY_STR28 *)dest=*(MY_STR28 *)src;break; + case 29:*(MY_STR29 *)dest=*(MY_STR29 *)src;break; + case 30:*(MY_STR30 *)dest=*(MY_STR30 *)src;break; + case 31:*(MY_STR31 *)dest=*(MY_STR31 *)src;break; + case 32:*(MY_STR32 *)dest=*(MY_STR32 *)src;break; + } + return EOK; + } + } else { + /* call it only to return error code */ + return memcpy_s(dest, destMax, src, count); + } +} + +/*trim judgement on "destMax <= SECUREC_MEM_MAX_LEN" */ +errno_t memcpy_sOptTc(void* dest, size_t destMax, const void* src, size_t count) +{ + if (LIKELY( count <= destMax && dest && src /*&& dest != src*/ + && count > 0 + && ( (dest > src && (void*)((UINT8T*)src + count) <= dest) || + (src > dest && (void*)((UINT8T*)dest + count) <= src) ) + ) ) + { + if (count > 32) { + /*large enough, let system API do it*/ +#ifdef USE_ASM + memcpy_opt(dest, src, count); +#else + (void)memcpy(dest, src, count); +#endif + return EOK; + } else { + /* use struct assignment */ + switch (count) + { + case 1:*(MY_STR1 *)dest=*(MY_STR1 *)src;break; + case 2:*(MY_STR2 *)dest=*(MY_STR2 *)src;break; + case 3:*(MY_STR3 *)dest=*(MY_STR3 *)src;break; + case 4:*(MY_STR4 *)dest=*(MY_STR4 *)src;break; + case 5:*(MY_STR5 *)dest=*(MY_STR5 *)src;break; + case 6:*(MY_STR6 *)dest=*(MY_STR6 *)src;break; + case 7:*(MY_STR7 *)dest=*(MY_STR7 *)src;break; + case 8:*(MY_STR8 *)dest=*(MY_STR8 *)src;break; + case 9:*(MY_STR9 *)dest=*(MY_STR9 *)src;break; + case 10:*(MY_STR10 *)dest=*(MY_STR10 *)src;break; + case 11:*(MY_STR11 *)dest=*(MY_STR11 *)src;break; + case 12:*(MY_STR12 *)dest=*(MY_STR12 *)src;break; + case 13:*(MY_STR13 *)dest=*(MY_STR13 *)src;break; + case 14:*(MY_STR14 *)dest=*(MY_STR14 *)src;break; + case 15:*(MY_STR15 *)dest=*(MY_STR15 *)src;break; + case 16:*(MY_STR16 *)dest=*(MY_STR16 *)src;break; + case 17:*(MY_STR17 *)dest=*(MY_STR17 *)src;break; + case 18:*(MY_STR18 *)dest=*(MY_STR18 *)src;break; + case 19:*(MY_STR19 *)dest=*(MY_STR19 *)src;break; + case 20:*(MY_STR20 *)dest=*(MY_STR20 *)src;break; + case 21:*(MY_STR21 *)dest=*(MY_STR21 *)src;break; + case 22:*(MY_STR22 *)dest=*(MY_STR22 *)src;break; + case 23:*(MY_STR23 *)dest=*(MY_STR23 *)src;break; + case 24:*(MY_STR24*)dest=*(MY_STR24 *)src;break; + case 25:*(MY_STR25 *)dest=*(MY_STR25 *)src;break; + case 26:*(MY_STR26 *)dest=*(MY_STR26 *)src;break; + case 27:*(MY_STR27 *)dest=*(MY_STR27 *)src;break; + case 28:*(MY_STR28 *)dest=*(MY_STR28 *)src;break; + case 29:*(MY_STR29 *)dest=*(MY_STR29 *)src;break; + case 30:*(MY_STR30 *)dest=*(MY_STR30 *)src;break; + case 31:*(MY_STR31 *)dest=*(MY_STR31 *)src;break; + case 32:*(MY_STR32 *)dest=*(MY_STR32 *)src;break; + } + return EOK; + } + } + else + { + /* call it only to return error code */ + return memcpy_s(dest, destMax, src, count); + } +} +#endif /* WITH_PERFORMANCE_ADDONS */ diff --git a/secure/src/memmove_s.c b/secure/src/memmove_s.c new file mode 100644 index 0000000..fd50dd5 --- /dev/null +++ b/secure/src/memmove_s.c @@ -0,0 +1,85 @@ +/******************************************************************************* +* Copyright @ Huawei Technologies Co., Ltd. 1998-2014. All rights reserved. +* File name: memmove_s.c +* History: +* 1. Date: +* Author: +* Modification: +******************************************************************************** +*/ + +#include +#include "securecutil.h" + +/******************************************************************************* + * + * memmove_s + * + * + * errno_t memmove_s(void *dest, size_t destMax, const void *src, size_t count); + * + * + * Copies count bytes of characters from src to dest. + * + * + * dest Destination object. + * destMax Size of the destination buffer. + * src Source object. + * count Number of characters to copy. + * + * + * dest buffer is uptdated. + * + * + * EOK Success + * EINVAL dest == NULL or strSrc == NULL + * ERANGE count > destMax or destMax > SECUREC_MEM_MAX_LEN + * or destMax == 0 + * + * If an error occured, dest will NOT be filled with 0. + * If some regions of the source area and the destination overlap, memmove_s + * ensures that the original source bytes in the overlapping region are copied + * before being overwritten. + ******************************************************************************* +*/ + +errno_t memmove_s(void* dest, size_t destMax, const void* src, size_t count) +{ + if (destMax == 0 || destMax > SECUREC_MEM_MAX_LEN ) + { + SECUREC_ERROR_INVALID_RANGE("memmove_s"); + return ERANGE; + } + if (dest == NULL || src == NULL) + { + SECUREC_ERROR_INVALID_PARAMTER("memmove_s"); + if (dest != NULL) + { + (void)memset(dest, 0, destMax); + return EINVAL_AND_RESET; + } + return EINVAL; + } + if (count > destMax) + { + (void)memset(dest, 0, destMax); + SECUREC_ERROR_INVALID_RANGE("memmove_s"); + return ERANGE_AND_RESET; + } + if (dest == src) + { + return EOK; + } + + if (count > 0) + { +#ifdef CALL_LIBC_COR_API + /*use underlying memmove for performance consideration*/ + (void)memmove(dest, src, count); +#else + + util_memmove(dest, src, count); +#endif + } + return EOK; +} \ No newline at end of file diff --git a/secure/src/memset_s.c b/secure/src/memset_s.c new file mode 100644 index 0000000..1e05093 --- /dev/null +++ b/secure/src/memset_s.c @@ -0,0 +1,276 @@ +/******************************************************************************* +* Copyright @ Huawei Technologies Co., Ltd. 1998-2014. All rights reserved. +* File name: memset_s.c +* History: +* 1. Date: +* Author: +* Modification: +******************************************************************************** +*/ + +#include +#include "securecutil.h" + +/******************************************************************************* +* +* memset_s +* +* +* errno_t memset_s(void* dest, size_t destMax, int c, size_t count) +* +* +* Sets buffers to a specified character. +* +* +* dest Pointer to destination. +* destMax The size of the buffer. +* c Character to set. +* count Number of characters. +* +* +* dest buffer is uptdated. +* +* +* EOK Success +* EINVAL dest == NULL +* ERANGE count > destMax or destMax > SECUREC_MEM_MAX_LEN +* or destMax == 0 +******************************************************************************* +*/ + + +errno_t memset_s(void* dest, size_t destMax, int c, size_t count) +{ + if (destMax == 0 || destMax > SECUREC_MEM_MAX_LEN) { + SECUREC_ERROR_INVALID_RANGE("memset_s"); + return ERANGE; + } + if (dest == NULL) { + SECUREC_ERROR_INVALID_PARAMTER("memset_s"); + return EINVAL; + } + + if (count > destMax) { + memset(dest, c, destMax); + SECUREC_ERROR_INVALID_RANGE("memset_s"); + return ERANGE_AND_RESET; + } + memset(dest, c, count); + return EOK; +} + +#if defined(WITH_PERFORMANCE_ADDONS) +/* assemble language memset */ +extern void *memset_opt(void *d, int c, size_t cnt); +static const MY_STR32 myStr = {"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}; +static const MY_STR32 myStrAllFF = {"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"}; + +errno_t memset_sOptAsm(void* dest, size_t destMax, int c, size_t count) +{ + if (LIKELY(count <= destMax && dest && destMax <= SECUREC_MEM_MAX_LEN )) + { + if (count > 32) + { +#ifdef USE_ASM + (void)memset_opt(dest, c, count); +#else + (void)memset(dest, c, count); +#endif + return EOK; + } + else + { + /* use struct assignment */ + switch (c) + { + case 0: + switch (count) + { + case 1:*(MY_STR1 *)dest=*(MY_STR1 *)(&myStr);break; + case 2:*(MY_STR2 *)dest=*(MY_STR2 *)(&myStr);break; + case 3:*(MY_STR3 *)dest=*(MY_STR3 *)(&myStr);break; + case 4:*(MY_STR4 *)dest=*(MY_STR4 *)(&myStr);break; + case 5:*(MY_STR5 *)dest=*(MY_STR5 *)(&myStr);break; + case 6:*(MY_STR6 *)dest=*(MY_STR6 *)(&myStr);break; + case 7:*(MY_STR7 *)dest=*(MY_STR7 *)(&myStr);break; + case 8:*(MY_STR8 *)dest=*(MY_STR8 *)(&myStr);break; + case 9:*(MY_STR9 *)dest=*(MY_STR9 *)(&myStr);break; + case 10:*(MY_STR10 *)dest=*(MY_STR10 *)(&myStr);break; + case 11:*(MY_STR11 *)dest=*(MY_STR11 *)(&myStr);break; + case 12:*(MY_STR12 *)dest=*(MY_STR12 *)(&myStr);break; + case 13:*(MY_STR13 *)dest=*(MY_STR13 *)(&myStr);break; + case 14:*(MY_STR14 *)dest=*(MY_STR14 *)(&myStr);break; + case 15:*(MY_STR15 *)dest=*(MY_STR15 *)(&myStr);break; + case 16:*(MY_STR16 *)dest=*(MY_STR16 *)(&myStr);break; + case 17:*(MY_STR17 *)dest=*(MY_STR17 *)(&myStr);break; + case 18:*(MY_STR18 *)dest=*(MY_STR18 *)(&myStr);break; + case 19:*(MY_STR19 *)dest=*(MY_STR19 *)(&myStr);break; + case 20:*(MY_STR20 *)dest=*(MY_STR20 *)(&myStr);break; + case 21:*(MY_STR21 *)dest=*(MY_STR21 *)(&myStr);break; + case 22:*(MY_STR22 *)dest=*(MY_STR22 *)(&myStr);break; + case 23:*(MY_STR23 *)dest=*(MY_STR23 *)(&myStr);break; + case 24:*(MY_STR24*)dest=*(MY_STR24 *)(&myStr);break; + case 25:*(MY_STR25 *)dest=*(MY_STR25 *)(&myStr);break; + case 26:*(MY_STR26 *)dest=*(MY_STR26 *)(&myStr);break; + case 27:*(MY_STR27 *)dest=*(MY_STR27 *)(&myStr);break; + case 28:*(MY_STR28 *)dest=*(MY_STR28 *)(&myStr);break; + case 29:*(MY_STR29 *)dest=*(MY_STR29 *)(&myStr);break; + case 30:*(MY_STR30 *)dest=*(MY_STR30 *)(&myStr);break; + case 31:*(MY_STR31 *)dest=*(MY_STR31 *)(&myStr);break; + case 32:*(MY_STR32 *)dest=*(MY_STR32 *)(&myStr);break; + } + + return EOK; + + case 0xFF: + switch (count) + { + case 1:*(MY_STR1 *)dest=*(MY_STR1 *)(&myStrAllFF);break; + case 2:*(MY_STR2 *)dest=*(MY_STR2 *)(&myStrAllFF);break; + case 3:*(MY_STR3 *)dest=*(MY_STR3 *)(&myStrAllFF);break; + case 4:*(MY_STR4 *)dest=*(MY_STR4 *)(&myStrAllFF);break; + case 5:*(MY_STR5 *)dest=*(MY_STR5 *)(&myStrAllFF);break; + case 6:*(MY_STR6 *)dest=*(MY_STR6 *)(&myStrAllFF);break; + case 7:*(MY_STR7 *)dest=*(MY_STR7 *)(&myStrAllFF);break; + case 8:*(MY_STR8 *)dest=*(MY_STR8 *)(&myStrAllFF);break; + case 9:*(MY_STR9 *)dest=*(MY_STR9 *)(&myStrAllFF);break; + case 10:*(MY_STR10 *)dest=*(MY_STR10 *)(&myStrAllFF);break; + case 11:*(MY_STR11 *)dest=*(MY_STR11 *)(&myStrAllFF);break; + case 12:*(MY_STR12 *)dest=*(MY_STR12 *)(&myStrAllFF);break; + case 13:*(MY_STR13 *)dest=*(MY_STR13 *)(&myStrAllFF);break; + case 14:*(MY_STR14 *)dest=*(MY_STR14 *)(&myStrAllFF);break; + case 15:*(MY_STR15 *)dest=*(MY_STR15 *)(&myStrAllFF);break; + case 16:*(MY_STR16 *)dest=*(MY_STR16 *)(&myStrAllFF);break; + case 17:*(MY_STR17 *)dest=*(MY_STR17 *)(&myStrAllFF);break; + case 18:*(MY_STR18 *)dest=*(MY_STR18 *)(&myStrAllFF);break; + case 19:*(MY_STR19 *)dest=*(MY_STR19 *)(&myStrAllFF);break; + case 20:*(MY_STR20 *)dest=*(MY_STR20 *)(&myStrAllFF);break; + case 21:*(MY_STR21 *)dest=*(MY_STR21 *)(&myStrAllFF);break; + case 22:*(MY_STR22 *)dest=*(MY_STR22 *)(&myStrAllFF);break; + case 23:*(MY_STR23 *)dest=*(MY_STR23 *)(&myStrAllFF);break; + case 24:*(MY_STR24 *)dest=*(MY_STR24 *)(&myStrAllFF);break; + case 25:*(MY_STR25 *)dest=*(MY_STR25 *)(&myStrAllFF);break; + case 26:*(MY_STR26 *)dest=*(MY_STR26 *)(&myStrAllFF);break; + case 27:*(MY_STR27 *)dest=*(MY_STR27 *)(&myStrAllFF);break; + case 28:*(MY_STR28 *)dest=*(MY_STR28 *)(&myStrAllFF);break; + case 29:*(MY_STR29 *)dest=*(MY_STR29 *)(&myStrAllFF);break; + case 30:*(MY_STR30 *)dest=*(MY_STR30 *)(&myStrAllFF);break; + case 31:*(MY_STR31 *)dest=*(MY_STR31 *)(&myStrAllFF);break; + case 32:*(MY_STR32 *)dest=*(MY_STR32 *)(&myStrAllFF);break; + } + return EOK; + } + memset(dest, c, count); + return EOK; + } + } + else + { + return memset_s(dest, destMax, c, count); + } +} + +errno_t memset_sOptTc(void* dest, size_t destMax, int c, size_t count) +{ + if (LIKELY(count <= destMax && dest )) + { + if (count > 32) + { +#ifdef USE_ASM + (void)memset_opt(dest, c, count); +#else + (void)memset(dest, c, count); +#endif + return EOK; + } + else + { + /* use struct assignment */ + switch (c) + { + case 0: + switch (count) + { + case 1:*(MY_STR1 *)dest=*(MY_STR1 *)(&myStr);break; + case 2:*(MY_STR2 *)dest=*(MY_STR2 *)(&myStr);break; + case 3:*(MY_STR3 *)dest=*(MY_STR3 *)(&myStr);break; + case 4:*(MY_STR4 *)dest=*(MY_STR4 *)(&myStr);break; + case 5:*(MY_STR5 *)dest=*(MY_STR5 *)(&myStr);break; + case 6:*(MY_STR6 *)dest=*(MY_STR6 *)(&myStr);break; + case 7:*(MY_STR7 *)dest=*(MY_STR7 *)(&myStr);break; + case 8:*(MY_STR8 *)dest=*(MY_STR8 *)(&myStr);break; + case 9:*(MY_STR9 *)dest=*(MY_STR9 *)(&myStr);break; + case 10:*(MY_STR10 *)dest=*(MY_STR10 *)(&myStr);break; + case 11:*(MY_STR11 *)dest=*(MY_STR11 *)(&myStr);break; + case 12:*(MY_STR12 *)dest=*(MY_STR12 *)(&myStr);break; + case 13:*(MY_STR13 *)dest=*(MY_STR13 *)(&myStr);break; + case 14:*(MY_STR14 *)dest=*(MY_STR14 *)(&myStr);break; + case 15:*(MY_STR15 *)dest=*(MY_STR15 *)(&myStr);break; + case 16:*(MY_STR16 *)dest=*(MY_STR16 *)(&myStr);break; + case 17:*(MY_STR17 *)dest=*(MY_STR17 *)(&myStr);break; + case 18:*(MY_STR18 *)dest=*(MY_STR18 *)(&myStr);break; + case 19:*(MY_STR19 *)dest=*(MY_STR19 *)(&myStr);break; + case 20:*(MY_STR20 *)dest=*(MY_STR20 *)(&myStr);break; + case 21:*(MY_STR21 *)dest=*(MY_STR21 *)(&myStr);break; + case 22:*(MY_STR22 *)dest=*(MY_STR22 *)(&myStr);break; + case 23:*(MY_STR23 *)dest=*(MY_STR23 *)(&myStr);break; + case 24:*(MY_STR24*)dest=*(MY_STR24 *)(&myStr);break; + case 25:*(MY_STR25 *)dest=*(MY_STR25 *)(&myStr);break; + case 26:*(MY_STR26 *)dest=*(MY_STR26 *)(&myStr);break; + case 27:*(MY_STR27 *)dest=*(MY_STR27 *)(&myStr);break; + case 28:*(MY_STR28 *)dest=*(MY_STR28 *)(&myStr);break; + case 29:*(MY_STR29 *)dest=*(MY_STR29 *)(&myStr);break; + case 30:*(MY_STR30 *)dest=*(MY_STR30 *)(&myStr);break; + case 31:*(MY_STR31 *)dest=*(MY_STR31 *)(&myStr);break; + case 32:*(MY_STR32 *)dest=*(MY_STR32 *)(&myStr);break; + } + return EOK; + + case 0xFF: + switch (count) + { + case 1:*(MY_STR1 *)dest=*(MY_STR1 *)(&myStrAllFF);break; + case 2:*(MY_STR2 *)dest=*(MY_STR2 *)(&myStrAllFF);break; + case 3:*(MY_STR3 *)dest=*(MY_STR3 *)(&myStrAllFF);break; + case 4:*(MY_STR4 *)dest=*(MY_STR4 *)(&myStrAllFF);break; + case 5:*(MY_STR5 *)dest=*(MY_STR5 *)(&myStrAllFF);break; + case 6:*(MY_STR6 *)dest=*(MY_STR6 *)(&myStrAllFF);break; + case 7:*(MY_STR7 *)dest=*(MY_STR7 *)(&myStrAllFF);break; + case 8:*(MY_STR8 *)dest=*(MY_STR8 *)(&myStrAllFF);break; + case 9:*(MY_STR9 *)dest=*(MY_STR9 *)(&myStrAllFF);break; + case 10:*(MY_STR10 *)dest=*(MY_STR10 *)(&myStrAllFF);break; + case 11:*(MY_STR11 *)dest=*(MY_STR11 *)(&myStrAllFF);break; + case 12:*(MY_STR12 *)dest=*(MY_STR12 *)(&myStrAllFF);break; + case 13:*(MY_STR13 *)dest=*(MY_STR13 *)(&myStrAllFF);break; + case 14:*(MY_STR14 *)dest=*(MY_STR14 *)(&myStrAllFF);break; + case 15:*(MY_STR15 *)dest=*(MY_STR15 *)(&myStrAllFF);break; + case 16:*(MY_STR16 *)dest=*(MY_STR16 *)(&myStrAllFF);break; + case 17:*(MY_STR17 *)dest=*(MY_STR17 *)(&myStrAllFF);break; + case 18:*(MY_STR18 *)dest=*(MY_STR18 *)(&myStrAllFF);break; + case 19:*(MY_STR19 *)dest=*(MY_STR19 *)(&myStrAllFF);break; + case 20:*(MY_STR20 *)dest=*(MY_STR20 *)(&myStrAllFF);break; + case 21:*(MY_STR21 *)dest=*(MY_STR21 *)(&myStrAllFF);break; + case 22:*(MY_STR22 *)dest=*(MY_STR22 *)(&myStrAllFF);break; + case 23:*(MY_STR23 *)dest=*(MY_STR23 *)(&myStrAllFF);break; + case 24:*(MY_STR24*)dest=*(MY_STR24 *)(&myStrAllFF);break; + case 25:*(MY_STR25 *)dest=*(MY_STR25 *)(&myStrAllFF);break; + case 26:*(MY_STR26 *)dest=*(MY_STR26 *)(&myStrAllFF);break; + case 27:*(MY_STR27 *)dest=*(MY_STR27 *)(&myStrAllFF);break; + case 28:*(MY_STR28 *)dest=*(MY_STR28 *)(&myStrAllFF);break; + case 29:*(MY_STR29 *)dest=*(MY_STR29 *)(&myStrAllFF);break; + case 30:*(MY_STR30 *)dest=*(MY_STR30 *)(&myStrAllFF);break; + case 31:*(MY_STR31 *)dest=*(MY_STR31 *)(&myStrAllFF);break; + case 32:*(MY_STR32 *)dest=*(MY_STR32 *)(&myStrAllFF);break; + } + return EOK; + } + memset(dest, c, count); + return EOK; + } + } + else + { + return memset_s(dest, destMax, c, count); + } +} +#endif /* WITH_PERFORMANCE_ADDONS */ diff --git a/secure/src/securecutil.h b/secure/src/securecutil.h new file mode 100644 index 0000000..66d824c --- /dev/null +++ b/secure/src/securecutil.h @@ -0,0 +1,135 @@ +/******************************************************************************* +* Copyright @ Huawei Technologies Co., Ltd. 1998-2014. All rights reserved. +* File name: securecutil.h +* History: +* 1. Date: 2014/5/20 +* Author: LiShunda +* Modification: remove extern "C" modifier which will cause g++ link error. +******************************************************************************** +*/ + +#ifndef __SECURECUTIL_H__46C86578_F8FF_4E49_8E64_9B175241761F +#define __SECURECUTIL_H__46C86578_F8FF_4E49_8E64_9B175241761F + +#include + +#ifdef CALL_LIBC_COR_API +/*#include if memory.h don't exist, use "string.h" instead.*/ +#include +#endif + +#define DIRECT_ASSIGNMENT_THRESHOLD (12) + +/*struct for performance*/ +typedef struct {char buf[1];}MY_STR1; +typedef struct {char buf[2];}MY_STR2; +typedef struct {char buf[3];}MY_STR3; +typedef struct {char buf[4];}MY_STR4; +typedef struct {char buf[5];}MY_STR5; +typedef struct {char buf[6];}MY_STR6; +typedef struct {char buf[7];}MY_STR7; +typedef struct {char buf[8];}MY_STR8; +typedef struct {char buf[9];}MY_STR9; +typedef struct {char buf[10];}MY_STR10; +typedef struct {char buf[11];}MY_STR11; +typedef struct {char buf[12];}MY_STR12; +typedef struct {char buf[13];}MY_STR13; +typedef struct {char buf[14];}MY_STR14; +typedef struct {char buf[15];}MY_STR15; +typedef struct {char buf[16];}MY_STR16; +typedef struct {char buf[17];}MY_STR17; +typedef struct {char buf[18];}MY_STR18; +typedef struct {char buf[19];}MY_STR19; +typedef struct {char buf[20];}MY_STR20; +typedef struct {char buf[21];}MY_STR21; +typedef struct {char buf[22];}MY_STR22; +typedef struct {char buf[23];}MY_STR23; +typedef struct {char buf[24];}MY_STR24; +typedef struct {char buf[25];}MY_STR25; +typedef struct {char buf[26];}MY_STR26; +typedef struct {char buf[27];}MY_STR27; +typedef struct {char buf[28];}MY_STR28; +typedef struct {char buf[29];}MY_STR29; +typedef struct {char buf[30];}MY_STR30; +typedef struct {char buf[31];}MY_STR31; +typedef struct {char buf[32];}MY_STR32; +typedef struct {char buf[33];}MY_STR33; +typedef struct {char buf[34];}MY_STR34; +typedef struct {char buf[35];}MY_STR35; +typedef struct {char buf[36];}MY_STR36; +typedef struct {char buf[37];}MY_STR37; +typedef struct {char buf[38];}MY_STR38; +typedef struct {char buf[39];}MY_STR39; +typedef struct {char buf[40];}MY_STR40; +typedef struct {char buf[41];}MY_STR41; +typedef struct {char buf[42];}MY_STR42; +typedef struct {char buf[43];}MY_STR43; +typedef struct {char buf[44];}MY_STR44; +typedef struct {char buf[45];}MY_STR45; +typedef struct {char buf[46];}MY_STR46; +typedef struct {char buf[47];}MY_STR47; +typedef struct {char buf[48];}MY_STR48; + +/*#define USE_ASM*/ + +#define _CHECK_BUFFER_OVERLAP /*lint !e946*/ +#define ERROR_HANDLER_BY_PRINTF + +/* +#define ERROR_HANDLER_BY_ASSERT +#define ERROR_HANDLER_BY_FILE_LOG +*/ + /* User can change the error handler by modify the following definition, + * such as logging the detail error in file. + */ +#if defined(_DEBUG) || defined(DEBUG) +#if defined(ERROR_HANDLER_BY_ASSERT) +#define SECUREC_ERROR_INVALID_PARAMTER(msg) assert( msg "invalid argument" == NULL) +#define SECUREC_ERROR_INVALID_RANGE(msg) assert( msg "invalid dest buffer size" == NULL) +#define SECUREC_ERROR_BUFFER_OVERLAP(msg) assert( msg "buffer overlap" == NULL) + +#elif defined(ERROR_HANDLER_BY_PRINTF) +#define SECUREC_ERROR_INVALID_PARAMTER(msg) printf( "%s invalid argument\n",msg) +#define SECUREC_ERROR_INVALID_RANGE(msg) printf( "%s invalid dest buffer size\n", msg) +#define SECUREC_ERROR_BUFFER_OVERLAP(msg) printf( "%s buffer overlap\n",msg) + +#else +#define SECUREC_ERROR_INVALID_PARAMTER(msg) ((void)0) +#define SECUREC_ERROR_INVALID_RANGE(msg) ((void)0) +#define SECUREC_ERROR_BUFFER_OVERLAP(msg) ((void)0) +#endif + +#if defined(ERROR_HANDLER_BY_FILE_LOG) +#define SECUREC_ERROR_INVALID_PARAMTER(msg) logSecureCRuntimeError(msg " EINVAL\n") +#define SECUREC_ERROR_INVALID_RANGE(msg) logSecureCRuntimeError(msg " ERANGE\n") +#define SECUREC_ERROR_BUFFER_OVERLAP(msg) logSecureCRuntimeError(msg " EOVERLAP\n") +#endif +#else +#define SECUREC_ERROR_INVALID_PARAMTER(msg) ((void)0) +#define SECUREC_ERROR_INVALID_RANGE(msg) ((void)0) +#define SECUREC_ERROR_BUFFER_OVERLAP(msg) ((void)0) +#endif + + + void memcpy_8b(void *dest, const void *src, size_t count); +#ifndef CALL_LIBC_COR_API + void memcpy_32b(void *dest, const void *src, size_t count); + void memcpy_64b(void *dest, const void *src, size_t count); +#endif + + void util_memmove (void* dst, const void* src, size_t count); +//lint -esym(526, vsnprintf_helper*) + int vsnprintf_helper (char* string, size_t count, const char* format, va_list ap); + +#ifdef __cplusplus +extern "C" +{ +#endif + void logSecureCRuntimeError(const char* errDetail); +#ifdef __cplusplus +} +#endif /* __cplusplus */ + + + +#endif \ No newline at end of file