diff --git a/src/ggml-cpu/CMakeLists.txt b/src/ggml-cpu/CMakeLists.txt index 971313d20..c8cc32fa5 100644 --- a/src/ggml-cpu/CMakeLists.txt +++ b/src/ggml-cpu/CMakeLists.txt @@ -23,6 +23,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name) ggml-cpu/amx/mmq.cpp ggml-cpu/amx/mmq.h ggml-cpu/ggml-cpu-impl.h + ggml-cpu/common.h + ggml-cpu/binary-ops.h + ggml-cpu/binary-ops.cpp + ggml-cpu/unary-ops.h + ggml-cpu/unary-ops.cpp ) target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17) diff --git a/src/ggml-cpu/binary-ops.cpp b/src/ggml-cpu/binary-ops.cpp new file mode 100644 index 000000000..14f5b43ae --- /dev/null +++ b/src/ggml-cpu/binary-ops.cpp @@ -0,0 +1,158 @@ +#include "binary-ops.h" + +#if defined(GGML_USE_ACCELERATE) +#include + +using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length); +#endif + +static inline float op_add(float a, float b) { + return a + b; +} + +static inline float op_sub(float a, float b) { + return a - b; +} + +static inline float op_mul(float a, float b) { + return a * b; +} + +static inline float op_div(float a, float b) { + return a / b; +} + +template +static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) { + constexpr auto src0_to_f32 = type_conversion_table::to_f32; + constexpr auto src1_to_f32 = type_conversion_table::to_f32; + constexpr auto f32_to_dst = type_conversion_table::from_f32; + + for (int i = 0; i < n; i++) { + z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i]))); + } +} + +template +static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) { + constexpr auto src0_to_f32 = type_conversion_table::to_f32; + constexpr auto src1_to_f32 = type_conversion_table::to_f32; + constexpr auto f32_to_dst = type_conversion_table::from_f32; + + for (int i = 0; i < n; i++) { + int i10 = i % ne10; + const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10); + z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr))); + } +} + +template +static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(dst_t)); + GGML_ASSERT(nb00 == sizeof(src0_t)); + + const auto [ir0, ir1] = get_thread_range(params, src0); + const bool is_src1_contiguous = (nb10 == sizeof(src1_t)); + + if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + } + +#ifdef GGML_USE_ACCELERATE + vDSP_fn_t vDSP_op = nullptr; + // TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + if (op == op_add) { + vDSP_op = vDSP_vadd; + } else if (op == op_sub) { + vDSP_op = vDSP_vsub; + } else if (op == op_mul) { + vDSP_op = vDSP_vmul; + } else if (op == op_div) { + vDSP_op = vDSP_vdiv; + } + } +#endif + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + const int64_t i13 = i03 % ne13; + const int64_t i12 = i02 % ne12; + const int64_t i11 = i01 % ne11; + + dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + + if (is_src1_contiguous) { + // src1 is broadcastable across src0 and dst in i1, i2, i3 + const int64_t nr0 = ne00 / ne10; + + for (int64_t r = 0; r < nr0; ++r) { +#ifdef GGML_USE_ACCELERATE + if constexpr (std::is_same_v && std::is_same_v && std::is_same_v) { + if (vDSP_op != nullptr) { + vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10); + continue; + } + } +#endif + vec_binary_op_contiguous(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); + } + } else { + vec_binary_op_non_contiguous(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr); + } + } +} + +// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates +template +static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + /* */ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32 + apply_binary_op(params, dst); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16 + apply_binary_op(params, dst); + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16 + apply_binary_op(params, dst); + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) { + apply_binary_op(params, dst); + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + apply_binary_op(params, dst); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + apply_binary_op(params, dst); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + apply_binary_op(params, dst); + } else { + GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, + ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); + } +} + +void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) { + binary_op(params, dst); +} + +void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) { + binary_op(params, dst); +} + +void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) { + binary_op(params, dst); +} + +void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) { + binary_op(params, dst); +} diff --git a/src/ggml-cpu/binary-ops.h b/src/ggml-cpu/binary-ops.h new file mode 100644 index 000000000..aca1d89be --- /dev/null +++ b/src/ggml-cpu/binary-ops.h @@ -0,0 +1,16 @@ +#pragma once + +#include "common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst); + +#ifdef __cplusplus +} +#endif diff --git a/src/ggml-cpu/common.h b/src/ggml-cpu/common.h new file mode 100644 index 000000000..3df01c1ed --- /dev/null +++ b/src/ggml-cpu/common.h @@ -0,0 +1,72 @@ +#pragma once + +#include "ggml.h" +#include "ggml-cpu-traits.h" +#include "ggml-cpu-impl.h" +#include "ggml-impl.h" + +#ifdef __cplusplus + +#include + +// convenience functions/macros for use in template calls +// note: these won't be required after the 'traits' lookup table is used. +static inline ggml_fp16_t f32_to_f16(float x) { + return GGML_FP32_TO_FP16(x); +} + +static inline float f16_to_f32(ggml_fp16_t x) { + return GGML_FP16_TO_FP32(x); +} + +static inline ggml_bf16_t f32_to_bf16(float x) { + return GGML_FP32_TO_BF16(x); +} + +static inline float bf16_to_f32(ggml_bf16_t x) { + return GGML_BF16_TO_FP32(x); +} + +static inline float f32_to_f32(float x) { + return x; +} + +// TODO - merge this into the traits table, after using row-based conversions +template +struct type_conversion_table; + +template <> +struct type_conversion_table { + static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32; + static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16; +}; + +template <> +struct type_conversion_table { + static constexpr float (*to_f32)(float) = f32_to_f32; + static constexpr float (*from_f32)(float) = f32_to_f32; +}; + +template <> +struct type_conversion_table { + static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32; + static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16; +}; + +static std::pair get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) { + const int64_t ith = params->ith; + const int64_t nth = params->nth; + + const int64_t nr = ggml_nrows(src0); + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); + + return {ir0, ir1}; +} + +#endif diff --git a/src/ggml-cpu/ggml-cpu.c b/src/ggml-cpu/ggml-cpu.c index d2c5feec4..dd3ebfb59 100644 --- a/src/ggml-cpu/ggml-cpu.c +++ b/src/ggml-cpu/ggml-cpu.c @@ -9,6 +9,8 @@ #include "ggml-impl.h" #include "ggml-cpu-quants.h" #include "ggml-threading.h" +#include "ggml-cpu/unary-ops.h" +#include "ggml-cpu/binary-ops.h" #include "ggml.h" #if defined(_MSC_VER) || defined(__MINGW32__) @@ -4289,340 +4291,6 @@ static void ggml_compute_forward_dup( // ggml_compute_forward_add -static void ggml_compute_forward_add_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(float)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0; r < nr0; ++r) { -#ifdef GGML_USE_ACCELERATE - vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); -#else - ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); -#endif - } - } - } else { - // src1 is not contiguous - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - - for (int64_t i0 = 0; i0 < ne0; ++i0) { - const int64_t i10 = i0 % ne10; - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); - - dst_ptr[i0] = src0_ptr[i0] + *src1_ptr; - } - } - } -} - -static void ggml_compute_forward_add_f16_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - if (dst->type == GGML_TYPE_F32) { - GGML_ASSERT( nb0 == sizeof(float)); - } - else { - GGML_ASSERT(dst->type == GGML_TYPE_F16); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - } - - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(float)) { - if (dst->type == GGML_TYPE_F16) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); - } - } - } else { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; - } - } - } - } - else { - // src1 is not contiguous - GGML_ABORT("fatal error"); - } -} - -static void ggml_compute_forward_add_bf16_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_BF16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - if (dst->type == GGML_TYPE_F32) { - GGML_ASSERT( nb0 == sizeof(float)); - } - else { - GGML_ASSERT(dst->type == GGML_TYPE_BF16); - GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); - } - - GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(float)) { - if (dst->type == GGML_TYPE_BF16) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); - } - } - } else { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; - } - } - } - } - else { - // src1 is not contiguous - GGML_ABORT("fatal error"); - } -} - -static void ggml_compute_forward_add_f16_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); - - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(ggml_fp16_t)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0; r < nr0; ++r) { - ggml_vec_add_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); - } - } - } - else { - // src1 is not contiguous - GGML_ABORT("fatal error"); - } -} - -static void ggml_compute_forward_add_bf16_bf16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_BF16); - GGML_ASSERT(src1->type == GGML_TYPE_BF16); - GGML_ASSERT(dst->type == GGML_TYPE_BF16); - - GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); - - // rows per thread - const int dr = (nr + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); - - if (nb10 == sizeof(ggml_bf16_t)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src0, src1 and dst are same shape => same indices - const int i3 = ir/(ne2*ne1); - const int i2 = (ir - i3*ne2*ne1)/ne1; - const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - - ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); - ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); - ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); - - for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i])); - } - } - } - else { - // src1 is not contiguous - GGML_ABORT("fatal error"); - } -} - static void ggml_compute_forward_add_q_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -4708,37 +4376,10 @@ static void ggml_compute_forward_add( switch (src0->type) { case GGML_TYPE_F32: - { - if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f32(params, dst); - } - else { - GGML_ABORT("fatal error"); - } - } break; case GGML_TYPE_F16: - { - if (src1->type == GGML_TYPE_F16) { - ggml_compute_forward_add_f16_f16(params, dst); - } - else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_f16_f32(params, dst); - } - else { - GGML_ABORT("fatal error"); - } - } break; case GGML_TYPE_BF16: { - if (src1->type == GGML_TYPE_BF16) { - ggml_compute_forward_add_bf16_bf16(params, dst); - } - else if (src1->type == GGML_TYPE_F32) { - ggml_compute_forward_add_bf16_f32(params, dst); - } - else { - GGML_ABORT("fatal error"); - } + ggml_compute_forward_add_non_quantized(params, dst); } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -5272,140 +4913,107 @@ static void ggml_compute_forward_acc( } } -// ggml_compute_forward_sub +// ggml_compute_forward_sum -static void ggml_compute_forward_sub_f32( +static void ggml_compute_forward_sum_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); + if (params->ith != 0) { + return; + } - // rows per thread - const int dr = (nr + nth - 1)/nth; + assert(ggml_is_scalar(dst)); + assert(src0->nb[0] == sizeof(float)); - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) - if (nb10 == sizeof(float)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + ggml_float sum = 0; + ggml_float row_sum = 0; - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0; r < nr0; ++r) { -#ifdef GGML_USE_ACCELERATE - vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10); -#else - ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); -#endif - } - } - } else { - // src1 is not contiguous - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - - for (int64_t i0 = 0; i0 < ne0; ++i0) { - const int64_t i10 = i0 % ne10; - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); - - dst_ptr[i0] = src0_ptr[i0] - *src1_ptr; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f32_ggf(ne00, + &row_sum, + (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); + sum += row_sum; } } } + ((float *) dst->data)[0] = sum; } -static void ggml_compute_forward_sub_f16( +static void ggml_compute_forward_sum_f16( const struct ggml_compute_params * params, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + if (params->ith != 0) { + return; + } - const int ith = params->ith; - const int nth = params->nth; + assert(ggml_is_scalar(dst)); - const int nr = ggml_nrows(src0); + assert(src0->nb[0] == sizeof(ggml_fp16_t)); - GGML_TENSOR_BINARY_OP_LOCALS + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + float sum = 0; + float row_sum = 0; - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f16_ggf(ne00, + &row_sum, + (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; + } + } + } + ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); +} - // rows per thread - const int dr = (nr + nth - 1)/nth; +static void ggml_compute_forward_sum_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { - // row range for this thread - const int ir0 = dr*ith; - const int ir1 = MIN(ir0 + dr, nr); + const struct ggml_tensor * src0 = dst->src[0]; - if (nb10 == sizeof(ggml_fp16_t)) { - for (int ir = ir0; ir < ir1; ++ir) { - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + if (params->ith != 0) { + return; + } + + assert(ggml_is_scalar(dst)); + + assert(src0->nb[0] == sizeof(ggml_bf16_t)); - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + float sum = 0; + float row_sum = 0; - for (int64_t r = 0; r < nr0; ++r) { - ggml_vec_sub_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_bf16_ggf(ne00, + &row_sum, + (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; } } - } else { - // src1 is not contiguous - GGML_ABORT("unimplemented error"); } + ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum); } -static void ggml_compute_forward_sub( +static void ggml_compute_forward_sum( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -5414,11 +5022,15 @@ static void ggml_compute_forward_sub( switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sub_f32(params, dst); + ggml_compute_forward_sum_f32(params, dst); } break; case GGML_TYPE_F16: { - ggml_compute_forward_sub_f16(params, dst); + ggml_compute_forward_sum_f16(params, dst); + } break; + case GGML_TYPE_BF16: + { + ggml_compute_forward_sum_bf16(params, dst); } break; default: { @@ -5427,145 +5039,51 @@ static void ggml_compute_forward_sub( } } -// ggml_compute_forward_mul +// ggml_compute_forward_sum_rows -static void ggml_compute_forward_mul_f32( +static void ggml_compute_forward_sum_rows_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - if (nb10 == sizeof(float)) { - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0 ; r < nr0; ++r) { -#ifdef GGML_USE_ACCELERATE - UNUSED(ggml_vec_mul_f32); - - vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10); -#else - ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); -#endif - } - } - } else { - // src1 is not contiguous - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - - for (int64_t i0 = 0; i0 < ne00; ++i0) { - const int64_t i10 = i0 % ne10; - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); - - dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr); - } - } + if (params->ith != 0) { + return; } -} - -static void ggml_compute_forward_mul_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); - - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - if (nb10 == sizeof(ggml_fp16_t)) { - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(dst->nb[0] == sizeof(float)); - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; + GGML_TENSOR_UNARY_OP_LOCALS - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + GGML_ASSERT(ne0 == 1); + GGML_ASSERT(ne1 == ne01); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); - for (int64_t r = 0 ; r < nr0; ++r) { - ggml_vec_mul_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); + for (int64_t i3 = 0; i3 < ne03; i3++) { + for (int64_t i2 = 0; i2 < ne02; i2++) { + for (int64_t i1 = 0; i1 < ne01; i1++) { + float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); + float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); + float row_sum = 0; + ggml_vec_sum_f32(ne00, &row_sum, src_row); + dst_row[0] = row_sum; } } - } else { - // src1 is not contiguous - GGML_ABORT("unimplemented error"); } } -static void ggml_compute_forward_mul( +static void ggml_compute_forward_sum_rows( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT((src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && "only f32/f16 src1 supported for now"); switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_mul_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_mul_f16(params, dst); + ggml_compute_forward_sum_rows_f32(params, dst); } break; default: { @@ -5574,129 +5092,46 @@ static void ggml_compute_forward_mul( } } -// ggml_compute_forward_div +// ggml_compute_forward_mean -static void ggml_compute_forward_div_f32( +static void ggml_compute_forward_mean_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT( nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - if (nb10 == sizeof(float)) { - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); - - for (int64_t r = 0; r < nr0; ++r) { -#ifdef GGML_USE_ACCELERATE - UNUSED(ggml_vec_div_f32); - - vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10); -#else - ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); -#endif - } - } - } else { - // src1 is not contiguous - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - // src1 is broadcastable across src0 and dst in i1, i2, i3 - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); - - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - - float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - - for (int64_t i0 = 0; i0 < ne00; ++i0) { - const int64_t i10 = i0 % ne10; - float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10); - dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr); - } - } + if (params->ith != 0) { + return; } -} - -static void ggml_compute_forward_div_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - const int ith = params->ith; - const int nth = params->nth; - - const int64_t nr = ggml_nrows(src0); - - GGML_TENSOR_BINARY_OP_LOCALS - - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F16); + assert(src0->nb[0] == sizeof(float)); - GGML_ASSERT( nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_TENSOR_UNARY_OP_LOCALS - if (nb10 == sizeof(ggml_fp16_t)) { - for (int64_t ir = ith; ir < nr; ir += nth) { - // src0 and dst are same shape => same indices - const int64_t i03 = ir/(ne02*ne01); - const int64_t i02 = (ir - i03*ne02*ne01)/ne01; - const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + assert(ne0 == 1); + assert(ne1 == ne01); + assert(ne2 == ne02); + assert(ne3 == ne03); - const int64_t i13 = i03 % ne13; - const int64_t i12 = i02 % ne12; - const int64_t i11 = i01 % ne11; - const int64_t nr0 = ne00 / ne10; + UNUSED(ne0); + UNUSED(ne1); + UNUSED(ne2); + UNUSED(ne3); - ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); - ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); - ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11); + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_f32(ne00, + (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); - for (int64_t r = 0; r < nr0; ++r) { - ggml_vec_div_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr); + *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; } } - } else { - // src1 is not contiguous - GGML_ABORT("unimplemented error"); } } -static void ggml_compute_forward_div( +static void ggml_compute_forward_mean( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -5705,11 +5140,7 @@ static void ggml_compute_forward_div( switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_div_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_div_f16(params, dst); + ggml_compute_forward_mean_f32(params, dst); } break; default: { @@ -5718,9 +5149,9 @@ static void ggml_compute_forward_div( } } -// ggml_compute_forward_sqr +// ggml_compute_forward_argmax -static void ggml_compute_forward_sqr_f32( +static void ggml_compute_forward_argmax_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -5730,47 +5161,25 @@ static void ggml_compute_forward_sqr_f32( return; } - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - assert( dst->nb[0] == sizeof(float)); assert(src0->nb[0] == sizeof(float)); + assert(dst->nb[0] == sizeof(float)); - for (int i = 0; i < n; i++) { - ggml_vec_sqr_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; -static void ggml_compute_forward_sqr_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - assert( dst->nb[0] == sizeof(ggml_fp16_t)); - assert(src0->nb[0] == sizeof(ggml_fp16_t)); + const size_t nb01 = src0->nb[1]; + const size_t nb0 = dst->nb[0]; - for (int i = 0; i < n; i++) { - ggml_vec_sqr_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); + for (int64_t i1 = 0; i1 < ne01; i1++) { + float * src = (float *) ((char *) src0->data + i1*nb01); + int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); + int v = 0; + ggml_vec_argmax_f32(ne00, &v, src); + dst_[0] = v; } } -static void ggml_compute_forward_sqr( +static void ggml_compute_forward_argmax( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -5779,11 +5188,7 @@ static void ggml_compute_forward_sqr( switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sqr_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_sqr_f16(params, dst); + ggml_compute_forward_argmax_f32(params, dst); } break; default: { @@ -5792,72 +5197,78 @@ static void ggml_compute_forward_sqr( } } -// ggml_compute_forward_sqrt +// ggml_compute_forward_count_equal -static void ggml_compute_forward_sqrt_f32( +static void ggml_compute_forward_count_equal_i32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; - if (params->ith != 0) { - return; - } + GGML_TENSOR_BINARY_OP_LOCALS; - assert(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(src0->type == GGML_TYPE_I32); + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_scalar(dst)); + GGML_ASSERT(dst->type == GGML_TYPE_I64); - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + const int64_t nr = ggml_nrows(src0); - assert( dst->nb[0] == sizeof(float)); - assert(src0->nb[0] == sizeof(float)); + const int ith = params->ith; + const int nth = params->nth; - for (int i = 0; i < n; i++) { - ggml_vec_sqrt_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} + int64_t * sums = (int64_t *) params->wdata; + int64_t sum_thread = 0; -static void ggml_compute_forward_sqrt_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; - const struct ggml_tensor * src0 = dst->src[0]; + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = MIN(ir0 + dr, nr); - if (params->ith != 0) { - return; - } + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir / (ne02*ne01); + const int64_t i02 = (ir - i03*ne03) / ne01; + const int64_t i01 = ir - i03*ne03 - i02*ne02; - assert(ggml_are_same_shape(src0, dst)); + const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01; + const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11; - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + for (int64_t i00 = 0; i00 < ne00; ++i00) { + const int32_t val0 = *((const int32_t *) (data0 + i00*nb00)); + const int32_t val1 = *((const int32_t *) (data1 + i00*nb10)); - assert( dst->nb[0] == sizeof(ggml_fp16_t)); - assert(src0->nb[0] == sizeof(ggml_fp16_t)); + sum_thread += val0 == val1; + } + } + if (ith != 0) { + sums[ith] = sum_thread; + } + ggml_barrier(params->threadpool); - for (int i = 0; i < n; i++) { - ggml_vec_sqrt_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); + if (ith != 0) { + return; + } + + for (int ith_other = 1; ith_other < nth; ++ith_other) { + sum_thread += sums[ith_other]; } + *((int64_t *) dst->data) = sum_thread; } -static void ggml_compute_forward_sqrt( +static void ggml_compute_forward_count_equal( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sqrt_f32(params, dst); - } break; - case GGML_TYPE_F16: + case GGML_TYPE_I32: { - ggml_compute_forward_sqrt_f16(params, dst); + ggml_compute_forward_count_equal_i32(params, dst); } break; default: { @@ -5866,9 +5277,9 @@ static void ggml_compute_forward_sqrt( } } -// ggml_compute_forward_log +// ggml_compute_forward_repeat -static void ggml_compute_forward_log_f32( +static void ggml_compute_forward_repeat_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -5878,24 +5289,43 @@ static void ggml_compute_forward_log_f32( return; } - GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_can_repeat(src0, dst)); - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + GGML_TENSOR_UNARY_OP_LOCALS - GGML_ASSERT( dst->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); - for (int i = 0; i < n; i++) { - ggml_vec_log_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_cpy_f32(ne00, + (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), + (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); + } + } + } + } + } + } } } -static void ggml_compute_forward_log_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { +static void ggml_compute_forward_repeat_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; @@ -5903,35 +5333,60 @@ static void ggml_compute_forward_log_f16( return; } - GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_can_repeat(src0, dst)); - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + GGML_TENSOR_UNARY_OP_LOCALS - GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t)); - GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); - for (int i = 0; i < n; i++) { - ggml_vec_log_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); + ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); + // ggml_vec_cpy_f16(ne00, y, x) + for (int i = 0; i < ne00; ++i) { + y[i] = x[i]; + } + } + } + } + } + } + } } } -static void ggml_compute_forward_log( +static void ggml_compute_forward_repeat( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; switch (src0->type) { - case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_I16: { - ggml_compute_forward_log_f32(params, dst); + ggml_compute_forward_repeat_f16(params, dst); } break; - case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_I32: { - ggml_compute_forward_log_f16(params, dst); + ggml_compute_forward_repeat_f32(params, dst); } break; default: { @@ -5940,9 +5395,9 @@ static void ggml_compute_forward_log( } } -// ggml_compute_forward_sin +// ggml_compute_forward_repeat_back -static void ggml_compute_forward_sin_f32( +static void ggml_compute_forward_repeat_back_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -5952,60 +5407,64 @@ static void ggml_compute_forward_sin_f32( return; } - GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_can_repeat(dst, src0)); - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + GGML_TENSOR_UNARY_OP_LOCALS - GGML_ASSERT( dst->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); + // guaranteed to be an integer due to the check in ggml_can_repeat + const int nr0 = (int)(ne00/ne0); + const int nr1 = (int)(ne01/ne1); + const int nr2 = (int)(ne02/ne2); + const int nr3 = (int)(ne03/ne3); - for (int i = 0; i < n; i++) { - ggml_vec_sin_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + if (ggml_is_contiguous(dst)) { + ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + } else { + for (int k3 = 0; k3 < ne3; k3++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int k1 = 0; k1 < ne1; k1++) { + ggml_vec_set_f32(ne0, + (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), + 0); + } + } + } + } + + // TODO: maybe this is not optimal? + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne3; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne2; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne1; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + ggml_vec_acc_f32(ne0, + (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), + (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); + } + } + } + } + } + } } } -static void ggml_compute_forward_sin_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t)); - GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); - - for (int i = 0; i < n; i++) { - ggml_vec_sin_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_sin( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { +static void ggml_compute_forward_repeat_back( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_sin_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_sin_f16(params, dst); + ggml_compute_forward_repeat_back_f32(params, dst); } break; default: { @@ -6014,1415 +5473,205 @@ static void ggml_compute_forward_sin( } } -// ggml_compute_forward_cos - -static void ggml_compute_forward_cos_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - GGML_ASSERT( dst->nb[0] == sizeof(float)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - - for (int i = 0; i < n; i++) { - ggml_vec_cos_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} +// ggml_compute_forward_concat -static void ggml_compute_forward_cos_f16( +static void ggml_compute_forward_concat_any( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t)); - GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t)); - - for (int i = 0; i < n; i++) { - ggml_vec_cos_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_cos( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; + const size_t len = ggml_type_size(src0->type); - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_cos_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_cos_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} + const int ith = params->ith; + const int nth = params->nth; -// ggml_compute_forward_sum + GGML_TENSOR_BINARY_OP_LOCALS -static void ggml_compute_forward_sum_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { + const int32_t dim = ggml_get_op_params_i32(dst, 0); - const struct ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(dim >= 0 && dim < 4); - if (params->ith != 0) { - return; - } + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = src0->ne[dim]; - assert(ggml_is_scalar(dst)); - assert(src0->nb[0] == sizeof(float)); + const char * x; - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) + // TODO: smarter multi-theading + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ith; i2 < ne2; i2 += nth) { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03; + } else { + x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13; + } - ggml_float sum = 0; - ggml_float row_sum = 0; + char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3; - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_f32_ggf(ne00, - &row_sum, - (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); - sum += row_sum; + memcpy(y, x, len); + } } } } - ((float *) dst->data)[0] = sum; } -static void ggml_compute_forward_sum_f16( +static void ggml_compute_forward_concat_i8( const struct ggml_compute_params * params, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; - if (params->ith != 0) { - return; - } - - assert(ggml_is_scalar(dst)); - - assert(src0->nb[0] == sizeof(ggml_fp16_t)); - - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) - - float sum = 0; - float row_sum = 0; + GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t)); - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_f16_ggf(ne00, - &row_sum, - (ggml_fp16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); - sum += row_sum; - } - } - } - ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); -} + const int ith = params->ith; + const int nth = params->nth; -static void ggml_compute_forward_sum_bf16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { + GGML_TENSOR_BINARY_OP_LOCALS - const struct ggml_tensor * src0 = dst->src[0]; + const int32_t dim = ggml_get_op_params_i32(dst, 0); - if (params->ith != 0) { - return; - } + GGML_ASSERT(dim >= 0 && dim < 4); - assert(ggml_is_scalar(dst)); + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = src0->ne[dim]; - assert(src0->nb[0] == sizeof(ggml_bf16_t)); + const int8_t * x; - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) - GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) + // TODO: smarter multi-theading + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ith; i2 < ne2; i2 += nth) { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); + } else { + x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); + } - float sum = 0; - float row_sum = 0; + int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_bf16_ggf(ne00, - &row_sum, - (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); - sum += row_sum; + *y = *x; + } } } } - ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum); } -static void ggml_compute_forward_sum( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { +static void ggml_compute_forward_concat_f16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sum_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_sum_f16(params, dst); - } break; - case GGML_TYPE_BF16: - { - ggml_compute_forward_sum_bf16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} + GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t)); -// ggml_compute_forward_sum_rows + const int ith = params->ith; + const int nth = params->nth; -static void ggml_compute_forward_sum_rows_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { + GGML_TENSOR_BINARY_OP_LOCALS - const struct ggml_tensor * src0 = dst->src[0]; + const int32_t dim = ggml_get_op_params_i32(dst, 0); - if (params->ith != 0) { - return; - } + GGML_ASSERT(dim >= 0 && dim < 4); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - GGML_ASSERT(dst->nb[0] == sizeof(float)); + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = src0->ne[dim]; - GGML_TENSOR_UNARY_OP_LOCALS + const ggml_fp16_t * x; - GGML_ASSERT(ne0 == 1); - GGML_ASSERT(ne1 == ne01); - GGML_ASSERT(ne2 == ne02); - GGML_ASSERT(ne3 == ne03); + // TODO: smarter multi-theading + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ith; i2 < ne2; i2 += nth) { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); + } else { + x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); + } - for (int64_t i3 = 0; i3 < ne03; i3++) { - for (int64_t i2 = 0; i2 < ne02; i2++) { - for (int64_t i1 = 0; i1 < ne01; i1++) { - float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03); - float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3); - float row_sum = 0; - ggml_vec_sum_f32(ne00, &row_sum, src_row); - dst_row[0] = row_sum; - } - } - } -} - -static void ggml_compute_forward_sum_rows( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sum_rows_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_mean - -static void ggml_compute_forward_mean_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(src0->nb[0] == sizeof(float)); - - GGML_TENSOR_UNARY_OP_LOCALS - - assert(ne0 == 1); - assert(ne1 == ne01); - assert(ne2 == ne02); - assert(ne3 == ne03); - - UNUSED(ne0); - UNUSED(ne1); - UNUSED(ne2); - UNUSED(ne3); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - for (int64_t i01 = 0; i01 < ne01; i01++) { - ggml_vec_sum_f32(ne00, - (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), - (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03)); - - *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00; - } - } - } -} - -static void ggml_compute_forward_mean( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_mean_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_argmax - -static void ggml_compute_forward_argmax_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(src0->nb[0] == sizeof(float)); - assert(dst->nb[0] == sizeof(float)); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - - const size_t nb01 = src0->nb[1]; - const size_t nb0 = dst->nb[0]; - - for (int64_t i1 = 0; i1 < ne01; i1++) { - float * src = (float *) ((char *) src0->data + i1*nb01); - int32_t * dst_ = (int32_t *) ((char *) dst->data + i1*nb0); - int v = 0; - ggml_vec_argmax_f32(ne00, &v, src); - dst_[0] = v; - } -} - -static void ggml_compute_forward_argmax( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_argmax_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_count_equal - -static void ggml_compute_forward_count_equal_i32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_TENSOR_BINARY_OP_LOCALS; - - GGML_ASSERT(src0->type == GGML_TYPE_I32); - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_are_same_shape(src0, src1)); - GGML_ASSERT(ggml_is_scalar(dst)); - GGML_ASSERT(dst->type == GGML_TYPE_I64); - - const int64_t nr = ggml_nrows(src0); - - const int ith = params->ith; - const int nth = params->nth; - - int64_t * sums = (int64_t *) params->wdata; - int64_t sum_thread = 0; - - // rows per thread - const int64_t dr = (nr + nth - 1)/nth; - - // row range for this thread - const int64_t ir0 = dr*ith; - const int64_t ir1 = MIN(ir0 + dr, nr); - - for (int64_t ir = ir0; ir < ir1; ++ir) { - const int64_t i03 = ir / (ne02*ne01); - const int64_t i02 = (ir - i03*ne03) / ne01; - const int64_t i01 = ir - i03*ne03 - i02*ne02; - - const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01; - const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11; - - for (int64_t i00 = 0; i00 < ne00; ++i00) { - const int32_t val0 = *((const int32_t *) (data0 + i00*nb00)); - const int32_t val1 = *((const int32_t *) (data1 + i00*nb10)); - - sum_thread += val0 == val1; - } - } - if (ith != 0) { - sums[ith] = sum_thread; - } - ggml_barrier(params->threadpool); - - if (ith != 0) { - return; - } - - for (int ith_other = 1; ith_other < nth; ++ith_other) { - sum_thread += sums[ith_other]; - } - *((int64_t *) dst->data) = sum_thread; -} - -static void ggml_compute_forward_count_equal( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_I32: - { - ggml_compute_forward_count_equal_i32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_repeat - -static void ggml_compute_forward_repeat_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_can_repeat(src0, dst)); - - GGML_TENSOR_UNARY_OP_LOCALS - - // guaranteed to be an integer due to the check in ggml_can_repeat - const int nr0 = (int)(ne0/ne00); - const int nr1 = (int)(ne1/ne01); - const int nr2 = (int)(ne2/ne02); - const int nr3 = (int)(ne3/ne03); - - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - // TODO: maybe this is not optimal? - for (int i3 = 0; i3 < nr3; i3++) { - for (int k3 = 0; k3 < ne03; k3++) { - for (int i2 = 0; i2 < nr2; i2++) { - for (int k2 = 0; k2 < ne02; k2++) { - for (int i1 = 0; i1 < nr1; i1++) { - for (int k1 = 0; k1 < ne01; k1++) { - for (int i0 = 0; i0 < nr0; i0++) { - ggml_vec_cpy_f32(ne00, - (float *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0), - (float *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01)); - } - } - } - } - } - } - } -} - -static void ggml_compute_forward_repeat_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_can_repeat(src0, dst)); - - GGML_TENSOR_UNARY_OP_LOCALS - - // guaranteed to be an integer due to the check in ggml_can_repeat - const int nr0 = (int)(ne0/ne00); - const int nr1 = (int)(ne1/ne01); - const int nr2 = (int)(ne2/ne02); - const int nr3 = (int)(ne3/ne03); - - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(ggml_fp16_t)); - GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); - - // TODO: maybe this is not optimal? - for (int i3 = 0; i3 < nr3; i3++) { - for (int k3 = 0; k3 < ne03; k3++) { - for (int i2 = 0; i2 < nr2; i2++) { - for (int k2 = 0; k2 < ne02; k2++) { - for (int i1 = 0; i1 < nr1; i1++) { - for (int k1 = 0; k1 < ne01; k1++) { - for (int i0 = 0; i0 < nr0; i0++) { - ggml_fp16_t * y = (ggml_fp16_t *) ((char *) dst->data + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0); - ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01); - // ggml_vec_cpy_f16(ne00, y, x) - for (int i = 0; i < ne00; ++i) { - y[i] = x[i]; - } - } - } - } - } - } - } - } -} - -static void ggml_compute_forward_repeat( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_I16: - { - ggml_compute_forward_repeat_f16(params, dst); - } break; - case GGML_TYPE_F32: - case GGML_TYPE_I32: - { - ggml_compute_forward_repeat_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_repeat_back - -static void ggml_compute_forward_repeat_back_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - GGML_ASSERT(ggml_can_repeat(dst, src0)); - - GGML_TENSOR_UNARY_OP_LOCALS - - // guaranteed to be an integer due to the check in ggml_can_repeat - const int nr0 = (int)(ne00/ne0); - const int nr1 = (int)(ne01/ne1); - const int nr2 = (int)(ne02/ne2); - const int nr3 = (int)(ne03/ne3); - - // TODO: support for transposed / permuted tensors - GGML_ASSERT(nb0 == sizeof(float)); - GGML_ASSERT(nb00 == sizeof(float)); - - if (ggml_is_contiguous(dst)) { - ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); - } else { - for (int k3 = 0; k3 < ne3; k3++) { - for (int k2 = 0; k2 < ne2; k2++) { - for (int k1 = 0; k1 < ne1; k1++) { - ggml_vec_set_f32(ne0, - (float *) ((char *) dst->data + k1*nb1 + k2*nb2 + k3*nb3), - 0); - } - } - } - } - - // TODO: maybe this is not optimal? - for (int i3 = 0; i3 < nr3; i3++) { - for (int k3 = 0; k3 < ne3; k3++) { - for (int i2 = 0; i2 < nr2; i2++) { - for (int k2 = 0; k2 < ne2; k2++) { - for (int i1 = 0; i1 < nr1; i1++) { - for (int k1 = 0; k1 < ne1; k1++) { - for (int i0 = 0; i0 < nr0; i0++) { - ggml_vec_acc_f32(ne0, - (float *) ((char *) dst->data + ( k3)*nb3 + ( k2)*nb2 + ( k1)*nb1), - (float *) ((char *) src0->data + (i3*ne3 + k3)*nb03 + (i2*ne2 + k2)*nb02 + (i1*ne1 + k1)*nb01 + (i0*ne0)*nb00)); - } - } - } - } - } - } - } -} - -static void ggml_compute_forward_repeat_back( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_repeat_back_f32(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_concat - -static void ggml_compute_forward_concat_any( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - const size_t len = ggml_type_size(src0->type); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int32_t dim = ggml_get_op_params_i32(dst, 0); - - GGML_ASSERT(dim >= 0 && dim < 4); - - int64_t o[4] = {0, 0, 0, 0}; - o[dim] = src0->ne[dim]; - - const char * x; - - // TODO: smarter multi-theading - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ith; i2 < ne2; i2 += nth) { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03; - } else { - x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13; - } - - char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3; - - memcpy(y, x, len); - } - } - } - } -} - -static void ggml_compute_forward_concat_i8( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int32_t dim = ggml_get_op_params_i32(dst, 0); - - GGML_ASSERT(dim >= 0 && dim < 4); - - int64_t o[4] = {0, 0, 0, 0}; - o[dim] = src0->ne[dim]; - - const int8_t * x; - - // TODO: smarter multi-theading - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ith; i2 < ne2; i2 += nth) { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); - } else { - x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); - } - - int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - - *y = *x; - } - } - } - } -} - -static void ggml_compute_forward_concat_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int32_t dim = ggml_get_op_params_i32(dst, 0); - - GGML_ASSERT(dim >= 0 && dim < 4); - - int64_t o[4] = {0, 0, 0, 0}; - o[dim] = src0->ne[dim]; - - const ggml_fp16_t * x; - - // TODO: smarter multi-theading - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ith; i2 < ne2; i2 += nth) { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); - } else { - x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); - } - - ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - - *y = *x; - } - } - } - } -} - -static void ggml_compute_forward_concat_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float)); - - const int ith = params->ith; - const int nth = params->nth; - - GGML_TENSOR_BINARY_OP_LOCALS - - const int32_t dim = ggml_get_op_params_i32(dst, 0); - - GGML_ASSERT(dim >= 0 && dim < 4); - - int64_t o[4] = {0, 0, 0, 0}; - o[dim] = src0->ne[dim]; - - const float * x; - - // TODO: smarter multi-theading - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ith; i2 < ne2; i2 += nth) { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); - } else { - x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); - } - - float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - - *y = *x; - } - } - } - } -} - -static void ggml_compute_forward_concat( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F16: - case GGML_TYPE_BF16: - case GGML_TYPE_I16: - { - ggml_compute_forward_concat_f16(params, dst); - } break; - case GGML_TYPE_I8: - { - ggml_compute_forward_concat_i8(params, dst); - } break; - case GGML_TYPE_F32: - case GGML_TYPE_I32: - { - ggml_compute_forward_concat_f32(params, dst); - } break; - default: - { - ggml_compute_forward_concat_any(params, dst); - } - } -} - -// ggml_compute_forward_abs - -static void ggml_compute_forward_abs_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_abs_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_abs_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_abs_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_abs( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_abs_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_abs_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sgn - -static void ggml_compute_forward_sgn_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_sgn_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_sgn_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_sgn_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_sgn( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_sgn_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_sgn_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_neg - -static void ggml_compute_forward_neg_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_neg_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_neg_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_neg_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_neg( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_neg_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_neg_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_step - -static void ggml_compute_forward_step_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_step_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_step_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_step_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_step( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_step_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_step_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_tanh - -static void ggml_compute_forward_tanh_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_tanh_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_tanh_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_tanh_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_tanh( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_tanh_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_tanh_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_elu - -static void ggml_compute_forward_elu_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_elu_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_elu_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_elu_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_elu( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_elu_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_elu_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_relu - -static void ggml_compute_forward_relu_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_relu_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_relu_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_relu_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_relu( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_relu_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_relu_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -// ggml_compute_forward_sigmoid - -static void ggml_compute_forward_sigmoid_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - for (int i = 0; i < n; i++) { - ggml_vec_sigmoid_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); + *y = *x; + } + } + } } } -static void ggml_compute_forward_sigmoid_f16( +static void ggml_compute_forward_concat_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; - if (params->ith != 0) { - return; - } + GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float)); - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); + const int ith = params->ith; + const int nth = params->nth; - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; + GGML_TENSOR_BINARY_OP_LOCALS - for (int i = 0; i < n; i++) { - ggml_vec_sigmoid_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); + const int32_t dim = ggml_get_op_params_i32(dst, 0); + + GGML_ASSERT(dim >= 0 && dim < 4); + + int64_t o[4] = {0, 0, 0, 0}; + o[dim] = src0->ne[dim]; + + const float * x; + + // TODO: smarter multi-theading + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = ith; i2 < ne2; i2 += nth) { + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03); + } else { + x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13); + } + + float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + + *y = *x; + } + } + } } } -static void ggml_compute_forward_sigmoid( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { +static void ggml_compute_forward_concat( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; switch (src0->type) { - case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_I16: { - ggml_compute_forward_sigmoid_f32(params, dst); + ggml_compute_forward_concat_f16(params, dst); } break; - case GGML_TYPE_F16: + case GGML_TYPE_I8: + { + ggml_compute_forward_concat_i8(params, dst); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_I32: { - ggml_compute_forward_sigmoid_f16(params, dst); + ggml_compute_forward_concat_f32(params, dst); } break; default: { - GGML_ABORT("fatal error"); + ggml_compute_forward_concat_any(params, dst); } } } @@ -7930,217 +6179,6 @@ static void ggml_compute_forward_silu_back( } } -static void ggml_compute_forward_hardswish_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_hardswish_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_hardswish_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_hardswish_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_hardswish( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_hardswish_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_hardswish_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_compute_forward_hardsigmoid_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_hardsigmoid_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_hardsigmoid_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_hardsigmoid_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_hardsigmoid( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_hardsigmoid_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_hardsigmoid_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - -static void ggml_compute_forward_exp_f32( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_exp_f32(nc, - (float *) ((char *) dst->data + i*( dst->nb[1])), - (float *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_exp_f16( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - if (params->ith != 0) { - return; - } - - assert(ggml_is_contiguous_1(src0)); - assert(ggml_is_contiguous_1(dst)); - assert(ggml_are_same_shape(src0, dst)); - - const int n = ggml_nrows(src0); - const int nc = src0->ne[0]; - - for (int i = 0; i < n; i++) { - ggml_vec_exp_f16(nc, - (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])), - (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1]))); - } -} - -static void ggml_compute_forward_exp( - const struct ggml_compute_params * params, - struct ggml_tensor * dst) { - - const struct ggml_tensor * src0 = dst->src[0]; - - switch (src0->type) { - case GGML_TYPE_F32: - { - ggml_compute_forward_exp_f32(params, dst); - } break; - case GGML_TYPE_F16: - { - ggml_compute_forward_exp_f16(params, dst); - } break; - default: - { - GGML_ABORT("fatal error"); - } - } -} - - // ggml_compute_forward_norm static void ggml_compute_forward_norm_f32( diff --git a/src/ggml-cpu/unary-ops.cpp b/src/ggml-cpu/unary-ops.cpp new file mode 100644 index 000000000..4fce569b3 --- /dev/null +++ b/src/ggml-cpu/unary-ops.cpp @@ -0,0 +1,186 @@ +#include "unary-ops.h" + +static inline float op_abs(float x) { + return fabsf(x); +} + +static inline float op_sgn(float x) { + return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f); +} + +static inline float op_neg(float x) { + return -x; +} + +static inline float op_step(float x) { + return (x > 0.f) ? 1.f : 0.f; +} + +static inline float op_tanh(float x) { + return tanhf(x); +} + +static inline float op_elu(float x) { + return (x > 0.f) ? x : expm1f(x); +} + +static inline float op_relu(float x) { + return (x > 0.f) ? x : 0.f; +} + +static inline float op_sigmoid(float x) { + return 1.f / (1.f + expf(-x)); +} + +static inline float op_hardsigmoid(float x) { + return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f)); +} + +static inline float op_exp(float x) { + return expf(x); +} + +static inline float op_hardswish(float x) { + return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f)); +} + +static inline float op_sqr(float x) { + return x * x; +} + +static inline float op_sqrt(float x) { + return sqrtf(x); +} + +static inline float op_sin(float x) { + return sinf(x); +} + +static inline float op_cos(float x) { + return cosf(x); +} + +static inline float op_log(float x) { + return logf(x); +} + +template +static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) { + constexpr auto src0_to_f32 = type_conversion_table::to_f32; + constexpr auto f32_to_dst = type_conversion_table::from_f32; + + for (int i = 0; i < n; i++) { + y[i] = f32_to_dst(op(src0_to_f32(x[i]))); + } +} + +template +static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst)); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT( nb0 == sizeof(dst_t)); + GGML_ASSERT(nb00 == sizeof(src0_t)); + + const auto [ir0, ir1] = get_thread_range(params, src0); + + for (int64_t ir = ir0; ir < ir1; ++ir) { + const int64_t i03 = ir/(ne02*ne01); + const int64_t i02 = (ir - i03*ne02*ne01)/ne01; + const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01); + + dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 ); + const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01); + + vec_unary_op(ne0, dst_ptr, src0_ptr); + } +} + +// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates +template +static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + /* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32 + apply_unary_op(params, dst); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16 + apply_unary_op(params, dst); + } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16 + apply_unary_op(params, dst); + } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) { + apply_unary_op(params, dst); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + apply_unary_op(params, dst); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__, + ggml_type_name(dst->type), ggml_type_name(src0->type)); + GGML_ABORT("fatal error"); + } +} + +void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} + +void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) { + unary_op(params, dst); +} diff --git a/src/ggml-cpu/unary-ops.h b/src/ggml-cpu/unary-ops.h new file mode 100644 index 000000000..b1ade2c8e --- /dev/null +++ b/src/ggml-cpu/unary-ops.h @@ -0,0 +1,28 @@ +#pragma once + +#include "common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst); + +#ifdef __cplusplus +} +#endif