Skip to content

Commit

Permalink
optimize performacne
Browse files Browse the repository at this point in the history
  • Loading branch information
zhaoguochun1995 committed Aug 17, 2023
1 parent 6618445 commit 336a2ef
Show file tree
Hide file tree
Showing 14 changed files with 191 additions and 115 deletions.
101 changes: 35 additions & 66 deletions scripts/autogen_diopi_wrapper/diopi_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -408,14 +408,16 @@

- schema: "cross_entropy_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor"
register_op: False
#register_op: True
custom_code_at_the_beginning: |
const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction);
at::Tensor out = at::empty_like(self);
interface: diopiCrossEntropyLossBackward(ctx, out, grad_output, self, target, weight, reductionDiopi, ignore_index.expect_int(), label_smoothing)


- schema: "cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, float label_smoothing=0.0) -> Tensor"
autograd: True
register_op: False
#autograd: True
custom_code_at_the_beginning: |
const int64_t ignore_index_int = ignore_index.expect_int();
const auto reductionDiopi = static_cast<::diopiReduction_t>(reduction);
Expand Down Expand Up @@ -443,6 +445,8 @@
return outputs;
- schema: "convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor"
custom_fallback: True
autocompare: False
size_attr: [stride, padding, dilation]
custom_code_at_the_beginning: |
int64_t batch_size = input.size(0);
Expand All @@ -457,6 +461,8 @@
interface: diopiConvolution2d(&context, out, input, weight, bias, stride, padding, dilation, groups)

- schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
custom_fallback: True
autocompare: False
size_attr: [stride, padding, dilation, bias_sizes]
device: [camb]
custom_code_at_the_beginning: |
Expand Down Expand Up @@ -485,6 +491,8 @@
interface: diopiConvolution2dBackward(ctx, grad_input, grad_weight, grad_bias, grad_output, input, weight, bias_sizes_ptr, stride, padding, dilation, transposed, output_paddingDiopiSize, groups);

- schema: "convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
custom_fallback: True
autocompare: False
size_attr: [stride, padding, dilation, bias_sizes]
device: [-camb, all]
custom_code_at_the_beginning: |
Expand Down Expand Up @@ -566,61 +574,30 @@
at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor(), at::Tensor()};
return outputs;
- schema: "dropout_impl(Tensor input, float p, bool train, *, Tensor(a!) mask) -> Tensor"
custom_code_at_the_beginning: |
at::Tensor out = at::empty_like(input);
register_op: False
interface: diopiDropout(ctx, out, mask, input, p, train)

- schema: "dropout(Tensor input, float p, bool train) -> Tensor"
autocompare: disable
register_op: False
outs: [mask]
custom_code_at_the_beginning: |
auto mask = at::empty(input.sizes(), input.options().dtype(at::kByte));
at::Tensor out = at::empty_like(input);
at::Tensor mask;
if (input.requires_grad()) {
mask = at::empty(input.sizes(), input.options().dtype(at::kByte));;
}
interface: diopiDropout(ctx, out, mask, input, p, train)
outs: [mask]
autograd: True
saved_data: [p, mask]
forward_schema: "dropout_impl(Tensor input, float p, bool train, *, Tensor(a!) mask) -> Tensor"
forward_process_code: |
auto mask = at::empty(input.sizes(), input.options().dtype(at::kByte));
at::Tensor out = at::empty_like(input);
cal_grad_code: |
auto p = p_.toDouble();
double p1m = 1. - p;
double scale = p1m == 0 ? 0. : 1. / p1m;
auto mask = mask_.toTensor();
at::Tensor out = grad_outputs[0] * mask * scale;
backward_return_code: |
std::vector<at::Tensor> outputs(6);
outputs[0] = out;
return outputs;
- schema: "dropout__impl(Tensor(a!) self, Tensor(b!) mask, float p, bool train) -> Tensor(a!)"
register_op: False
interface: diopiDropoutInp(ctx, self, mask, p, train)

- schema: "dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)"
custom_code_at_the_beginning: |
auto mask = at::empty(self.sizes(), self.options().dtype(at::kByte));
register_op: False
outs: [mask]
custom_code_at_the_beginning: |
at::Tensor mask;
if (self.requires_grad()) {
mask = at::empty(self.sizes(), self.options().dtype(at::kByte));;
}
interface: diopiDropoutInp(ctx, self, mask, p, train)
autograd: True
forward_process_code: |
auto mask = at::empty(self.sizes(), self.options().dtype(at::kByte));
saved_data: [p, mask]
forward_schema: "dropout__impl(Tensor(a!) self, Tensor(b!) mask, float p, bool train) -> Tensor(a!)"
cal_grad_code: |
auto p = p_.toDouble();
double p1m = 1. - p;
double scale = p1m == 0 ? 0. : 1. / p1m;
auto mask = mask_.toTensor();
at::Tensor out = grad_outputs[0] * mask * scale;
backward_return_code: |
std::vector<at::Tensor> outputs(6);
outputs[0] = out;
return outputs;
wrappter_custom_return: return self;

- schema: "bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)"
autocompare: disable
interface: diopiBernoulliScalar(ctx, self, p, 0);

- schema: "log.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
interface: diopiLog(ctx, out, self)
Expand Down Expand Up @@ -702,35 +679,27 @@


- schema: "linear_backward(Tensor input, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
autocompare: False
custom_code_at_the_beginning: |
auto grad_input = at::empty(input.sizes(), input.options());
auto grad_weight = at::empty(weight.sizes(), weight.options().dtype(at::kFloat));
at::Tensor grad_bias;
bool bias_has_value = output_mask[2];
if (bias_has_value) {
at::Tensor grad_input, grad_weight, grad_bias;
if (output_mask[0]) {
grad_input = at::empty(input.sizes(), grad_output.options());
}
if (output_mask[1]) {
grad_weight = at::empty(weight.sizes(), grad_output.options());
}
if (output_mask[2]) {
grad_bias = at::empty({grad_output.size(-1)}, grad_output.options());
}
interface: diopiLinearBackward(ctx, grad_input, grad_weight, grad_bias, grad_output, input, weight)

- schema: "linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor"
autograd: True
custom_code_at_the_beginning: |
std::vector<int64_t> output_size(input.sizes().begin(), input.sizes().end());
output_size.back() = weight.sizes()[0];
auto out = at::empty(output_size, input.options());
interface: diopiLinear(ctx, out, input, weight, bias)
forward_process_code: |
bool bias_has_value = (bias.has_value() == true) ? bias.value().requires_grad() : false;
std::array<bool, 3> output_mask{input.requires_grad(), weight.requires_grad(), bias_has_value};
saved_data: [output_mask, input, weight]
cal_grad_code: |
auto output_mask = output_mask_.to<std::array<bool, 3>>();
auto input = input_.toTensor();
auto weight = weight_.toTensor();
auto grad_output = grad_outputs[0];
backward_schema: "linear_backward(Tensor input, Tensor grad_output, Tensor weight, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)"
backward_return_code: |
return {std::get<0>(result), std::get<1>(result), std::get<2>(result)};

- schema: "_log_softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) out) -> Tensor(a!)"
interface: diopiLogSoftmaxBackward(ctx, out, grad_output, output, dim)

Expand Down
1 change: 1 addition & 0 deletions scripts/ci/nv/ci_nv_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ export VENDOR_INCLUDE_DIRS=${CUDA_PATH}/include
#export DIPU_FORCE_FALLBACK_OPS_LIST=_index_put_impl_,index.Tensor_out
#export DIPU_DUMP_OP_ARGS=2
#export DIPU_DEBUG_ALLOCATOR=15
export DIPU_CUDA_EVENT_TIMING=1
export DIPU_DEVICE_MEMCACHING_ALGORITHM=BF
export DIPU_HOST_MEMCACHING_ALGORITHM=BF
export DIPU_PATCH_CUDA_CACHED_ALLOCATOR=1
Expand Down
67 changes: 67 additions & 0 deletions tests/test_ops/archived/op_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import torch
import torch.utils.benchmark as benchmark
import torch_dipu
from itertools import product

x = torch.randn(10000, 64).cuda()

def batched_dot_mul_sum(a, b):
'''Computes batched dot by multiplying and summing'''
return a.mul(b).sum(-1)

t0 = benchmark.Timer(
stmt='batched_dot_mul_sum(x, x)',
setup='from __main__ import batched_dot_mul_sum',
globals={'x': x})
r0 = t0.timeit(100)
print(r0)
assert r0.mean < 8.8e-5


def batched_dot_bmm(a, b):
'''Computes batched dot by reducing to ``bmm``'''
a = a.reshape(-1, 1, a.shape[-1])
b = b.reshape(-1, b.shape[-1], 1)
return torch.bmm(a, b).flatten(-3)

t1 = benchmark.Timer(
stmt='batched_dot_bmm(x, x)',
setup='from __main__ import batched_dot_bmm',
globals={'x': x})

r1 = t1.timeit(100)
print(r1)
assert r1.mean < 8.5e-5


# Compare takes a list of measurements which we'll save in results.
results = []
sizes = [1, 64, 32, 120]
for b, n in product(sizes, sizes):
# label and sub_label are the rows
# description is the column
label = 'Batched dot'
sub_label = f'[{b}, {n}]'
x = torch.ones((b, n))
for num_threads in [1, 4, 16, 32]:
results.append(benchmark.Timer(
stmt='batched_dot_mul_sum(x, x)',
setup='from __main__ import batched_dot_mul_sum',
globals={'x': x},
num_threads=num_threads,
label=label,
sub_label=sub_label,
description='mul/sum',
).blocked_autorange(min_run_time=1))
results.append(benchmark.Timer(
stmt='batched_dot_bmm(x, x)',
setup='from __main__ import batched_dot_bmm',
globals={'x': x},
num_threads=num_threads,
label=label,
sub_label=sub_label,
description='bmm',
).blocked_autorange(min_run_time=1))

compare = benchmark.Compare(results)
compare.print()
2 changes: 1 addition & 1 deletion third_party/DIOPI
Submodule DIOPI updated 1 files
+142 −54 impl/torch/functions.cpp
2 changes: 1 addition & 1 deletion torch_dipu/csrc_dipu/aten/RegisterDIPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ namespace {

bool wrapper_DIPU_is_pinned(const at::Tensor& self, c10::optional<at::Device> device) {
dipu::profile::RecordBlockCreator dipu_recorder(__FUNCTION__);
const OptionalDeviceGuard device_guard(device_of(self));
// const OptionalDeviceGuard device_guard(device_of(self));
return dnative::is_pinned(self, device);
}

Expand Down
8 changes: 8 additions & 0 deletions torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ static ::std::tuple<at::Tensor&, at::Tensor&, at::Tensor&> custom_fallback_dipu_
return std::tie(out, save_mean, save_invstd);
}

static at::Tensor custom_fallback_dipu_convolution_overrideable(const at::Tensor& input, const at::Tensor& weight, const c10::optional<at::Tensor>& bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups) {
return at::convolution(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups);
}

std::tuple<at::Tensor, at::Tensor, at::Tensor> custom_fallback_dipu_convolution_backward_overrideable(const at::Tensor& grad_output, const at::Tensor& input, const at::Tensor& weight, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool, 3> output_mask) {
return at::convolution_backward(grad_output, input, weight, c10::nullopt, stride, padding, dilation, transposed, output_padding, groups, output_mask);
}

static std::tuple<at::Tensor, at::Tensor, at::Tensor> custom_fallback_dipu_native_batch_norm(const at::Tensor& input, const c10::optional<at::Tensor>& weight_opt,
const c10::optional<at::Tensor>& bias_opt, const c10::optional<at::Tensor>& running_mean_opt,
const c10::optional<at::Tensor>& running_var_opt, bool training, double momentum, double eps) {
Expand Down
2 changes: 1 addition & 1 deletion torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ class EventPool final {
}

void get(T& event) {
std::lock_guard<mutex_t> _(event_mutex_);
if (event_pool_.empty()) {
allocator_(event);
allocate_num_++;
} else {
std::lock_guard<mutex_t> _(event_mutex_);
event = event_pool_.back();
event_pool_.pop_back();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <list>
#include <tuple>
#include <deque>
#include "DIPUSpinMutex.h"
#include "../DIPUEvent.h"

namespace dipu {
Expand All @@ -27,6 +28,7 @@ template<class T, int algorithm>
class AsyncResourcePoolImpl<T, at::DeviceType::CPU, algorithm>: public AsyncResourcePool<T>{
std::list<T> list_;
using mutex_t = std::recursive_mutex;
//using mutex_t = dipu::SpinMutex;
mutex_t mutex_;
public:
void add(const T& t, std::deque<DIPUEvent>& events) override {
Expand Down Expand Up @@ -57,6 +59,7 @@ class AsyncResourcePoolImpl<T, dipu::DIPU_DEVICE_TYPE, algorithm> : public Async
using Res = std::tuple<T, std::deque<DIPUEvent>>;
std::list<Res> list_;
using mutex_t = std::recursive_mutex;
//using mutex_t = dipu::SpinMutex;
mutex_t mutex_;
public:
void add(const T& t, std::deque<DIPUEvent>& events) override {
Expand All @@ -65,14 +68,16 @@ class AsyncResourcePoolImpl<T, dipu::DIPU_DEVICE_TYPE, algorithm> : public Async
}

T get() override {
std::lock_guard<mutex_t> lk(mutex_);
T t = std::get<0>(list_.front());
list_.pop_front();
{
std::lock_guard<mutex_t> lk(mutex_);
list_.pop_front();
}
return t;
}

bool ready() override {
std::lock_guard<mutex_t> lk(mutex_);
//std::lock_guard<mutex_t> lk(mutex_);
if (list_.empty()) {
return false;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,47 +1,16 @@
// Copyright (c) 2023, DeepLink.

#include "DIPUCachingAllocator.h"
#include "DIPUSpinMutex.h"
#include <queue>
#include <vector>
#include <stack>
#include <atomic>
#include <thread>
#include <map>
#include <functional>

namespace dipu {

/// Simple spin-lock to help build thread-safe functions.
class SpinMutex {
private:
std::atomic<bool> excl_ { false };

public:
constexpr SpinMutex() noexcept = default;

SpinMutex(const SpinMutex&) = delete;

void delay() const noexcept {
std::this_thread::yield();
}

void lock() {
for (bool exp = false;
!excl_.compare_exchange_weak(exp, true, std::memory_order_acq_rel);
exp = false) delay();
}

bool try_lock() {
bool exp = false;
return
excl_.compare_exchange_weak(exp, true, std::memory_order_acq_rel);
}

void unlock() {
excl_.store(false, std::memory_order_release);
}
};

class BFCachingAllocatorImpl{
public:
using allocate_fn_t = std::function<void*(size_t)>;
Expand Down Expand Up @@ -166,6 +135,7 @@ class BFCachingAllocatorImpl{
std::vector<StreamSetHandle> streamSets_;

using mutex_t = SpinMutex;
//using mutex_t = std::recursive_mutex;
mutable mutex_t mut_;

static size_t roundBytes(size_t nbytes) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ class BSCachingAllocator: public CacheAllocator {
}

allocator_->async_mem_pool()->add(std::make_tuple(ptr(), size()), events);
allocator_->flush_mem_pool();
//allocator_->flush_mem_pool();
}
}
};
Expand Down
Loading

0 comments on commit 336a2ef

Please sign in to comment.