-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
commit 66abeaf Author: Carl Pearson <[email protected]> Date: Mon Apr 6 13:03:49 2020 -0500 . commit 7cc7288 Author: Carl Pearson <[email protected]> Date: Mon Apr 6 11:29:04 2020 -0500 . commit 547d794 Author: Carl Pearson <[email protected]> Date: Mon Apr 6 11:04:08 2020 -0500 . commit 15db5cf Author: Carl Pearson <[email protected]> Date: Mon Apr 6 07:46:41 2020 -0500 . commit 83e4e4f Author: Carl Pearson <[email protected]> Date: Mon Apr 6 07:39:01 2020 -0500 cuda 10.1 commit 9bf48c6 Author: Carl Pearson <[email protected]> Date: Fri Apr 3 08:12:21 2020 -0500 . commit 48d957e Author: Carl Pearson <[email protected]> Date: Fri Apr 3 07:54:28 2020 -0500 remove install files commit efc8283 Merge: cc817f8 dfbe32f Author: Carl Pearson <[email protected]> Date: Thu Apr 2 13:40:04 2020 -0500 Merge branch 'sgemm' of github.com:cwpearson/nvidia-performance-tools into sgemm commit cc817f8 Author: Carl Pearson <[email protected]> Date: Thu Apr 2 13:39:58 2020 -0500 sgemm working commit d2e5b26 Author: Carl Pearson <[email protected]> Date: Thu Apr 2 13:29:11 2020 -0500 cpu test commit dfbe32f Author: Carl Pearson <[email protected]> Date: Thu Apr 2 07:58:12 2020 -0500 add rai_build commit 468757f Author: Carl Pearson <[email protected]> Date: Thu Apr 2 07:40:29 2020 -0500 . commit cc2ace8 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:44:48 2020 -0500 . commit b205b6a Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:44:27 2020 -0500 . commit 141a794 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:31:04 2020 -0500 . commit 7c6caff Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:25:30 2020 -0500 . commit 0b4dd5d Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:24:02 2020 -0500 . commit 8bdf7ce Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:19:41 2020 -0500 add sgemm to travis commit 108561d Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:12:42 2020 -0500 travis commit ac7fe77 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:08:18 2020 -0500 switch row/col major commit e5789b3 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 17:00:45 2020 -0500 all sgemms compile commit 5c782a6 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 16:39:33 2020 -0500 steal tiled code from webgpu commit 34775a4 Author: Carl Pearson <[email protected]> Date: Wed Apr 1 15:20:18 2020 -0500 add basic sgemm impl, update argparse commit 05a321c Author: Carl Pearson <[email protected]> Date: Wed Apr 1 10:16:56 2020 -0500 basic sgemm kernel and copy regtiled from 508
- Loading branch information
Showing
18 changed files
with
1,000 additions
and
34 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
sgemm/build | ||
*.deb | ||
*.run |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,38 @@ | ||
set -x | ||
|
||
set -e | ||
|
||
cd docker | ||
ls -halt | ||
source ci/env.sh | ||
|
||
if [[ $BUILD_DOCKER == "1" ]]; then | ||
cd $TRAVIS_BUILD_DIR | ||
|
||
echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin | ||
echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin | ||
|
||
TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7} | ||
DOCKER_REPO=nvidia-performance-tools | ||
DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO | ||
DOCKER_TAG=$TRAVIS_CPU_ARCH-10.2-$TRAVIS_BRANCH-$TRAVIS_COMMIT | ||
TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7} | ||
DOCKER_REPO=nvidia-performance-tools | ||
DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO | ||
DOCKER_TAG=${TRAVIS_CPU_ARCH}-10.1-$TRAVIS_BRANCH-$TRAVIS_COMMIT | ||
|
||
|
||
docker build -f $TRAVIS_CPU_ARCH.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG . | ||
docker push $DOCKER_SLUG:$DOCKER_TAG | ||
docker build -f ${TRAVIS_CPU_ARCH}_10-1.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG . | ||
docker push $DOCKER_SLUG:$DOCKER_TAG | ||
|
||
|
||
if [[ $TRAVIS_BRANCH == master ]]; then | ||
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-$TRAVIS_CPU_ARCH | ||
docker push $DOCKER_SLUG:latest-$TRAVIS_CPU_ARCH | ||
else | ||
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH | ||
docker push $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH | ||
if [[ $TRAVIS_BRANCH == master ]]; then | ||
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH} | ||
docker push $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH} | ||
else | ||
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH | ||
docker push $DOCKER_SLUG:$TRAVIS_BRANCH-${TRAVIS_CPU_ARCH} | ||
fi | ||
fi | ||
|
||
# remove the login key from the image | ||
rm -fv $HOME/.docker/config.json | ||
|
||
if [[ $BUILD_TYPE != '' ]]; then | ||
cd $TRAVIS_BUILD_DIR | ||
cd sgemm | ||
mkdir -p build | ||
cd build | ||
cmake .. -DCMAKE_BUILD_TYPE=$BUILD_TYPE | ||
make VERBOSE=1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
CMAKE_PREFIX=$HOME/cmake | ||
|
||
export PATH=/usr/local/cuda/bin:$PATH | ||
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH | ||
|
||
export PATH=$CMAKE_PREFIX/bin:$PATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,38 @@ | ||
set -x | ||
set -e | ||
|
||
sudo apt-get update | ||
sudo apt-get install -q -y wget | ||
|
||
if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then | ||
wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O docker/NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb; | ||
wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O docker/nsight-compute-PPC64LE-2019.5.0.14-27346997.run; | ||
elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then | ||
wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O docker/NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb; | ||
wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O docker/nsight-compute-linux-2019.5.0.14-27346997.run; | ||
source ci/env.sh | ||
|
||
# deps for building docker images | ||
if [[ $BUILD_DOCKER == "1" ]]; then | ||
cd $TRAVIS_BUILD_DIR | ||
|
||
if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then | ||
wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb; | ||
wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O nsight-compute-PPC64LE-2019.5.0.14-27346997.run; | ||
elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then | ||
wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb; | ||
wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O nsight-compute-linux-2019.5.0.14-27346997.run; | ||
fi | ||
fi | ||
|
||
# deps for building code | ||
if [[ $BUILD_TYPE != '' ]]; then | ||
cs $HOME | ||
|
||
## install CMake | ||
wget -qSL https://github.com/Kitware/CMake/releases/download/v3.8.2/cmake-3.8.2-Linux-x86_64.tar.gz -O cmake.tar.gz | ||
mkdir -p $CMAKE_PREFIX | ||
tar -xf cmake.tar.gz --strip-components=1 -C $CMAKE_PREFIX | ||
rm cmake.tar.gz | ||
|
||
## install CUDA | ||
sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub | ||
CUDA102="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.2.89-1_amd64.deb" | ||
wget -SL $CUDA102 -O cuda.deb | ||
sudo dpkg -i cuda.deb | ||
sudo apt-get update | ||
sudo apt-get install -y --no-install-recommends \ | ||
cuda-toolkit-10-2 | ||
rm cuda.deb | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
project(sgemm LANGUAGES CXX CUDA) | ||
|
||
# 3.8+ for CUDA | ||
cmake_minimum_required(VERSION 3.8 FATAL_ERROR) | ||
|
||
if(NOT CMAKE_BUILD_TYPE) | ||
set(CMAKE_BUILD_TYPE "Release") | ||
message(STATUS "Setting CMAKE_BUILD_TYPE=Release") | ||
endif() | ||
|
||
set(CMAKE_CUDA_STANDARD 11) | ||
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) | ||
|
||
include_directories(PUBLIC SYSTEM include) | ||
|
||
# Add line info to binaries to help with profiling | ||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") | ||
|
||
add_executable(sgemm-cpu cpu.cpp) | ||
add_executable(sgemm-basic basic.cu) | ||
add_executable(sgemm-tiled tiled.cu) | ||
add_executable(sgemm-regtiled-coarsened regtiled_coarsened.cu) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
#include <algorithm> | ||
|
||
#include <argparse/argparse.hpp> | ||
|
||
#include "common.hpp" | ||
|
||
/* NOTE: A and C are column major, B is row major | ||
*/ | ||
__global__ void mygemm(float *c, //<! [out] and MxN matrix | ||
const float *a, //<! [in] an MxK matrix | ||
const float *b, //<! [in] an KxN matrix | ||
const int M, const int N, const int K) { | ||
|
||
#define A(_i, _j) a[(_i) + (_j)*M] | ||
#define B(_i, _j) b[(_i)*N + (_j)] | ||
#define C(_i, _j) c[(_i) + (_j)*M] | ||
|
||
int gidx = blockDim.x * blockIdx.x + threadIdx.x; | ||
int gidy = blockDim.y * blockIdx.y + threadIdx.y; | ||
|
||
for (int i = gidy; i < M; i += gridDim.y * blockDim.y) { | ||
for (int j = gidx; j < N; j += gridDim.x * blockDim.x) { | ||
float acc = 0; | ||
for (int k = 0; k < K; ++k) { | ||
acc += A(i, k) * B(k, j); | ||
} | ||
C(i, j) = acc; | ||
} | ||
} | ||
|
||
#undef A | ||
#undef B | ||
#undef C | ||
} | ||
|
||
int main(int argc, char **argv) { | ||
|
||
argparse::Parser parser; | ||
|
||
// default matrix sizes: | ||
// A: 307 x 313 | ||
// B: 313 x 311 | ||
// C: 307 x 311 | ||
int m = 307; | ||
int n = 311; | ||
int k = 313; | ||
|
||
int nIters = 5; | ||
int nWarmup = 5; | ||
parser.add_positional(m); | ||
parser.add_positional(n); | ||
parser.add_positional(k); | ||
parser.add_option(nIters, "--iters"); | ||
parser.add_option(nWarmup, "--warmup"); | ||
|
||
if (!parser.parse(argc, argv)) { | ||
parser.help(); | ||
exit(EXIT_FAILURE); | ||
} | ||
|
||
const int64_t flop = m * n * k * 2; | ||
|
||
// initialize host data | ||
std::vector<float> aHost(m * k), bHost(k * n), cHost(m * n), cExpected(m * n); | ||
std::generate(aHost.begin(), aHost.end(), random_int); | ||
std::generate(bHost.begin(), bHost.end(), random_int); | ||
|
||
// allocate device data | ||
float *aDev, *bDev, *cDev; | ||
CUDA_RUNTIME(cudaMalloc(&aDev, aHost.size() * sizeof(float))); | ||
CUDA_RUNTIME(cudaMalloc(&bDev, bHost.size() * sizeof(float))); | ||
CUDA_RUNTIME(cudaMalloc(&cDev, cHost.size() * sizeof(float))); | ||
|
||
// copy data to device | ||
CUDA_RUNTIME(cudaMemcpy(aDev, aHost.data(), aHost.size() * sizeof(float), | ||
cudaMemcpyDefault)); | ||
CUDA_RUNTIME(cudaMemcpy(bDev, bHost.data(), bHost.size() * sizeof(float), | ||
cudaMemcpyDefault)); | ||
|
||
// create events to time GPU kernel | ||
cudaEvent_t start, stop; | ||
CUDA_RUNTIME(cudaEventCreate(&start)); | ||
CUDA_RUNTIME(cudaEventCreate(&stop)); | ||
|
||
// GPU kernel launch parameters | ||
dim3 dimBlock(32, 8); | ||
dim3 dimGrid; | ||
dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x; | ||
dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y; | ||
|
||
// total elapsed time | ||
float elapsed = 0; | ||
|
||
/* Launch the kernel nIters + nWarmup times | ||
Check for correctness on the first time. | ||
Record the time after nWarmup runs complete. | ||
*/ | ||
for (int i = 0; i < nIters + nWarmup; ++i) { | ||
CUDA_RUNTIME(cudaEventRecord(start)); | ||
mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k); | ||
CUDA_RUNTIME(cudaEventRecord(stop)); | ||
CUDA_RUNTIME(cudaEventSynchronize(stop)); | ||
|
||
// check result once | ||
if (i == 0) { | ||
// copy result to host | ||
CUDA_RUNTIME(cudaMemcpy(cHost.data(), cDev, cHost.size() * sizeof(float), | ||
cudaMemcpyDefault)); | ||
|
||
// check result on host | ||
cpu_gemm(cExpected.data(), aHost.data(), bHost.data(), m, n, k); | ||
|
||
for (size_t i = 0; i < cExpected.size(); ++i) { | ||
if (!equal(cExpected[i], cHost[i], 1e-6)) { | ||
std::cerr << "Error!\n"; | ||
exit(EXIT_FAILURE); | ||
} | ||
} | ||
} | ||
|
||
float millis; | ||
CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop)); | ||
std::cerr << i << ": " << millis << (i >= nWarmup ? " *" : " ") << "\n"; | ||
|
||
// record time after warmup runs | ||
if (i >= nWarmup) { | ||
elapsed += millis; | ||
} | ||
} | ||
|
||
// print results | ||
double gflops = flop / ((elapsed / nIters) / 1000) / 1e9; | ||
std::cerr << gflops << "GFLOPS (" << flop << " flop, " | ||
<< (elapsed / nIters) / 1000 << "s)\n"; | ||
|
||
// release resources | ||
CUDA_RUNTIME(cudaEventDestroy(start)); | ||
CUDA_RUNTIME(cudaEventDestroy(stop)); | ||
CUDA_RUNTIME(cudaFree(aDev)); | ||
CUDA_RUNTIME(cudaFree(bDev)); | ||
CUDA_RUNTIME(cudaFree(cDev)); | ||
return 0; | ||
} |
Oops, something went wrong.