Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit 66abeaf
Author: Carl Pearson <[email protected]>
Date:   Mon Apr 6 13:03:49 2020 -0500

    .

commit 7cc7288
Author: Carl Pearson <[email protected]>
Date:   Mon Apr 6 11:29:04 2020 -0500

    .

commit 547d794
Author: Carl Pearson <[email protected]>
Date:   Mon Apr 6 11:04:08 2020 -0500

    .

commit 15db5cf
Author: Carl Pearson <[email protected]>
Date:   Mon Apr 6 07:46:41 2020 -0500

    .

commit 83e4e4f
Author: Carl Pearson <[email protected]>
Date:   Mon Apr 6 07:39:01 2020 -0500

    cuda 10.1

commit 9bf48c6
Author: Carl Pearson <[email protected]>
Date:   Fri Apr 3 08:12:21 2020 -0500

    .

commit 48d957e
Author: Carl Pearson <[email protected]>
Date:   Fri Apr 3 07:54:28 2020 -0500

    remove install files

commit efc8283
Merge: cc817f8 dfbe32f
Author: Carl Pearson <[email protected]>
Date:   Thu Apr 2 13:40:04 2020 -0500

    Merge branch 'sgemm' of github.com:cwpearson/nvidia-performance-tools into sgemm

commit cc817f8
Author: Carl Pearson <[email protected]>
Date:   Thu Apr 2 13:39:58 2020 -0500

    sgemm working

commit d2e5b26
Author: Carl Pearson <[email protected]>
Date:   Thu Apr 2 13:29:11 2020 -0500

    cpu test

commit dfbe32f
Author: Carl Pearson <[email protected]>
Date:   Thu Apr 2 07:58:12 2020 -0500

    add rai_build

commit 468757f
Author: Carl Pearson <[email protected]>
Date:   Thu Apr 2 07:40:29 2020 -0500

    .

commit cc2ace8
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:44:48 2020 -0500

    .

commit b205b6a
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:44:27 2020 -0500

    .

commit 141a794
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:31:04 2020 -0500

    .

commit 7c6caff
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:25:30 2020 -0500

    .

commit 0b4dd5d
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:24:02 2020 -0500

    .

commit 8bdf7ce
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:19:41 2020 -0500

    add sgemm to travis

commit 108561d
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:12:42 2020 -0500

    travis

commit ac7fe77
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:08:18 2020 -0500

    switch row/col major

commit e5789b3
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 17:00:45 2020 -0500

    all sgemms compile

commit 5c782a6
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 16:39:33 2020 -0500

    steal tiled code from webgpu

commit 34775a4
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 15:20:18 2020 -0500

    add basic sgemm impl, update argparse

commit 05a321c
Author: Carl Pearson <[email protected]>
Date:   Wed Apr 1 10:16:56 2020 -0500

    basic sgemm kernel and copy regtiled from 508
  • Loading branch information
cwpearson committed Apr 6, 2020
1 parent f80c8e7 commit d546091
Show file tree
Hide file tree
Showing 18 changed files with 1,000 additions and 34 deletions.
Empty file added .dockerignore
Empty file.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sgemm/build
*.deb
*.run
14 changes: 10 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
dist: bionic
language: minimal
arch:
- ppc64le
- amd64

docker: true
jobs:
include:
- arch: ppc64le
env: BUILD_DOCKER=1
docker: true
- arch: amd64
env: BUILD_DOCKER=1
docker: true
- env: BUILD_TYPE=Release
- env: BUILD_TYPE=Debug

before_script:
- ci/install_deps.sh || travis_terminate 1;
Expand Down
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,23 @@ docker pull cwpearson/nvidia-performance-tools/latest-ppc64le

[ECE 408 Spring 2020 - Introduction to Nvidia Performance Tools](https://docs.google.com/presentation/d/1A5i3Zdh7ltOLdW7qHZ2tviXYcyl1sKvM7kRpnzOD7tQ/edit?usp=sharing)

## nvcc

```
--profile (-pg)
Instrument generated code/executable for use by gprof (Linux only).
--debug (-g)
Generate debug information for host code.
--device-debug (-G)
Generate debug information for device code. Turns off all optimizations.
Don't use for profiling; use -lineinfo instead.
--generate-line-info (-lineinfo)
Generate line-number information for device code.
```

## Nsight Compute

```bash
Expand All @@ -29,12 +46,23 @@ nv-nsight-cu-cli --csv a.out
nsys profile a.out
```


## Managing docker images

* `docker ps -a`
* `docker rm `docker ps -a -q``

Run a profiling container:
```bash
docker run cwpearson/nvidia-performance-tools:latest-amd64
```

Resume a previously exited container:
```bash
* docker ps -a # find the ID
* docker start <ID> # resume the exited container
* docker attach <ID> # attach a terminal to the container
```

## Resources

* [Using Nvidia Nsight Systems in Containers and the Cloud](https://devblogs.nvidia.com/nvidia-nsight-systems-containers-cloud/)
4 changes: 3 additions & 1 deletion docker/amd64.dockerfile → amd64_10-1.dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda:10.2-devel-ubuntu18.04
FROM nvidia/cuda:10.1-devel-ubuntu18.04

# Set one or more individual labels
LABEL maintainer="Carl Pearson"
Expand All @@ -20,6 +20,8 @@ COPY NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb nsight_systems.deb
RUN chmod +x nsight_compute.run
RUN TERM=xterm ./nsight_compute.run --quiet -- -noprompt -targetpath=/usr/local/NVIDIA-Nsight-Compute
ENV PATH=$PATH:/usr/local/NVIDIA-Nsight-Compute
RUN rm nsight_compute.run

# install Nsight Systems
RUN dpkg -i nsight_systems.deb
RUN rm nsight_systems.deb
45 changes: 27 additions & 18 deletions ci/build.sh
Original file line number Diff line number Diff line change
@@ -1,29 +1,38 @@
set -x

set -e

cd docker
ls -halt
source ci/env.sh

if [[ $BUILD_DOCKER == "1" ]]; then
cd $TRAVIS_BUILD_DIR

echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin
echo $DOCKER_PASS | docker login -u $DOCKER_USER --password-stdin

TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7}
DOCKER_REPO=nvidia-performance-tools
DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO
DOCKER_TAG=$TRAVIS_CPU_ARCH-10.2-$TRAVIS_BRANCH-$TRAVIS_COMMIT
TRAVIS_COMMIT=${TRAVIS_COMMIT:0:7}
DOCKER_REPO=nvidia-performance-tools
DOCKER_SLUG=$DOCKER_USER/$DOCKER_REPO
DOCKER_TAG=${TRAVIS_CPU_ARCH}-10.1-$TRAVIS_BRANCH-$TRAVIS_COMMIT


docker build -f $TRAVIS_CPU_ARCH.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG .
docker push $DOCKER_SLUG:$DOCKER_TAG
docker build -f ${TRAVIS_CPU_ARCH}_10-1.dockerfile -t $DOCKER_SLUG:$DOCKER_TAG .
docker push $DOCKER_SLUG:$DOCKER_TAG


if [[ $TRAVIS_BRANCH == master ]]; then
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-$TRAVIS_CPU_ARCH
docker push $DOCKER_SLUG:latest-$TRAVIS_CPU_ARCH
else
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
docker push $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
if [[ $TRAVIS_BRANCH == master ]]; then
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH}
docker push $DOCKER_SLUG:latest-${TRAVIS_CPU_ARCH}
else
docker tag $DOCKER_SLUG:$DOCKER_TAG $DOCKER_SLUG:$TRAVIS_BRANCH-$TRAVIS_CPU_ARCH
docker push $DOCKER_SLUG:$TRAVIS_BRANCH-${TRAVIS_CPU_ARCH}
fi
fi

# remove the login key from the image
rm -fv $HOME/.docker/config.json

if [[ $BUILD_TYPE != '' ]]; then
cd $TRAVIS_BUILD_DIR
cd sgemm
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=$BUILD_TYPE
make VERBOSE=1
fi
6 changes: 6 additions & 0 deletions ci/env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
CMAKE_PREFIX=$HOME/cmake

export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

export PATH=$CMAKE_PREFIX/bin:$PATH
43 changes: 34 additions & 9 deletions ci/install_deps.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,38 @@
set -x
set -e

sudo apt-get update
sudo apt-get install -q -y wget

if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then
wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O docker/NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb;
wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O docker/nsight-compute-PPC64LE-2019.5.0.14-27346997.run;
elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then
wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O docker/NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb;
wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O docker/nsight-compute-linux-2019.5.0.14-27346997.run;
source ci/env.sh

# deps for building docker images
if [[ $BUILD_DOCKER == "1" ]]; then
cd $TRAVIS_BUILD_DIR

if [[ $TRAVIS_CPU_ARCH == ppc64le ]]; then
wget -qSL https://uofi.box.com/shared/static/vfxflckdjixxkc524qltme4sx8kt3w9d.deb -O NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb;
wget -qSL https://uofi.box.com/shared/static/swjp2bjr7xj153vzw8mvutv2tqomypxu.run -O nsight-compute-PPC64LE-2019.5.0.14-27346997.run;
elif [[ $TRAVIS_CPU_ARCH == amd64 ]]; then
wget -qSL https://uofi.box.com/shared/static/zjsv2rayiotyrdix6a6yd3w8cre56lo0.deb -O NVIDIA_Nsight_Systems_Linux_2020.2.1.71.deb;
wget -qSL https://uofi.box.com/shared/static/4fuf3wws1uplhf29ndcq4s91kl3jyl7z.run -O nsight-compute-linux-2019.5.0.14-27346997.run;
fi
fi

# deps for building code
if [[ $BUILD_TYPE != '' ]]; then
cs $HOME

## install CMake
wget -qSL https://github.com/Kitware/CMake/releases/download/v3.8.2/cmake-3.8.2-Linux-x86_64.tar.gz -O cmake.tar.gz
mkdir -p $CMAKE_PREFIX
tar -xf cmake.tar.gz --strip-components=1 -C $CMAKE_PREFIX
rm cmake.tar.gz

## install CUDA
sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
CUDA102="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.2.89-1_amd64.deb"
wget -SL $CUDA102 -O cuda.deb
sudo dpkg -i cuda.deb
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
cuda-toolkit-10-2
rm cuda.deb
fi
4 changes: 3 additions & 1 deletion docker/ppc64le.dockerfile → ppc64le_10-1.dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM nvidia/cuda-ppc64le:10.2-devel-ubuntu18.04
FROM nvidia/cuda-ppc64le:10.1-devel-ubuntu18.04

# Set one or more individual labels
LABEL maintainer="Carl Pearson"
Expand All @@ -20,6 +20,8 @@ COPY NVIDIA_Nsight_Systems_Power_CLI_Only_2020.2.1.71.deb nsight_systems.deb
RUN chmod +x nsight_compute.run
RUN TERM=xterm ./nsight_compute.run --quiet -- -noprompt -targetpath=/usr/local/NVIDIA-Nsight-Compute
ENV PATH=$PATH:/usr/local/NVIDIA-Nsight-Compute
RUN rm nsight_compute.run

# install Nsight Systems
RUN dpkg -i nsight_systems.deb
RUN rm nsight_systems.deb
22 changes: 22 additions & 0 deletions sgemm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
project(sgemm LANGUAGES CXX CUDA)

# 3.8+ for CUDA
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)

if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release")
message(STATUS "Setting CMAKE_BUILD_TYPE=Release")
endif()

set(CMAKE_CUDA_STANDARD 11)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)

include_directories(PUBLIC SYSTEM include)

# Add line info to binaries to help with profiling
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")

add_executable(sgemm-cpu cpu.cpp)
add_executable(sgemm-basic basic.cu)
add_executable(sgemm-tiled tiled.cu)
add_executable(sgemm-regtiled-coarsened regtiled_coarsened.cu)
143 changes: 143 additions & 0 deletions sgemm/basic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#include <algorithm>

#include <argparse/argparse.hpp>

#include "common.hpp"

/* NOTE: A and C are column major, B is row major
*/
__global__ void mygemm(float *c, //<! [out] and MxN matrix
const float *a, //<! [in] an MxK matrix
const float *b, //<! [in] an KxN matrix
const int M, const int N, const int K) {

#define A(_i, _j) a[(_i) + (_j)*M]
#define B(_i, _j) b[(_i)*N + (_j)]
#define C(_i, _j) c[(_i) + (_j)*M]

int gidx = blockDim.x * blockIdx.x + threadIdx.x;
int gidy = blockDim.y * blockIdx.y + threadIdx.y;

for (int i = gidy; i < M; i += gridDim.y * blockDim.y) {
for (int j = gidx; j < N; j += gridDim.x * blockDim.x) {
float acc = 0;
for (int k = 0; k < K; ++k) {
acc += A(i, k) * B(k, j);
}
C(i, j) = acc;
}
}

#undef A
#undef B
#undef C
}

int main(int argc, char **argv) {

argparse::Parser parser;

// default matrix sizes:
// A: 307 x 313
// B: 313 x 311
// C: 307 x 311
int m = 307;
int n = 311;
int k = 313;

int nIters = 5;
int nWarmup = 5;
parser.add_positional(m);
parser.add_positional(n);
parser.add_positional(k);
parser.add_option(nIters, "--iters");
parser.add_option(nWarmup, "--warmup");

if (!parser.parse(argc, argv)) {
parser.help();
exit(EXIT_FAILURE);
}

const int64_t flop = m * n * k * 2;

// initialize host data
std::vector<float> aHost(m * k), bHost(k * n), cHost(m * n), cExpected(m * n);
std::generate(aHost.begin(), aHost.end(), random_int);
std::generate(bHost.begin(), bHost.end(), random_int);

// allocate device data
float *aDev, *bDev, *cDev;
CUDA_RUNTIME(cudaMalloc(&aDev, aHost.size() * sizeof(float)));
CUDA_RUNTIME(cudaMalloc(&bDev, bHost.size() * sizeof(float)));
CUDA_RUNTIME(cudaMalloc(&cDev, cHost.size() * sizeof(float)));

// copy data to device
CUDA_RUNTIME(cudaMemcpy(aDev, aHost.data(), aHost.size() * sizeof(float),
cudaMemcpyDefault));
CUDA_RUNTIME(cudaMemcpy(bDev, bHost.data(), bHost.size() * sizeof(float),
cudaMemcpyDefault));

// create events to time GPU kernel
cudaEvent_t start, stop;
CUDA_RUNTIME(cudaEventCreate(&start));
CUDA_RUNTIME(cudaEventCreate(&stop));

// GPU kernel launch parameters
dim3 dimBlock(32, 8);
dim3 dimGrid;
dimGrid.x = (n + dimBlock.x - 1) / dimBlock.x;
dimGrid.y = (m + dimBlock.y - 1) / dimBlock.y;

// total elapsed time
float elapsed = 0;

/* Launch the kernel nIters + nWarmup times
Check for correctness on the first time.
Record the time after nWarmup runs complete.
*/
for (int i = 0; i < nIters + nWarmup; ++i) {
CUDA_RUNTIME(cudaEventRecord(start));
mygemm<<<dimGrid, dimBlock>>>(cDev, aDev, bDev, m, n, k);
CUDA_RUNTIME(cudaEventRecord(stop));
CUDA_RUNTIME(cudaEventSynchronize(stop));

// check result once
if (i == 0) {
// copy result to host
CUDA_RUNTIME(cudaMemcpy(cHost.data(), cDev, cHost.size() * sizeof(float),
cudaMemcpyDefault));

// check result on host
cpu_gemm(cExpected.data(), aHost.data(), bHost.data(), m, n, k);

for (size_t i = 0; i < cExpected.size(); ++i) {
if (!equal(cExpected[i], cHost[i], 1e-6)) {
std::cerr << "Error!\n";
exit(EXIT_FAILURE);
}
}
}

float millis;
CUDA_RUNTIME(cudaEventElapsedTime(&millis, start, stop));
std::cerr << i << ": " << millis << (i >= nWarmup ? " *" : " ") << "\n";

// record time after warmup runs
if (i >= nWarmup) {
elapsed += millis;
}
}

// print results
double gflops = flop / ((elapsed / nIters) / 1000) / 1e9;
std::cerr << gflops << "GFLOPS (" << flop << " flop, "
<< (elapsed / nIters) / 1000 << "s)\n";

// release resources
CUDA_RUNTIME(cudaEventDestroy(start));
CUDA_RUNTIME(cudaEventDestroy(stop));
CUDA_RUNTIME(cudaFree(aDev));
CUDA_RUNTIME(cudaFree(bDev));
CUDA_RUNTIME(cudaFree(cDev));
return 0;
}
Loading

0 comments on commit d546091

Please sign in to comment.