Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JNI: throw CUDA errors more specifically #10551

Merged
merged 29 commits into from
Apr 24, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 174 additions & 1 deletion java/src/main/java/ai/rapids/cudf/CudaException.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,9 @@
*/
package ai.rapids.cudf;

import java.util.HashSet;
import java.util.Set;

/**
* Exception from the cuda language/library. Be aware that because of how cuda does asynchronous
* processing exceptions from cuda can be thrown by method calls that did not cause the exception
Expand All @@ -30,9 +33,179 @@
public class CudaException extends RuntimeException {
CudaException(String message) {
super(message);
this.cudaError = extractCudaError(message);
}

CudaException(String message, Throwable cause) {
super(message, cause);
this.cudaError = extractCudaError(message);
}

public final CudaError cudaError;

/**
* The Java mirror of cudaError, which facilities the tracking of CUDA errors in JVM.
*/
public enum CudaError {
cudaErrorInvalidValue(1),
cudaErrorMemoryAllocation(2),
cudaErrorInitializationError(3),
cudaErrorCudartUnloading(4),
cudaErrorProfilerDisabled(5),
cudaErrorProfilerNotInitialized(6),
cudaErrorProfilerAlreadyStarted(7),
cudaErrorProfilerAlreadyStopped(8),
cudaErrorInvalidConfiguration(9),
cudaErrorInvalidPitchValue(12),
cudaErrorInvalidSymbol(13),
cudaErrorInvalidHostPointer(16),
cudaErrorInvalidDevicePointer(17),
cudaErrorInvalidTexture(18),
cudaErrorInvalidTextureBinding(19),
cudaErrorInvalidChannelDescriptor(20),
cudaErrorInvalidMemcpyDirection(21),
cudaErrorAddressOfConstant(22),
cudaErrorTextureFetchFailed(23),
cudaErrorTextureNotBound(24),
cudaErrorSynchronizationError(25),
cudaErrorInvalidFilterSetting(26),
cudaErrorInvalidNormSetting(27),
cudaErrorMixedDeviceExecution(28),
cudaErrorNotYetImplemented(31),
cudaErrorMemoryValueTooLarge(32),
cudaErrorStubLibrary(34),
cudaErrorInsufficientDriver(35),
cudaErrorCallRequiresNewerDriver(36),
cudaErrorInvalidSurface(37),
cudaErrorDuplicateVariableName(43),
cudaErrorDuplicateTextureName(44),
cudaErrorDuplicateSurfaceName(45),
cudaErrorDevicesUnavailable(46),
cudaErrorIncompatibleDriverContext(49),
cudaErrorMissingConfiguration(52),
cudaErrorPriorLaunchFailure(53),
cudaErrorLaunchMaxDepthExceeded(65),
cudaErrorLaunchFileScopedTex(66),
cudaErrorLaunchFileScopedSurf(67),
cudaErrorSyncDepthExceeded(68),
cudaErrorLaunchPendingCountExceeded(69),
cudaErrorInvalidDeviceFunction(98),
cudaErrorNoDevice(100),
cudaErrorInvalidDevice(101),
cudaErrorDeviceNotLicensed(102),
cudaErrorSoftwareValidityNotEstablished(103),
cudaErrorStartupFailure(127),
cudaErrorInvalidKernelImage(200),
cudaErrorDeviceUninitialized(201),
cudaErrorMapBufferObjectFailed(205),
cudaErrorUnmapBufferObjectFailed(206),
cudaErrorArrayIsMapped(207),
cudaErrorAlreadyMapped(208),
cudaErrorNoKernelImageForDevice(209),
cudaErrorAlreadyAcquired(210),
cudaErrorNotMapped(211),
cudaErrorNotMappedAsArray(212),
cudaErrorNotMappedAsPointer(213),
cudaErrorECCUncorrectable(214),
cudaErrorUnsupportedLimit(215),
cudaErrorDeviceAlreadyInUse(216),
cudaErrorPeerAccessUnsupported(217),
cudaErrorInvalidPtx(218),
cudaErrorInvalidGraphicsContext(219),
cudaErrorNvlinkUncorrectable(220),
cudaErrorJitCompilerNotFound(221),
cudaErrorUnsupportedPtxVersion(222),
cudaErrorJitCompilationDisabled(223),
cudaErrorUnsupportedExecAffinity(224),
cudaErrorInvalidSource(300),
cudaErrorFileNotFound(301),
cudaErrorSharedObjectSymbolNotFound(302),
cudaErrorSharedObjectInitFailed(303),
cudaErrorOperatingSystem(304),
cudaErrorInvalidResourceHandle(400),
cudaErrorIllegalState(401),
cudaErrorSymbolNotFound(500),
cudaErrorNotReady(600),
cudaErrorIllegalAddress(700),
cudaErrorLaunchOutOfResources(701),
cudaErrorLaunchTimeout(702),
cudaErrorLaunchIncompatibleTexturing(703),
cudaErrorPeerAccessAlreadyEnabled(704),
cudaErrorPeerAccessNotEnabled(705),
cudaErrorSetOnActiveProcess(708),
cudaErrorContextIsDestroyed(709),
cudaErrorAssert(710),
cudaErrorTooManyPeers(711),
cudaErrorHostMemoryAlreadyRegistered(712),
cudaErrorHostMemoryNotRegistered(713),
cudaErrorHardwareStackError(714),
cudaErrorIllegalInstruction(715),
cudaErrorMisalignedAddress(716),
cudaErrorInvalidAddressSpace(717),
cudaErrorInvalidPc(718),
cudaErrorLaunchFailure(719),
cudaErrorCooperativeLaunchTooLarge(720),
cudaErrorNotPermitted(800),
cudaErrorNotSupported(801),
cudaErrorSystemNotReady(802),
cudaErrorSystemDriverMismatch(803),
cudaErrorCompatNotSupportedOnDevice(804),
cudaErrorMpsConnectionFailed(805),
cudaErrorMpsRpcFailure(806),
cudaErrorMpsServerNotReady(807),
cudaErrorMpsMaxClientsReached(808),
cudaErrorMpsMaxConnectionsReached(809),
cudaErrorStreamCaptureUnsupported(900),
cudaErrorStreamCaptureInvalidated(901),
cudaErrorStreamCaptureMerge(902),
cudaErrorStreamCaptureUnmatched(903),
cudaErrorStreamCaptureUnjoined(904),
cudaErrorStreamCaptureIsolation(905),
cudaErrorStreamCaptureImplicit(906),
cudaErrorCapturedEvent(907),
cudaErrorStreamCaptureWrongThread(908),
cudaErrorTimeout(909),
cudaErrorGraphExecUpdateFailure(910),
cudaErrorExternalDevice(911),
cudaErrorUnknown(999),
cudaErrorApiFailureBase(10000);

final int code;

private static final Set<CudaError> stickyErrors = new HashSet<CudaError>(){{
jlowe marked this conversation as resolved.
Show resolved Hide resolved
add(CudaError.cudaErrorIllegalAddress);
add(CudaError.cudaErrorLaunchTimeout);
add(CudaError.cudaErrorHardwareStackError);
add(CudaError.cudaErrorIllegalInstruction);
add(CudaError.cudaErrorMisalignedAddress);
add(CudaError.cudaErrorInvalidAddressSpace);
add(CudaError.cudaErrorInvalidPc);
add(CudaError.cudaErrorLaunchFailure);
add(CudaError.cudaErrorExternalDevice);
add(CudaError.cudaErrorUnknown);
}};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't very robust. I described a more robust way to detect sticky errors here: #10200 (comment)

Soon I hope to have libcudf throw a separate exception type for sticky errors.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, it would be good to align this with what's going on in that other issue.

Copy link
Contributor Author

@sperlingxx sperlingxx Apr 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I totally agree with @jrhemstad as well. So, shall we pend the JNI-side work for the time being until the libcudf is enhanced in terms of CUDA error handling?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RAPIDS Accelerator is already addressing this for the short-term at NVIDIA/spark-rapids#5118. Therefore I'd rather we take the time here to to leverage a proper interface in libcudf rather than rush this in and then need to change it when libcudf refines its exception handling soon afterwards.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @jlowe, I reworked the PR. For now, it pushes down the sticky error detection to libcudf.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I am stuck on how to trigger a fatal CUDA error through the unit test.


CudaError(int errorCode) {
this.code = errorCode;
}

/**
* Returns whether this CudaError is sticky or not.
*
* Sticky errors leave the process in an inconsistent state and any further CUDA work will return
* the same error. To continue using CUDA, the process must be terminated and relaunched.
*/
public boolean isSticky() {
return stickyErrors.contains(this);
}
}

private static CudaError extractCudaError(String message) {
for (String segment : message.split(" ")) {
if (segment.startsWith("cudaError")) {
return CudaError.valueOf(segment);
}
}
throw new CudfException("invalid CUDA error message: " + message);
}
}
48 changes: 31 additions & 17 deletions java/src/main/native/include/jni_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,8 @@ class native_jstringArray {
/**
* @brief create a cuda exception from a given cudaError_t
*/
inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) {
inline jthrowable cuda_exception(JNIEnv *const env, const char *file, unsigned int line,
cudaError_t status, jthrowable cause = NULL) {
jclass ex_class = env->FindClass(cudf::jni::CUDA_ERROR_CLASS);
if (ex_class == NULL) {
return NULL;
Expand All @@ -747,25 +748,21 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
return NULL;
}

jstring msg = env->NewStringUTF(cudaGetErrorString(status));
if (msg == NULL) {
const char *err_name = cudaGetErrorName(status);
if (err_name == nullptr) {
return NULL;
}
const char *err_string = cudaGetErrorString(status);

jobject ret = env->NewObject(ex_class, ctor_id, msg, cause);
return (jthrowable)ret;
}
// Build the error message in the format of cudf::cuda_error, so that cudf::jni::CUDA_ERROR_CLASS
// can parse both of them.
std::string n_msg = "CUDA error encountered at: " + std::string{file} + ":" +
Copy link
Contributor

@jrhemstad jrhemstad Mar 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't count on the contents of that exception message being stable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather see cudf::cuda_error updated to allow the extraction of the CUDA error ID rather than relying on parsing. In general we should be moving away from string-scraping for error identification, not adding more instances of it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #10553

std::to_string(line) + ": " + std::to_string(status) + " " + err_name + " " +
err_string;
jstring j_msg = env->NewStringUTF(n_msg.c_str());

inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
if (cudaSuccess != cuda_status) {
// Clear the last error so it does not propagate.
cudaGetLastError();
jthrowable jt = cuda_exception(env, cuda_status);
if (jt != NULL) {
env->Throw(jt);
throw jni_exception("CUDA ERROR");
}
}
jobject ret = env->NewObject(ex_class, ctor_id, j_msg, cause);
return (jthrowable)ret;
}

} // namespace jni
Expand Down Expand Up @@ -796,14 +793,27 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
if (cudaSuccess != internal_cuda_status) { \
/* Clear the last error so it does not propagate.*/ \
cudaGetLastError(); \
jthrowable jt = cudf::jni::cuda_exception(env, internal_cuda_status); \
jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, internal_cuda_status); \
if (jt != NULL) { \
env->Throw(jt); \
} \
return ret_val; \
} \
}

#define JNI_CUDA_CHECK(env, cuda_status) \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best practice would be to put this in a do{...} while(0)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed. Thanks for advice!

jlowe marked this conversation as resolved.
Show resolved Hide resolved
{ \
if (cudaSuccess != cuda_status) { \
/* Clear the last error so it does not propagate.*/ \
cudaGetLastError(); \
jthrowable jt = cudf::jni::cuda_exception(env, __FILE__, __LINE__, cuda_status); \
if (jt != NULL) { \
env->Throw(jt); \
throw cudf::jni::jni_exception("CUDA ERROR"); \
} \
} \
}

#define JNI_NULL_CHECK(env, obj, error_msg, ret_val) \
{ \
if ((obj) == 0) { \
Expand Down Expand Up @@ -831,6 +841,10 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val); \
} \
catch (const cudf::cuda_error &e) { \
/* For CUDA errors, the specific error code will be extracted from error message. */ \
JNI_CHECK_THROW_NEW(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), ret_val); \
} \
catch (const std::exception &e) { \
/* If jni_exception caught then a Java exception is pending and this will not overwrite it. */ \
JNI_CHECK_THROW_NEW(env, class_name, e.what(), ret_val); \
Expand Down
6 changes: 3 additions & 3 deletions java/src/main/native/src/CudaJni.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,7 +44,7 @@ void auto_set_device(JNIEnv *env) {
if (Cudf_device != cudaInvalidDeviceId) {
if (Thread_device != Cudf_device) {
cudaError_t cuda_status = cudaSetDevice(Cudf_device);
jni_cuda_check(env, cuda_status);
JNI_CUDA_CHECK(env, cuda_status);
Thread_device = Cudf_device;
}
}
Expand All @@ -53,7 +53,7 @@ void auto_set_device(JNIEnv *env) {
/** Fills all the bytes in the buffer 'buf' with 'value'. */
void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) {
cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size());
jni_cuda_check(env, cuda_status);
JNI_CUDA_CHECK(env, cuda_status);
}

} // namespace jni
Expand Down
4 changes: 2 additions & 2 deletions java/src/main/native/src/RmmJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -328,10 +328,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
try {
// make sure the CUDA device is setup in the context
cudaError_t cuda_status = cudaFree(0);
cudf::jni::jni_cuda_check(env, cuda_status);
JNI_CUDA_CHECK(env, cuda_status);
int device_id;
cuda_status = cudaGetDevice(&device_id);
cudf::jni::jni_cuda_check(env, cuda_status);
JNI_CUDA_CHECK(env, cuda_status);

bool use_pool_alloc = allocation_mode & 1;
bool use_managed_mem = allocation_mode & 2;
Expand Down
17 changes: 15 additions & 2 deletions java/src/test/java/ai/rapids/cudf/CudaTest.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,7 +18,7 @@

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.*;

public class CudaTest {

Expand All @@ -32,4 +32,17 @@ public void testGetCudaRuntimeInfo() {
assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
}

@Test
public void testCudaException() {
assertThrows(CudaException.class, () -> {
try {
Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
} catch (CudaException ex) {
assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.cudaError);
assertFalse(ex.cudaError.isSticky());
throw ex;
}
}
);
}
}