diff --git a/serving/src/main/java/ai/djl/serving/http/InferenceRequestHandler.java b/serving/src/main/java/ai/djl/serving/http/InferenceRequestHandler.java index e9fbd698c..308145649 100644 --- a/serving/src/main/java/ai/djl/serving/http/InferenceRequestHandler.java +++ b/serving/src/main/java/ai/djl/serving/http/InferenceRequestHandler.java @@ -418,6 +418,20 @@ void sendOutput(Output output, ChannelHandlerContext ctx) { byte[] buf = supplier.nextChunk(chunkReadTime, TimeUnit.SECONDS); // Defer sending HTTP header until first chunk received. // This allows inference update HTTP code. + // If this is the first and last chunk, we're in a non-streaming case and can + // use default response without chunked transfer encoding + if (first && !supplier.hasNext()) { + FullHttpResponse resp = + new DefaultFullHttpResponse(HttpVersion.HTTP_1_1, status); + for (Map.Entry entry : output.getProperties().entrySet()) { + resp.headers().set(entry.getKey(), entry.getValue()); + } + if (buf != null) { + resp.content().writeBytes(buf); + } + NettyUtils.sendHttpResponse(ctx, resp, true); + return; + } if (first) { code = output.getCode(); status = new HttpResponseStatus(code, output.getMessage());