Skip to content

Commit

Permalink
GPT 4o audio
Browse files Browse the repository at this point in the history
  • Loading branch information
abrenneke committed Feb 2, 2025
1 parent 7ff31f2 commit e6fd847
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 1 deletion.
125 changes: 124 additions & 1 deletion packages/core/src/model/nodes/ChatNode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import { nodeDefinition } from '../NodeDefinition.js';
import type { TokenizerCallInfo } from '../../integrations/Tokenizer.js';
import { DEFAULT_CHAT_ENDPOINT } from '../../utils/defaults.js';
import { chatMessageToOpenAIChatCompletionMessage } from '../../utils/chatMessageToOpenAIChatCompletionMessage.js';
import { base64ToUint8Array } from '../../utils/base64.js';

export type ChatNode = ChartNode<'chat', ChatNodeData>;

Expand Down Expand Up @@ -64,6 +65,9 @@ export type ChatNodeConfigData = {

modalitiesIncludeText?: boolean;
modalitiesIncludeAudio?: boolean;

audioVoice?: string;
audioFormat?: 'wav' | 'mp3' | 'flac' | 'opus' | 'pcm16';
};

export type ChatNodeData = ChatNodeConfigData & {
Expand All @@ -87,6 +91,8 @@ export type ChatNodeData = ChatNodeConfigData & {
useResponseFormatInput?: boolean;
useAdditionalParametersInput?: boolean;
useResponseSchemaNameInput?: boolean;
useAudioVoiceInput?: boolean;
useAudioFormatInput?: boolean;

/** Given the same set of inputs, return the same output without hitting GPT */
cache: boolean;
Expand Down Expand Up @@ -150,6 +156,9 @@ export class ChatNodeImpl extends NodeImpl<ChatNode> {
useServerTokenCalculation: true,
outputUsage: false,
usePredictedOutput: false,

modalitiesIncludeAudio: false,
modalitiesIncludeText: true,
},
};

Expand Down Expand Up @@ -380,6 +389,24 @@ export class ChatNodeImpl extends NodeImpl<ChatNode> {
});
}

if (this.data.useAudioVoiceInput) {
inputs.push({
dataType: 'string',
id: 'audioVoice' as PortId,
title: 'Audio Voice',
description: 'The voice to use for audio responses. See your model for supported voices.',
});
}

if (this.data.useAudioFormatInput) {
inputs.push({
dataType: 'string',
id: 'audioFormat' as PortId,
title: 'Audio Format',
description: 'The format to use for audio responses.',
});
}

return inputs;
}

Expand Down Expand Up @@ -452,6 +479,22 @@ export class ChatNodeImpl extends NodeImpl<ChatNode> {
});
}

if (this.data.modalitiesIncludeAudio) {
outputs.push({
dataType: 'audio',
id: 'audio' as PortId,
title: 'Audio',
description: 'The audio response from the model.',
});

outputs.push({
dataType: 'string',
id: 'audioTranscript' as PortId,
title: 'Transcript',
description: 'The transcript of the audio response.',
});
}

return outputs;
}

Expand Down Expand Up @@ -662,6 +705,29 @@ export class ChatNodeImpl extends NodeImpl<ChatNode> {
dataKey: 'modalitiesIncludeAudio',
helperMessage: 'If on, the model will include audio in its responses. Only relevant for multimodal models.',
},
{
type: 'string',
label: 'Audio Voice',
dataKey: 'audioVoice',
useInputToggleDataKey: 'useAudioVoiceInput',
helperMessage: 'The voice to use for audio responses. See your model for supported voices.',
hideIf: (data) => !data.modalitiesIncludeAudio,
},
{
type: 'dropdown',
label: 'Audio Format',
dataKey: 'audioFormat',
useInputToggleDataKey: 'useAudioFormatInput',
options: [
{ value: 'wav', label: 'WAV' },
{ value: 'mp3', label: 'MP3' },
{ value: 'flac', label: 'FLAC' },
{ value: 'opus', label: 'OPUS' },
{ value: 'pcm16', label: 'PCM16' },
],
defaultValue: 'wav',
hideIf: (data) => !data.modalitiesIncludeAudio,
},
],
},
{
Expand Down Expand Up @@ -970,6 +1036,28 @@ export class ChatNodeImpl extends NodeImpl<ChatNode> {
: { type: 'content' as const, content: predictedOutput.map((part) => ({ type: 'text', text: part })) }
: undefined;

const modalities: ('text' | 'audio')[] = [];
if (this.data.modalitiesIncludeText) {
modalities.push('text');
}
if (this.data.modalitiesIncludeAudio) {
modalities.push('audio');
}

const audio = modalities.includes('audio')
? {
voice: getInputOrData(this.data, inputs, 'audioVoice'),
format:
(getInputOrData(this.data, inputs, 'audioFormat') as
| 'wav'
| 'mp3'
| 'flac'
| 'opus'
| 'pcm16'
| undefined) ?? 'wav',
}
: undefined;

try {
return await retry(
async () => {
Expand All @@ -987,6 +1075,8 @@ export class ChatNodeImpl extends NodeImpl<ChatNode> {
response_format: openaiResponseFormat,
tool_choice: toolChoice,
prediction: predictionObject,
modalities,
audio,
...additionalParameters,
};

Expand All @@ -1010,7 +1100,8 @@ export class ChatNodeImpl extends NodeImpl<ChatNode> {

const startTime = Date.now();

if (isO1Beta) {
// Non-streaming APIs
if (isO1Beta || audio) {
const response = await chatCompletions({
auth: {
apiKey: context.settings.openAiKey ?? '',
Expand Down Expand Up @@ -1050,6 +1141,23 @@ export class ChatNodeImpl extends NodeImpl<ChatNode> {
};
}

if (modalities.includes('audio')) {
const audioData = response.choices[0]!.message.audio;

output['audio' as PortId] = {
type: 'audio',
value: {
data: base64ToUint8Array(audioData!.data),
mediaType: audioFormatToMediaType(audio!.format),
},
};

output['audioTranscript' as PortId] = {
type: 'string',
value: response.choices[0]!.message.audio!.transcript,
};
}

output['duration' as PortId] = { type: 'number', value: Date.now() - startTime };

Object.freeze(output);
Expand Down Expand Up @@ -1418,3 +1526,18 @@ export function getChatNodeMessages(inputs: Inputs) {
export function getCostForTokens(tokenCount: number, type: 'prompt' | 'completion', costPerThousand: number) {
return (tokenCount / 1000) * costPerThousand;
}

function audioFormatToMediaType(format: 'wav' | 'mp3' | 'flac' | 'opus' | 'pcm16') {
switch (format) {
case 'wav':
return 'audio/wav';
case 'mp3':
return 'audio/mpeg';
case 'flac':
return 'audio/flac';
case 'opus':
return 'audio/opus';
case 'pcm16':
return 'audio/wav';
}
}
24 changes: 24 additions & 0 deletions packages/core/src/utils/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,14 @@ export const openaiModels = {
},
displayName: 'o3-mini (2025-01-31)',
},
'gpt-4o-audio-preview': {
maxTokens: 128000,
cost: {
prompt: 0.0025,
completion: 0.01,
},
displayName: 'GPT-4o Audio (Preview)',
},
'local-model': {
maxTokens: Number.MAX_SAFE_INTEGER,
cost: {
Expand Down Expand Up @@ -319,6 +327,13 @@ export type ChatCompletionOptions = {
type: 'content';
content: string | { type: string; text: string }[];
};

modalities: ('text' | 'audio')[];

audio?: {
voice: string;
format: 'wav' | 'mp3' | 'flac' | 'opus' | 'pcm16';
};
};

export type ChatCompletionResponse = {
Expand Down Expand Up @@ -380,6 +395,15 @@ export type ChatCompletionResponseMessage = {

/** The tool calls generated by the model, such as function calls. */
tool_calls: OpenAIFunctionToolCall[];

refusal: string | null;

audio?: {
id: string;
data: string;
expires_at: number;
transcript: string;
};
};

export type ChatCompletionChunk = {
Expand Down

0 comments on commit e6fd847

Please sign in to comment.