TanStack AI supports multimodal content in messages, allowing you to send images, audio, video, and documents alongside text to AI models that support these modalities.
When sending messages to AI models, you can include different types of content:
Multimodal messages use the ContentPart type to represent different content types:
import type { ContentPart, ImagePart, TextPart } from '@tanstack/ai'
// Text content
const textPart: TextPart = {
type: 'text',
content: 'What do you see in this image?'
}
// Image from base64 data (mimeType is required for data sources)
const imagePart: ImagePart = {
type: 'image',
source: {
type: 'data',
value: 'base64EncodedImageData...',
mimeType: 'image/jpeg' // Required for data sources
},
metadata: {
// Provider-specific metadata
detail: 'high' // OpenAI detail level
}
}
// Image from URL (mimeType is optional for URL sources)
const imageUrlPart: ImagePart = {
type: 'image',
source: {
type: 'url',
value: 'https://example.com/image.jpg',
mimeType: 'image/jpeg' // Optional hint for URL sources
}
}import type { ContentPart, ImagePart, TextPart } from '@tanstack/ai'
// Text content
const textPart: TextPart = {
type: 'text',
content: 'What do you see in this image?'
}
// Image from base64 data (mimeType is required for data sources)
const imagePart: ImagePart = {
type: 'image',
source: {
type: 'data',
value: 'base64EncodedImageData...',
mimeType: 'image/jpeg' // Required for data sources
},
metadata: {
// Provider-specific metadata
detail: 'high' // OpenAI detail level
}
}
// Image from URL (mimeType is optional for URL sources)
const imageUrlPart: ImagePart = {
type: 'image',
source: {
type: 'url',
value: 'https://example.com/image.jpg',
mimeType: 'image/jpeg' // Optional hint for URL sources
}
}Messages can have content as either a string or an array of ContentPart:
import { chat } from '@tanstack/ai'
import { openaiText } from '@tanstack/ai-openai'
const response = await chat({
adapter: openaiText('gpt-5.5'),
messages: [
{
role: 'user',
content: [
{ type: 'text', content: 'What is in this image?' },
{
type: 'image',
source: {
type: 'url',
value: 'https://example.com/photo.jpg'
}
}
]
}
]
})import { chat } from '@tanstack/ai'
import { openaiText } from '@tanstack/ai-openai'
const response = await chat({
adapter: openaiText('gpt-5.5'),
messages: [
{
role: 'user',
content: [
{ type: 'text', content: 'What is in this image?' },
{
type: 'image',
source: {
type: 'url',
value: 'https://example.com/photo.jpg'
}
}
]
}
]
})OpenAI supports images and audio in their vision and audio models:
import { openaiText } from '@tanstack/ai-openai'
const adapter = openaiText('gpt-5.5')
// Image with detail level metadata
const message = {
role: 'user' ,
content: [
{ type: 'text' , content: 'Describe this image' },
{
type: 'image' ,
source: { type: 'data' , value: imageBase64, mimeType: 'image/jpeg' },
metadata: { detail: 'high' } // 'auto' | 'low' | 'high'
}
]
}import { openaiText } from '@tanstack/ai-openai'
const adapter = openaiText('gpt-5.5')
// Image with detail level metadata
const message = {
role: 'user' ,
content: [
{ type: 'text' , content: 'Describe this image' },
{
type: 'image' ,
source: { type: 'data' , value: imageBase64, mimeType: 'image/jpeg' },
metadata: { detail: 'high' } // 'auto' | 'low' | 'high'
}
]
}Supported modalities by model:
Anthropic's Claude models support images and PDF documents:
import { anthropicText } from '@tanstack/ai-anthropic'
const adapter = anthropicText('claude-sonnet-4-6')
// Image with mimeType in source
const imageMessage = {
role: 'user' ,
content: [
{ type: 'text' , content: 'What do you see?' },
{
type: 'image' ,
source: { type: 'data' , value: imageBase64, mimeType: 'image/jpeg' }
}
]
}
// PDF document
const docMessage = {
role: 'user',
content: [
{ type: 'text', content: 'Summarize this document' },
{
type: 'document',
source: { type: 'data', value: pdfBase64, mimeType: 'application/pdf' }
}
]
}import { anthropicText } from '@tanstack/ai-anthropic'
const adapter = anthropicText('claude-sonnet-4-6')
// Image with mimeType in source
const imageMessage = {
role: 'user' ,
content: [
{ type: 'text' , content: 'What do you see?' },
{
type: 'image' ,
source: { type: 'data' , value: imageBase64, mimeType: 'image/jpeg' }
}
]
}
// PDF document
const docMessage = {
role: 'user',
content: [
{ type: 'text', content: 'Summarize this document' },
{
type: 'document',
source: { type: 'data', value: pdfBase64, mimeType: 'application/pdf' }
}
]
}Supported modalities:
Check each model's supports.input in @tanstack/ai-anthropic's model-meta.ts for the authoritative per-model list.
Google's Gemini models support a wide range of modalities:
import { geminiText } from '@tanstack/ai-gemini'
const adapter = geminiText('gemini-3-flash-preview')
// Image with mimeType in source
const message = {
role: 'user',
content: [
{ type: 'text', content: 'Analyze this image' },
{
type: 'image',
source: { type: 'data', value: imageBase64, mimeType: 'image/png' }
}
]
}import { geminiText } from '@tanstack/ai-gemini'
const adapter = geminiText('gemini-3-flash-preview')
// Image with mimeType in source
const message = {
role: 'user',
content: [
{ type: 'text', content: 'Analyze this image' },
{
type: 'image',
source: { type: 'data', value: imageBase64, mimeType: 'image/png' }
}
]
}Supported modalities:
Ollama supports images in compatible models:
import { ollamaText } from '@tanstack/ai-ollama'
// `ollamaText(model)` takes a model name. The host is read from the
// `OLLAMA_HOST` environment variable (defaults to http://localhost:11434).
const adapter = ollamaText('llama3.2-vision')
// Image as base64
const message = {
role: 'user',
content: [
{ type: 'text', content: 'What is in this image?' },
{
type: 'image',
source: { type: 'data', value: imageBase64, mimeType: 'image/jpeg' }
}
]
}import { ollamaText } from '@tanstack/ai-ollama'
// `ollamaText(model)` takes a model name. The host is read from the
// `OLLAMA_HOST` environment variable (defaults to http://localhost:11434).
const adapter = ollamaText('llama3.2-vision')
// Image as base64
const message = {
role: 'user',
content: [
{ type: 'text', content: 'What is in this image?' },
{
type: 'image',
source: { type: 'data', value: imageBase64, mimeType: 'image/jpeg' }
}
]
}Note: Ollama support varies by model. Check the specific model documentation for multimodal capabilities.
Content can be provided as either inline data or a URL:
Use type: 'data' for inline base64-encoded content. The mimeType field is required to ensure providers receive proper content type information:
const imagePart = {
type: 'image',
source: {
type: 'data',
value: 'iVBORw0KGgoAAAANSUhEUgAAAAUA...', // Base64 string
mimeType: 'image/png' // Required for data sources
}
}
const audioPart = {
type: 'audio',
source: {
type: 'data',
value: 'base64AudioData...',
mimeType: 'audio/mp3' // Required for data sources
}
}const imagePart = {
type: 'image',
source: {
type: 'data',
value: 'iVBORw0KGgoAAAANSUhEUgAAAAUA...', // Base64 string
mimeType: 'image/png' // Required for data sources
}
}
const audioPart = {
type: 'audio',
source: {
type: 'data',
value: 'base64AudioData...',
mimeType: 'audio/mp3' // Required for data sources
}
}Use type: 'url' for content hosted at a URL. The mimeType field is optional as providers can often infer it from the URL or response headers:
const imagePart = {
type: 'image' ,
source: {
type: 'url' ,
value: 'https://example.com/image.jpg',
mimeType: 'image/jpeg' // Optional hint
}
}const imagePart = {
type: 'image' ,
source: {
type: 'url' ,
value: 'https://example.com/image.jpg',
mimeType: 'image/jpeg' // Optional hint
}
}Note: Not all providers support URL-based content for all modalities. Check provider documentation for specifics.
String content continues to work as before:
// This still works
const message = {
role: 'user',
content: 'Hello, world!'
}
// And this works for multimodal
const multimodalMessage = {
role: 'user',
content: [
{ type: 'text', content: 'Hello, world!' },
{ type: 'image', source: { type: 'url', value: '...' } }
]
}// This still works
const message = {
role: 'user',
content: 'Hello, world!'
}
// And this works for multimodal
const multimodalMessage = {
role: 'user',
content: [
{ type: 'text', content: 'Hello, world!' },
{ type: 'image', source: { type: 'url', value: '...' } }
]
}The multimodal types are fully typed. Provider-specific metadata types are available:
import type {
ContentPart,
ImagePart,
DocumentPart,
AudioPart,
VideoPart,
TextPart
} from '@tanstack/ai'
// Provider-specific metadata types
import type { OpenAIImageMetadata } from '@tanstack/ai-openai'
import type { AnthropicImageMetadata } from '@tanstack/ai-anthropic'
import type { GeminiImageMetadata } from '@tanstack/ai-gemini'import type {
ContentPart,
ImagePart,
DocumentPart,
AudioPart,
VideoPart,
TextPart
} from '@tanstack/ai'
// Provider-specific metadata types
import type { OpenAIImageMetadata } from '@tanstack/ai-openai'
import type { AnthropicImageMetadata } from '@tanstack/ai-anthropic'
import type { GeminiImageMetadata } from '@tanstack/ai-gemini'When receiving messages from external sources (like request.json()), the data is typed as any. TanStack AI does not ship a runtime message validator — define a schema with your preferred Standard-Schema library (Zod, Valibot, ArkType, …) and parse the body before handing it to chat().
import { chat } from '@tanstack/ai'
import { openaiText } from '@tanstack/ai-openai'
import { z } from 'zod'
const ContentPartSchema = z.discriminatedUnion('type', [
z.object({ type: z.literal('text'), content: z.string() }),
z.object({
type: z.literal('image'),
source: z.object({ type: z.enum(['url', 'data']), value: z.string() }),
}),
])
const MessageSchema = z.object({
// `ModelMessage.role` is 'user' | 'assistant' | 'tool' — there is no
// 'system' role. System instructions are passed separately via the
// `systemPrompts` option on `chat()`, not as messages.
role: z.enum(['user', 'assistant', 'tool']),
content: z.union([z.string(), z.array(ContentPartSchema)]),
})
const BodySchema = z.object({ messages: z.array(MessageSchema) })
// In an API route handler
const { messages } = BodySchema.parse(await request.json())
const stream = chat({
adapter: openaiText('gpt-5.5'),
messages,
})import { chat } from '@tanstack/ai'
import { openaiText } from '@tanstack/ai-openai'
import { z } from 'zod'
const ContentPartSchema = z.discriminatedUnion('type', [
z.object({ type: z.literal('text'), content: z.string() }),
z.object({
type: z.literal('image'),
source: z.object({ type: z.enum(['url', 'data']), value: z.string() }),
}),
])
const MessageSchema = z.object({
// `ModelMessage.role` is 'user' | 'assistant' | 'tool' — there is no
// 'system' role. System instructions are passed separately via the
// `systemPrompts` option on `chat()`, not as messages.
role: z.enum(['user', 'assistant', 'tool']),
content: z.union([z.string(), z.array(ContentPartSchema)]),
})
const BodySchema = z.object({ messages: z.array(MessageSchema) })
// In an API route handler
const { messages } = BodySchema.parse(await request.json())
const stream = chat({
adapter: openaiText('gpt-5.5'),
messages,
})The TypeScript types on chat() still constrain anything you append at the call site to the modalities supported by the selected model.
Use appropriate source type: Use data for small content or when you need to include content inline. Use url for large files or when the content is already hosted.
Include metadata: Provide relevant metadata (like mimeType or detail) to help the model process the content correctly.
Check model support: Not all models support all modalities. Verify the model you're using supports the content types you want to send.
Handle errors gracefully: When a model doesn't support a particular modality, it may throw an error. Handle these cases in your application.
When using the ChatClient from @tanstack/ai-client, you can send multimodal messages directly from your UI using the sendMessage method.
The sendMessage method accepts either a simple string or a MultimodalContent object:
import { ChatClient, fetchServerSentEvents } from '@tanstack/ai-client'
const client = new ChatClient({
connection: fetchServerSentEvents('/api/chat'),
})
// Simple text message
await client.sendMessage('Hello!')
// Multimodal message with image
await client.sendMessage({
content: [
{ type: 'text', content: 'What is in this image?' },
{
type: 'image',
source: { type: 'url', value: 'https://example.com/photo.jpg' }
}
]
})import { ChatClient, fetchServerSentEvents } from '@tanstack/ai-client'
const client = new ChatClient({
connection: fetchServerSentEvents('/api/chat'),
})
// Simple text message
await client.sendMessage('Hello!')
// Multimodal message with image
await client.sendMessage({
content: [
{ type: 'text', content: 'What is in this image?' },
{
type: 'image',
source: { type: 'url', value: 'https://example.com/photo.jpg' }
}
]
})You can provide a custom ID for the message:
await client.sendMessage({
content: 'Hello!',
id: 'custom-message-id-123'
})await client.sendMessage({
content: 'Hello!',
id: 'custom-message-id-123'
})The second parameter allows you to pass additional forwardedProps for that specific request. These are shallow-merged with the client's base forwardedProps configuration, with per-message values taking priority:
const client = new ChatClient({
connection: fetchServerSentEvents('/api/chat'),
forwardedProps: { model: 'gpt-5' }, // Base forwarded props
})
// Override model for this specific message
await client.sendMessage('Analyze this complex problem', {
model: 'gpt-5',
temperature: 0.2,
})const client = new ChatClient({
connection: fetchServerSentEvents('/api/chat'),
forwardedProps: { model: 'gpt-5' }, // Base forwarded props
})
// Override model for this specific message
await client.sendMessage('Analyze this complex problem', {
model: 'gpt-5',
temperature: 0.2,
})Note: The legacy body constructor option is still supported but deprecated. New code should use forwardedProps. Both populate the same wire field.
Here's how to use multimodal messages in a React component:
import { useChat } from '@tanstack/ai-react'
import { fetchServerSentEvents } from '@tanstack/ai-client'
import { useState } from 'react'
function ChatWithImages() {
const [imageUrl, setImageUrl] = useState('')
const { sendMessage, messages } = useChat({
connection: fetchServerSentEvents('/api/chat'),
})
const handleSendWithImage = () => {
if (imageUrl) {
sendMessage({
content: [
{ type: 'text', content: 'What do you see in this image?' },
{ type: 'image', source: { type: 'url', value: imageUrl } }
]
})
}
}
return (
<div>
<input
type="url"
placeholder="Image URL"
value={imageUrl}
onChange={(e) => setImageUrl(e.target.value)}
/>
<button onClick={handleSendWithImage}>Send with Image</button>
</div>
)
}import { useChat } from '@tanstack/ai-react'
import { fetchServerSentEvents } from '@tanstack/ai-client'
import { useState } from 'react'
function ChatWithImages() {
const [imageUrl, setImageUrl] = useState('')
const { sendMessage, messages } = useChat({
connection: fetchServerSentEvents('/api/chat'),
})
const handleSendWithImage = () => {
if (imageUrl) {
sendMessage({
content: [
{ type: 'text', content: 'What do you see in this image?' },
{ type: 'image', source: { type: 'url', value: imageUrl } }
]
})
}
}
return (
<div>
<input
type="url"
placeholder="Image URL"
value={imageUrl}
onChange={(e) => setImageUrl(e.target.value)}
/>
<button onClick={handleSendWithImage}>Send with Image</button>
</div>
)
}Here's how to handle file uploads and send them as multimodal content:
import { useChat } from '@tanstack/ai-react'
import { fetchServerSentEvents } from '@tanstack/ai-client'
function ChatWithFileUpload() {
const { sendMessage } = useChat({
connection: fetchServerSentEvents('/api/chat'),
})
const handleFileUpload = async (file: File) => {
// Convert file to base64
const base64 = await new Promise<string>((resolve) => {
const reader = new FileReader()
reader.onload = () => {
const result = reader.result as string
// Remove data URL prefix (e.g., "data:image/png;base64,")
resolve(result.split(',')[1])
}
reader.readAsDataURL(file)
})
// Determine content type based on file type
const type = file.type.startsWith('image/')
? 'image'
: file.type.startsWith('audio/')
? 'audio'
: file.type.startsWith('video/')
? 'video'
: 'document'
await sendMessage({
content: [
{ type: 'text', content: `Please analyze this ${type}` },
{
type,
source: { type: 'data', value: base64, mimeType: file.type }
}
]
})
}
return (
<input
type="file"
accept="image/*,audio/*,video/*,.pdf"
onChange={(e) => {
const file = e.target.files?.[0]
if (file) handleFileUpload(file)
}}
/>
)
}import { useChat } from '@tanstack/ai-react'
import { fetchServerSentEvents } from '@tanstack/ai-client'
function ChatWithFileUpload() {
const { sendMessage } = useChat({
connection: fetchServerSentEvents('/api/chat'),
})
const handleFileUpload = async (file: File) => {
// Convert file to base64
const base64 = await new Promise<string>((resolve) => {
const reader = new FileReader()
reader.onload = () => {
const result = reader.result as string
// Remove data URL prefix (e.g., "data:image/png;base64,")
resolve(result.split(',')[1])
}
reader.readAsDataURL(file)
})
// Determine content type based on file type
const type = file.type.startsWith('image/')
? 'image'
: file.type.startsWith('audio/')
? 'audio'
: file.type.startsWith('video/')
? 'video'
: 'document'
await sendMessage({
content: [
{ type: 'text', content: `Please analyze this ${type}` },
{
type,
source: { type: 'data', value: base64, mimeType: file.type }
}
]
})
}
return (
<input
type="file"
accept="image/*,audio/*,video/*,.pdf"
onChange={(e) => {
const file = e.target.files?.[0]
if (file) handleFileUpload(file)
}}
/>
)
}