The Real-time Digital Human API supports real-time camera recognition, allowing the AI to analyze visual input from the user’s camera and respond accordingly. This enables use cases such as object recognition, scene analysis, gesture detection, and visual Q&A.
Provider Compatibility: Camera recognition is only supported with the OpenAIRealtime provider. This feature is not available when using other AI providers (e.g., ElevenLabs).
Camera Transmission Methods
NavTalk provides two methods for transmitting camera data to the AI, each optimized for different use cases:
| Method | Latency | Bandwidth | Use Case |
|---|
| WebRTC Video Stream | Ultra-low (~50ms) | High | Continuous video analysis, real-time gesture recognition, live monitoring |
| Periodic Snapshots | Medium (~2s) | Low | Periodic scene recognition, object detection, visual Q&A |
Both methods can be used simultaneously. The WebRTC stream provides real-time video for immediate AI processing, while periodic snapshots offer AI-friendly image data for vision model analysis.
Method 1: WebRTC Video Stream (Real-time)
This method transmits the camera video stream directly through WebRTC, providing ultra-low latency for real-time visual analysis.
Request Camera Access
First, request camera permissions and create the video stream:// Data definitions
let cameraStream = null;
let cameraEnabled = false;
let peerConnection = null;
async function startCamera() {
try {
const constraints = {
video: {
width: { ideal: 640 },
height: { ideal: 360 },
frameRate: { ideal: 15 }
},
audio: false // Audio is handled separately
};
cameraStream = await navigator.mediaDevices.getUserMedia(constraints);
cameraEnabled = true;
// Display camera preview
if (cameraPreviewVideo) {
cameraPreviewVideo.srcObject = cameraStream;
await cameraPreviewVideo.play();
}
console.log("Camera started successfully");
} catch (error) {
console.error("Unable to access camera:", error);
cameraEnabled = false;
}
}
Camera permissions are required from the user. Make sure to request permissions appropriately and handle cases where the user denies access.
Add Video Track to WebRTC Connection
After establishing the WebRTC connection (after handling the offer), add the camera video track to the peer connection:let videoSenders = [];
async function pushVideoStream(enable) {
if (!cameraStream || !peerConnection) {
return;
}
if (enable) {
// Add all video tracks from camera stream to WebRTC connection
cameraStream.getTracks().forEach(track => {
const sender = peerConnection.addTrack(track, cameraStream);
videoSenders.push(sender);
});
console.log("Video stream added to WebRTC connection");
} else {
// Remove video tracks when disabling camera
videoSenders.forEach(sender => {
peerConnection.removeTrack(sender);
});
videoSenders = [];
console.log("Video stream removed from WebRTC connection");
}
}
// Call this after handling WebRTC offer and when camera is enabled
// Usually called inside handleOffer() function
async function handleOffer(message) {
const offer = new RTCSessionDescription(message.sdp);
peerConnection = new RTCPeerConnection(configuration);
await peerConnection.setRemoteDescription(offer);
const answer = await peerConnection.createAnswer();
await peerConnection.setLocalDescription(answer);
sendAnswerMessage(peerConnection.localDescription);
// Add audio track (for microphone)
// ... audio track setup ...
// Add video track if camera is enabled
if (cameraEnabled && cameraStream) {
pushVideoStream(true);
}
// Setup event handlers
peerConnection.ontrack = (event) => {
// Handle incoming video/audio from server
};
}
Real-time Transmission: The WebRTC video stream provides ultra-low latency (typically 50-100ms) and is ideal for scenarios requiring continuous visual analysis, such as gesture control or real-time object tracking.
Method 2: Periodic Snapshots (Image Frames)
This method captures video frames at regular intervals (every 2 seconds by default), converts them to JPEG images, and sends them via WebSocket. This approach is bandwidth-efficient and suitable for periodic scene analysis.
Start Periodic Frame Capture
After the WebSocket connection is established and session is ready, start capturing frames periodically:// Data definitions for periodic snapshots
let cameraCaptureInterval = null;
const cameraCaptureIntervalMs = 2000; // Capture every 2 seconds
function startCameraCapture() {
if (!calling || !socket || socket.readyState !== WebSocket.OPEN) {
console.log('WebSocket not connected or not calling, skipping camera capture');
return;
}
if (cameraCaptureInterval) {
clearInterval(cameraCaptureInterval);
}
// Send first frame immediately
captureFrameAndSend();
// Then capture periodically (every 2 seconds)
cameraCaptureInterval = setInterval(() => {
if (cameraEnabled && socket?.readyState === WebSocket.OPEN) {
captureFrameAndSend();
}
}, cameraCaptureIntervalMs);
}
Capture Frame and Send to AI
Capture video frames, convert them to base64-encoded images, and send them to the AI via WebSocket:// Event type constant
const NavTalkMessageType = {
REALTIME_INPUT_IMAGE: "realtime.input_image",
// ... other event types
};
function captureFrameAndSend() {
const video = cameraPreviewVideo;
if (!video || !cameraStream || !socket ||
socket.readyState !== WebSocket.OPEN ||
video.readyState < 2) { // HAVE_CURRENT_DATA
console.warn("Video not ready for capture");
return;
}
// Create canvas and capture frame
const canvas = document.createElement('canvas');
canvas.width = video.videoWidth || 640;
canvas.height = video.videoHeight || 360;
const ctx = canvas.getContext('2d');
try {
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
// Convert to base64 and send
canvas.toBlob(async (blob) => {
if (!blob) {
console.error("Failed to create blob from canvas");
return;
}
try {
const arrayBuffer = await blob.arrayBuffer();
const uint8Array = new Uint8Array(arrayBuffer);
const base64 = btoa(String.fromCharCode.apply(null, uint8Array));
const imageUrl = `data:${blob.type};base64,${base64}`;
// Send to AI via WebSocket using correct message format
const message = {
type: NavTalkMessageType.REALTIME_INPUT_IMAGE,
data: {
content: imageUrl, // base64-encoded Data URL
reply: 0 // 0 = no immediate reply, 1 = request reply
}
};
socket.send(JSON.stringify(message));
} catch (error) {
console.error("Error processing camera frame:", error);
}
}, "image/jpeg", 0.7); // JPEG quality: 0.7
} catch (error) {
console.error("Error capturing frame:", error);
}
}
Best Practices:
- Use JPEG quality of 0.7 (70%) to reduce payload size while maintaining reasonable image quality
- Resolution of 640x360 is sufficient for most use cases and reduces bandwidth requirements
- Capture interval of 2-3 seconds balances real-time responsiveness and bandwidth usage
- Set
reply: 0 to avoid triggering immediate AI response for each frame
Stop Camera
Release camera resources when done:function stopCamera() {
cameraEnabled = false;
// Stop periodic snapshot capture
if (cameraCaptureInterval) {
clearInterval(cameraCaptureInterval);
cameraCaptureInterval = null;
}
// Remove video track from WebRTC connection
pushVideoStream(false);
// Stop camera stream
if (cameraStream) {
cameraStream.getTracks().forEach(track => track.stop());
cameraStream = null;
}
// Clear preview video
if (cameraPreviewVideo) {
cameraPreviewVideo.srcObject = null;
}
console.log("Camera stopped");
}
// Toggle camera on/off
async function toggleCamera() {
if (cameraEnabled) {
stopCamera();
} else {
await startCamera();
// If already in a call, add video track to WebRTC
if (calling && peerConnection) {
pushVideoStream(true);
// Also start periodic snapshots if needed
if (socket?.readyState === WebSocket.OPEN) {
startCameraCapture();
}
}
}
}
Always call stopCamera() to release camera resources when the session ends or the component is unmounted. Failing to do so may prevent other applications from accessing the camera.
Choosing the Right Method
Use WebRTC Video Stream When:
- You need continuous real-time video analysis (e.g., gesture recognition, live monitoring)
- Low latency is critical (50-100ms)
- The AI model supports direct video stream processing
- Network bandwidth is sufficient
Use Periodic Snapshots When:
- You need periodic scene recognition (e.g., “What do you see?”)
- Bandwidth is limited
- You want to reduce server processing load
- The use case doesn’t require frame-by-frame analysis
Use Both Methods When:
- You want the best of both worlds: real-time responsiveness with AI vision capabilities
- The AI provider supports both video stream and image input
- Your application needs both continuous monitoring and detailed image analysis
Recommendation: Start with periodic snapshots (Method 2) for most use cases. Add WebRTC video stream (Method 1) only when you specifically need ultra-low latency or continuous video analysis. The NavTalk console implementation uses both methods simultaneously for optimal performance.
Complete Integration Example
Here’s how to integrate both camera transmission methods in a complete implementation:
// Define event type constants
const NavTalkMessageType = {
REALTIME_INPUT_IMAGE: "realtime.input_image",
WEB_RTC_OFFER: "webrtc.signaling.offer",
WEB_RTC_ANSWER: "webrtc.signaling.answer",
WEB_RTC_ICE_CANDIDATE: "webrtc.signaling.iceCandidate",
REALTIME_SESSION_UPDATED: "realtime.session.updated",
// ... other types
};
// Global variables
let cameraStream = null;
let cameraEnabled = false;
let peerConnection = null;
let videoSenders = [];
let cameraCaptureInterval = null;
const cameraCaptureIntervalMs = 2000;
// Start camera and add to WebRTC when offer is received
async function handleOffer(message) {
const offer = new RTCSessionDescription(message.sdp);
peerConnection = new RTCPeerConnection(configuration);
await peerConnection.setRemoteDescription(offer);
const answer = await peerConnection.createAnswer();
await peerConnection.setLocalDescription(answer);
sendAnswerMessage(peerConnection.localDescription);
// Add audio track (microphone)
if (audioStream) {
audioStream.getTracks().forEach(track => {
peerConnection.addTrack(track, audioStream);
});
}
// Add video track if camera is enabled (WebRTC video stream)
if (cameraEnabled && cameraStream) {
cameraStream.getTracks().forEach(track => {
const sender = peerConnection.addTrack(track, cameraStream);
videoSenders.push(sender);
});
}
peerConnection.ontrack = (event) => {
// Handle incoming audio/video from server
if (remoteVideo && event.streams[0]) {
remoteVideo.srcObject = event.streams[0];
}
};
peerConnection.onicecandidate = (event) => {
if (event.candidate) {
sendIceMessage(event.candidate);
}
};
}
// Start periodic snapshots when session is ready
function handleSessionUpdated() {
console.log("Session updated, ready to receive audio");
startRecording();
// Start periodic camera snapshots if camera is enabled
if (cameraEnabled) {
startCameraCapture();
}
}
// Message handler
async function handleReceivedMessage(data) {
switch (data.type) {
case NavTalkMessageType.WEB_RTC_OFFER:
await handleOffer(data.data);
break;
case NavTalkMessageType.REALTIME_SESSION_UPDATED:
handleSessionUpdated();
break;
// ... other cases
}
}
Integration Timing:
- WebRTC video stream is added when handling the WebRTC offer (during connection setup)
- Periodic snapshots start when receiving
realtime.session.updated event (when session is ready)
- Both methods can run simultaneously without conflicts