From antigravity-awesome-skills
Build real-time voice AI applications using Azure AI Voice Live SDK (azure-ai-voicelive). Use this skill when creating Python applications that need real-time bidirectional audio communication with...
npx claudepluginhub absjaded/antigravity-awesome-skillsThis skill uses the workspace's default tool permissions.
Build real-time voice AI applications with bidirectional WebSocket communication.
Verifies tests pass on completed feature branch, presents options to merge locally, create GitHub PR, keep as-is or discard; executes choice and cleans up worktree.
Guides root cause investigation for bugs, test failures, unexpected behavior, performance issues, and build failures before proposing fixes.
Writes implementation plans from specs for multi-step tasks, mapping files and breaking into TDD bite-sized steps before coding.
Build real-time voice AI applications with bidirectional WebSocket communication.
pip install azure-ai-voicelive aiohttp azure-identity
AZURE_COGNITIVE_SERVICES_ENDPOINT=https://<region>.api.cognitive.microsoft.com
# For API key auth (not recommended for production)
AZURE_COGNITIVE_SERVICES_KEY=<api-key>
DefaultAzureCredential (preferred):
from azure.ai.voicelive.aio import connect
from azure.identity.aio import DefaultAzureCredential
async with connect(
endpoint=os.environ["AZURE_COGNITIVE_SERVICES_ENDPOINT"],
credential=DefaultAzureCredential(),
model="gpt-4o-realtime-preview",
credential_scopes=["https://cognitiveservices.azure.com/.default"]
) as conn:
...
API Key:
from azure.ai.voicelive.aio import connect
from azure.core.credentials import AzureKeyCredential
async with connect(
endpoint=os.environ["AZURE_COGNITIVE_SERVICES_ENDPOINT"],
credential=AzureKeyCredential(os.environ["AZURE_COGNITIVE_SERVICES_KEY"]),
model="gpt-4o-realtime-preview"
) as conn:
...
import asyncio
import os
from azure.ai.voicelive.aio import connect
from azure.identity.aio import DefaultAzureCredential
async def main():
async with connect(
endpoint=os.environ["AZURE_COGNITIVE_SERVICES_ENDPOINT"],
credential=DefaultAzureCredential(),
model="gpt-4o-realtime-preview",
credential_scopes=["https://cognitiveservices.azure.com/.default"]
) as conn:
# Update session with instructions
await conn.session.update(session={
"instructions": "You are a helpful assistant.",
"modalities": ["text", "audio"],
"voice": "alloy"
})
# Listen for events
async for event in conn:
print(f"Event: {event.type}")
if event.type == "response.audio_transcript.done":
print(f"Transcript: {event.transcript}")
elif event.type == "response.done":
break
asyncio.run(main())
The VoiceLiveConnection exposes these resources:
| Resource | Purpose | Key Methods |
|---|---|---|
conn.session | Session configuration | update(session=...) |
conn.response | Model responses | create(), cancel() |
conn.input_audio_buffer | Audio input | append(), commit(), clear() |
conn.output_audio_buffer | Audio output | clear() |
conn.conversation | Conversation state | item.create(), item.delete(), item.truncate() |
conn.transcription_session | Transcription config | update(session=...) |
from azure.ai.voicelive.models import RequestSession, FunctionTool
await conn.session.update(session=RequestSession(
instructions="You are a helpful voice assistant.",
modalities=["text", "audio"],
voice="alloy", # or "echo", "shimmer", "sage", etc.
input_audio_format="pcm16",
output_audio_format="pcm16",
turn_detection={
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 500
},
tools=[
FunctionTool(
type="function",
name="get_weather",
description="Get current weather",
parameters={
"type": "object",
"properties": {
"location": {"type": "string"}
},
"required": ["location"]
}
)
]
))
import base64
# Read audio chunk (16-bit PCM, 24kHz mono)
audio_chunk = await read_audio_from_microphone()
b64_audio = base64.b64encode(audio_chunk).decode()
await conn.input_audio_buffer.append(audio=b64_audio)
async for event in conn:
if event.type == "response.audio.delta":
audio_bytes = base64.b64decode(event.delta)
await play_audio(audio_bytes)
elif event.type == "response.audio.done":
print("Audio complete")
async for event in conn:
match event.type:
# Session events
case "session.created":
print(f"Session: {event.session}")
case "session.updated":
print("Session updated")
# Audio input events
case "input_audio_buffer.speech_started":
print(f"Speech started at {event.audio_start_ms}ms")
case "input_audio_buffer.speech_stopped":
print(f"Speech stopped at {event.audio_end_ms}ms")
# Transcription events
case "conversation.item.input_audio_transcription.completed":
print(f"User said: {event.transcript}")
case "conversation.item.input_audio_transcription.delta":
print(f"Partial: {event.delta}")
# Response events
case "response.created":
print(f"Response started: {event.response.id}")
case "response.audio_transcript.delta":
print(event.delta, end="", flush=True)
case "response.audio.delta":
audio = base64.b64decode(event.delta)
case "response.done":
print(f"Response complete: {event.response.status}")
# Function calls
case "response.function_call_arguments.done":
result = handle_function(event.name, event.arguments)
await conn.conversation.item.create(item={
"type": "function_call_output",
"call_id": event.call_id,
"output": json.dumps(result)
})
await conn.response.create()
# Errors
case "error":
print(f"Error: {event.error.message}")
await conn.session.update(session={"turn_detection": None})
# Manually control turns
await conn.input_audio_buffer.append(audio=b64_audio)
await conn.input_audio_buffer.commit() # End of user turn
await conn.response.create() # Trigger response
async for event in conn:
if event.type == "input_audio_buffer.speech_started":
# User interrupted - cancel current response
await conn.response.cancel()
await conn.output_audio_buffer.clear()
# Add system message
await conn.conversation.item.create(item={
"type": "message",
"role": "system",
"content": [{"type": "input_text", "text": "Be concise."}]
})
# Add user message
await conn.conversation.item.create(item={
"type": "message",
"role": "user",
"content": [{"type": "input_text", "text": "Hello!"}]
})
await conn.response.create()
| Voice | Description |
|---|---|
alloy | Neutral, balanced |
echo | Warm, conversational |
shimmer | Clear, professional |
sage | Calm, authoritative |
coral | Friendly, upbeat |
ash | Deep, measured |
ballad | Expressive |
verse | Storytelling |
Azure voices: Use AzureStandardVoice, AzureCustomVoice, or AzurePersonalVoice models.
| Format | Sample Rate | Use Case |
|---|---|---|
pcm16 | 24kHz | Default, high quality |
pcm16-8000hz | 8kHz | Telephony |
pcm16-16000hz | 16kHz | Voice assistants |
g711_ulaw | 8kHz | Telephony (US) |
g711_alaw | 8kHz | Telephony (EU) |
# Server VAD (default)
{"type": "server_vad", "threshold": 0.5, "silence_duration_ms": 500}
# Azure Semantic VAD (smarter detection)
{"type": "azure_semantic_vad"}
{"type": "azure_semantic_vad_en"} # English optimized
{"type": "azure_semantic_vad_multilingual"}
from azure.ai.voicelive.aio import ConnectionError, ConnectionClosed
try:
async with connect(...) as conn:
async for event in conn:
if event.type == "error":
print(f"API Error: {event.error.code} - {event.error.message}")
except ConnectionClosed as e:
print(f"Connection closed: {e.code} - {e.reason}")
except ConnectionError as e:
print(f"Connection error: {e}")
This skill is applicable to execute the workflow or actions described in the overview.