{"openapi":"3.1.0","info":{"title":"WhisperX GPU Transcription Service","description":"High-performance audio transcription with speaker diarization and job queuing","version":"1.1.0"},"paths":{"/health":{"get":{"tags":["Health"],"summary":"Service health & GPU status","description":"Check whether the service is running, which GPU is attached, how much VRAM is in use, and the current state of the job queue. No authentication required.","operationId":"health_check_health_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HealthResponse"}}}}}}},"/queue":{"get":{"tags":["Queue"],"summary":"Queue depth & wait-time estimate","description":"See how many jobs are pending and processing, the estimated wait time for new submissions, and current VRAM utilisation. Useful for deciding whether to submit now or back off.","operationId":"get_queue_status_queue_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/QueueStatusResponse"}}}}},"security":[{"APIKeyHeader":[]}]}},"/models":{"get":{"tags":["Models"],"summary":"Available Whisper models","description":"List every model the service can use, whether it is already loaded in GPU memory, its parameter count, and estimated VRAM footprint. Helpful for choosing a model before submitting a transcription.","operationId":"list_models_models_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ModelsResponse"}}}}},"security":[{"APIKeyHeader":[]}]}},"/jobs/{job_id}":{"get":{"tags":["Jobs"],"summary":"Poll a transcription job","description":"Retrieve the current status of a previously submitted job. Once the status is `completed`, the full transcription result (text, segments, speaker labels) is included in the response. Completed jobs are retained for one hour.","operationId":"get_job_status_jobs__job_id__get","security":[{"APIKeyHeader":[]}],"parameters":[{"name":"job_id","in":"path","required":true,"schema":{"type":"string","title":"Job Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/JobStatusResponse"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"tags":["Jobs"],"summary":"Cancel a queued job","description":"Remove a job that is still waiting in the queue. Jobs that are already being processed on the GPU cannot be cancelled. Returns an error if the job has already started, completed, or been cancelled.","operationId":"cancel_job_jobs__job_id__delete","security":[{"APIKeyHeader":[]}],"parameters":[{"name":"job_id","in":"path","required":true,"schema":{"type":"string","title":"Job Id"}}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/transcribe":{"post":{"tags":["Transcription"],"summary":"Transcribe an audio file (async)","description":"Upload an audio file to transcribe. In async mode (default) the request returns immediately with a job ID you can poll. Choose a Whisper model, set a language hint, and toggle speaker diarization. Supports mp3, wav, m4a, ogg, webm, and more.","operationId":"transcribe_audio_transcribe_post","requestBody":{"content":{"multipart/form-data":{"schema":{"$ref":"#/components/schemas/Body_transcribe_audio_transcribe_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"anyOf":[{"$ref":"#/components/schemas/JobSubmitResponse"},{"$ref":"#/components/schemas/TranscriptionResult"}],"title":"Response Transcribe Audio Transcribe Post"}}}},"400":{"description":"Bad Request","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"401":{"description":"Unauthorized","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"413":{"description":"Request Entity Too Large","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"415":{"description":"Unsupported Media Type","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"429":{"description":"Too Many Requests","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"500":{"description":"Internal Server Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"503":{"description":"Service Unavailable","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"security":[{"APIKeyHeader":[]}]}},"/transcribe/sync":{"post":{"tags":["Transcription"],"summary":"Transcribe an audio file (synchronous)","description":"Same transcription pipeline as `/transcribe`, but always blocks until the result is ready — no job queue, no polling. Best for short recordings or when you need the result in a single round-trip. May time out for files longer than ~20 minutes.","operationId":"transcribe_audio_sync_transcribe_sync_post","requestBody":{"content":{"multipart/form-data":{"schema":{"$ref":"#/components/schemas/Body_transcribe_audio_sync_transcribe_sync_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/TranscriptionResult"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"security":[{"APIKeyHeader":[]}]}},"/api/v1/transcribe":{"post":{"tags":["Legacy"],"summary":"Transcribe audio (v1 legacy format)","description":"Drop-in replacement for the original whisper-x-api service. Accepts a single audio file, always uses the `large-v3` model with speaker diarization enabled, and returns the simplified segment format. Use this if you have existing integrations that rely on the v1 response shape.","operationId":"legacy_transcribe_api_v1_transcribe_post","requestBody":{"content":{"multipart/form-data":{"schema":{"$ref":"#/components/schemas/Body_legacy_transcribe_api_v1_transcribe_post"}}},"required":true},"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{"$ref":"#/components/schemas/LegacyTranscriptionResponse"}}}},"400":{"description":"Bad Request","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"401":{"description":"Unauthorized","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"500":{"description":"Internal Server Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}},"security":[{"APIKeyHeader":[]}]}}},"components":{"schemas":{"Body_legacy_transcribe_api_v1_transcribe_post":{"properties":{"file":{"type":"string","contentMediaType":"application/octet-stream","title":"File","description":"Audio file — mp3, wav, aac, ogg, m4a, wma, webm, amr, awb"}},"type":"object","required":["file"],"title":"Body_legacy_transcribe_api_v1_transcribe_post"},"Body_transcribe_audio_sync_transcribe_sync_post":{"properties":{"file":{"type":"string","contentMediaType":"application/octet-stream","title":"File","description":"Audio file — mp3, wav, ogg, m4a, webm, mp4, etc."},"model":{"$ref":"#/components/schemas/ModelName","description":"Whisper model size. Larger models are slower but more accurate.","default":"large-v3"},"language":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Language","description":"ISO 639-1 language code (e.g. `en`, `de`). Leave empty for auto-detection."},"speaker_count":{"anyOf":[{"type":"integer","maximum":20.0,"minimum":1.0},{"type":"null"}],"title":"Speaker Count","description":"Expected number of speakers. Helps diarization when known."},"enable_diarization":{"type":"boolean","title":"Enable Diarization","description":"Identify and label individual speakers in the transcript.","default":true},"response_format":{"type":"string","pattern":"^(json|verbose_json)$","title":"Response Format","description":"`verbose_json` returns segments with timestamps; `json` returns plain text only.","default":"verbose_json"}},"type":"object","required":["file"],"title":"Body_transcribe_audio_sync_transcribe_sync_post"},"Body_transcribe_audio_transcribe_post":{"properties":{"file":{"type":"string","contentMediaType":"application/octet-stream","title":"File","description":"Audio file — mp3, wav, ogg, m4a, webm, mp4, etc."},"model":{"$ref":"#/components/schemas/ModelName","description":"Whisper model size. Larger models are slower but more accurate.","default":"large-v3"},"language":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Language","description":"ISO 639-1 language code (e.g. `en`, `de`). Leave empty for auto-detection."},"speaker_count":{"anyOf":[{"type":"integer","maximum":20.0,"minimum":1.0},{"type":"null"}],"title":"Speaker Count","description":"Expected number of speakers. Helps diarization when known."},"enable_diarization":{"type":"boolean","title":"Enable Diarization","description":"Identify and label individual speakers in the transcript.","default":true},"response_format":{"type":"string","pattern":"^(json|verbose_json)$","title":"Response Format","description":"`verbose_json` returns segments with timestamps; `json` returns plain text only.","default":"verbose_json"}},"type":"object","required":["file"],"title":"Body_transcribe_audio_transcribe_post"},"ErrorResponse":{"properties":{"error":{"type":"string","title":"Error"},"detail":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Detail"},"retry_after_seconds":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Retry After Seconds","description":"Suggested retry delay (for rate limiting)"}},"type":"object","required":["error"],"title":"ErrorResponse","description":"Error response."},"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"HealthResponse":{"properties":{"status":{"type":"string","title":"Status","default":"healthy"},"gpu_available":{"type":"boolean","title":"Gpu Available"},"gpu_name":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Gpu Name"},"gpu_memory_used_gb":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Gpu Memory Used Gb"},"gpu_memory_total_gb":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Gpu Memory Total Gb"},"models_loaded":{"items":{"type":"string"},"type":"array","title":"Models Loaded"},"queue":{"anyOf":[{"$ref":"#/components/schemas/QueueStatusResponse"},{"type":"null"}],"description":"Queue status (if async mode enabled)"}},"type":"object","required":["gpu_available"],"title":"HealthResponse","description":"Health check response."},"JobStatus":{"type":"string","enum":["queued","processing","completed","failed","cancelled"],"title":"JobStatus","description":"Transcription job status."},"JobStatusResponse":{"properties":{"job_id":{"type":"string","title":"Job Id"},"status":{"$ref":"#/components/schemas/JobStatus"},"position":{"type":"integer","title":"Position","description":"Position in queue (0 = processing)"},"created_at":{"type":"number","title":"Created At"},"started_at":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Started At"},"completed_at":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Completed At"},"wait_time_seconds":{"type":"number","title":"Wait Time Seconds","description":"Time spent waiting"},"processing_time_seconds":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Processing Time Seconds","description":"Time spent processing (if started)"},"result":{"anyOf":[{"$ref":"#/components/schemas/TranscriptionResult"},{"type":"null"}],"description":"Transcription result (if completed)"},"error":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Error","description":"Error message (if failed)"}},"type":"object","required":["job_id","status","position","created_at","wait_time_seconds","processing_time_seconds"],"title":"JobStatusResponse","description":"Job status response."},"JobSubmitResponse":{"properties":{"job_id":{"type":"string","title":"Job Id","description":"Unique job identifier"},"status":{"$ref":"#/components/schemas/JobStatus","description":"Current job status"},"position":{"type":"integer","title":"Position","description":"Position in queue (0 = processing)"},"estimated_wait_seconds":{"type":"number","title":"Estimated Wait Seconds","description":"Estimated wait time in seconds"},"poll_url":{"type":"string","title":"Poll Url","description":"URL to poll for job status"},"poll_interval_seconds":{"type":"integer","title":"Poll Interval Seconds","description":"Recommended poll interval"}},"type":"object","required":["job_id","status","position","estimated_wait_seconds","poll_url","poll_interval_seconds"],"title":"JobSubmitResponse","description":"Response when submitting a job to the queue."},"LegacyTranscriptionResponse":{"properties":{"segments":{"items":{"$ref":"#/components/schemas/LegacyTranscriptionSegment"},"type":"array","title":"Segments"}},"type":"object","required":["segments"],"title":"LegacyTranscriptionResponse","description":"Legacy response format matching whisper-x-api /api/v1/transcribe."},"LegacyTranscriptionSegment":{"properties":{"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"text":{"type":"string","title":"Text"},"speaker":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Speaker"}},"type":"object","required":["start","end","text"],"title":"LegacyTranscriptionSegment","description":"Legacy segment format (start, end, text, speaker only)."},"ModelInfo":{"properties":{"name":{"type":"string","title":"Name"},"loaded":{"type":"boolean","title":"Loaded"},"parameters":{"type":"string","title":"Parameters"},"vram_required_gb":{"type":"number","title":"Vram Required Gb","description":"Estimated VRAM requirement"}},"type":"object","required":["name","loaded","parameters","vram_required_gb"],"title":"ModelInfo","description":"Information about an available model."},"ModelName":{"type":"string","enum":["tiny","base","small","medium","large-v2","large-v3"],"title":"ModelName","description":"Available Whisper models."},"ModelsResponse":{"properties":{"models":{"items":{"$ref":"#/components/schemas/ModelInfo"},"type":"array","title":"Models"},"default_model":{"type":"string","title":"Default Model"}},"type":"object","required":["models","default_model"],"title":"ModelsResponse","description":"Available models response."},"QueueStatusResponse":{"properties":{"pending":{"type":"integer","title":"Pending","description":"Jobs waiting in queue"},"processing":{"type":"integer","title":"Processing","description":"Jobs currently processing"},"max_concurrent":{"type":"integer","title":"Max Concurrent","description":"Maximum concurrent jobs"},"max_queue_depth":{"type":"integer","title":"Max Queue Depth","description":"Maximum queue size"},"queue_available":{"type":"integer","title":"Queue Available","description":"Available queue slots"},"estimated_wait_seconds":{"type":"number","title":"Estimated Wait Seconds","description":"Estimated wait for new jobs"},"total_processed":{"type":"integer","title":"Total Processed","description":"Total jobs processed"},"average_processing_seconds":{"type":"number","title":"Average Processing Seconds","description":"Average processing time"},"vram_used_gb":{"type":"number","title":"Vram Used Gb","description":"VRAM currently in use"},"vram_total_gb":{"type":"number","title":"Vram Total Gb","description":"Total GPU VRAM"},"vram_available_gb":{"type":"number","title":"Vram Available Gb","description":"Available VRAM"}},"type":"object","required":["pending","processing","max_concurrent","max_queue_depth","queue_available","estimated_wait_seconds","total_processed","average_processing_seconds","vram_used_gb","vram_total_gb","vram_available_gb"],"title":"QueueStatusResponse","description":"Queue status response."},"TranscriptionResult":{"properties":{"text":{"type":"string","title":"Text","description":"Full transcription text"},"language":{"type":"string","title":"Language","description":"Detected or specified language"},"duration":{"type":"number","title":"Duration","description":"Audio duration in seconds"},"segments":{"items":{"$ref":"#/components/schemas/TranscriptionSegment"},"type":"array","title":"Segments"},"model":{"type":"string","title":"Model","description":"Model used for transcription"},"diarization_enabled":{"type":"boolean","title":"Diarization Enabled","default":false}},"type":"object","required":["text","language","duration","model"],"title":"TranscriptionResult","description":"OpenAI-compatible verbose_json response with speaker labels."},"TranscriptionSegment":{"properties":{"id":{"type":"integer","title":"Id"},"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"text":{"type":"string","title":"Text"},"speaker":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Speaker"},"words":{"anyOf":[{"items":{"$ref":"#/components/schemas/WordSegment"},"type":"array"},{"type":"null"}],"title":"Words"}},"type":"object","required":["id","start","end","text"],"title":"TranscriptionSegment","description":"A segment of transcribed audio."},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"},"input":{"title":"Input"},"ctx":{"type":"object","title":"Context"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"WordSegment":{"properties":{"word":{"type":"string","title":"Word"},"start":{"type":"number","title":"Start"},"end":{"type":"number","title":"End"},"speaker":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Speaker"}},"type":"object","required":["word","start","end"],"title":"WordSegment","description":"Word-level timestamp."}},"securitySchemes":{"APIKeyHeader":{"type":"apiKey","in":"header","name":"X-Api-Key"}}}}