Serverless GPU transcription with Modal
I quickly want to share how easy it is to set up a serverless GPU function with Modal.
Background: my girlfriend had recorded 3 hours of audio during an interview and I decided to help her out with some automatic audio transcription.
Things that I like about this setup:
- simple python script
- I can send my gf just a link to the service where she can upload her audio files and get back text
- since it's serverless, I don't have to worry about paying for idle time
- I can get around 30s limitations of free online services
- Modal gives you $30 free credits per month to play around with
imports
import os
import modal
from fastapi import HTTPException
from fastapi import UploadFile, File
from pydantic import BaseModel
stub and cache creation
stub = modal.Stub("whisper")
stub.image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install(
"torch",
"transformers",
)
.apt_install("ffmpeg")
.pip_install("ffmpeg-python")
)
CACHE_PATH = "/root/cache"
cache_volume = modal.SharedVolume().persist("whisper-cache")
pipeline and inference logic
@stub.cls(
gpu="T4", # a 16 GB GPU will suffice
shared_volumes={CACHE_PATH: cache_volume}, # here we reference our cache volume
container_idle_timeout=120, # time to keep alive the container in between requests
concurrency_limit=1, # how many gpus may be provisioned at the same time
keep_warm=0, # how many containers to keep alive at all times
)
class WhisperPipeline:
def __enter__(self): # for container lifecycle mgmt
os.environ['TRANSFORMERS_CACHE'] = CACHE_PATH
# these imports will use the stub's image
import torch
from transformers import pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# by specifying chunk_length_s, we can throw in audio files
# of any length that fit in the GPU
self.pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-tiny",
chunk_length_s=30,
device=device,
)
@modal.method()
def transcribe(self, file: bytes) -> str:
return self.pipe(file)["text"]
web endpoint
class TranscribeResponse(BaseModel):
prediction: str
@stub.function()
@modal.web_endpoint(method="POST")
async def entrypoint(file: UploadFile = File(...)):
try:
whisper_pipe = WhisperPipeline()
prediction = whisper_pipe.transcribe.call(file=file.file.read())
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
return TranscribeResponse(prediction=prediction)
You can either serve this temporarily with $ modal serve app.py
or deploy it to the cloud with $ modal deploy app.py
.
Finally, go to the deployment url and add /docs
to see the swagger API and try it out!