Serverless GPU transcription with Modal

I quickly want to share how easy it is to set up a serverless GPU function with Modal.

Background: my girlfriend had recorded 3 hours of audio during an interview and I decided to help her out with some automatic audio transcription.

Things that I like about this setup:

  • simple python script
  • I can send my gf just a link to the service where she can upload her audio files and get back text
  • since it's serverless, I don't have to worry about paying for idle time
  • I can get around 30s limitations of free online services
  • Modal gives you $30 free credits per month to play around with

imports

import os
import modal
from fastapi import HTTPException
from fastapi import UploadFile, File
from pydantic import BaseModel

stub and cache creation

stub = modal.Stub("whisper")
stub.image = (
    modal.Image.debian_slim(python_version="3.10")
    .pip_install(
        "torch",
        "transformers",
    )
    .apt_install("ffmpeg")
    .pip_install("ffmpeg-python")
)
 
CACHE_PATH = "/root/cache"
cache_volume = modal.SharedVolume().persist("whisper-cache")

pipeline and inference logic

@stub.cls(
    gpu="T4", # a 16 GB GPU will suffice
    shared_volumes={CACHE_PATH: cache_volume}, # here we reference our cache volume
    container_idle_timeout=120, # time to keep alive the container in between requests
    concurrency_limit=1, # how many gpus may be provisioned at the same time
    keep_warm=0, # how many containers to keep alive at all times
)
class WhisperPipeline:
    def __enter__(self): # for container lifecycle mgmt
        os.environ['TRANSFORMERS_CACHE'] = CACHE_PATH
 
        # these imports will use the stub's image
        import torch
        from transformers import pipeline
 
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
        # by specifying chunk_length_s, we can throw in audio files
        # of any length that fit in the GPU
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model="openai/whisper-tiny",
            chunk_length_s=30,
            device=device,
        )
 
    @modal.method()
    def transcribe(self, file: bytes) -> str:
        return self.pipe(file)["text"]

web endpoint

class TranscribeResponse(BaseModel):
    prediction: str
 
@stub.function()
@modal.web_endpoint(method="POST")
async def entrypoint(file: UploadFile = File(...)):
    try:
        whisper_pipe = WhisperPipeline()
        prediction = whisper_pipe.transcribe.call(file=file.file.read())
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
    return TranscribeResponse(prediction=prediction)

You can either serve this temporarily with $ modal serve app.py or deploy it to the cloud with $ modal deploy app.py. Finally, go to the deployment url and add /docs to see the swagger API and try it out!

© 2024 Vorias Digital BV