Contenerized text-to-speech engine based on Facebook's Massively Multilingual Speech model with web API
Jacek Kowalski
2024-08-01 71e49cf0d406c0c7fc6abd8d5b682dc10114e9b0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import io
 
import fastapi
import pydantic
import scipy
import transformers
import torch
 
from config import *
 
tokenizers = {}
models = {}
 
for lang in LanguageModel:
    tokenizers[lang.value] = transformers.AutoTokenizer.from_pretrained('./data/tokenizer/facebook/mms-tts-' + lang.name)
    models[lang.value] = transformers.VitsModel.from_pretrained('./data/model/facebook/mms-tts-' + lang.name)
 
 
class SynthesizeRequest(pydantic.BaseModel):
    language: LanguageModel
    text: str
 
class SynthesizeResponse(fastapi.Response):
    media_type = 'audio/wav'
 
 
app = fastapi.FastAPI()
 
 
@app.post('/synthesize', response_class=SynthesizeResponse)
async def synthesize(request: SynthesizeRequest) -> SynthesizeResponse:
    inputs = tokenizers[request.language.value](request.text, return_tensors='pt')
    model = models[request.language.value]
    with torch.no_grad():
        output = model(**inputs).waveform
 
    with io.BytesIO() as fp:
        scipy.io.wavfile.write(fp, rate=model.config.sampling_rate, data=output.float().numpy().T)
        return SynthesizeResponse(content = fp.getvalue())