Contenerized text-to-speech engine based on Coqui's TTS with web API
Jacek Kowalski
2024-08-01 564abd4b587604941dc7d0c7723abd25c9822e1b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import enum
import io
 
import fastapi
import pydantic
import TTS.api
 
import config
 
models = {}
 
for id, model in config.models.items():
    models[id] = TTS.api.TTS(model).to("cpu")
 
 
LanguageModel = enum.Enum('LanguageModel', {k: k for k in models.keys()})
 
 
class SynthesizeRequest(pydantic.BaseModel):
    language: LanguageModel
    text: str
 
 
class SynthesizeResponse(fastapi.Response):
    media_type = 'audio/wav'
 
 
app = fastapi.FastAPI()
 
@app.post('/synthesize', response_class=SynthesizeResponse)
async def synthesize(request: SynthesizeRequest) -> SynthesizeResponse:
    with io.BytesIO() as fp:
        models[request.language.value].tts_to_file(request.text, file_path=fp)
        return SynthesizeResponse(content = fp.getvalue())