Unverified Commit ac4bb5c3 authored by Charlie.Wei's avatar Charlie.Wei Committed by GitHub

Add tongyi tts&tts function optimization (#2177)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: 's avatarcrazywoola <427733928@qq.com>
Co-authored-by: 's avatarcrazywoola <100913391+crazywoola@users.noreply.github.com>
parent a96cae4f
import uuid
import hashlib
import subprocess
from abc import abstractmethod from abc import abstractmethod
from typing import Optional from typing import Optional
from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.entities.model_entities import ModelType from core.model_runtime.entities.model_entities import ModelType
from core.model_runtime.model_providers.__base.ai_model import AIModel from core.model_runtime.model_providers.__base.ai_model import AIModel
from core.model_runtime.entities.model_entities import ModelPropertyKey
class TTSModel(AIModel): class TTSModel(AIModel):
...@@ -40,3 +45,96 @@ class TTSModel(AIModel): ...@@ -40,3 +45,96 @@ class TTSModel(AIModel):
:return: translated audio file :return: translated audio file
""" """
raise NotImplementedError raise NotImplementedError
def _get_model_voice(self, model: str, credentials: dict) -> any:
"""
Get voice for given tts model
:param model: model name
:param credentials: model credentials
:return: voice
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
def _get_model_audio_type(self, model: str, credentials: dict) -> str:
"""
Get audio type for given tts model
:param model: model name
:param credentials: model credentials
:return: voice
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
def _get_model_word_limit(self, model: str, credentials: dict) -> int:
"""
Get audio type for given tts model
:return: audio type
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
"""
Get audio max workers for given tts model
:return: audio type
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
@staticmethod
def _split_text_into_sentences(text: str, limit: int, delimiters=None):
if delimiters is None:
delimiters = set('。!?;\n')
buf = []
word_count = 0
for char in text:
buf.append(char)
if char in delimiters:
if word_count >= limit:
yield ''.join(buf)
buf = []
word_count = 0
else:
word_count += 1
else:
word_count += 1
if buf:
yield ''.join(buf)
@staticmethod
def _is_ffmpeg_installed():
try:
output = subprocess.check_output("ffmpeg -version", shell=True)
if "ffmpeg version" in output.decode("utf-8"):
return True
else:
raise InvokeBadRequestError("ffmpeg is not installed, "
"details: https://docs.dify.ai/getting-started/install-self-hosted"
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
except Exception:
raise InvokeBadRequestError("ffmpeg is not installed, "
"details: https://docs.dify.ai/getting-started/install-self-hosted"
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
# Todo: To improve the streaming function
@staticmethod
def _get_file_name(file_content: str) -> str:
hash_object = hashlib.sha256(file_content.encode())
hex_digest = hash_object.hexdigest()
namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
return str(unique_uuid)
import uuid
import hashlib
import subprocess
from io import BytesIO from io import BytesIO
from typing import Optional from typing import Optional
from functools import reduce from functools import reduce
from pydub import AudioSegment from pydub import AudioSegment
from core.model_runtime.entities.model_entities import ModelPropertyKey
from core.model_runtime.errors.validate import CredentialsValidateFailedError from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.model_providers.__base.tts_model import TTSModel from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.openai._common import _CommonOpenAI from core.model_runtime.model_providers.openai._common import _CommonOpenAI
from typing_extensions import Literal
from flask import Response, stream_with_context from flask import Response, stream_with_context
from openai import OpenAI from openai import OpenAI
import concurrent.futures import concurrent.futures
...@@ -22,9 +17,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): ...@@ -22,9 +17,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
""" """
Model class for OpenAI Speech to text model. Model class for OpenAI Speech to text model.
""" """
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool,
user: Optional[str] = None) -> any:
""" """
_invoke text2speech model _invoke text2speech model
...@@ -65,7 +58,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): ...@@ -65,7 +58,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
except Exception as ex: except Exception as ex:
raise CredentialsValidateFailedError(str(ex)) raise CredentialsValidateFailedError(str(ex))
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
""" """
_tts_invoke text2speech model _tts_invoke text2speech model
...@@ -104,8 +97,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): ...@@ -104,8 +97,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
raise InvokeBadRequestError(str(ex)) raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function # Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
user: Optional[str] = None) -> any:
""" """
_tts_invoke_streaming text2speech model _tts_invoke_streaming text2speech model
...@@ -131,84 +123,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): ...@@ -131,84 +123,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
except Exception as ex: except Exception as ex:
raise InvokeBadRequestError(str(ex)) raise InvokeBadRequestError(str(ex))
def _get_model_voice(self, model: str, credentials: dict) -> Literal[
"alloy", "echo", "fable", "onyx", "nova", "shimmer"]:
"""
Get voice for given tts model
:param model: model name
:param credentials: model credentials
:return: voice
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
def _get_model_audio_type(self, model: str, credentials: dict) -> str:
"""
Get audio type for given tts model
:param model: model name
:param credentials: model credentials
:return: voice
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
def _get_model_word_limit(self, model: str, credentials: dict) -> int:
"""
Get audio type for given tts model
:return: audio type
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
"""
Get audio max workers for given tts model
:return: audio type
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
@staticmethod
def _split_text_into_sentences(text: str, limit: int, delimiters=None):
if delimiters is None:
delimiters = set('。!?;\n')
buf = []
word_count = 0
for char in text:
buf.append(char)
if char in delimiters:
if word_count >= limit:
yield ''.join(buf)
buf = []
word_count = 0
else:
word_count += 1
else:
word_count += 1
if buf:
yield ''.join(buf)
@staticmethod
def _get_file_name(file_content: str) -> str:
hash_object = hashlib.sha256(file_content.encode())
hex_digest = hash_object.hexdigest()
namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
return str(unique_uuid)
def _process_sentence(self, sentence: str, model: str, credentials: dict): def _process_sentence(self, sentence: str, model: str, credentials: dict):
""" """
_tts_invoke openai text2speech model api _tts_invoke openai text2speech model api
...@@ -226,18 +140,3 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): ...@@ -226,18 +140,3 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip()) response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
if isinstance(response.read(), bytes): if isinstance(response.read(), bytes):
return response.read() return response.read()
@staticmethod
def _is_ffmpeg_installed():
try:
output = subprocess.check_output("ffmpeg -version", shell=True)
if "ffmpeg version" in output.decode("utf-8"):
return True
else:
raise InvokeBadRequestError("ffmpeg is not installed, "
"details: https://docs.dify.ai/getting-started/install-self-hosted"
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
except Exception:
raise InvokeBadRequestError("ffmpeg is not installed, "
"details: https://docs.dify.ai/getting-started/install-self-hosted"
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
from core.model_runtime.errors.invoke import InvokeError
class _CommonTongyi:
@staticmethod
def _to_credential_kwargs(credentials: dict) -> dict:
credentials_kwargs = {
"dashscope_api_key": credentials['dashscope_api_key'],
}
return credentials_kwargs
@property
def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
"""
Map model invoke error to unified error
The key is the error type thrown to the caller
The value is the error type thrown by the model,
which needs to be converted into a unified error type for the caller.
:return: Invoke error mapping
"""
pass
...@@ -16,6 +16,7 @@ help: ...@@ -16,6 +16,7 @@ help:
en_US: https://dashscope.console.aliyun.com/api-key_management en_US: https://dashscope.console.aliyun.com/api-key_management
supported_model_types: supported_model_types:
- llm - llm
- tts
configurate_methods: configurate_methods:
- predefined-model - predefined-model
provider_credential_schema: provider_credential_schema:
......
model: tts-1
model_type: tts
model_properties:
default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
word_limit: 120
audio_type: 'mp3'
max_workers: 5
from io import BytesIO
from typing import Optional
from functools import reduce
from pydub import AudioSegment
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
import dashscope
from flask import Response, stream_with_context
import concurrent.futures
class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
"""
Model class for Tongyi Speech to text model.
"""
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
"""
_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param streaming: output is streaming
:param user: unique user id
:return: text translated to audio file
"""
self._is_ffmpeg_installed()
audio_type = self._get_model_audio_type(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
user=user)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
"""
validate credentials text2speech model
:param model: model name
:param credentials: model credentials
:param user: unique user id
:return: text translated to audio file
"""
try:
self._tts_invoke(
model=model,
credentials=credentials,
content_text='Hello world!',
user=user
)
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param user: unique user id
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
audio_bytes_list = list()
# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
credentials=credentials, audio_type=audio_type) for sentence in sentences]
for future in futures:
try:
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
"""
_tts_invoke_streaming text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param user: unique user id
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
dashscope.api_key = credentials.get('dashscope_api_key')
voice_name = self._get_model_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
for sentence in sentences:
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
format=audio_type, word_timestamp_enabled=True,
phoneme_timestamp_enabled=True)
if isinstance(response.get_audio_data(), bytes):
return response.get_audio_data()
except Exception as ex:
raise InvokeBadRequestError(str(ex))
def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
"""
_tts_invoke Tongyi text2speech model api
:param model: model name
:param credentials: model credentials
:param sentence: text content to be translated
:param audio_type: audio file type
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
dashscope.api_key = credentials.get('dashscope_api_key')
voice_name = self._get_model_voice(model, credentials)
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
if isinstance(response.get_audio_data(), bytes):
return response.get_audio_data()
...@@ -495,7 +495,7 @@ The text generation application offers non-session support and is ideal for tran ...@@ -495,7 +495,7 @@ The text generation application offers non-session support and is ideal for tran
/> />
<Row> <Row>
<Col> <Col>
Text to speech, only supports openai model. Text to speech.
### Request Body ### Request Body
......
...@@ -458,7 +458,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx' ...@@ -458,7 +458,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
/> />
<Row> <Row>
<Col> <Col>
文字转语音,仅支持 openai 模型 文字转语音。
### Request Body ### Request Body
......
...@@ -845,7 +845,7 @@ Chat applications support session persistence, allowing previous chat history to ...@@ -845,7 +845,7 @@ Chat applications support session persistence, allowing previous chat history to
/> />
<Row> <Row>
<Col> <Col>
Text to speech, only supports openai model. Text to speech.
### Request Body ### Request Body
......
...@@ -917,7 +917,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx' ...@@ -917,7 +917,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
/> />
<Row> <Row>
<Col> <Col>
文字转语音,仅支持 openai 模型 文字转语音。
### Request Body ### Request Body
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment