Feat/chat support voice input (#532)

a03a92e9 · zxhlyh · GitHub · feebb5dd · a03a92e9 · a03a92e9
Unverified Commit a03a92e9 authored Jul 07, 2023 by zxhlyh Committed by GitHub Jul 07, 2023
70 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -147,3 +147,5 @@ docker/volumes/weaviate/*
 sdks/python-client/build
 sdks/python-client/dist
 sdks/python-client/dify_client.egg-info
+.vscode/
\ No newline at end of file
--- a/api/controllers/console/__init__.py
+++ b/api/controllers/console/__init__.py
@@ -9,7 +9,7 @@ api = ExternalApi(bp)
 from . import setup, version, apikey, admin
 # Import app controllers
-from .app import app, site, completion, model_config, statistic, conversation, message, generator
+from .app import app, site, completion, model_config, statistic, conversation, message, generator, audio
 # Import auth controllers
 from .auth import login, oauth, data_source_oauth
@@ -21,4 +21,4 @@ from .datasets import datasets, datasets_document, datasets_segments, file, hit_
 from .workspace import workspace, members, providers, account
 # Import explore controllers
-from .explore import installed_app, recommended_app, completion, conversation, message, parameter, saved_message
+from .explore import installed_app, recommended_app, completion, conversation, message, parameter, saved_message, audio
--- a/api/controllers/console/app/app.py
+++ b/api/controllers/console/app/app.py
@@ -22,6 +22,7 @@ model_config_fields = {
    'opening_statement': fields.String,
    'suggested_questions': fields.Raw(attribute='suggested_questions_list'),
    'suggested_questions_after_answer': fields.Raw(attribute='suggested_questions_after_answer_dict'),
+    'speech_to_text': fields.Raw(attribute='speech_to_text_dict'),
    'more_like_this': fields.Raw(attribute='more_like_this_dict'),
    'model': fields.Raw(attribute='model_dict'),
    'user_input_form': fields.Raw(attribute='user_input_form_list'),
@@ -144,6 +145,7 @@ class AppListApi(Resource):
                opening_statement=model_configuration['opening_statement'],
                suggested_questions=json.dumps(model_configuration['suggested_questions']),
                suggested_questions_after_answer=json.dumps(model_configuration['suggested_questions_after_answer']),
+                speech_to_text=json.dumps(model_configuration['speech_to_text']),
                more_like_this=json.dumps(model_configuration['more_like_this']),
                model=json.dumps(model_configuration['model']),
                user_input_form=json.dumps(model_configuration['user_input_form']),
@@ -434,6 +436,7 @@ class AppCopy(Resource):
            opening_statement=app_config.opening_statement,
            suggested_questions=app_config.suggested_questions,
            suggested_questions_after_answer=app_config.suggested_questions_after_answer,
+            speech_to_text=app_config.speech_to_text,
            more_like_this=app_config.more_like_this,
            model=app_config.model,
            user_input_form=app_config.user_input_form,

--- a/api/controllers/console/app/audio.py
+++ b/api/controllers/console/app/audio.py
+# -*- coding:utf-8 -*-
+import logging
+from flask import request
+from flask_login import login_required
+from werkzeug.exceptions import InternalServerError, NotFound
+import services
+from controllers.console import api
+from controllers.console.app import _get_app
+from controllers.console.app.error import AppUnavailableError, \
+    ProviderNotInitializeError, CompletionRequestError, ProviderQuotaExceededError, \
+    ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, \
+    UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError
+from controllers.console.setup import setup_required
+from controllers.console.wraps import account_initialization_required
+from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \
+    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from flask_restful import Resource
+from services.audio_service import AudioService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \
+    UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+class ChatMessageAudioApi(Resource):
+    @setup_required
+    @login_required
+    @account_initialization_required
+    def post(self, app_id):
+        app_id = str(app_id)
+        app_model = _get_app(app_id, 'chat')
+        file = request.files['file']
+        try:
+            response = AudioService.transcript(
+                tenant_id=app_model.tenant_id,
+                file=file,
+            )
+            return response
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except NoAudioUploadedServiceError:
+            raise NoAudioUploadedError()
+        except AudioTooLargeServiceError as e:
+            raise AudioTooLargeError(str(e))
+        except UnsupportedAudioTypeServiceError:
+            raise UnsupportedAudioTypeError()
+        except ProviderNotSupportSpeechToTextServiceError:
+            raise ProviderNotSupportSpeechToTextError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()
+api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text')
\ No newline at end of file
--- a/api/controllers/console/app/error.py
+++ b/api/controllers/console/app/error.py
@@ -49,3 +49,27 @@ class AppMoreLikeThisDisabledError(BaseHTTPException):
    error_code = 'app_more_like_this_disabled'
    description = "The 'More like this' feature is disabled. Please refresh your page."
    code = 403
+class NoAudioUploadedError(BaseHTTPException):
+    error_code = 'no_audio_uploaded'
+    description = "Please upload your audio."
+    code = 400
+class AudioTooLargeError(BaseHTTPException):
+    error_code = 'audio_too_large'
+    description = "Audio size exceeded. {message}"
+    code = 413
+class UnsupportedAudioTypeError(BaseHTTPException):
+    error_code = 'unsupported_audio_type'
+    description = "Audio type not allowed."
+    code = 415
+class ProviderNotSupportSpeechToTextError(BaseHTTPException):
+    error_code = 'provider_not_support_speech_to_text'
+    description = "Provider not support speech to text."
+    code = 400
\ No newline at end of file
--- a/api/controllers/console/app/model_config.py
+++ b/api/controllers/console/app/model_config.py
@@ -41,6 +41,7 @@ class ModelConfigResource(Resource):
            opening_statement=model_configuration['opening_statement'],
            suggested_questions=json.dumps(model_configuration['suggested_questions']),
            suggested_questions_after_answer=json.dumps(model_configuration['suggested_questions_after_answer']),
+            speech_to_text=json.dumps(model_configuration['speech_to_text']),
            more_like_this=json.dumps(model_configuration['more_like_this']),
            model=json.dumps(model_configuration['model']),
            user_input_form=json.dumps(model_configuration['user_input_form']),

--- a/api/controllers/console/explore/audio.py
+++ b/api/controllers/console/explore/audio.py
+# -*- coding:utf-8 -*-
+import logging
+from flask import request
+from werkzeug.exceptions import InternalServerError
+import services
+from controllers.console import api
+from controllers.console.app.error import AppUnavailableError, ProviderNotInitializeError, \
+    ProviderQuotaExceededError, ProviderModelCurrentlyNotSupportError, CompletionRequestError, \
+    NoAudioUploadedError, AudioTooLargeError, \
+    UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError
+from controllers.console.explore.wraps import InstalledAppResource
+from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \
+    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from services.audio_service import AudioService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \
+    UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+from models.model import AppModelConfig
+class ChatAudioApi(InstalledAppResource):
+    def post(self, installed_app):
+        app_model = installed_app.app
+        app_model_config: AppModelConfig = app_model.app_model_config
+        if not app_model_config.speech_to_text_dict['enabled']:
+            raise AppUnavailableError()
+        file = request.files['file']
+        try:
+            response = AudioService.transcript(
+                tenant_id=app_model.tenant_id,
+                file=file,
+            )
+            return response
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except NoAudioUploadedServiceError:
+            raise NoAudioUploadedError()
+        except AudioTooLargeServiceError as e:
+            raise AudioTooLargeError(str(e))
+        except UnsupportedAudioTypeServiceError:
+            raise UnsupportedAudioTypeError()
+        except ProviderNotSupportSpeechToTextServiceError:
+            raise ProviderNotSupportSpeechToTextError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()
+api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')
\ No newline at end of file
--- a/api/controllers/console/explore/parameter.py
+++ b/api/controllers/console/explore/parameter.py
@@ -21,6 +21,7 @@ class AppParameterApi(InstalledAppResource):
        'opening_statement': fields.String,
        'suggested_questions': fields.Raw,
        'suggested_questions_after_answer': fields.Raw,
+        'speech_to_text': fields.Raw,
        'more_like_this': fields.Raw,
        'user_input_form': fields.Raw,
    }
@@ -35,6 +36,7 @@ class AppParameterApi(InstalledAppResource):
            'opening_statement': app_model_config.opening_statement,
            'suggested_questions': app_model_config.suggested_questions_list,
            'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
+            'speech_to_text': app_model_config.speech_to_text_dict,
            'more_like_this': app_model_config.more_like_this_dict,
            'user_input_form': app_model_config.user_input_form_list
        }

--- a/api/controllers/service_api/__init__.py
+++ b/api/controllers/service_api/__init__.py
@@ -7,6 +7,6 @@ bp = Blueprint('service_api', __name__, url_prefix='/v1')
 api = ExternalApi(bp)
-from .app import completion, app, conversation, message
+from .app import completion, app, conversation, message, audio
 from .dataset import document
--- a/api/controllers/service_api/app/app.py
+++ b/api/controllers/service_api/app/app.py
@@ -22,6 +22,7 @@ class AppParameterApi(AppApiResource):
        'opening_statement': fields.String,
        'suggested_questions': fields.Raw,
        'suggested_questions_after_answer': fields.Raw,
+        'speech_to_text': fields.Raw,
        'more_like_this': fields.Raw,
        'user_input_form': fields.Raw,
    }
@@ -35,6 +36,7 @@ class AppParameterApi(AppApiResource):
            'opening_statement': app_model_config.opening_statement,
            'suggested_questions': app_model_config.suggested_questions_list,
            'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
+            'speech_to_text': app_model_config.speech_to_text_dict,
            'more_like_this': app_model_config.more_like_this_dict,
            'user_input_form': app_model_config.user_input_form_list
        }

--- a/api/controllers/service_api/app/audio.py
+++ b/api/controllers/service_api/app/audio.py
+import logging
+from flask import request
+from werkzeug.exceptions import InternalServerError
+import services
+from controllers.service_api import api
+from controllers.service_api.app.error import AppUnavailableError, ProviderNotInitializeError, CompletionRequestError, ProviderQuotaExceededError, \
+    ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, UnsupportedAudioTypeError, \
+    ProviderNotSupportSpeechToTextError
+from controllers.service_api.wraps import AppApiResource
+from core.llm.error import LLMBadRequestError, LLMAuthorizationError, LLMAPIUnavailableError, LLMAPIConnectionError, \
+    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from models.model import App, AppModelConfig
+from services.audio_service import AudioService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \
+    UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+class AudioApi(AppApiResource):
+    def post(self, app_model: App, end_user):
+        app_model_config: AppModelConfig = app_model.app_model_config
+        if not app_model_config.speech_to_text_dict['enabled']:
+            raise AppUnavailableError() 
+        file = request.files['file']
+        try:
+            response = AudioService.transcript(
+                tenant_id=app_model.tenant_id,
+                file=file,
+            )
+            return response
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except NoAudioUploadedServiceError:
+            raise NoAudioUploadedError()
+        except AudioTooLargeServiceError as e:
+            raise AudioTooLargeError(str(e))
+        except UnsupportedAudioTypeServiceError:
+            raise UnsupportedAudioTypeError()
+        except ProviderNotSupportSpeechToTextServiceError:
+            raise ProviderNotSupportSpeechToTextError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()
+api.add_resource(AudioApi, '/audio-to-text')
\ No newline at end of file
--- a/api/controllers/service_api/app/error.py
+++ b/api/controllers/service_api/app/error.py
@@ -51,3 +51,27 @@ class CompletionRequestError(BaseHTTPException):
    description = "Completion request failed."
    code = 400
+class NoAudioUploadedError(BaseHTTPException):
+    error_code = 'no_audio_uploaded'
+    description = "Please upload your audio."
+    code = 400
+class AudioTooLargeError(BaseHTTPException):
+    error_code = 'audio_too_large'
+    description = "Audio size exceeded. {message}"
+    code = 413
+class UnsupportedAudioTypeError(BaseHTTPException):
+    error_code = 'unsupported_audio_type'
+    description = "Audio type not allowed."
+    code = 415
+class ProviderNotSupportSpeechToTextError(BaseHTTPException):
+    error_code = 'provider_not_support_speech_to_text'
+    description = "Provider not support speech to text."
+    code = 400
--- a/api/controllers/web/__init__.py
+++ b/api/controllers/web/__init__.py
@@ -7,4 +7,4 @@ bp = Blueprint('web', __name__, url_prefix='/api')
 api = ExternalApi(bp)
-from . import completion, app, conversation, message, site, saved_message
+from . import completion, app, conversation, message, site, saved_message, audio
--- a/api/controllers/web/app.py
+++ b/api/controllers/web/app.py
@@ -21,6 +21,7 @@ class AppParameterApi(WebApiResource):
        'opening_statement': fields.String,
        'suggested_questions': fields.Raw,
        'suggested_questions_after_answer': fields.Raw,
+        'speech_to_text': fields.Raw,
        'more_like_this': fields.Raw,
        'user_input_form': fields.Raw,
    }
@@ -34,6 +35,7 @@ class AppParameterApi(WebApiResource):
            'opening_statement': app_model_config.opening_statement,
            'suggested_questions': app_model_config.suggested_questions_list,
            'suggested_questions_after_answer': app_model_config.suggested_questions_after_answer_dict,
+            'speech_to_text': app_model_config.speech_to_text_dict,
            'more_like_this': app_model_config.more_like_this_dict,
            'user_input_form': app_model_config.user_input_form_list
        }

--- a/api/controllers/web/audio.py
+++ b/api/controllers/web/audio.py
+# -*- coding:utf-8 -*-
+import logging
+from flask import request
+from werkzeug.exceptions import InternalServerError
+import services
+from controllers.web import api
+from controllers.web.error import AppUnavailableError, ProviderNotInitializeError, CompletionRequestError, \
+    ProviderQuotaExceededError, ProviderModelCurrentlyNotSupportError, NoAudioUploadedError, AudioTooLargeError, \
+    UnsupportedAudioTypeError, ProviderNotSupportSpeechToTextError
+from controllers.web.wraps import WebApiResource
+from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthorizationError, LLMAPIConnectionError, \
+    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
+from services.audio_service import AudioService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, \
+    UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+from models.model import App, AppModelConfig
+class AudioApi(WebApiResource):
+    def post(self, app_model: App, end_user):
+        app_model_config: AppModelConfig = app_model.app_model_config
+        if not app_model_config.speech_to_text_dict['enabled']:
+            raise AppUnavailableError()
+        file = request.files['file']
+        try:
+            response = AudioService.transcript(
+                tenant_id=app_model.tenant_id,
+                file=file,
+            )
+            return response
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except NoAudioUploadedServiceError:
+            raise NoAudioUploadedError()
+        except AudioTooLargeServiceError as e:
+            raise AudioTooLargeError(str(e))
+        except UnsupportedAudioTypeServiceError:
+            raise UnsupportedAudioTypeError()
+        except ProviderNotSupportSpeechToTextServiceError:
+            raise ProviderNotSupportSpeechToTextError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()
+api.add_resource(AudioApi, '/audio-to-text')
\ No newline at end of file
--- a/api/controllers/web/error.py
+++ b/api/controllers/web/error.py
@@ -62,3 +62,27 @@ class AppSuggestedQuestionsAfterAnswerDisabledError(BaseHTTPException):
    error_code = 'app_suggested_questions_after_answer_disabled'
    description = "The 'Suggested Questions After Answer' feature is disabled. Please refresh your page."
    code = 403
+class NoAudioUploadedError(BaseHTTPException):
+    error_code = 'no_audio_uploaded'
+    description = "Please upload your audio."
+    code = 400
+class AudioTooLargeError(BaseHTTPException):
+    error_code = 'audio_too_large'
+    description = "Audio size exceeded. {message}"
+    code = 413
+class UnsupportedAudioTypeError(BaseHTTPException):
+    error_code = 'unsupported_audio_type'
+    description = "Audio type not allowed."
+    code = 415
+class ProviderNotSupportSpeechToTextError(BaseHTTPException):
+    error_code = 'provider_not_support_speech_to_text'
+    description = "Provider not support speech to text."
+    code = 400
\ No newline at end of file
--- a/api/core/llm/whisper.py
+++ b/api/core/llm/whisper.py
+import openai
+from models.provider import ProviderName
+from core.llm.error_handle_wraps import handle_llm_exceptions
+from core.llm.provider.base import BaseProvider
+class Whisper:
+    def __init__(self, provider: BaseProvider):
+        self.provider = provider
+        if self.provider.get_provider_name() == ProviderName.OPENAI:
+            self.client = openai.Audio
+            self.credentials = provider.get_credentials()
+    @handle_llm_exceptions
+    def transcribe(self, file):
+        return self.client.transcribe(
+            model='whisper-1', 
+            file=file,
+            api_key=self.credentials.get('openai_api_key'),
+            api_base=self.credentials.get('openai_api_base'),
+            api_type=self.credentials.get('openai_api_type'),
+            api_version=self.credentials.get('openai_api_version'),
+        )
--- a/api/migrations/versions/a5b56fb053ef_app_config_add_speech_to_text.py
+++ b/api/migrations/versions/a5b56fb053ef_app_config_add_speech_to_text.py
+"""app config add speech_to_text
+Revision ID: a5b56fb053ef
+Revises: d3d503a3471c
+Create Date: 2023-07-06 17:55:20.894149
+"""
+from alembic import op
+import sqlalchemy as sa
+# revision identifiers, used by Alembic.
+revision = 'a5b56fb053ef'
+down_revision = 'd3d503a3471c'
+branch_labels = None
+depends_on = None
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('app_model_configs', schema=None) as batch_op:
+        batch_op.add_column(sa.Column('speech_to_text', sa.Text(), nullable=True))
+    # ### end Alembic commands ###
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('app_model_configs', schema=None) as batch_op:
+        batch_op.drop_column('speech_to_text')
+    # ### end Alembic commands ###
--- a/api/models/model.py
+++ b/api/models/model.py
@@ -81,6 +81,7 @@ class AppModelConfig(db.Model):
    opening_statement = db.Column(db.Text)
    suggested_questions = db.Column(db.Text)
    suggested_questions_after_answer = db.Column(db.Text)
+    speech_to_text = db.Column(db.Text)
    more_like_this = db.Column(db.Text)
    model = db.Column(db.Text)
    user_input_form = db.Column(db.Text)
@@ -104,6 +105,11 @@ class AppModelConfig(db.Model):
    def suggested_questions_after_answer_dict(self) -> dict:
        return json.loads(self.suggested_questions_after_answer) if self.suggested_questions_after_answer \
            else {"enabled": False}
+    @property
+    def speech_to_text_dict(self) -> dict:
+        return json.loads(self.speech_to_text) if self.speech_to_text \
+            else {"enabled": False}
    @property
    def more_like_this_dict(self) -> dict:
@@ -223,6 +229,9 @@ class Conversation(db.Model):
                model_config['suggested_questions_after_answer'] = override_model_configs[
                    'suggested_questions_after_answer'] \
                    if 'suggested_questions_after_answer' in override_model_configs else {"enabled": False}
+                model_config['speech_to_text'] = override_model_configs[
+                    'speech_to_text'] \
+                    if 'speech_to_text' in override_model_configs else {"enabled": False}
                model_config['more_like_this'] = override_model_configs['more_like_this'] \
                    if 'more_like_this' in override_model_configs else {"enabled": False}
                model_config['user_input_form'] = override_model_configs['user_input_form']
@@ -239,6 +248,7 @@ class Conversation(db.Model):
            model_config['opening_statement'] = app_model_config.opening_statement
            model_config['suggested_questions'] = app_model_config.suggested_questions_list
            model_config['suggested_questions_after_answer'] = app_model_config.suggested_questions_after_answer_dict
+            model_config['speech_to_text'] = app_model_config.speech_to_text_dict
            model_config['more_like_this'] = app_model_config.more_like_this_dict
            model_config['user_input_form'] = app_model_config.user_input_form_list

--- a/api/services/app_model_config_service.py
+++ b/api/services/app_model_config_service.py
@@ -4,6 +4,7 @@ import uuid
 from core.constant import llm_constant
 from models.account import Account
 from services.dataset_service import DatasetService
+from core.llm.llm_builder import LLMBuilder
 class AppModelConfigService:
@@ -109,6 +110,26 @@ class AppModelConfigService:
        if not isinstance(config["suggested_questions_after_answer"]["enabled"], bool):
            raise ValueError("enabled in suggested_questions_after_answer must be of boolean type")
+        # speech_to_text
+        if 'speech_to_text' not in config or not config["speech_to_text"]:
+            config["speech_to_text"] = {
+                "enabled": False
+            }
+        if not isinstance(config["speech_to_text"], dict):
+            raise ValueError("speech_to_text must be of dict type")
+        if "enabled" not in config["speech_to_text"] or not config["speech_to_text"]["enabled"]:
+            config["speech_to_text"]["enabled"] = False
+        if not isinstance(config["speech_to_text"]["enabled"], bool):
+            raise ValueError("enabled in speech_to_text must be of boolean type")
+        provider_name = LLMBuilder.get_default_provider(account.current_tenant_id)
+        if config["speech_to_text"]["enabled"] and provider_name != 'openai':
+            raise ValueError("provider not support speech to text")
        # more_like_this
        if 'more_like_this' not in config or not config["more_like_this"]:
            config["more_like_this"] = {
@@ -277,6 +298,7 @@ class AppModelConfigService:
            "opening_statement": config["opening_statement"],
            "suggested_questions": config["suggested_questions"],
            "suggested_questions_after_answer": config["suggested_questions_after_answer"],
+            "speech_to_text": config["speech_to_text"],
            "more_like_this": config["more_like_this"],
            "model": {
                "provider": config["model"]["provider"],

--- a/api/services/audio_service.py
+++ b/api/services/audio_service.py
+import io
+from werkzeug.datastructures import FileStorage
+from core.llm.llm_builder import LLMBuilder
+from core.llm.provider.llm_provider_service import LLMProviderService
+from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServiceError, UnsupportedAudioTypeServiceError, ProviderNotSupportSpeechToTextServiceError
+from core.llm.whisper import Whisper
+from models.provider import ProviderName
+FILE_SIZE_LIMIT = 1 * 1024 * 1024
+ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
+class AudioService:
+    @classmethod
+    def transcript(cls, tenant_id: str, file: FileStorage):
+        if file is None:
+            raise NoAudioUploadedServiceError()
+        extension = file.mimetype
+        if extension not in [f'audio/{ext}' for ext in ALLOWED_EXTENSIONS]:
+            raise UnsupportedAudioTypeServiceError()
+        file_content = file.read()
+        file_size = len(file_content)
+        if file_size > FILE_SIZE_LIMIT:
+            message = f"({file_size} > {FILE_SIZE_LIMIT})"
+            raise AudioTooLargeServiceError(message)
+        provider_name = LLMBuilder.get_default_provider(tenant_id)
+        if provider_name != ProviderName.OPENAI.value:
+            raise ProviderNotSupportSpeechToTextServiceError('haha')
+        provider_service = LLMProviderService(tenant_id, provider_name)
+        buffer = io.BytesIO(file_content)
+        buffer.name = 'temp.wav'
+        return Whisper(provider_service.provider).transcribe(buffer)
\ No newline at end of file
--- a/api/services/errors/__init__.py
+++ b/api/services/errors/__init__.py
 # -*- coding:utf-8 -*-
 __all__ = [
    'base', 'conversation', 'message', 'index', 'app_model_config', 'account', 'document', 'dataset',
-    'app', 'completion'
+    'app', 'completion', 'audio'
 ]
 from . import *
--- a/api/services/errors/audio.py
+++ b/api/services/errors/audio.py
+from services.errors.base import BaseServiceError
+class NoAudioUploadedServiceError(BaseServiceError):
+    error_code = 'no_audio_uploaded'
+    description = "Please upload your audio."
+    code = 400
+class AudioTooLargeServiceError(BaseServiceError):
+    error_code = 'audio_too_large'
+    description = "Audio size exceeded. {message}"
+    code = 413
+class UnsupportedAudioTypeServiceError(BaseServiceError):
+    error_code = 'unsupported_audio_type'
+    description = "Audio type not allowed."
+    code = 415
+class ProviderNotSupportSpeechToTextServiceError(BaseServiceError):
+    error_code = 'provider_not_support_speech_to_text'
+    description = "Provider not support speech to text. {message}"
+    code = 400
\ No newline at end of file
--- a/web/app/components/app/chat/index.tsx
+++ b/web/app/components/app/chat/index.tsx
@@ -3,6 +3,7 @@ import type { FC } from 'react'
 import React, { useEffect, useLayoutEffect, useRef, useState } from 'react'
 import { useContext } from 'use-context-selector'
 import cn from 'classnames'
+import Recorder from 'js-audio-recorder'
 import { HandThumbDownIcon, HandThumbUpIcon } from '@heroicons/react/24/outline'
 import { UserCircleIcon } from '@heroicons/react/24/solid'
 import { useTranslation } from 'react-i18next'
@@ -19,6 +20,10 @@ import AppContext from '@/context/app-context'
 import { Markdown } from '@/app/components/base/markdown'
 import { formatNumber } from '@/utils/format'
 import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
+import VoiceInput from '@/app/components/base/voice-input'
+import { Microphone01 } from '@/app/components/base/icons/src/vender/line/mediaAndDevices'
+import { Microphone01 as Microphone01Solid } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
+import { XCircle } from '@/app/components/base/icons/src/vender/solid/general'
 const stopIcon = (
  <svg width="14" height="14" viewBox="0 0 14 14" fill="none" xmlns="http://www.w3.org/2000/svg">
@@ -59,6 +64,7 @@ export type IChatProps = {
  controlFocus?: number
  isShowSuggestion?: boolean
  suggestionList?: string[]
+  isShowSpeechToText?: boolean
 }
 export type MessageMore = {
@@ -421,6 +427,7 @@ const Chat: FC<IChatProps> = ({
  controlFocus,
  isShowSuggestion,
  suggestionList,
+  isShowSpeechToText,
 }) => {
  const { t } = useTranslation()
  const { notify } = useContext(ToastContext)
@@ -488,6 +495,15 @@ const Chat: FC<IChatProps> = ({
    }
  }, [suggestionList])
+  const [voiceInputShow, setVoiceInputShow] = useState(false)
+  const handleVoiceInputShow = () => {
+    (Recorder as any).getPermission().then(() => {
+      setVoiceInputShow(true)
+    }, () => {
+      logError(t('common.voiceInput.notAllow'))
+    })
+  }
  return (
    <div className={cn('px-3.5', 'h-full')}>
      {/* Chat List */}
@@ -565,6 +581,26 @@ const Chat: FC<IChatProps> = ({
              />
              <div className="absolute top-0 right-2 flex items-center h-[48px]">
                <div className={`${s.count} mr-4 h-5 leading-5 text-sm bg-gray-50 text-gray-500`}>{query.trim().length}</div>
+                {
+                  query
+                    ? (
+                      <div className='flex justify-center items-center w-8 h-8 cursor-pointer hover:bg-gray-100 rounded-lg' onClick={() => setQuery('')}>
+                        <XCircle className='w-4 h-4 text-[#98A2B3]' />
+                      </div>
+                    )
+                    : isShowSpeechToText
+                      ? (
+                        <div
+                          className='group flex justify-center items-center w-8 h-8 hover:bg-primary-50 rounded-lg cursor-pointer'
+                          onClick={handleVoiceInputShow}
+                        >
+                          <Microphone01 className='block w-4 h-4 text-gray-500 group-hover:hidden' />
+                          <Microphone01Solid className='hidden w-4 h-4 text-primary-600 group-hover:block' />
+                        </div>
+                      )
+                      : null
+                }
+                <div className='mx-2 w-[1px] h-4 bg-black opacity-5' />
                {isMobile
                  ? sendBtn
                  : (
@@ -581,6 +617,14 @@ const Chat: FC<IChatProps> = ({
                    </Tooltip>
                  )}
              </div>
+              {
+                voiceInputShow && (
+                  <VoiceInput
+                    onCancel={() => setVoiceInputShow(false)}
+                    onConverted={text => setQuery(text)}
+                  />
+                )
+              }
            </div>
          </div>
        )

--- a/web/app/components/app/chat/style.module.css
+++ b/web/app/components/app/chat/style.module.css
@@ -79,7 +79,7 @@
 .textArea {
  padding-top: 13px;
  padding-bottom: 13px;
-  padding-right: 90px;
+  padding-right: 130px;
  border-radius: 12px;
  line-height: 20px;
  background-color: #fff;

--- a/web/app/components/app/configuration/config/feature/choose-feature/feature-item/preview-imgs/speech-to-text.svg
+++ b/web/app/components/app/configuration/config/feature/choose-feature/feature-item/preview-imgs/speech-to-text.svg
--- a/web/app/components/app/configuration/config/feature/choose-feature/feature-item/style.module.css
+++ b/web/app/components/app/configuration/config/feature/choose-feature/feature-item/style.module.css
@@ -22,4 +22,8 @@
 .moreLikeThisPreview {
  background-image: url(./preview-imgs/more-like-this.svg);
+}
+.speechToTextPreview {
+  background-image: url(./preview-imgs/speech-to-text.svg);
 }
\ No newline at end of file
--- a/web/app/components/app/configuration/config/feature/choose-feature/index.tsx
+++ b/web/app/components/app/configuration/config/feature/choose-feature/index.tsx
@@ -7,10 +7,12 @@ import MoreLikeThisIcon from '../../../base/icons/more-like-this-icon'
 import FeatureItem from './feature-item'
 import Modal from '@/app/components/base/modal'
 import SuggestedQuestionsAfterAnswerIcon from '@/app/components/app/configuration/base/icons/suggested-questions-after-answer-icon'
+import { Microphone01 } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
 type IConfig = {
  openingStatement: boolean
  moreLikeThis: boolean
  suggestedQuestionsAfterAnswer: boolean
+  speechToText: boolean
 }
 export type IChooseFeatureProps = {
@@ -19,6 +21,7 @@ export type IChooseFeatureProps = {
  config: IConfig
  isChatApp: boolean
  onChange: (key: string, value: boolean) => void
+  showSpeechToTextItem?: boolean
 }
 const OpeningStatementIcon = (
@@ -33,6 +36,7 @@ const ChooseFeature: FC<IChooseFeatureProps> = ({
  isChatApp,
  config,
  onChange,
+  showSpeechToTextItem,
 }) => {
  const { t } = useTranslation()
@@ -69,6 +73,18 @@ const ChooseFeature: FC<IChooseFeatureProps> = ({
                value={config.suggestedQuestionsAfterAnswer}
                onChange={value => onChange('suggestedQuestionsAfterAnswer', value)}
              />
+              {
+                showSpeechToTextItem && (
+                  <FeatureItem
+                    icon={<Microphone01 className='w-4 h-4 text-[#7839EE]' />}
+                    previewImgClassName='speechToTextPreview'
+                    title={t('appDebug.feature.speechToText.title')}
+                    description={t('appDebug.feature.speechToText.description')}
+                    value={config.speechToText}
+                    onChange={value => onChange('speechToText', value)}
+                  />
+                )
+              }
            </>
          </FeatureGroup>
        )}

--- a/web/app/components/app/configuration/config/feature/use-feature.tsx
+++ b/web/app/components/app/configuration/config/feature/use-feature.tsx
@@ -7,6 +7,8 @@ function useFeature({
  setMoreLikeThis,
  suggestedQuestionsAfterAnswer,
  setSuggestedQuestionsAfterAnswer,
+  speechToText,
+  setSpeechToText,
 }: {
  introduction: string
  setIntroduction: (introduction: string) => void
@@ -14,13 +16,14 @@ function useFeature({
  setMoreLikeThis: (moreLikeThis: boolean) => void
  suggestedQuestionsAfterAnswer: boolean
  setSuggestedQuestionsAfterAnswer: (suggestedQuestionsAfterAnswer: boolean) => void
+  speechToText: boolean
+  setSpeechToText: (speechToText: boolean) => void
 }) {
  const [tempshowOpeningStatement, setTempShowOpeningStatement] = React.useState(!!introduction)
  useEffect(() => {
    // wait to api data back
-    if (!!introduction) {
+    if (introduction)
      setTempShowOpeningStatement(true)
-    }
  }, [introduction])
  // const [tempMoreLikeThis, setTempMoreLikeThis] = React.useState(moreLikeThis)
@@ -30,15 +33,16 @@ function useFeature({
  const featureConfig = {
    openingStatement: tempshowOpeningStatement,
-    moreLikeThis: moreLikeThis,
+    moreLikeThis,
-    suggestedQuestionsAfterAnswer: suggestedQuestionsAfterAnswer
+    suggestedQuestionsAfterAnswer,
+    speechToText,
  }
  const handleFeatureChange = (key: string, value: boolean) => {
    switch (key) {
      case 'openingStatement':
-        if (!value) {
+        if (!value)
          setIntroduction('')
-        }
        setTempShowOpeningStatement(value)
        break
      case 'moreLikeThis':
@@ -47,12 +51,14 @@ function useFeature({
      case 'suggestedQuestionsAfterAnswer':
        setSuggestedQuestionsAfterAnswer(value)
        break
+      case 'speechToText':
+        setSpeechToText(value)
    }
  }
  return {
    featureConfig,
-    handleFeatureChange
+    handleFeatureChange,
  }
 }
 export default useFeature
\ No newline at end of file
--- a/web/app/components/app/configuration/config/index.tsx
+++ b/web/app/components/app/configuration/config/index.tsx
@@ -4,6 +4,7 @@ import React from 'react'
 import { useContext } from 'use-context-selector'
 import produce from 'immer'
 import { useBoolean } from 'ahooks'
+import useSWR from 'swr'
 import DatasetConfig from '../dataset-config'
 import ChatGroup from '../features/chat-group'
 import ExperienceEnchanceGroup from '../features/experience-enchance-group'
@@ -19,6 +20,7 @@ import ConfigPrompt from '@/app/components/app/configuration/config-prompt'
 import ConfigVar from '@/app/components/app/configuration/config-var'
 import type { PromptVariable } from '@/models/debug'
 import { AppType } from '@/types/app'
+import { fetchTenantInfo } from '@/service/common'
 const Config: FC = () => {
  const {
@@ -33,8 +35,12 @@ const Config: FC = () => {
    setMoreLikeThisConfig,
    suggestedQuestionsAfterAnswerConfig,
    setSuggestedQuestionsAfterAnswerConfig,
+    speechToTextConfig,
+    setSpeechToTextConfig,
  } = useContext(ConfigContext)
  const isChatApp = mode === AppType.chat
+  const { data: userInfo } = useSWR({ url: '/info' }, fetchTenantInfo)
+  const targetProvider = userInfo?.providers?.find(({ token_is_set, is_valid }) => token_is_set && is_valid)
  const promptTemplate = modelConfig.configs.prompt_template
  const promptVariables = modelConfig.configs.prompt_variables
@@ -78,9 +84,15 @@ const Config: FC = () => {
        draft.enabled = value
      }))
    },
+    speechToText: speechToTextConfig.enabled,
+    setSpeechToText: (value) => {
+      setSpeechToTextConfig(produce(speechToTextConfig, (draft) => {
+        draft.enabled = value
+      }))
+    },
  })
-  const hasChatConfig = isChatApp && (featureConfig.openingStatement || featureConfig.suggestedQuestionsAfterAnswer)
+  const hasChatConfig = isChatApp && (featureConfig.openingStatement || featureConfig.suggestedQuestionsAfterAnswer || (featureConfig.speechToText && targetProvider?.provider_name === 'openai'))
  const hasToolbox = false
  const [showAutomatic, { setTrue: showAutomaticTrue, setFalse: showAutomaticFalse }] = useBoolean(false)
@@ -110,6 +122,7 @@ const Config: FC = () => {
            isChatApp={isChatApp}
            config={featureConfig}
            onChange={handleFeatureChange}
+            showSpeechToTextItem={targetProvider?.provider_name === 'openai'}
          />
        )}
        {showAutomatic && (
@@ -149,6 +162,7 @@ const Config: FC = () => {
                }
              }
              isShowSuggestedQuestionsAfterAnswer={featureConfig.suggestedQuestionsAfterAnswer}
+              isShowSpeechText={featureConfig.speechToText}
            />
          )
        }

--- a/web/app/components/app/configuration/debug/index.tsx
+++ b/web/app/components/app/configuration/debug/index.tsx
@@ -38,6 +38,7 @@ const Debug: FC<IDebug> = ({
    mode,
    introduction,
    suggestedQuestionsAfterAnswerConfig,
+    speechToTextConfig,
    moreLikeThisConfig,
    inputs,
    // setInputs,
@@ -159,6 +160,7 @@ const Debug: FC<IDebug> = ({
        enabled: false,
      },
      suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig,
+      speech_to_text: speechToTextConfig,
      agent_mode: {
        enabled: true,
        tools: [...postDatasets],
@@ -308,6 +310,7 @@ const Debug: FC<IDebug> = ({
      user_input_form: promptVariablesToUserInputsForm(modelConfig.configs.prompt_variables),
      opening_statement: introduction,
      suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig,
+      speech_to_text: speechToTextConfig,
      more_like_this: moreLikeThisConfig,
      agent_mode: {
        enabled: true,
@@ -386,6 +389,7 @@ const Debug: FC<IDebug> = ({
                  }}
                  isShowSuggestion={doShowSuggestion}
                  suggestionList={suggestQuestions}
+                  isShowSpeechToText={speechToTextConfig.enabled}
                />
              </div>
            </div>

--- a/web/app/components/app/configuration/features/chat-group/index.tsx
+++ b/web/app/components/app/configuration/features/chat-group/index.tsx
 'use client'
-import React, { FC } from 'react'
+import type { FC } from 'react'
+import React from 'react'
+import { useTranslation } from 'react-i18next'
 import GroupName from '../../base/group-name'
-import OpeningStatement, { IOpeningStatementProps } from './opening-statement'
+import type { IOpeningStatementProps } from './opening-statement'
+import OpeningStatement from './opening-statement'
 import SuggestedQuestionsAfterAnswer from './suggested-questions-after-answer'
-import { useTranslation } from 'react-i18next'
+import SpeechToText from './speech-to-text'
 /*
-* Include 
+* Include
 * 1. Conversation Opener
 * 2. Opening Suggestion
 * 3. Next question suggestion
 */
-interface ChatGroupProps {
+type ChatGroupProps = {
  isShowOpeningStatement: boolean
  openingStatementConfig: IOpeningStatementProps
  isShowSuggestedQuestionsAfterAnswer: boolean
+  isShowSpeechText: boolean
 }
 const ChatGroup: FC<ChatGroupProps> = ({
  isShowOpeningStatement,
  openingStatementConfig,
-  isShowSuggestedQuestionsAfterAnswer
+  isShowSuggestedQuestionsAfterAnswer,
+  isShowSpeechText,
 }) => {
  const { t } = useTranslation()
@@ -33,6 +38,11 @@ const ChatGroup: FC<ChatGroupProps> = ({
        {isShowSuggestedQuestionsAfterAnswer && (
          <SuggestedQuestionsAfterAnswer />
        )}
+        {
+          isShowSpeechText && (
+            <SpeechToText />
+          )
+        }
      </div>
    </div>
  )

--- a/web/app/components/app/configuration/features/chat-group/speech-to-text/index.tsx
+++ b/web/app/components/app/configuration/features/chat-group/speech-to-text/index.tsx
+'use client'
+import React, { type FC } from 'react'
+import { useTranslation } from 'react-i18next'
+import Panel from '@/app/components/app/configuration/base/feature-panel'
+import { Microphone01 } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
+const SuggestedQuestionsAfterAnswer: FC = () => {
+  const { t } = useTranslation()
+  return (
+    <Panel
+      title={
+        <div className='flex items-center gap-2'>
+          <div>{t('appDebug.feature.speechToText.title')}</div>
+        </div>
+      }
+      headerIcon={<Microphone01 className='w-4 h-4 text-[#7839EE]' />}
+      headerRight={
+        <div className='text-xs text-gray-500'>{t('appDebug.feature.speechToText.resDes')}</div>
+      }
+      noBodySpacing
+    />
+  )
+}
+export default React.memo(SuggestedQuestionsAfterAnswer)
--- a/web/app/components/app/configuration/index.tsx
+++ b/web/app/components/app/configuration/index.tsx
@@ -53,6 +53,9 @@ const Configuration: FC = () => {
  const [suggestedQuestionsAfterAnswerConfig, setSuggestedQuestionsAfterAnswerConfig] = useState<MoreLikeThisConfig>({
    enabled: false,
  })
+  const [speechToTextConfig, setSpeechToTextConfig] = useState<MoreLikeThisConfig>({
+    enabled: false,
+  })
  const [formattingChanged, setFormattingChanged] = useState(false)
  const [inputs, setInputs] = useState<Inputs>({})
  const [query, setQuery] = useState('')
@@ -73,6 +76,7 @@ const Configuration: FC = () => {
    opening_statement: '',
    more_like_this: null,
    suggested_questions_after_answer: null,
+    speech_to_text: null,
    dataSets: [],
  })
@@ -102,6 +106,9 @@ const Configuration: FC = () => {
    setSuggestedQuestionsAfterAnswerConfig(modelConfig.suggested_questions_after_answer || {
      enabled: false,
    })
+    setSpeechToTextConfig(modelConfig.speech_to_text || {
+      enabled: false,
+    })
  }
  const [hasSetCustomAPIKEY, setHasSetCustomerAPIKEY] = useState(true)
@@ -146,6 +153,9 @@ const Configuration: FC = () => {
      if (modelConfig.suggested_questions_after_answer)
        setSuggestedQuestionsAfterAnswerConfig(modelConfig.suggested_questions_after_answer)
+      if (modelConfig.speech_to_text)
+        setSpeechToTextConfig(modelConfig.speech_to_text)
      const config = {
        modelConfig: {
          provider: model.provider,
@@ -157,6 +167,7 @@ const Configuration: FC = () => {
          opening_statement: modelConfig.opening_statement,
          more_like_this: modelConfig.more_like_this,
          suggested_questions_after_answer: modelConfig.suggested_questions_after_answer,
+          speech_to_text: modelConfig.speech_to_text,
          dataSets: datasets || [],
        },
        completionParams: model.completion_params,
@@ -187,6 +198,7 @@ const Configuration: FC = () => {
      opening_statement: introduction || '',
      more_like_this: moreLikeThisConfig,
      suggested_questions_after_answer: suggestedQuestionsAfterAnswerConfig,
+      speech_to_text: speechToTextConfig,
      agent_mode: {
        enabled: true,
        tools: [...postDatasets],
@@ -203,6 +215,7 @@ const Configuration: FC = () => {
      draft.opening_statement = introduction
      draft.more_like_this = moreLikeThisConfig
      draft.suggested_questions_after_answer = suggestedQuestionsAfterAnswerConfig
+      draft.speech_to_text = speechToTextConfig
      draft.dataSets = dataSets
    })
    setPublishedConfig({
@@ -245,6 +258,8 @@ const Configuration: FC = () => {
      setMoreLikeThisConfig,
      suggestedQuestionsAfterAnswerConfig,
      setSuggestedQuestionsAfterAnswerConfig,
+      speechToTextConfig,
+      setSpeechToTextConfig,
      formattingChanged,
      setFormattingChanged,
      inputs,

--- a/web/app/components/base/icons/assets/vender/line/general/loading-02.svg
+++ b/web/app/components/base/icons/assets/vender/line/general/loading-02.svg
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g clip-path="url(#clip0_6037_51601)">
+<path d="M7.99992 1.33398V4.00065M7.99992 12.0007V14.6673M3.99992 8.00065H1.33325M14.6666 8.00065H11.9999M12.7189 12.7196L10.8333 10.834M12.7189 3.33395L10.8333 5.21956M3.28097 12.7196L5.16659 10.834M3.28097 3.33395L5.16659 5.21956" stroke="#667085" stroke-width="1.25" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+<defs>
+<clipPath id="clip0_6037_51601">
+<rect width="16" height="16" fill="white"/>
+</clipPath>
+</defs>
+</svg>
--- a/web/app/components/base/icons/assets/vender/line/general/x-close.svg
+++ b/web/app/components/base/icons/assets/vender/line/general/x-close.svg
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="x-close">
+<path id="Icon" d="M12 4L4 12M4 4L12 12" stroke="#667085" stroke-width="1.25" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+</svg>
--- a/web/app/components/base/icons/assets/vender/line/mediaAndDevices/microphone-01.svg
+++ b/web/app/components/base/icons/assets/vender/line/mediaAndDevices/microphone-01.svg
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="microphone-01">
+<path id="Icon" d="M12.6666 6.66732V8.00065C12.6666 10.578 10.5772 12.6673 7.99992 12.6673M3.33325 6.66732V8.00065C3.33325 10.578 5.42259 12.6673 7.99992 12.6673M7.99992 12.6673V14.6673M5.33325 14.6673H10.6666M7.99992 10.0007C6.89535 10.0007 5.99992 9.10522 5.99992 8.00065V3.33398C5.99992 2.22941 6.89535 1.33398 7.99992 1.33398C9.10449 1.33398 9.99992 2.22941 9.99992 3.33398V8.00065C9.99992 9.10522 9.10449 10.0007 7.99992 10.0007Z" stroke="#667085" stroke-width="1.25" stroke-linecap="round" stroke-linejoin="round"/>
+</g>
+</svg>
--- a/web/app/components/base/icons/assets/vender/solid/general/x-circle.svg
+++ b/web/app/components/base/icons/assets/vender/solid/general/x-circle.svg
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<path id="Solid" fill-rule="evenodd" clip-rule="evenodd" d="M8.00008 0.666016C3.94999 0.666016 0.666748 3.94926 0.666748 7.99935C0.666748 12.0494 3.94999 15.3327 8.00008 15.3327C12.0502 15.3327 15.3334 12.0494 15.3334 7.99935C15.3334 3.94926 12.0502 0.666016 8.00008 0.666016ZM10.4715 5.52794C10.7318 5.78829 10.7318 6.2104 10.4715 6.47075L8.94289 7.99935L10.4715 9.52794C10.7318 9.78829 10.7318 10.2104 10.4715 10.4708C10.2111 10.7311 9.78903 10.7311 9.52868 10.4708L8.00008 8.94216L6.47149 10.4708C6.21114 10.7311 5.78903 10.7311 5.52868 10.4708C5.26833 10.2104 5.26833 9.78829 5.52868 9.52794L7.05727 7.99935L5.52868 6.47075C5.26833 6.2104 5.26833 5.78829 5.52868 5.52794C5.78903 5.26759 6.21114 5.26759 6.47149 5.52794L8.00008 7.05654L9.52868 5.52794C9.78903 5.26759 10.2111 5.26759 10.4715 5.52794Z" fill="#98A2B3"/>
+</svg>
--- a/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/microphone-01.svg
+++ b/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/microphone-01.svg
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="microphone-01">
+<g id="Solid">
+<path fill-rule="evenodd" clip-rule="evenodd" d="M8.00008 0.666016C6.52732 0.666016 5.33341 1.85992 5.33341 3.33268V7.99935C5.33341 9.47211 6.52732 10.666 8.00008 10.666C9.47284 10.666 10.6667 9.47211 10.6667 7.99935V3.33268C10.6667 1.85992 9.47284 0.666016 8.00008 0.666016Z" fill="#155EEF"/>
+<path d="M4.00008 6.66602C4.00008 6.29783 3.7016 5.99935 3.33341 5.99935C2.96522 5.99935 2.66675 6.29783 2.66675 6.66602V7.99935C2.66675 10.7195 4.70319 12.9641 7.33466 13.2916C7.33384 13.3052 7.33341 13.3189 7.33341 13.3327V13.9993H5.33341C4.96522 13.9993 4.66675 14.2978 4.66675 14.666C4.66675 15.0342 4.96522 15.3327 5.33341 15.3327H10.6667C11.0349 15.3327 11.3334 15.0342 11.3334 14.666C11.3334 14.2978 11.0349 13.9993 10.6667 13.9993H8.66675V13.3327C8.66675 13.3189 8.66633 13.3052 8.6655 13.2916C11.297 12.9641 13.3334 10.7195 13.3334 7.99935V6.66602C13.3334 6.29783 13.0349 5.99935 12.6667 5.99935C12.2986 5.99935 12.0001 6.29783 12.0001 6.66602V7.99935C12.0001 10.2085 10.2092 11.9993 8.00008 11.9993C5.79094 11.9993 4.00008 10.2085 4.00008 7.99935V6.66602Z" fill="#155EEF"/>
+</g>
+</g>
+</svg>
--- a/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/stop-circle.svg
+++ b/web/app/components/base/icons/assets/vender/solid/mediaAndDevices/stop-circle.svg
+<svg width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">
+<g id="stop-circle">
+<path id="Solid" fill-rule="evenodd" clip-rule="evenodd" d="M9.99992 0.833984C4.93731 0.833984 0.833252 4.93804 0.833252 10.0007C0.833252 15.0633 4.93731 19.1673 9.99992 19.1673C15.0625 19.1673 19.1666 15.0633 19.1666 10.0007C19.1666 4.93804 15.0625 0.833984 9.99992 0.833984ZM6.75741 7.12232C6.66658 7.30058 6.66658 7.53394 6.66658 8.00065V12.0006C6.66658 12.4674 6.66658 12.7007 6.75741 12.879C6.83731 13.0358 6.96479 13.1633 7.12159 13.2432C7.29985 13.334 7.53321 13.334 7.99992 13.334H11.9999C12.4666 13.334 12.7 13.334 12.8782 13.2432C13.035 13.1633 13.1625 13.0358 13.2424 12.879C13.3333 12.7007 13.3333 12.4674 13.3333 12.0006V8.00065C13.3333 7.53394 13.3333 7.30058 13.2424 7.12232C13.1625 6.96552 13.035 6.83804 12.8782 6.75814C12.7 6.66732 12.4666 6.66732 11.9999 6.66732H7.99992C7.53321 6.66732 7.29985 6.66732 7.12159 6.75814C6.96479 6.83804 6.83731 6.96552 6.75741 7.12232Z" fill="#155EEF"/>
+</g>
+</svg>
--- a/web/app/components/base/icons/src/vender/line/general/Loading02.json
+++ b/web/app/components/base/icons/src/vender/line/general/Loading02.json
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"clip-path": "url(#clip0_6037_51601)"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "path",
+						"attributes": {
+							"d": "M7.99992 1.33398V4.00065M7.99992 12.0007V14.6673M3.99992 8.00065H1.33325M14.6666 8.00065H11.9999M12.7189 12.7196L10.8333 10.834M12.7189 3.33395L10.8333 5.21956M3.28097 12.7196L5.16659 10.834M3.28097 3.33395L5.16659 5.21956",
+							"stroke": "currentColor",
+							"stroke-width": "1.25",
+							"stroke-linecap": "round",
+							"stroke-linejoin": "round"
+						},
+						"children": []
+					}
+				]
+			},
+			{
+				"type": "element",
+				"name": "defs",
+				"attributes": {},
+				"children": [
+					{
+						"type": "element",
+						"name": "clipPath",
+						"attributes": {
+							"id": "clip0_6037_51601"
+						},
+						"children": [
+							{
+								"type": "element",
+								"name": "rect",
+								"attributes": {
+									"width": "16",
+									"height": "16",
+									"fill": "white"
+								},
+								"children": []
+							}
+						]
+					}
+				]
+			}
+		]
+	},
+	"name": "Loading02"
+}
\ No newline at end of file
--- a/web/app/components/base/icons/src/vender/line/general/Loading02.tsx
+++ b/web/app/components/base/icons/src/vender/line/general/Loading02.tsx
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+import * as React from 'react'
+import data from './Loading02.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+export default Icon
--- a/web/app/components/base/icons/src/vender/line/general/XClose.json
+++ b/web/app/components/base/icons/src/vender/line/general/XClose.json
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"id": "x-close"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "path",
+						"attributes": {
+							"id": "Icon",
+							"d": "M12 4L4 12M4 4L12 12",
+							"stroke": "currentColor",
+							"stroke-width": "1.25",
+							"stroke-linecap": "round",
+							"stroke-linejoin": "round"
+						},
+						"children": []
+					}
+				]
+			}
+		]
+	},
+	"name": "XClose"
+}
\ No newline at end of file
--- a/web/app/components/base/icons/src/vender/line/general/XClose.tsx
+++ b/web/app/components/base/icons/src/vender/line/general/XClose.tsx
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+import * as React from 'react'
+import data from './XClose.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+export default Icon
--- a/web/app/components/base/icons/src/vender/line/general/index.ts
+++ b/web/app/components/base/icons/src/vender/line/general/index.ts
+export { default as Loading02 } from './Loading02'
 export { default as Trash03 } from './Trash03'
+export { default as XClose } from './XClose'
 export { default as X } from './X'
--- a/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.json
+++ b/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.json
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"id": "microphone-01"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "path",
+						"attributes": {
+							"id": "Icon",
+							"d": "M12.6666 6.66732V8.00065C12.6666 10.578 10.5772 12.6673 7.99992 12.6673M3.33325 6.66732V8.00065C3.33325 10.578 5.42259 12.6673 7.99992 12.6673M7.99992 12.6673V14.6673M5.33325 14.6673H10.6666M7.99992 10.0007C6.89535 10.0007 5.99992 9.10522 5.99992 8.00065V3.33398C5.99992 2.22941 6.89535 1.33398 7.99992 1.33398C9.10449 1.33398 9.99992 2.22941 9.99992 3.33398V8.00065C9.99992 9.10522 9.10449 10.0007 7.99992 10.0007Z",
+							"stroke": "currentColor",
+							"stroke-width": "1.25",
+							"stroke-linecap": "round",
+							"stroke-linejoin": "round"
+						},
+						"children": []
+					}
+				]
+			}
+		]
+	},
+	"name": "Microphone01"
+}
\ No newline at end of file
--- a/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.tsx
+++ b/web/app/components/base/icons/src/vender/line/mediaAndDevices/Microphone01.tsx
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+import * as React from 'react'
+import data from './Microphone01.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+export default Icon
--- a/web/app/components/base/icons/src/vender/line/mediaAndDevices/index.ts
+++ b/web/app/components/base/icons/src/vender/line/mediaAndDevices/index.ts
+export { default as Microphone01 } from './Microphone01'
--- a/web/app/components/base/icons/src/vender/solid/general/XCircle.json
+++ b/web/app/components/base/icons/src/vender/solid/general/XCircle.json
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "path",
+				"attributes": {
+					"id": "Solid",
+					"fill-rule": "evenodd",
+					"clip-rule": "evenodd",
+					"d": "M8.00008 0.666016C3.94999 0.666016 0.666748 3.94926 0.666748 7.99935C0.666748 12.0494 3.94999 15.3327 8.00008 15.3327C12.0502 15.3327 15.3334 12.0494 15.3334 7.99935C15.3334 3.94926 12.0502 0.666016 8.00008 0.666016ZM10.4715 5.52794C10.7318 5.78829 10.7318 6.2104 10.4715 6.47075L8.94289 7.99935L10.4715 9.52794C10.7318 9.78829 10.7318 10.2104 10.4715 10.4708C10.2111 10.7311 9.78903 10.7311 9.52868 10.4708L8.00008 8.94216L6.47149 10.4708C6.21114 10.7311 5.78903 10.7311 5.52868 10.4708C5.26833 10.2104 5.26833 9.78829 5.52868 9.52794L7.05727 7.99935L5.52868 6.47075C5.26833 6.2104 5.26833 5.78829 5.52868 5.52794C5.78903 5.26759 6.21114 5.26759 6.47149 5.52794L8.00008 7.05654L9.52868 5.52794C9.78903 5.26759 10.2111 5.26759 10.4715 5.52794Z",
+					"fill": "currentColor"
+				},
+				"children": []
+			}
+		]
+	},
+	"name": "XCircle"
+}
\ No newline at end of file
--- a/web/app/components/base/icons/src/vender/solid/general/XCircle.tsx
+++ b/web/app/components/base/icons/src/vender/solid/general/XCircle.tsx
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+import * as React from 'react'
+import data from './XCircle.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+export default Icon
--- a/web/app/components/base/icons/src/vender/solid/general/index.ts
+++ b/web/app/components/base/icons/src/vender/solid/general/index.ts
 export { default as Download02 } from './Download02'
+export { default as XCircle } from './XCircle'
--- a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.json
+++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.json
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "16",
+			"height": "16",
+			"viewBox": "0 0 16 16",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"id": "microphone-01"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "g",
+						"attributes": {
+							"id": "Solid"
+						},
+						"children": [
+							{
+								"type": "element",
+								"name": "path",
+								"attributes": {
+									"fill-rule": "evenodd",
+									"clip-rule": "evenodd",
+									"d": "M8.00008 0.666016C6.52732 0.666016 5.33341 1.85992 5.33341 3.33268V7.99935C5.33341 9.47211 6.52732 10.666 8.00008 10.666C9.47284 10.666 10.6667 9.47211 10.6667 7.99935V3.33268C10.6667 1.85992 9.47284 0.666016 8.00008 0.666016Z",
+									"fill": "currentColor"
+								},
+								"children": []
+							},
+							{
+								"type": "element",
+								"name": "path",
+								"attributes": {
+									"d": "M4.00008 6.66602C4.00008 6.29783 3.7016 5.99935 3.33341 5.99935C2.96522 5.99935 2.66675 6.29783 2.66675 6.66602V7.99935C2.66675 10.7195 4.70319 12.9641 7.33466 13.2916C7.33384 13.3052 7.33341 13.3189 7.33341 13.3327V13.9993H5.33341C4.96522 13.9993 4.66675 14.2978 4.66675 14.666C4.66675 15.0342 4.96522 15.3327 5.33341 15.3327H10.6667C11.0349 15.3327 11.3334 15.0342 11.3334 14.666C11.3334 14.2978 11.0349 13.9993 10.6667 13.9993H8.66675V13.3327C8.66675 13.3189 8.66633 13.3052 8.6655 13.2916C11.297 12.9641 13.3334 10.7195 13.3334 7.99935V6.66602C13.3334 6.29783 13.0349 5.99935 12.6667 5.99935C12.2986 5.99935 12.0001 6.29783 12.0001 6.66602V7.99935C12.0001 10.2085 10.2092 11.9993 8.00008 11.9993C5.79094 11.9993 4.00008 10.2085 4.00008 7.99935V6.66602Z",
+									"fill": "currentColor"
+								},
+								"children": []
+							}
+						]
+					}
+				]
+			}
+		]
+	},
+	"name": "Microphone01"
+}
\ No newline at end of file
--- a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.tsx
+++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/Microphone01.tsx
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+import * as React from 'react'
+import data from './Microphone01.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+export default Icon
--- a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.json
+++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.json
+{
+	"icon": {
+		"type": "element",
+		"isRootNode": true,
+		"name": "svg",
+		"attributes": {
+			"width": "20",
+			"height": "20",
+			"viewBox": "0 0 20 20",
+			"fill": "none",
+			"xmlns": "http://www.w3.org/2000/svg"
+		},
+		"children": [
+			{
+				"type": "element",
+				"name": "g",
+				"attributes": {
+					"id": "stop-circle"
+				},
+				"children": [
+					{
+						"type": "element",
+						"name": "path",
+						"attributes": {
+							"id": "Solid",
+							"fill-rule": "evenodd",
+							"clip-rule": "evenodd",
+							"d": "M9.99992 0.833984C4.93731 0.833984 0.833252 4.93804 0.833252 10.0007C0.833252 15.0633 4.93731 19.1673 9.99992 19.1673C15.0625 19.1673 19.1666 15.0633 19.1666 10.0007C19.1666 4.93804 15.0625 0.833984 9.99992 0.833984ZM6.75741 7.12232C6.66658 7.30058 6.66658 7.53394 6.66658 8.00065V12.0006C6.66658 12.4674 6.66658 12.7007 6.75741 12.879C6.83731 13.0358 6.96479 13.1633 7.12159 13.2432C7.29985 13.334 7.53321 13.334 7.99992 13.334H11.9999C12.4666 13.334 12.7 13.334 12.8782 13.2432C13.035 13.1633 13.1625 13.0358 13.2424 12.879C13.3333 12.7007 13.3333 12.4674 13.3333 12.0006V8.00065C13.3333 7.53394 13.3333 7.30058 13.2424 7.12232C13.1625 6.96552 13.035 6.83804 12.8782 6.75814C12.7 6.66732 12.4666 6.66732 11.9999 6.66732H7.99992C7.53321 6.66732 7.29985 6.66732 7.12159 6.75814C6.96479 6.83804 6.83731 6.96552 6.75741 7.12232Z",
+							"fill": "currentColor"
+						},
+						"children": []
+					}
+				]
+			}
+		]
+	},
+	"name": "StopCircle"
+}
\ No newline at end of file
--- a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.tsx
+++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/StopCircle.tsx
+// GENERATE BY script
+// DON NOT EDIT IT MANUALLY
+import * as React from 'react'
+import data from './StopCircle.json'
+import IconBase from '@/app/components/base/icons/IconBase'
+import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
+const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
+  props,
+  ref,
+) => <IconBase {...props} ref={ref} data={data as IconData} />)
+export default Icon
--- a/web/app/components/base/icons/src/vender/solid/mediaAndDevices/index.ts
+++ b/web/app/components/base/icons/src/vender/solid/mediaAndDevices/index.ts
+export { default as Microphone01 } from './Microphone01'
+export { default as StopCircle } from './StopCircle'
--- a/web/app/components/base/voice-input/index.module.css
+++ b/web/app/components/base/voice-input/index.module.css
+.wrapper {
+  background: linear-gradient(131deg, #2250F2 0%, #0EBCF3 100%);
+  box-shadow: 0px 4px 6px -2px rgba(16, 24, 40, 0.03), 0px 12px 16px -4px rgba(16, 24, 40, 0.08);
+}
+.convert {
+  background: linear-gradient(91.92deg, #104AE1 -1.74%, #0098EE 75.74%);
+  background-clip: text;
+  color: transparent;
+}
\ No newline at end of file
--- a/web/app/components/base/voice-input/index.tsx
+++ b/web/app/components/base/voice-input/index.tsx
+import { useCallback, useEffect, useRef, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import { useParams, usePathname } from 'next/navigation'
+import cn from 'classnames'
+import Recorder from 'js-audio-recorder'
+import { useRafInterval } from 'ahooks'
+import s from './index.module.css'
+import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
+import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general'
+import { audioToText } from '@/service/share'
+type VoiceInputTypes = {
+  onConverted: (text: string) => void
+  onCancel: () => void
+}
+const VoiceInput = ({
+  onCancel,
+  onConverted,
+}: VoiceInputTypes) => {
+  const { t } = useTranslation()
+  const recorder = useRef(new Recorder())
+  const canvasRef = useRef<HTMLCanvasElement | null>(null)
+  const ctxRef = useRef<CanvasRenderingContext2D | null>(null)
+  const drawRecordId = useRef<number | null>(null)
+  const [originDuration, setOriginDuration] = useState(0)
+  const [startRecord, setStartRecord] = useState(false)
+  const [startConvert, setStartConvert] = useState(false)
+  const pathname = usePathname()
+  const params = useParams()
+  const clearInterval = useRafInterval(() => {
+    setOriginDuration(originDuration + 1)
+  }, 1000)
+  const drawRecord = useCallback(() => {
+    drawRecordId.current = requestAnimationFrame(drawRecord)
+    const canvas = canvasRef.current!
+    const ctx = ctxRef.current!
+    const dataUnit8Array = recorder.current.getRecordAnalyseData()
+    const dataArray = [].slice.call(dataUnit8Array)
+    const lineLength = parseInt(`${canvas.width / 3}`)
+    const gap = parseInt(`${1024 / lineLength}`)
+    ctx.clearRect(0, 0, canvas.width, canvas.height)
+    ctx.beginPath()
+    let x = 0
+    for (let i = 0; i < lineLength; i++) {
+      let v = dataArray.slice(i * gap, i * gap + gap).reduce((prev: number, next: number) => {
+        return prev + next
+      }, 0) / gap
+      if (v < 128)
+        v = 128
+      if (v > 178)
+        v = 178
+      const y = (v - 128) / 50 * canvas.height
+      ctx.moveTo(x, 16)
+      ctx.roundRect(x, 16 - y, 2, y, [1, 1, 0, 0])
+      ctx.fill()
+      x += 3
+    }
+    ctx.closePath()
+  }, [])
+  const handleStopRecorder = useCallback(async () => {
+    clearInterval()
+    setStartRecord(false)
+    setStartConvert(true)
+    recorder.current.stop()
+    drawRecordId.current && cancelAnimationFrame(drawRecordId.current)
+    drawRecordId.current = null
+    const canvas = canvasRef.current!
+    const ctx = ctxRef.current!
+    ctx.clearRect(0, 0, canvas.width, canvas.height)
+    const wavBlob = recorder.current.getWAVBlob()
+    const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' })
+    const formData = new FormData()
+    formData.append('file', wavFile)
+    let url = ''
+    let isPublic = false
+    if (params.token) {
+      url = '/audio-to-text'
+      isPublic = true
+    }
+    else if (params.appId) {
+      if (pathname.search('explore/installed') > -1)
+        url = `/installed-apps/${params.appId}/audio-to-text`
+      else
+        url = `/apps/${params.appId}/audio-to-text`
+    }
+    try {
+      const audioResponse = await audioToText(url, isPublic, formData)
+      onConverted(audioResponse.text)
+      onCancel()
+    }
+    catch (e) {
+      onConverted('')
+      onCancel()
+    }
+  }, [])
+  const handleStartRecord = async () => {
+    try {
+      await recorder.current.start()
+      setStartRecord(true)
+      setStartConvert(false)
+      if (canvasRef.current && ctxRef.current)
+        drawRecord()
+    }
+    catch (e) {
+      onCancel()
+    }
+  }
+  const initCanvas = () => {
+    const dpr = window.devicePixelRatio || 1
+    const canvas = document.getElementById('voice-input-record') as HTMLCanvasElement
+    if (canvas) {
+      const { width: cssWidth, height: cssHeight } = canvas.getBoundingClientRect()
+      canvas.width = dpr * cssWidth
+      canvas.height = dpr * cssHeight
+      canvasRef.current = canvas
+      const ctx = canvas.getContext('2d')
+      if (ctx) {
+        ctx.scale(dpr, dpr)
+        ctx.fillStyle = 'rgba(209, 224, 255, 1)'
+        ctxRef.current = ctx
+      }
+    }
+  }
+  if (originDuration >= 120 && startRecord)
+    handleStopRecorder()
+  useEffect(() => {
+    initCanvas()
+    handleStartRecord()
+  }, [])
+  const minutes = parseInt(`${parseInt(`${originDuration}`) / 60}`)
+  const seconds = parseInt(`${originDuration}`) % 60
+  return (
+    <div className={cn(s.wrapper, 'absolute inset-0 rounded-xl')}>
+      <div className='absolute inset-[1.5px] flex items-center pl-[14.5px] pr-[6.5px] py-[14px] bg-primary-25 rounded-[10.5px] overflow-hidden'>
+        <canvas id='voice-input-record' className='absolute left-0 bottom-0 w-full h-4' />
+        {
+          startConvert && <Loading02 className='animate-spin mr-2 w-4 h-4 text-primary-700' />
+        }
+        <div className='grow'>
+          {
+            startRecord && (
+              <div className='text-sm text-gray-500'>
+                {t('common.voiceInput.speaking')}
+              </div>
+            )
+          }
+          {
+            startConvert && (
+              <div className={cn(s.convert, 'text-sm')}>
+                {t('common.voiceInput.converting')}
+              </div>
+            )
+          }
+        </div>
+        {
+          startRecord && (
+            <div
+              className='flex justify-center items-center mr-1 w-8 h-8 hover:bg-primary-100 rounded-lg  cursor-pointer'
+              onClick={handleStopRecorder}
+            >
+              <StopCircle className='w-5 h-5 text-primary-600' />
+            </div>
+          )
+        }
+        {
+          startConvert && (
+            <div
+              className='flex justify-center items-center mr-1 w-8 h-8 hover:bg-gray-200 rounded-lg  cursor-pointer'
+              onClick={onCancel}
+            >
+              <XClose className='w-4 h-4 text-gray-500' />
+            </div>
+          )
+        }
+        <div className={`w-[45px] pl-1 text-xs font-medium ${originDuration > 110 ? 'text-[#F04438]' : 'text-gray-700'}`}>{`0${minutes.toFixed(0)}:${seconds >= 10 ? seconds : `0${seconds}`}`}</div>
+      </div>
+    </div>
+  )
+}
+export default VoiceInput
--- a/web/app/components/explore/installed-app/index.tsx
+++ b/web/app/components/explore/installed-app/index.tsx
@@ -29,7 +29,7 @@ const InstalledApp: FC<IInstalledAppProps> = ({
    <div className='h-full p-2'>
      {installedApp?.app.mode === 'chat'
        ? (
-          <ChatApp isInstalledApp installedAppInfo={installedApp}/>
+          <ChatApp isInstalledApp installedAppInfo={installedApp} />
        )
        : (
          <TextGenerationApp isInstalledApp installedAppInfo={installedApp}/>

--- a/web/app/components/share/chat/index.tsx
+++ b/web/app/components/share/chat/index.tsx
@@ -149,6 +149,7 @@ const Main: FC<IMainProps> = ({
  }
  const [suggestedQuestionsAfterAnswerConfig, setSuggestedQuestionsAfterAnswerConfig] = useState<SuggestedQuestionsAfterAnswerConfig | null>(null)
+  const [speechToTextConfig, setSpeechToTextConfig] = useState<SuggestedQuestionsAfterAnswerConfig | null>(null)
  const [conversationIdChangeBecauseOfNew, setConversationIdChangeBecauseOfNew, getConversationIdChangeBecauseOfNew] = useGetState(false)
  const [isChatStarted, { setTrue: setChatStarted, setFalse: setChatNotStarted }] = useBoolean(false)
@@ -326,7 +327,7 @@ const Main: FC<IMainProps> = ({
        const isNotNewConversation = allConversations.some(item => item.id === _conversationId)
        setAllConversationList(allConversations)
        // fetch new conversation info
-        const { user_input_form, opening_statement: introduction, suggested_questions_after_answer }: any = appParams
+        const { user_input_form, opening_statement: introduction, suggested_questions_after_answer, speech_to_text }: any = appParams
        const prompt_variables = userInputsFormToPromptVariables(user_input_form)
        if (siteInfo.default_language)
          changeLanguage(siteInfo.default_language)
@@ -341,6 +342,7 @@ const Main: FC<IMainProps> = ({
          prompt_variables,
        } as PromptConfig)
        setSuggestedQuestionsAfterAnswerConfig(suggested_questions_after_answer)
+        setSpeechToTextConfig(speech_to_text)
        // setConversationList(conversations as ConversationItem[])
@@ -620,6 +622,7 @@ const Main: FC<IMainProps> = ({
                    controlFocus={controlFocus}
                    isShowSuggestion={doShowSuggestion}
                    suggestionList={suggestQuestions}
+                    isShowSpeechToText={speechToTextConfig?.enabled}
                  />
                </div>
              </div>)

--- a/web/context/debug-configuration.ts
+++ b/web/context/debug-configuration.ts
 import { createContext } from 'use-context-selector'
-import type { CompletionParams, Inputs, ModelConfig, MoreLikeThisConfig, PromptConfig, SuggestedQuestionsAfterAnswerConfig } from '@/models/debug'
+import type { CompletionParams, Inputs, ModelConfig, MoreLikeThisConfig, PromptConfig, SpeechToTextConfig, SuggestedQuestionsAfterAnswerConfig } from '@/models/debug'
 import type { DataSet } from '@/models/datasets'
 type IDebugConfiguration = {
@@ -19,6 +19,8 @@ type IDebugConfiguration = {
  setMoreLikeThisConfig: (moreLikeThisConfig: MoreLikeThisConfig) => void
  suggestedQuestionsAfterAnswerConfig: SuggestedQuestionsAfterAnswerConfig
  setSuggestedQuestionsAfterAnswerConfig: (suggestedQuestionsAfterAnswerConfig: SuggestedQuestionsAfterAnswerConfig) => void
+  speechToTextConfig: SpeechToTextConfig
+  setSpeechToTextConfig: (speechToTextConfig: SpeechToTextConfig) => void
  formattingChanged: boolean
  setFormattingChanged: (formattingChanged: boolean) => void
  inputs: Inputs
@@ -59,6 +61,10 @@ const DebugConfigurationContext = createContext<IDebugConfiguration>({
    enabled: false,
  },
  setSuggestedQuestionsAfterAnswerConfig: () => { },
+  speechToTextConfig: {
+    enabled: false,
+  },
+  setSpeechToTextConfig: () => { },
  formattingChanged: false,
  setFormattingChanged: () => { },
  inputs: {},

--- a/web/i18n/lang/app-debug.en.ts
+++ b/web/i18n/lang/app-debug.en.ts
@@ -46,6 +46,11 @@ const translation = {
      generateNumTip: 'Number of each generated times',
      tip: 'Using this feature will incur additional tokens overhead',
    },
+    speechToText: {
+      title: 'Speech to Text',
+      description: 'Once enabled, you can use voice input.',
+      resDes: 'Voice input is enabled',
+    },
    dataSet: {
      title: 'Context',
      noData: 'You can import datasets as context',

--- a/web/i18n/lang/app-debug.zh.ts
+++ b/web/i18n/lang/app-debug.zh.ts
@@ -46,6 +46,11 @@ const translation = {
      generateNumTip: '每次生成数',
      tip: '使用此功能将会额外消耗 tokens',
    },
+    speechToText: {
+      title: '语音转文字',
+      description: '启用后，您可以使用语音输入。',
+      resDes: '语音输入已启用',
+    },
    dataSet: {
      title: '上下文',
      noData: '您可以导入数据集作为上下文',

--- a/web/i18n/lang/common.en.ts
+++ b/web/i18n/lang/common.en.ts
@@ -225,6 +225,11 @@ const translation = {
    viewDoc: 'View documentation',
    relatedApp: 'linked apps',
  },
+  voiceInput: {
+    speaking: 'Speak now...',
+    converting: 'Converting to text...',
+    notAllow: 'microphone not authorized',
+  },
 }
 export default translation
--- a/web/i18n/lang/common.zh.ts
+++ b/web/i18n/lang/common.zh.ts
@@ -226,6 +226,11 @@ const translation = {
    viewDoc: '查看文档',
    relatedApp: '个关联应用',
  },
+  voiceInput: {
+    speaking: '现在讲...',
+    converting: '正在转换为文本...',
+    notAllow: '麦克风未授权',
+  },
 }
 export default translation
--- a/web/models/debug.ts
+++ b/web/models/debug.ts
@@ -31,6 +31,8 @@ export type MoreLikeThisConfig = {
 export type SuggestedQuestionsAfterAnswerConfig = MoreLikeThisConfig
+export type SpeechToTextConfig = MoreLikeThisConfig
 // frontend use. Not the same as backend
 export type ModelConfig = {
  provider: string // LLM Provider: for example "OPENAI"
@@ -43,6 +45,9 @@ export type ModelConfig = {
  suggested_questions_after_answer: {
    enabled: boolean
  } | null
+  speech_to_text: {
+    enabled: boolean
+  } | null
  dataSets: any[]
 }

--- a/web/package.json
+++ b/web/package.json
@@ -48,6 +48,7 @@
    "i18next": "^22.4.13",
    "i18next-resources-to-backend": "^1.1.3",
    "immer": "^9.0.19",
+    "js-audio-recorder": "^1.0.7",
    "js-cookie": "^3.0.1",
    "katex": "^0.16.7",
    "lodash-es": "^4.17.21",
@@ -68,6 +69,7 @@
    "react-tooltip": "5.8.3",
    "react-window": "^1.8.9",
    "react-window-infinite-loader": "^1.0.9",
+    "recordrtc": "^5.6.2",
    "rehype-katex": "^6.0.2",
    "remark-breaks": "^3.0.2",
    "remark-gfm": "^3.0.1",
@@ -88,6 +90,7 @@
    "@types/js-cookie": "^3.0.3",
    "@types/negotiator": "^0.6.1",
    "@types/qs": "^6.9.7",
+    "@types/recordrtc": "^5.6.11",
    "@types/sortablejs": "^1.15.1",
    "eslint-config-next": "^13.4.7",
    "eslint-plugin-react-hooks": "^4.6.0",

--- a/web/service/base.ts
+++ b/web/service/base.ts
@@ -35,7 +35,9 @@ export type IOnError = (msg: string) => void
 type IOtherOptions = {
  isPublicAPI?: boolean
+  bodyStringify?: boolean
  needAllResponseContent?: boolean
+  deleteContentType?: boolean
  onData?: IOnData // for stream
  onError?: IOnError
  onCompleted?: IOnCompleted // for stream
@@ -132,7 +134,9 @@ const baseFetch = (
  fetchOptions: any,
  {
    isPublicAPI = false,
+    bodyStringify = true,
    needAllResponseContent,
+    deleteContentType,
  }: IOtherOptions,
 ) => {
  const options = Object.assign({}, baseOptions, fetchOptions)
@@ -141,6 +145,15 @@ const baseFetch = (
    options.headers.set('Authorization', `bearer ${sharedToken}`)
  }
+  if (deleteContentType) {
+    options.headers.delete('Content-Type')
+  }
+  else {
+    const contentType = options.headers.get('Content-Type')
+    if (!contentType)
+      options.headers.set('Content-Type', ContentType.json)
+  }
  const urlPrefix = isPublicAPI ? PUBLIC_API_PREFIX : API_PREFIX
  let urlWithPrefix = `${urlPrefix}${url.startsWith('/') ? url : `/${url}`}`
@@ -160,7 +173,7 @@ const baseFetch = (
    delete options.params
  }
-  if (body)
+  if (body && bodyStringify)
    options.body = JSON.stringify(body)
  // Handle timeout
@@ -285,6 +298,10 @@ export const ssePost = (url: string, fetchOptions: any, { isPublicAPI = false, o
    signal: abortController.signal,
  }, fetchOptions)
+  const contentType = options.headers.get('Content-Type')
+  if (!contentType)
+    options.headers.set('Content-Type', ContentType.json)
  getAbortController?.(abortController)
  const urlPrefix = isPublicAPI ? PUBLIC_API_PREFIX : API_PREFIX

--- a/web/service/share.ts
+++ b/web/service/share.ts
@@ -114,3 +114,7 @@ export const removeMessage = (messageId: string, isInstalledApp: boolean, instal
 export const fetchSuggestedQuestions = (messageId: string, isInstalledApp: boolean, installedAppId = '') => {
  return (getAction('get', isInstalledApp))(getUrl(`/messages/${messageId}/suggested-questions`, isInstalledApp, installedAppId))
 }
+export const audioToText = (url: string, isPublicAPI: boolean, body: FormData) => {
+  return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ text: string }>
+}
--- a/web/types/app.ts
+++ b/web/types/app.ts
@@ -85,6 +85,9 @@ export type ModelConfig = {
  suggested_questions_after_answer: {
    enabled: boolean
  }
+  speech_to_text: {
+    enabled: boolean
+  }
  agent_mode: {
    enabled: boolean
    tools: ToolItem[]