Merge branch 'feat/chat-support-voice-input' into deploy/dev

56c57963 · StyleZhang · 9a1bd85b · 3f22fdd0 · 56c57963 · 56c57963
Commit 56c57963 authored Jul 05, 2023 by StyleZhang
9 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -147,3 +147,5 @@ docker/volumes/weaviate/*
 sdks/python-client/build
 sdks/python-client/dist
 sdks/python-client/dify_client.egg-info
+
+.vscode/
\ No newline at end of file
--- a/api/controllers/console/explore/completion.py
+++ b/api/controllers/console/explore/completion.py
@@ -3,7 +3,7 @@ import json
 import logging
 from typing import Generator, Union

-from flask import Response, stream_with_context
+from flask import Response, stream_with_context, request
 from flask_login import current_user
 from flask_restful import reqparse
 from werkzeug.exceptions import InternalServerError, NotFound
@@ -19,6 +19,8 @@ from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthor
    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
 from libs.helper import uuid_value
 from services.completion_service import CompletionService
+from services.completion_service import CompletionService
+from controllers.console.datasets.error import NoFileUploadedError, TooManyFilesError


 # define completion api for user
@@ -138,6 +140,52 @@ class ChatStopApi(InstalledAppResource):
        PubHandler.stop(current_user, task_id)

        return {'result': 'success'}, 200
+    
+class AudioApi(InstalledAppResource):
+    def post(self, installed_app):
+        app_model = installed_app.app
+        if app_model.mode != 'chat':
+            raise NotChatAppError()
+        
+        file = request.files['file']
+
+        # check file
+        if 'file' not in request.files:
+            raise NoFileUploadedError()
+        
+        if len(request.files) > 1:
+            raise TooManyFilesError()
+        
+        from services.audio_service import AudioService
+
+        try:
+            response = AudioService.transcript(
+                app_model=app_model,
+                file=file,
+            )
+
+            return response
+        except services.errors.conversation.ConversationNotExistsError:
+            raise NotFound("Conversation Not Exists.")
+        except services.errors.conversation.ConversationCompletedError:
+            raise ConversationCompletedError()
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()


 def compact_response(response: Union[dict | Generator]) -> Response:
@@ -178,3 +226,4 @@ api.add_resource(CompletionApi, '/installed-apps/<uuid:installed_app_id>/complet
 api.add_resource(CompletionStopApi, '/installed-apps/<uuid:installed_app_id>/completion-messages/<string:task_id>/stop', endpoint='installed_app_stop_completion')
 api.add_resource(ChatApi, '/installed-apps/<uuid:installed_app_id>/chat-messages', endpoint='installed_app_chat_completion')
 api.add_resource(ChatStopApi, '/installed-apps/<uuid:installed_app_id>/chat-messages/<string:task_id>/stop', endpoint='installed_app_stop_chat_completion')
+api.add_resource(AudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text')
--- a/api/controllers/web/completion.py
+++ b/api/controllers/web/completion.py
@@ -3,7 +3,7 @@ import json
 import logging
 from typing import Generator, Union

-from flask import Response, stream_with_context
+from flask import Response, stream_with_context, request
 from flask_restful import reqparse
 from werkzeug.exceptions import InternalServerError, NotFound

@@ -18,7 +18,8 @@ from core.llm.error import LLMBadRequestError, LLMAPIUnavailableError, LLMAuthor
    LLMRateLimitError, ProviderTokenNotInitError, QuotaExceededError, ModelCurrentlyNotSupportError
 from libs.helper import uuid_value
 from services.completion_service import CompletionService
-
+from services.audio_service import AudioService
+from controllers.console.datasets.error import NoFileUploadedError, TooManyFilesError

 # define completion api for user
 class CompletionApi(WebApiResource):
@@ -133,6 +134,49 @@ class ChatStopApi(WebApiResource):
        PubHandler.stop(end_user, task_id)

        return {'result': 'success'}, 200
+    
+class AudioApi(WebApiResource):
+    def post(self, app_model, end_user):
+        if app_model.mode != 'chat':
+            raise NotChatAppError()
+        
+        file = request.files['file']
+
+        # check file
+        if 'file' not in request.files:
+            raise NoFileUploadedError()
+        
+        if len(request.files) > 1:
+            raise TooManyFilesError()
+
+        try:
+            response = AudioService.transcript(
+                app_model=app_model,
+                file=file,
+            )
+
+            return response
+        except services.errors.conversation.ConversationNotExistsError:
+            raise NotFound("Conversation Not Exists.")
+        except services.errors.conversation.ConversationCompletedError:
+            raise ConversationCompletedError()
+        except services.errors.app_model_config.AppModelConfigBrokenError:
+            logging.exception("App model config broken.")
+            raise AppUnavailableError()
+        except ProviderTokenNotInitError:
+            raise ProviderNotInitializeError()
+        except QuotaExceededError:
+            raise ProviderQuotaExceededError()
+        except ModelCurrentlyNotSupportError:
+            raise ProviderModelCurrentlyNotSupportError()
+        except (LLMBadRequestError, LLMAPIConnectionError, LLMAPIUnavailableError,
+                LLMRateLimitError, LLMAuthorizationError) as e:
+            raise CompletionRequestError(str(e))
+        except ValueError as e:
+            raise e
+        except Exception as e:
+            logging.exception("internal server error.")
+            raise InternalServerError()


 def compact_response(response: Union[dict | Generator]) -> Response:
@@ -173,3 +217,4 @@ api.add_resource(CompletionApi, '/completion-messages')
 api.add_resource(CompletionStopApi, '/completion-messages/<string:task_id>/stop')
 api.add_resource(ChatApi, '/chat-messages')
 api.add_resource(ChatStopApi, '/chat-messages/<string:task_id>/stop')
+api.add_resource(AudioApi, '/audio-to-text')
--- a/api/services/audio_service.py
+++ b/api/services/audio_service.py
+import openai
+import io
+from werkzeug.datastructures import FileStorage
+from core.llm.llm_builder import LLMBuilder
+from core.llm.provider.llm_provider_service import LLMProviderService
+from models.model import App
+from controllers.console.datasets.error import FileTooLargeError, UnsupportedFileTypeError
+
+FILE_SIZE_LIMIT = 25 * 1024 * 1024  # 25MB
+ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
+
+class AudioService:
+    @classmethod
+    def transcript(cls, app_model: App, file: FileStorage, **params):
+        file_content = file.read()
+        file_size = len(file_content)
+
+        if file_size > FILE_SIZE_LIMIT:
+            message = f"({file_size} > {FILE_SIZE_LIMIT})"
+            raise FileTooLargeError(message)
+        
+        extension = file.mimetype
+        if extension not in [f'audio/{ext}' for ext in ALLOWED_EXTENSIONS]:
+            raise UnsupportedFileTypeError()
+
+        provider_name = LLMBuilder.get_default_provider(app_model.tenant_id)
+        provider = LLMProviderService(app_model.tenant_id, provider_name)
+        credentials = provider.get_credentials(provider_name)
+
+        buffer = io.BytesIO(file_content)
+        buffer.name = 'temp.wav'
+
+        transcript = openai.Audio.transcribe(
+                        model='whisper-1', 
+                        file=buffer,
+                        api_key=credentials.get('openai_api_key'),
+                        api_base=credentials.get('openai_api_base'),
+                        api_type=credentials.get('openai_api_type'),
+                        api_version=credentials.get('openai_api_version'),
+                        params=params
+                    )
+        
+        return transcript
\ No newline at end of file
--- a/web/app/components/app/chat/index.tsx
+++ b/web/app/components/app/chat/index.tsx
@@ -63,6 +63,8 @@ export type IChatProps = {
  controlFocus?: number
  isShowSuggestion?: boolean
  suggestionList?: string[]
+  isInstalledApp: boolean
+  installedAppId: string
 }

 export type MessageMore = {
@@ -425,6 +427,8 @@ const Chat: FC<IChatProps> = ({
  controlFocus,
  isShowSuggestion,
  suggestionList,
+  isInstalledApp,
+  installedAppId,
 }) => {
  const { t } = useTranslation()
  const { notify } = useContext(ToastContext)
@@ -574,7 +578,7 @@ const Chat: FC<IChatProps> = ({
                {
                  query
                    ? (
-                      <div className='flex justify-center items-center w-8 h-8 cursor-pointer' onClick={() => setQuery('')}>
+                      <div className='flex justify-center items-center w-8 h-8 cursor-pointer hover:bg-gray-100 rounded-lg' onClick={() => setQuery('')}>
                        <XCircle className='w-4 h-4 text-[#98A2B3]' />
                      </div>
                    )
@@ -606,7 +610,14 @@ const Chat: FC<IChatProps> = ({
                  )}
              </div>
              {
-                voiceInputShow && <VoiceInput onCancel={() => setVoiceInputShow(false)} onConverted={() => setVoiceInputShow(false)} />
+                voiceInputShow && (
+                  <VoiceInput
+                    isInstalledApp={isInstalledApp}
+                    installedAppId={installedAppId}
+                    onCancel={() => setVoiceInputShow(false)}
+                    onConverted={text => setQuery(text)}
+                  />
+                )
              }
            </div>
          </div>

--- a/web/app/components/base/voice-input/index.tsx
+++ b/web/app/components/base/voice-input/index.tsx
@@ -5,13 +5,18 @@ import Recorder from 'js-audio-recorder'
 import s from './index.module.css'
 import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices'
 import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general'
+import { audioToText } from '@/service/share'

 type VoiceInputTypes = {
+  isInstalledApp: boolean
+  installedAppId: string
  onConverted: (text: string) => void
  onCancel: () => void
 }

 const VoiceInput = ({
+  isInstalledApp,
+  installedAppId,
  onCancel,
  onConverted,
 }: VoiceInputTypes) => {
@@ -21,6 +26,7 @@ const VoiceInput = ({
  const ctxRef = useRef<CanvasRenderingContext2D | null>(null)
  const drawRecordId = useRef<number | null>(null)
  const [duration, setDuration] = useState('00:00')
+  const [originDuration, setOriginDuration] = useState(0)
  const [startRecord, setStartRecord] = useState(false)
  const [startConvert, setStartConvert] = useState(false)
  const drawRecord = useCallback(() => {
@@ -49,7 +55,7 @@ const VoiceInput = ({
    }
    ctx.closePath()
  }, [])
-  const handleStopRecorder = useCallback(() => {
+  const handleStopRecorder = useCallback(async () => {
    setStartRecord(false)
    setStartConvert(true)
    recorder.current.stop()
@@ -58,9 +64,20 @@ const VoiceInput = ({
    const canvas = canvasRef.current!
    const ctx = ctxRef.current!
    ctx.clearRect(0, 0, canvas.width, canvas.height)
-    // const wavBlob = recorder.current.getWAVBlob()
-    // const wavFile = new File([wavBlob], 'audio.wav', { type: 'audio/wav' })
-    // onConverted('')
+    const wavBlob = recorder.current.getWAVBlob()
+    const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' })
+    const formData = new FormData()
+    formData.append('file', wavFile)
+
+    try {
+      const audioResponse = await audioToText(isInstalledApp, installedAppId, formData)
+      onConverted(audioResponse.text)
+      onCancel()
+    }
+    catch (e) {
+      onConverted('')
+      onCancel()
+    }
  }, [])
  const handleStartRecord = () => {
    setStartRecord(true)
@@ -68,10 +85,9 @@ const VoiceInput = ({
    recorder.current.start()
    recorder.current.onprogress = (params) => {
      const originDuration = params.duration
-      if (originDuration > 65) {
-        console.log('stop')
+      setOriginDuration(originDuration)
+      if (originDuration >= 120)
        handleStopRecorder()
-      }
      const minutes = parseInt(`${parseInt(`${originDuration}`) / 60}`)
      const seconds = parseInt(`${originDuration}`) % 60
      setDuration(`0${minutes.toFixed(0)}:${seconds >= 10 ? seconds : `0${seconds}`}`)
@@ -140,14 +156,14 @@ const VoiceInput = ({
        {
          startConvert && (
            <div
-              className='flex justify-center items-center mr-1 w-8 h-8 hover:bg-primary-100 rounded-lg  cursor-pointer'
+              className='flex justify-center items-center mr-1 w-8 h-8 hover:bg-gray-200 rounded-lg  cursor-pointer'
              onClick={onCancel}
            >
              <XClose className='w-4 h-4 text-gray-500' />
            </div>
          )
        }
-        <div className='w-[45px] pl-1 text-xs font-medium text-gray-700'>{duration}</div>
+        <div className={`w-[45px] pl-1 text-xs font-medium ${originDuration > 110 ? 'text-[#F04438]' : 'text-gray-700'}`}>{duration}</div>
      </div>
    </div>
  )

--- a/web/app/components/share/chat/index.tsx
+++ b/web/app/components/share/chat/index.tsx
@@ -620,6 +620,8 @@ const Main: FC<IMainProps> = ({
                    controlFocus={controlFocus}
                    isShowSuggestion={doShowSuggestion}
                    suggestionList={suggestQuestions}
+                    isInstalledApp={isInstalledApp}
+                    installedAppId={installedAppInfo?.id || ''}
                  />
                </div>
              </div>)

--- a/web/service/base.ts
+++ b/web/service/base.ts
@@ -35,7 +35,9 @@ export type IOnError = (msg: string) => void

 type IOtherOptions = {
  isPublicAPI?: boolean
+  bodyStringify?: boolean
  needAllResponseContent?: boolean
+  deleteContentType?: boolean
  onData?: IOnData // for stream
  onError?: IOnError
  onCompleted?: IOnCompleted // for stream
@@ -140,7 +142,9 @@ const baseFetch = (
  fetchOptions: any,
  {
    isPublicAPI = false,
+    bodyStringify = true,
    needAllResponseContent,
+    deleteContentType,
  }: IOtherOptions,
 ) => {
  const options = Object.assign({}, baseOptions, fetchOptions)
@@ -149,6 +153,9 @@ const baseFetch = (
    options.headers.set('Authorization', `bearer ${sharedToken}`)
  }

+  if (deleteContentType)
+    options.headers.delete('Content-Type')
+
  const urlPrefix = isPublicAPI ? PUBLIC_API_PREFIX : API_PREFIX
  let urlWithPrefix = `${urlPrefix}${url.startsWith('/') ? url : `/${url}`}`

@@ -168,7 +175,7 @@ const baseFetch = (
    delete options.params
  }

-  if (body)
+  if (body && bodyStringify)
    options.body = JSON.stringify(body)

  // Handle timeout

--- a/web/service/share.ts
+++ b/web/service/share.ts
@@ -114,3 +114,7 @@ export const removeMessage = (messageId: string, isInstalledApp: boolean, instal
 export const fetchSuggestedQuestions = (messageId: string, isInstalledApp: boolean, installedAppId = '') => {
  return (getAction('get', isInstalledApp))(getUrl(`/messages/${messageId}/suggested-questions`, isInstalledApp, installedAppId))
 }
+
+export const audioToText = (isInstalledApp: boolean, installedAppId: string, body: FormData) => {
+  return (getAction('post', isInstalledApp))(getUrl('/audio-to-text', isInstalledApp, installedAppId), { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ text: string }>
+}