Update Default Models in HF Prompt Schemas (#1222)

rholinshead · web-flow · commit 9b49d855e35f · 2024-02-13T11:45:29.000-05:00
# Update Default Models in HF Prompt Schemas Update the prompt schemas for ASR and TTS remote inference to match the defaults set in #1221 <img width="1479" alt="Screenshot 2024-02-13 at 11 25 38 AM" src="https://github.com/lastmile-ai/aiconfig/assets/5060851/28e48030-0fc3-49d2-8c6b-ad0ea142d4d8"> <img width="1465" alt="Screenshot 2024-02-13 at 11 26 42 AM" src="https://github.com/lastmile-ai/aiconfig/assets/5060851/b5d132da-8777-47e9-95f1-73cf4f25a906"> --- [//]: # (BEGIN SAPLING FOOTER) Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/lastmile-ai/aiconfig/pull/1222). * __->__ #1222 * #1221
diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/remote_inference_client/automatic_speech_recognition.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/remote_inference_client/automatic_speech_recognition.py
@@ -43,6 +43,13 @@ def refine_completion_params(model_settings: dict[Any, Any]) -> dict[str, Any]:
         if key.lower() in supported_keys:
             completion_data[key.lower()] = model_settings[key]
 
+    # The default model is openai/whisper-large-v3, which does not work as of
+    # 02/13/2024. Instead, default to a free model (which supports remote
+    # inference) with the next most "likes" in HF
+    # https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=likes
+    if completion_data.get("model") is None:
+        completion_data["model"] = "openai/whisper-large-v2"
+
     return completion_data
 
 
@@ -299,7 +306,7 @@ def get_output_text(
             output_data = output.data
             if isinstance(output_data, str):
                 return output_data
-            
+
             else:
                 raise ValueError(
                     f"Invalid output data type {type(output_data)} for prompt '{prompt.name}'. Expected string."
@@ -347,7 +354,7 @@ def validate_and_retrieve_audio_from_attachments(prompt: Prompt) -> str:
         raise ValueError(
             "Multiple audio inputs are not supported for the HF Automatic Speech Recognition Inference api. Please specify a single audio input attachment for Prompt: {prompt.name}."
         )
-    
+
     attachment = prompt.input.attachments[0]
 
     validate_attachment_type_is_audio(attachment)
diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/remote_inference_client/text_2_speech.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/remote_inference_client/text_2_speech.py
@@ -47,6 +47,13 @@ def refine_completion_params(model_settings: dict[Any, Any]) -> dict[str, Any]:
         if key.lower() in supported_keys:
             completion_data[key.lower()] = model_settings[key]
 
+    # The default model is suno/bark, which requires HF Pro subscription
+    # Instead, default to a free model (which supports remote inference) with
+    # the next most "likes" in HF
+    # https://huggingface.co/models?pipeline_tag=text-to-speech&sort=likes
+    if completion_data.get("model") is None:
+        completion_data["model"] = "facebook/fastspeech2-en-ljspeech"
+
     return completion_data
 
 
diff --git a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceAutomaticSpeechRecognitionRemoteInferencePromptSchema.ts b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceAutomaticSpeechRecognitionRemoteInferencePromptSchema.ts
@@ -1,43 +1,44 @@
 import { PromptSchema } from "../../utils/promptUtils";
 
-export const HuggingFaceAutomaticSpeechRecognitionRemoteInferencePromptSchema: PromptSchema = {
-  // See https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/inference/_client.py#L302for supported params.
-  // The settings below are supported settings specified in the HuggingFaceAutomaticSpeechRecognitionRemoteInference refine_completion_params implementation.
-  input: {
-    type: "object",
-    required: ["attachments"],
-    properties: {
-      attachments: {
-        type: "array",
-        items: {
-          type: "attachment",
-          required: ["data"],
-          mime_types: [
-            "audio/mpeg",
-            "audio/wav",
-            "audio/webm",
-            "audio/flac",
-            "audio/ogg",
-            "audio/ogg",
-          ],
-          properties: {
-            data: {
-              type: "string",
+export const HuggingFaceAutomaticSpeechRecognitionRemoteInferencePromptSchema: PromptSchema =
+  {
+    // See https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/inference/_client.py#L302for supported params.
+    // The settings below are supported settings specified in the HuggingFaceAutomaticSpeechRecognitionRemoteInference refine_completion_params implementation.
+    input: {
+      type: "object",
+      required: ["attachments"],
+      properties: {
+        attachments: {
+          type: "array",
+          items: {
+            type: "attachment",
+            required: ["data"],
+            mime_types: [
+              "audio/mpeg",
+              "audio/wav",
+              "audio/webm",
+              "audio/flac",
+              "audio/ogg",
+              "audio/ogg",
+            ],
+            properties: {
+              data: {
+                type: "string",
+              },
             },
           },
+          max_items: 1,
         },
-        max_items: 1,
       },
     },
-  },
-  model_settings: {
-    type: "object",
-    properties: {
-      model: {
-        type: "string",
-        description: `Hugging Face model to use. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint`,
-        default: "openai/whisper-large-v3"
+    model_settings: {
+      type: "object",
+      properties: {
+        model: {
+          type: "string",
+          description: `Hugging Face model to use. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed Inference Endpoint`,
+          default: "openai/whisper-large-v2",
+        },
       },
     },
-  },
-};
+  };
diff --git a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceText2SpeechRemoteInferencePromptSchema.ts b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceText2SpeechRemoteInferencePromptSchema.ts
@@ -13,7 +13,7 @@ export const HuggingFaceText2SpeechRemoteInferencePromptSchema: PromptSchema = {
         type: "string",
         description: `Hugging Face model to use. Can be a model ID hosted on the Hugging Face Hub or a URL 
         to a deployed Inference Endpoint`,
-        default: "suno/bark",
+        default: "facebook/fastspeech2-en-ljspeech",
       },
     },
   },