ls1intum · sebastianloose · Jan 14, 2025 · Jan 14, 2025 · Jan 20, 2025 · Jan 28, 2025
@@ -18,4 +18,5 @@ class PipelineEnum(str, Enum):
     IRIS_FAQ_RETRIEVAL_PIPELINE = "IRIS_FAQ_RETRIEVAL_PIPELINE"
     IRIS_INCONSISTENCY_CHECK = "IRIS_INCONSISTENCY_CHECK"
     IRIS_REWRITING_PIPELINE = "IRIS_REWRITING_PIPELINE"
+    IRIS_VIDEO_TRANSCRIPTION_INGESTION = "IRIS_VIDEO_TRANSCRIPTION_INGESTION"
     NOT_SET = "NOT_SET"
@@ -0,0 +1,25 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+
+class TranscriptionSegmentDTO(BaseModel):
+    start_time: float = Field(..., alias="startTime")
+    end_time: float = Field(..., alias="endTime")
+    text: str = Field(..., alias="text")
+    slide_number: int = Field(default=0, alias="slideNumber")
+
+
+class TranscriptionDTO(BaseModel):
+    language: str = Field(default="en", alias="language")
+    segments: List[TranscriptionSegmentDTO] = Field(..., alias="segments")
+
+
+class TranscriptionWebhookDTO(BaseModel):
+    transcription: TranscriptionDTO = Field(..., alias="transcription")
+    lecture_id: int = Field(..., alias="lectureId")
+    lecture_name: str = Field(..., alias="lectureName")
+    course_id: int = Field(..., alias="courseId")
+    course_name: str = Field(..., alias="courseName")
+    lecture_unit_id: int = Field(..., alias="lectureUnitId")
+    course_description: str = Field("", alias="courseDescription")
@@ -0,0 +1,16 @@
+from typing import List, Optional
+
+from pydantic import Field
+
+from app.domain import PipelineExecutionDTO, PipelineExecutionSettingsDTO
+from app.domain.data.metrics.transcription_dto import TranscriptionWebhookDTO
+from app.domain.status.stage_dto import StageDTO
+
+
+class TranscriptionIngestionPipelineExecutionDto(PipelineExecutionDTO):
+    transcription: TranscriptionWebhookDTO
+    lectureUnitId: int
+    settings: Optional[PipelineExecutionSettingsDTO]
+    initial_stages: Optional[List[StageDTO]] = Field(
+        default=None, alias="initialStages"
+    )
@@ -1,13 +1,14 @@
 import logging
 from typing import Literal, Any
+
+from langchain_experimental.text_splitter import SemanticChunker
 from openai import (
-    OpenAI,
     APIError,
     APITimeoutError,
     RateLimitError,
     InternalServerError,
 )
-from openai.lib.azure import AzureOpenAI
+from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
 
 from ...llm.external.model import EmbeddingModel
 import time
@@ -16,7 +17,7 @@
 class OpenAIEmbeddingModel(EmbeddingModel):
     model: str
     api_key: str
-    _client: OpenAI
+    _client: OpenAIEmbeddings
 
     def embed(self, text: str) -> list[float]:
         retries = 5
@@ -26,12 +27,7 @@ def embed(self, text: str) -> list[float]:
 
         for attempt in range(retries):
             try:
-                response = self._client.embeddings.create(
-                    model=self.model,
-                    input=text,
-                    encoding_format="float",
-                )
-                return response.data[0].embedding
+                return self._client.embed_query(text)
             except (
                 APIError,
                 APITimeoutError,
@@ -44,12 +40,30 @@ def embed(self, text: str) -> list[float]:
                 time.sleep(wait_time)
         raise Exception(f"Failed to get embedding from OpenAI after {retries} retries.")
 
+    def split_text_semantically(
+        self,
+        text: str,
+        breakpoint_threshold_type: Literal[
+            "percentile", "standard_deviation", "interquartile", "gradient"
+        ] = "gradient",
+        breakpoint_threshold_amount: float = 95.0,
+        min_chunk_size: int = 512,
+    ):
+        chunker = SemanticChunker(
+            self._client,
+            breakpoint_threshold_type=breakpoint_threshold_type,
+            breakpoint_threshold_amount=breakpoint_threshold_amount,
+            min_chunk_size=min_chunk_size,
+        )
+
+        return chunker.split_text(text)
+
 
 class DirectOpenAIEmbeddingModel(OpenAIEmbeddingModel):
     type: Literal["openai_embedding"]
 
     def model_post_init(self, __context: Any) -> None:
-        self._client = OpenAI(api_key=self.api_key)
+        self._client = OpenAIEmbeddings(api_key=self.api_key)
 
     def __str__(self):
         return f"OpenAIEmbedding('{self.model}')"
@@ -62,7 +76,7 @@ class AzureOpenAIEmbeddingModel(OpenAIEmbeddingModel):
     api_version: str
 
     def model_post_init(self, __context: Any) -> None:
-        self._client = AzureOpenAI(
+        self._client = AzureOpenAIEmbeddings(
             azure_endpoint=self.endpoint,
             azure_deployment=self.azure_deployment,
             api_version=self.api_version,

@@ -1,4 +1,4 @@
-from typing import Optional, Sequence, Union, Dict, Any, Type, Callable
+from typing import Optional, Sequence, Union, Dict, Any, Type, Callable, Literal
 
 from langchain_core.tools import BaseTool
 from pydantic import ConfigDict
@@ -46,6 +46,21 @@ def embed(self, text: str) -> list[float]:
         llm = self.llm_manager.get_llm_by_id(self.model_id)
         return llm.embed(text)
 
+    def split_text_semantically(
+        self,
+        text: str,
+        breakpoint_threshold_type: Literal[
+            "percentile", "standard_deviation", "interquartile", "gradient"
+        ] = "gradient",
+        breakpoint_threshold_amount: float = 95.0,
+        min_chunk_size: int = 512,
+    ):
+        llm = self.llm_manager.get_llm_by_id(self.model_id)
+
+        return llm.split_text_semantically(
+            text, breakpoint_threshold_type, breakpoint_threshold_amount, min_chunk_size
+        )
+
     def bind_tools(
         self,
         tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable, BaseTool]],

@@ -0,0 +1,12 @@
+def transcription_summary_prompt(lecture_name: str, chunk_content: str):
+    return f"""
+        You are an excellent tutor with deep expertise in computer science and practical applications,
+        teaching at the university level.
+        A snippet of the spoken content of one lecture of the lecture {lecture_name} will be given to you.
+        Please accurately follow the instructions below.
+        1. Summarize the information in a clear and accurate manner.
+        2. Do not add additional information.
+        3. Only answer in complete sentences.
+        This is the text you should summarize:
+        {chunk_content}
+    """