Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lecture transcriptions webhook endpoint #204

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f080268
initalize pyris webhook endpoint
isabellagessl Jan 14, 2025
aa5d967
change data type
isabellagessl Jan 14, 2025
96a7f66
Minor changes
sebastianloose Jan 20, 2025
6eafaf4
Add transcription pipeline
sebastianloose Jan 28, 2025
b57d930
Merge branch 'main' into feature/transcriptions-webhook-endpoint
isabellagessl Feb 3, 2025
72350f1
add summarizing and chunking for lecture transcriptions
isabellagessl Feb 3, 2025
679dd15
add lecture id to transcriptioningestiondto
isabellagessl Feb 4, 2025
1e75c6e
fix token usage
isabellagessl Feb 4, 2025
92d6c3b
Fix chunking
sebastianloose Feb 4, 2025
60b0810
Merge branch 'feature/transcriptions-webhook-endpoint' of github.com:…
sebastianloose Feb 4, 2025
036e31a
Format code
sebastianloose Feb 4, 2025
b1db0bc
Improve code
sebastianloose Feb 4, 2025
d2c1647
add semaphore
isabellagessl Feb 4, 2025
d8b11c9
reformat
isabellagessl Feb 4, 2025
b991690
minor improvements
isabellagessl Feb 4, 2025
ffd9c43
process transcription by lecture unit
isabellagessl Feb 4, 2025
7fa57e4
fix linter error
isabellagessl Feb 10, 2025
8983277
integrate feedback
isabellagessl Feb 11, 2025
216cdf7
handle transcriptions on lecture unit base
isabellagessl Feb 11, 2025
5bd5e0d
fix linters
isabellagessl Feb 11, 2025
5036396
Add semantic chunking
sebastianloose Feb 11, 2025
c81c09e
Merge branch 'feature/transcriptions-webhook-endpoint' of github.com:…
sebastianloose Feb 11, 2025
7458423
Fix linter errors
sebastianloose Feb 11, 2025
ef77b25
Merge branch 'main' into feature/transcriptions-webhook-endpoint
sebastianloose Feb 18, 2025
710fdca
Minor changes
sebastianloose Feb 24, 2025
8624512
fix endpoint dto
isabellagessl Feb 28, 2025
9ac4624
Fix text embedding
sebastianloose Feb 28, 2025
5f34c5a
Minor improvement
sebastianloose Feb 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions app/common/PipelineEnum.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ class PipelineEnum(str, Enum):
IRIS_FAQ_RETRIEVAL_PIPELINE = "IRIS_FAQ_RETRIEVAL_PIPELINE"
IRIS_INCONSISTENCY_CHECK = "IRIS_INCONSISTENCY_CHECK"
IRIS_REWRITING_PIPELINE = "IRIS_REWRITING_PIPELINE"
IRIS_VIDEO_TRANSCRIPTION_INGESTION = "IRIS_VIDEO_TRANSCRIPTION_INGESTION"
NOT_SET = "NOT_SET"
25 changes: 25 additions & 0 deletions app/domain/data/metrics/transcription_dto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import List

from pydantic import BaseModel, Field


class TranscriptionSegmentDTO(BaseModel):
start_time: float = Field(..., alias="startTime")
end_time: float = Field(..., alias="endTime")
text: str = Field(..., alias="text")
slide_number: int = Field(default=0, alias="slideNumber")


class TranscriptionDTO(BaseModel):
language: str = Field(default="en", alias="language")
segments: List[TranscriptionSegmentDTO] = Field(..., alias="segments")


class TranscriptionWebhookDTO(BaseModel):
transcription: TranscriptionDTO = Field(..., alias="transcription")
lecture_id: int = Field(..., alias="lectureId")
lecture_name: str = Field(..., alias="lectureName")
course_id: int = Field(..., alias="courseId")
course_name: str = Field(..., alias="courseName")
lecture_unit_id: int = Field(..., alias="lectureUnitId")
course_description: str = Field("", alias="courseDescription")
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import List, Optional

from pydantic import Field

from app.domain import PipelineExecutionDTO, PipelineExecutionSettingsDTO
from app.domain.data.metrics.transcription_dto import TranscriptionWebhookDTO
from app.domain.status.stage_dto import StageDTO


class TranscriptionIngestionPipelineExecutionDto(PipelineExecutionDTO):
transcription: TranscriptionWebhookDTO
lectureUnitId: int
settings: Optional[PipelineExecutionSettingsDTO]
initial_stages: Optional[List[StageDTO]] = Field(
default=None, alias="initialStages"
)
36 changes: 25 additions & 11 deletions app/llm/external/openai_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import logging
from typing import Literal, Any

from langchain_experimental.text_splitter import SemanticChunker
from openai import (
OpenAI,
APIError,
APITimeoutError,
RateLimitError,
InternalServerError,
)
from openai.lib.azure import AzureOpenAI
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings

from ...llm.external.model import EmbeddingModel
import time
Expand All @@ -16,7 +17,7 @@
class OpenAIEmbeddingModel(EmbeddingModel):
model: str
api_key: str
_client: OpenAI
_client: OpenAIEmbeddings

def embed(self, text: str) -> list[float]:
retries = 5
Expand All @@ -26,12 +27,7 @@ def embed(self, text: str) -> list[float]:

for attempt in range(retries):
try:
response = self._client.embeddings.create(
model=self.model,
input=text,
encoding_format="float",
)
return response.data[0].embedding
return self._client.embed_query(text)
except (
APIError,
APITimeoutError,
Expand All @@ -44,12 +40,30 @@ def embed(self, text: str) -> list[float]:
time.sleep(wait_time)
raise Exception(f"Failed to get embedding from OpenAI after {retries} retries.")

def split_text_semantically(
self,
text: str,
breakpoint_threshold_type: Literal[
"percentile", "standard_deviation", "interquartile", "gradient"
] = "gradient",
breakpoint_threshold_amount: float = 95.0,
min_chunk_size: int = 512,
):
chunker = SemanticChunker(
self._client,
breakpoint_threshold_type=breakpoint_threshold_type,
breakpoint_threshold_amount=breakpoint_threshold_amount,
min_chunk_size=min_chunk_size,
)

return chunker.split_text(text)


class DirectOpenAIEmbeddingModel(OpenAIEmbeddingModel):
type: Literal["openai_embedding"]

def model_post_init(self, __context: Any) -> None:
self._client = OpenAI(api_key=self.api_key)
self._client = OpenAIEmbeddings(api_key=self.api_key)

def __str__(self):
return f"OpenAIEmbedding('{self.model}')"
Expand All @@ -62,7 +76,7 @@ class AzureOpenAIEmbeddingModel(OpenAIEmbeddingModel):
api_version: str

def model_post_init(self, __context: Any) -> None:
self._client = AzureOpenAI(
self._client = AzureOpenAIEmbeddings(
azure_endpoint=self.endpoint,
azure_deployment=self.azure_deployment,
api_version=self.api_version,
Expand Down
17 changes: 16 additions & 1 deletion app/llm/request_handler/basic_request_handler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional, Sequence, Union, Dict, Any, Type, Callable
from typing import Optional, Sequence, Union, Dict, Any, Type, Callable, Literal

from langchain_core.tools import BaseTool
from pydantic import ConfigDict
Expand Down Expand Up @@ -46,6 +46,21 @@ def embed(self, text: str) -> list[float]:
llm = self.llm_manager.get_llm_by_id(self.model_id)
return llm.embed(text)

def split_text_semantically(
self,
text: str,
breakpoint_threshold_type: Literal[
"percentile", "standard_deviation", "interquartile", "gradient"
] = "gradient",
breakpoint_threshold_amount: float = 95.0,
min_chunk_size: int = 512,
):
llm = self.llm_manager.get_llm_by_id(self.model_id)

return llm.split_text_semantically(
text, breakpoint_threshold_type, breakpoint_threshold_amount, min_chunk_size
)

def bind_tools(
self,
tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable, BaseTool]],
Expand Down
12 changes: 12 additions & 0 deletions app/pipeline/prompts/transcription_ingestion_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
def transcription_summary_prompt(lecture_name: str, chunk_content: str):
return f"""
You are an excellent tutor with deep expertise in computer science and practical applications,
teaching at the university level.
A snippet of the spoken content of one lecture of the lecture {lecture_name} will be given to you.
Please accurately follow the instructions below.
1. Summarize the information in a clear and accurate manner.
2. Do not add additional information.
3. Only answer in complete sentences.
This is the text you should summarize:
{chunk_content}
"""
Loading