diff --git a/examples/document-search/from_config.py b/examples/document-search/from_config.py index b68ab64e0..6907a1dd3 100644 --- a/examples/document-search/from_config.py +++ b/examples/document-search/from_config.py @@ -37,6 +37,9 @@ "k": 3, "max_distance": 1.2, }, + "metadata_store": { + "type": "InMemoryMetadataStore", + }, }, }, "reranker": {"type": "ragbits.document_search.retrieval.rerankers.noop:NoopReranker"}, diff --git a/packages/ragbits-core/src/ragbits/core/metadata_stores/__init__.py b/packages/ragbits-core/src/ragbits/core/metadata_stores/__init__.py new file mode 100644 index 000000000..722a45702 --- /dev/null +++ b/packages/ragbits-core/src/ragbits/core/metadata_stores/__init__.py @@ -0,0 +1,30 @@ +import sys + +from ragbits.core.utils.config_handling import get_cls_from_config + +from .base import MetadataStore +from .in_memory import InMemoryMetadataStore + +__all__ = ["InMemoryMetadataStore", "MetadataStore"] + +module = sys.modules[__name__] + + +def get_metadata_store(metadata_store_config: dict | None) -> MetadataStore | None: + """ + Initializes and returns a MetadataStore object based on the provided configuration. + + Args: + metadata_store_config: A dictionary containing configuration details for the MetadataStore. + + Returns: + An instance of the specified MetadataStore class, initialized with the provided config + (if any) or default arguments. + """ + if metadata_store_config is None: + return None + + metadata_store_class = get_cls_from_config(metadata_store_config["type"], module) + config = metadata_store_config.get("config", {}) + + return metadata_store_class(**config) diff --git a/packages/ragbits-core/src/ragbits/core/metadata_stores/base.py b/packages/ragbits-core/src/ragbits/core/metadata_stores/base.py new file mode 100644 index 000000000..5a6ad81d4 --- /dev/null +++ b/packages/ragbits-core/src/ragbits/core/metadata_stores/base.py @@ -0,0 +1,32 @@ +from abc import ABC, abstractmethod + + +class MetadataStore(ABC): + """ + An abstract class for metadata storage. Allows to store, query and retrieve metadata in form of key value pairs. + """ + + @abstractmethod + async def store(self, ids: list[str], metadatas: list[dict]) -> None: + """ + Store metadatas under ids in metadata store. + + Args: + ids: list of unique ids of the entries + metadatas: list of dicts with metadata. + """ + + @abstractmethod + async def get(self, ids: list[str]) -> list[dict]: + """ + Returns metadatas associated with a given ids. + + Args: + ids: list of ids to use. + + Returns: + List of metadata dicts associated with a given ids. + + Raises: + MetadataNotFoundError: If the metadata is not found. + """ diff --git a/packages/ragbits-core/src/ragbits/core/metadata_stores/exceptions.py b/packages/ragbits-core/src/ragbits/core/metadata_stores/exceptions.py new file mode 100644 index 000000000..222206855 --- /dev/null +++ b/packages/ragbits-core/src/ragbits/core/metadata_stores/exceptions.py @@ -0,0 +1,8 @@ +class MetadataNotFoundError(Exception): + """ + Raised when metadata is not found in the metadata store + """ + + def __init__(self, id: str) -> None: + super().__init__(f"Metadata not found for {id} id.") + self.id = id diff --git a/packages/ragbits-core/src/ragbits/core/metadata_stores/in_memory.py b/packages/ragbits-core/src/ragbits/core/metadata_stores/in_memory.py new file mode 100644 index 000000000..6f2a98902 --- /dev/null +++ b/packages/ragbits-core/src/ragbits/core/metadata_stores/in_memory.py @@ -0,0 +1,43 @@ +from ragbits.core.metadata_stores.base import MetadataStore +from ragbits.core.metadata_stores.exceptions import MetadataNotFoundError + + +class InMemoryMetadataStore(MetadataStore): + """ + Metadata Store implemented in memory + """ + + def __init__(self) -> None: + """ + Constructs a new InMemoryMetadataStore instance. + """ + self._storage: dict[str, dict] = {} + + async def store(self, ids: list[str], metadatas: list[dict]) -> None: + """ + Store metadatas under ids in metadata store. + + Args: + ids: list of unique ids of the entries + metadatas: list of dicts with metadata. + """ + for _id, metadata in zip(ids, metadatas, strict=False): + self._storage[_id] = metadata + + async def get(self, ids: list[str]) -> list[dict]: + """ + Returns metadatas associated with a given ids. + + Args: + ids: list of ids to use. + + Returns: + List of metadata dicts associated with a given ids. + + Raises: + MetadataNotFoundError: If the metadata is not found. + """ + try: + return [self._storage[_id] for _id in ids] + except KeyError as exc: + raise MetadataNotFoundError(*exc.args) from exc diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py b/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py index 06a9338a0..b9c16c34a 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py @@ -1,5 +1,6 @@ import sys +from ..metadata_stores import get_metadata_store from ..utils.config_handling import get_cls_from_config from .base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery from .in_memory import InMemoryVectorStore @@ -26,4 +27,8 @@ def get_vector_store(vector_store_config: dict) -> VectorStore: if vector_store_config["type"].endswith("ChromaVectorStore"): return vector_store_cls.from_config(config) - return vector_store_cls(default_options=VectorStoreOptions(**config.get("default_options", {}))) + metadata_store_config = vector_store_config.get("metadata_store_config") + return vector_store_cls( + default_options=VectorStoreOptions(**config.get("default_options", {})), + metadata_store=get_metadata_store(metadata_store_config), + ) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py index 7f340c5c1..f1c102b1c 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py @@ -2,6 +2,8 @@ from pydantic import BaseModel +from ragbits.core.metadata_stores.base import MetadataStore + WhereQuery = dict[str, str | int | float | bool] @@ -29,9 +31,21 @@ class VectorStore(ABC): A class with an implementation of Vector Store, allowing to store and retrieve vectors by similarity function. """ - def __init__(self, default_options: VectorStoreOptions | None = None) -> None: + def __init__( + self, + default_options: VectorStoreOptions | None = None, + metadata_store: MetadataStore | None = None, + ) -> None: + """ + Constructs a new VectorStore instance. + + Args: + default_options: The default options for querying the vector store. + metadata_store: The metadata store to use. + """ super().__init__() self._default_options = default_options or VectorStoreOptions() + self._metadata_store = metadata_store @abstractmethod async def store(self, entries: list[VectorStoreEntry]) -> None: diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index 22dd4a5b1..c8500e7ae 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -8,6 +8,8 @@ from chromadb import Collection from chromadb.api import ClientAPI +from ragbits.core.metadata_stores import get_metadata_store +from ragbits.core.metadata_stores.base import MetadataStore from ragbits.core.utils.config_handling import get_cls_from_config from ragbits.core.vector_stores.base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery @@ -23,17 +25,19 @@ def __init__( index_name: str, distance_method: Literal["l2", "ip", "cosine"] = "l2", default_options: VectorStoreOptions | None = None, - ): + metadata_store: MetadataStore | None = None, + ) -> None: """ - Initializes the ChromaVectorStore with the given parameters. + Constructs a new ChromaVectorStore instance. Args: client: The ChromaDB client. index_name: The name of the index. distance_method: The distance method to use. default_options: The default options for querying the vector store. + metadata_store: The metadata store to use. If None, the metadata will be stored in ChromaDB. """ - super().__init__(default_options) + super().__init__(default_options=default_options, metadata_store=metadata_store) self._client = client self._index_name = index_name self._distance_method = distance_method @@ -62,12 +66,13 @@ def from_config(cls, config: dict) -> ChromaVectorStore: Returns: An initialized instance of the ChromaVectorStore class. """ - client = get_cls_from_config(config["client"]["type"], chromadb) # type: ignore + client_cls = get_cls_from_config(config["client"]["type"], chromadb) return cls( - client=client(**config["client"].get("config", {})), + client=client_cls(**config["client"].get("config", {})), index_name=config["index_name"], distance_method=config.get("distance_method", "l2"), default_options=VectorStoreOptions(**config.get("default_options", {})), + metadata_store=get_metadata_store(config.get("metadata_store")), ) async def store(self, entries: list[VectorStoreEntry]) -> None: @@ -77,17 +82,17 @@ async def store(self, entries: list[VectorStoreEntry]) -> None: Args: entries: The entries to store. """ - # TODO: Think about better id components for hashing + # TODO: Think about better id components for hashing and move hash computing to VectorStoreEntry ids = [sha256(entry.key.encode("utf-8")).hexdigest() for entry in entries] + documents = [entry.key for entry in entries] embeddings = [entry.vector for entry in entries] - metadatas = [ - { - "__key": entry.key, - "__metadata": json.dumps(entry.metadata, default=str), - } - for entry in entries - ] - self._collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas) # type: ignore + metadatas = [entry.metadata for entry in entries] + metadatas = ( + [{"__metadata": json.dumps(metadata, default=str)} for metadata in metadatas] + if self._metadata_store is None + else await self._metadata_store.store(ids, metadatas) # type: ignore + ) + self._collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas, documents=documents) # type: ignore async def retrieve(self, vector: list[float], options: VectorStoreOptions | None = None) -> list[VectorStoreEntry]: """ @@ -99,25 +104,37 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None Returns: The retrieved entries. + + Raises: + MetadataNotFoundError: If the metadata is not found. """ options = self._default_options if options is None else options + results = self._collection.query( query_embeddings=vector, n_results=options.k, - include=["metadatas", "embeddings", "distances"], + include=["metadatas", "embeddings", "distances", "documents"], ) + ids = results.get("ids") or [] metadatas = results.get("metadatas") or [] embeddings = results.get("embeddings") or [] distances = results.get("distances") or [] + documents = results.get("documents") or [] + + metadatas = [ + [json.loads(metadata["__metadata"]) for batch in metadatas for metadata in batch] # type: ignore + if self._metadata_store is None + else await self._metadata_store.get(*ids) + ] return [ VectorStoreEntry( - key=str(metadata["__key"]), + key=document, vector=list(embeddings), - metadata=json.loads(str(metadata["__metadata"])), + metadata=metadata, # type: ignore ) - for batch in zip(metadatas, embeddings, distances, strict=False) - for metadata, embeddings, distance in zip(*batch, strict=False) + for batch in zip(metadatas, embeddings, distances, documents, strict=False) + for metadata, embeddings, distance, document in zip(*batch, strict=False) if options.max_distance is None or distance <= options.max_distance ] @@ -135,6 +152,9 @@ async def list( Returns: The entries. + + Raises: + MetadataNotFoundError: If the metadata is not found. """ # Cast `where` to chromadb's Where type where_chroma: chromadb.Where | None = dict(where) if where else None @@ -143,16 +163,24 @@ async def list( where=where_chroma, limit=limit, offset=offset, - include=["metadatas", "embeddings"], + include=["metadatas", "embeddings", "documents"], ) + ids = get_results.get("ids") or [] metadatas = get_results.get("metadatas") or [] embeddings = get_results.get("embeddings") or [] + documents = get_results.get("documents") or [] + + metadatas = ( + [json.loads(metadata["__metadata"]) for metadata in metadatas] # type: ignore + if self._metadata_store is None + else await self._metadata_store.get(ids) + ) return [ VectorStoreEntry( - key=str(metadata["__key"]), + key=document, vector=list(embedding), - metadata=json.loads(str(metadata["__metadata"])), + metadata=metadata, # type: ignore ) - for metadata, embedding in zip(metadatas, embeddings, strict=False) + for metadata, embedding, document in zip(metadatas, embeddings, documents, strict=False) ] diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py index 7e50558f8..48dd6a886 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py @@ -2,6 +2,7 @@ import numpy as np +from ragbits.core.metadata_stores.base import MetadataStore from ragbits.core.vector_stores.base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery @@ -10,8 +11,19 @@ class InMemoryVectorStore(VectorStore): A simple in-memory implementation of Vector Store, storing vectors in memory. """ - def __init__(self, default_options: VectorStoreOptions | None = None) -> None: - super().__init__(default_options) + def __init__( + self, + default_options: VectorStoreOptions | None = None, + metadata_store: MetadataStore | None = None, + ) -> None: + """ + Constructs a new InMemoryVectorStore instance. + + Args: + default_options: The default options for querying the vector store. + metadata_store: The metadata store to use. + """ + super().__init__(default_options=default_options, metadata_store=metadata_store) self._storage: dict[str, VectorStoreEntry] = {} async def store(self, entries: list[VectorStoreEntry]) -> None: diff --git a/packages/ragbits-core/tests/unit/metadata_stores/__init__.py b/packages/ragbits-core/tests/unit/metadata_stores/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/ragbits-core/tests/unit/metadata_stores/test_in_memory.py b/packages/ragbits-core/tests/unit/metadata_stores/test_in_memory.py new file mode 100644 index 000000000..321e78902 --- /dev/null +++ b/packages/ragbits-core/tests/unit/metadata_stores/test_in_memory.py @@ -0,0 +1,31 @@ +import pytest + +from ragbits.core.metadata_stores.exceptions import MetadataNotFoundError +from ragbits.core.metadata_stores.in_memory import InMemoryMetadataStore + + +@pytest.fixture +def metadata_store() -> InMemoryMetadataStore: + return InMemoryMetadataStore() + + +async def test_store(metadata_store: InMemoryMetadataStore) -> None: + ids = ["id1", "id2"] + metadatas = [{"key1": "value1"}, {"key2": "value2"}] + await metadata_store.store(ids, metadatas) + assert metadata_store._storage["id1"] == {"key1": "value1"} + assert metadata_store._storage["id2"] == {"key2": "value2"} + + +async def test_get(metadata_store: InMemoryMetadataStore) -> None: + ids = ["id1", "id2"] + metadatas = [{"key1": "value1"}, {"key2": "value2"}] + await metadata_store.store(ids, metadatas) + result = await metadata_store.get(ids) + assert result == [{"key1": "value1"}, {"key2": "value2"}] + + +async def test_get_metadata_not_found(metadata_store: InMemoryMetadataStore) -> None: + ids = ["id1"] + with pytest.raises(MetadataNotFoundError): + await metadata_store.get(ids) diff --git a/packages/ragbits-core/tests/unit/vector_stores/__init__.py b/packages/ragbits-core/tests/unit/vector_stores/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py index b6a5fbcfe..2f7968480 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py @@ -14,7 +14,7 @@ def mock_chromadb_store() -> ChromaVectorStore: ) -def test_get_chroma_collection(mock_chromadb_store: ChromaVectorStore) -> None: +async def test_get_chroma_collection(mock_chromadb_store: ChromaVectorStore) -> None: _ = mock_chromadb_store._get_chroma_collection() assert mock_chromadb_store._client.get_or_create_collection.call_count == 2 # type: ignore @@ -43,11 +43,11 @@ async def test_store(mock_chromadb_store: ChromaVectorStore) -> None: embeddings=[[0.1, 0.2, 0.3]], metadatas=[ { - "__key": "test_key", "__metadata": '{"content": "test content", "document": {"title": "test title", "source":' ' {"path": "/test/path"}, "document_type": "test_type"}}', } ], + documents=["test_key"], ) @@ -74,12 +74,10 @@ async def test_retrieve( "metadatas": [ [ { - "__key": "test_key_1", "__metadata": '{"content": "test content 1", "document": {"title": "test title 1", "source":' ' {"path": "/test/path-1"}, "document_type": "txt"}}', }, { - "__key": "test_key_2", "__metadata": '{"content": "test content 2", "document": {"title": "test title 2", "source":' ' {"path": "/test/path-2"}, "document_type": "txt"}}', }, @@ -87,6 +85,8 @@ async def test_retrieve( ], "embeddings": [[[0.12, 0.25, 0.29], [0.13, 0.26, 0.30]]], "distances": [[0.1, 0.2]], + "documents": [["test_key_1", "test_key_2"]], + "ids": [["test_id_1", "test_id_2"]], } entries = await mock_chromadb_store.retrieve(vector, options=VectorStoreOptions(max_distance=max_distance)) @@ -103,17 +103,17 @@ async def test_list(mock_chromadb_store: ChromaVectorStore) -> None: mock_collection.get.return_value = { # type: ignore "metadatas": [ { - "__key": "test_key", "__metadata": '{"content": "test content", "document": {"title": "test title", "source":' ' {"path": "/test/path"}, "document_type": "test_type"}}', }, { - "__key": "test_key_2", "__metadata": '{"content": "test content 2", "document": {"title": "test title 2", "source":' ' {"path": "/test/path"}, "document_type": "test_type"}}', }, ], "embeddings": [[0.12, 0.25, 0.29], [0.13, 0.26, 0.30]], + "documents": ["test_key", "test_key_2"], + "ids": ["test_id_1", "test_id_2"], } entries = await mock_chromadb_store.list() diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py index 47539a9a7..4c2280a5b 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py @@ -1,3 +1,4 @@ +import warnings from pathlib import Path from PIL import Image @@ -6,7 +7,9 @@ from unstructured.documents.elements import Element as UnstructuredElement from unstructured.documents.elements import ElementType -from ragbits.core.llms.base import LLM +from ragbits.core.llms.base import LLM, LLMType +from ragbits.core.llms.factory import get_default_llm, has_default_llm +from ragbits.core.llms.litellm import LiteLLM from ragbits.core.prompt import Prompt from ragbits.document_search.documents.document import DocumentMeta, DocumentType from ragbits.document_search.documents.element import Element, ImageElement @@ -95,12 +98,18 @@ async def _to_image_element( img_bytes = crop_and_convert_to_bytes(image, top_x, top_y, bottom_x, bottom_y) prompt = _ImagePrompt(_ImagePromptInput(images=[img_bytes])) - if self._llm: - image_description = await self._llm.generate(prompt=prompt) - elif self.image_describer: - image_description = await self.image_describer.get_image_description(prompt=prompt) - else: - image_description = "" + if self.image_describer is None: + if self._llm is not None: + llm_to_use = self._llm + elif has_default_llm(LLMType.VISION): + llm_to_use = get_default_llm(LLMType.VISION) + else: + warnings.warn( + f"Vision LLM was not provided, setting default option to {DEFAULT_LLM_IMAGE_DESCRIPTION_MODEL}" + ) + llm_to_use = LiteLLM(DEFAULT_LLM_IMAGE_DESCRIPTION_MODEL) + self.image_describer = ImageDescriber(llm_to_use) + image_description = await self.image_describer.get_image_description(prompt=prompt) return ImageElement( description=image_description, ocr_extracted_text=element.text,