From 4d655c35618e5a79b471806b697d63a85b5400d0 Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Thu, 24 Oct 2024 10:13:50 +0200 Subject: [PATCH 01/11] feat: support for different MetadataStores in VectorStore --- .../ragbits/core/metadata_store/__init__.py | 0 .../src/ragbits/core/metadata_store/base.py | 71 +++++++++++++++ .../ragbits/core/metadata_store/in_memory.py | 78 ++++++++++++++++ .../src/ragbits/core/vector_store/base.py | 9 +- .../core/vector_store/chromadb_store.py | 88 ++++++++++++------- .../ragbits/core/vector_store/in_memory.py | 6 +- .../test_in_memory_metadata_store.py | 33 +++++++ .../unit/vector_stores/test_chromadb_store.py | 25 ++---- 8 files changed, 262 insertions(+), 48 deletions(-) create mode 100644 packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py create mode 100644 packages/ragbits-core/src/ragbits/core/metadata_store/base.py create mode 100644 packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py create mode 100644 packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py b/packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/base.py b/packages/ragbits-core/src/ragbits/core/metadata_store/base.py new file mode 100644 index 000000000..e17c5728d --- /dev/null +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/base.py @@ -0,0 +1,71 @@ +import abc +from typing import Any +from uuid import UUID + + +class MetadataStore(abc.ABC): + """ + An abstract class for metadata storage. Allows to store, query and retrieve metadata in form of key value pairs + """ + + @abc.abstractmethod + async def store(self, key: str | UUID, metadata: dict) -> None: + """ + Store metadata under key in metadata store + + Args: + key: unique key of the entry + metadata: dict with metadata + """ + + @abc.abstractmethod + async def query(self, metadata_field_name: str, value: Any) -> dict: + """ + Queries metastore and returns dicts with key: metadata format that match + + Args: + metadata_field_name: name of metadata field + value: value to match against + + Returns: + dict with key: metadata entries that match query + """ + + @abc.abstractmethod + async def get(self, key: str | UUID) -> dict: + """ + Returns metadata associated with a given key + + Args: + key: key to use + + Returns: + metadata dict associated with a given key + """ + + @abc.abstractmethod + async def get_all(self) -> dict: + """ + Returns all keys with associated metadata + + Returns: + metadata dict for all entries + """ + + @abc.abstractmethod + async def store_global(self, metadata: dict) -> None: + """ + Store key value pairs of metadata that is shared across entries + + Args: + metadata: common key value pairs for the whole collection + """ + + @abc.abstractmethod + async def get_global(self) -> dict: + """ + Get key value pairs of metadata that is shared across entries + + Returns + metadata for the whole collection + """ diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py b/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py new file mode 100644 index 000000000..4ff03095e --- /dev/null +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py @@ -0,0 +1,78 @@ +from typing import Any +from uuid import UUID + +from ragbits.core.metadata_store.base import MetadataStore + + +class InMemoryMetadataStore(MetadataStore): + """ + Metadata Store implemented in memory + """ + + def __init__(self) -> None: + self._storage: dict[str | UUID, Any] = {} + self._global_storage: dict[str | UUID, Any] = {} + + async def store(self, key: str | UUID, metadata: dict) -> None: + """ + Store metadata under key in metadata store + + Args: + key: unique key of the entry + metadata: dict with metadata + """ + self._storage[key] = metadata + + async def query(self, metadata_field_name: str, value: Any) -> dict: + """ + Queries metastore and returns dicts with key: metadata format that match + + Args: + metadata_field_name: name of metadata field + value: value to match against + + Returns: + dict with key: metadata entries that match query + """ + return { + key: metadata for key, metadata in self._storage.items() if metadata.get(metadata_field_name, None) == value + } + + async def get(self, key: str | UUID) -> dict: + """ + Returns metadata associated with a given key + + Args: + key: key to use + + Returns: + metadata dict associated with a given key + """ + return self._storage.get(key, {}) + + async def get_all(self) -> dict: + """ + Returns all keys with associated metadata + + Returns: + metadata dict for all entries + """ + return self._storage + + async def store_global(self, metadata: dict) -> None: + """ + Store key value pairs of metadata that is shared across entries + + Args: + metadata: common key value pairs for the whole collection + """ + self._global_storage.update(metadata) + + async def get_global(self) -> dict: + """ + Get key value pairs of metadata that is shared across entries + + Returns: + metadata for the whole collection + """ + return self._global_storage diff --git a/packages/ragbits-core/src/ragbits/core/vector_store/base.py b/packages/ragbits-core/src/ragbits/core/vector_store/base.py index 4d494c561..13f657a8e 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_store/base.py +++ b/packages/ragbits-core/src/ragbits/core/vector_store/base.py @@ -1,8 +1,10 @@ import abc -from typing import List +from typing import List, Optional from pydantic import BaseModel +from ragbits.core.metadata_store.base import MetadataStore + class VectorDBEntry(BaseModel): """ @@ -19,6 +21,11 @@ class VectorStore(abc.ABC): A class with an implementation of Vector Store, allowing to store and retrieve vectors by similarity function. """ + metadata_store: Optional[MetadataStore] + + def __init__(self, metadata_store: Optional[MetadataStore] = None): + self.metadata_store = metadata_store + @abc.abstractmethod async def store(self, entries: List[VectorDBEntry]) -> None: """ diff --git a/packages/ragbits-core/src/ragbits/core/vector_store/chromadb_store.py b/packages/ragbits-core/src/ragbits/core/vector_store/chromadb_store.py index 6bee093e3..1a0369989 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_store/chromadb_store.py +++ b/packages/ragbits-core/src/ragbits/core/vector_store/chromadb_store.py @@ -10,6 +10,7 @@ HAS_CHROMADB = False from ragbits.core.embeddings import Embeddings +from ragbits.core.metadata_store.base import MetadataStore from ragbits.core.utils.config_handling import get_cls_from_config from ragbits.core.vector_store import VectorDBEntry, VectorStore @@ -17,6 +18,13 @@ class ChromaDBStore(VectorStore): """Class that stores text embeddings using [Chroma](https://docs.trychroma.com/)""" + CHROMA_IDS_KEY = "ids" + CHROMA_DOCUMENTS_KEY = "documents" + CHROMA_DISTANCES_KEY = "distances" + CHROMA_metadata_KEY = "metadatas" + DEFAULT_DISTANCE_METHOD = "l2" + METADATA_INNER_KEY = "__key" + def __init__( self, index_name: str, @@ -24,6 +32,7 @@ def __init__( embedding_function: Union[Embeddings, "chromadb.EmbeddingFunction"], max_distance: Optional[float] = None, distance_method: Literal["l2", "ip", "cosine"] = "l2", + metadata_store: Optional[MetadataStore] = None, ): """ Initializes the ChromaDBStore with the given parameters. @@ -38,13 +47,13 @@ def __init__( if not HAS_CHROMADB: raise ImportError("Install the 'ragbits-document-search[chromadb]' extra to use LiteLLM embeddings models") - super().__init__() + super().__init__(metadata_store) self._index_name = index_name self._chroma_client = chroma_client self._embedding_function = embedding_function self._max_distance = max_distance self._metadata = {"hnsw:space": distance_method} - self._collection = self._get_chroma_collection() + self._collection = None @classmethod def from_config(cls, config: dict) -> "ChromaDBStore": @@ -69,10 +78,10 @@ def from_config(cls, config: dict) -> "ChromaDBStore": chroma_client, embedding_function, max_distance=config.get("max_distance"), - distance_method=config.get("distance_method", "l2"), + distance_method=config.get("distance_method", cls.DEFAULT_DISTANCE_METHOD), ) - def _get_chroma_collection(self) -> "chromadb.Collection": + async def _get_chroma_collection(self) -> "chromadb.Collection": """ Based on the selected embedding_function, chooses how to retrieve the ChromaDB collection. If the collection doesn't exist, it creates one. @@ -80,14 +89,23 @@ def _get_chroma_collection(self) -> "chromadb.Collection": Returns: Retrieved collection """ + if self._collection is not None: + return self._collection + + if self.metadata_store is not None: + await self.metadata_store.store_global(self._metadata) + metadata_to_store = None + else: + metadata_to_store = self._metadata if isinstance(self._embedding_function, Embeddings): - return self._chroma_client.get_or_create_collection(name=self._index_name, metadata=self._metadata) + return self._chroma_client.get_or_create_collection(name=self._index_name, metadata=metadata_to_store) - return self._chroma_client.get_or_create_collection( + self._collection = self._chroma_client.get_or_create_collection( name=self._index_name, - metadata=self._metadata, + metadata=metadata_to_store, embedding_function=self._embedding_function, ) + return self._collection def _return_best_match(self, retrieved: dict) -> Optional[str]: """ @@ -99,21 +117,17 @@ def _return_best_match(self, retrieved: dict) -> Optional[str]: Returns: The best match or None if no match is found """ - if self._max_distance is None or retrieved["distances"][0][0] <= self._max_distance: - return retrieved["documents"][0][0] + if self._max_distance is None or retrieved[self.CHROMA_DISTANCES_KEY][0][0] <= self._max_distance: + return retrieved[self.CHROMA_DOCUMENTS_KEY][0][0] return None - def _process_db_entry(self, entry: VectorDBEntry) -> tuple[str, list[float], dict]: + @staticmethod + def _process_db_entry(entry: VectorDBEntry) -> tuple[str, list[float], str, dict]: doc_id = sha256(entry.key.encode("utf-8")).hexdigest() embedding = entry.vector - metadata = { - "__key": entry.key, - "__metadata": json.dumps(entry.metadata, default=str), - } - - return doc_id, embedding, metadata + return doc_id, embedding, entry.key, entry.metadata @property def embedding_function(self) -> Union[Embeddings, "chromadb.EmbeddingFunction"]: @@ -133,9 +147,17 @@ async def store(self, entries: List[VectorDBEntry]) -> None: entries: The entries to store. """ entries_processed = list(map(self._process_db_entry, entries)) - ids, embeddings, metadatas = map(list, zip(*entries_processed)) + ids, embeddings, contents, metadatas = map(list, zip(*entries_processed)) - self._collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas) + if self.metadata_store is not None: + for key, meta in zip(ids, metadatas): + await self.metadata_store.store(key, meta) + metadata_to_store = None + else: + metadata_to_store = [{self.METADATA_INNER_KEY: json.dumps(m, default=str)} for m in metadatas] + + collection = await self._get_chroma_collection() + collection.add(ids=ids, embeddings=embeddings, metadatas=metadata_to_store, documents=contents) async def retrieve(self, vector: List[float], k: int = 5) -> List[VectorDBEntry]: """ @@ -148,18 +170,24 @@ async def retrieve(self, vector: List[float], k: int = 5) -> List[VectorDBEntry] Returns: The retrieved entries. """ - query_result = self._collection.query(query_embeddings=[vector], n_results=k) - - db_entries = [] - for meta in query_result.get("metadatas"): - for result in meta: - db_entry = VectorDBEntry( - key=result["__key"], - vector=vector, - metadata=json.loads(result["__metadata"]), - ) - - db_entries.append(db_entry) + collection = await self._get_chroma_collection() + query_result = collection.query(query_embeddings=[vector], n_results=k) + db_entries: list[VectorDBEntry] = [] + + if len(query_result[self.CHROMA_DOCUMENTS_KEY]) < 1: + return db_entries + for i in range(len(query_result[self.CHROMA_DOCUMENTS_KEY][0])): + key = query_result[self.CHROMA_DOCUMENTS_KEY][0][i] + if self.metadata_store is not None: + metadata = await self.metadata_store.get(query_result[self.CHROMA_IDS_KEY][0][i]) + else: + metadata = json.loads(query_result[self.CHROMA_metadata_KEY][0][i][self.METADATA_INNER_KEY]) + db_entry = VectorDBEntry( + key=key, + vector=vector, + metadata=metadata, + ) + db_entries.append(db_entry) return db_entries diff --git a/packages/ragbits-core/src/ragbits/core/vector_store/in_memory.py b/packages/ragbits-core/src/ragbits/core/vector_store/in_memory.py index ce0576fa6..b3cb307fa 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_store/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/vector_store/in_memory.py @@ -1,5 +1,8 @@ +from typing import Optional + import numpy as np +from ragbits.core.metadata_store.base import MetadataStore from ragbits.core.vector_store.base import VectorDBEntry, VectorStore @@ -8,7 +11,8 @@ class InMemoryVectorStore(VectorStore): A simple in-memory implementation of Vector Store, storing vectors in memory. """ - def __init__(self) -> None: + def __init__(self, metadata_store: Optional[MetadataStore] = None) -> None: + super().__init__(metadata_store) self._storage: dict[str, VectorDBEntry] = {} async def store(self, entries: list[VectorDBEntry]) -> None: diff --git a/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py b/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py new file mode 100644 index 000000000..e7ad80d66 --- /dev/null +++ b/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py @@ -0,0 +1,33 @@ +from ragbits.core.metadata_store.in_memory import InMemoryMetadataStore + + +async def test_in_memory_vector_store(): + store = InMemoryMetadataStore() + + metadata_key_1_values = {"test1": "test1", "test2": 2} + metadata_key_2_values = {"test1": "test1", "test2": 4} + await store.store("key1", metadata_key_1_values) + await store.store("key2", metadata_key_2_values) + + assert await store.get("key1") == metadata_key_1_values + assert await store.get("key2") == metadata_key_2_values + + assert await store.query("test2", 2) == {"key1": metadata_key_1_values} + assert await store.query("test1", "test1") == {"key1": metadata_key_1_values, "key2": metadata_key_2_values} + + +async def test_in_memory_vector_store_global(): + store = InMemoryMetadataStore() + + metadata = {"test1": "test1", "test2": 2} + await store.store_global(metadata) + assert await store.get_global() == metadata + + +async def test_in_memory_vector_store_all(): + store = InMemoryMetadataStore() + + metadata = {"key1": {"test1": "test1", "test2": 2}, "key2": {"test1": "test1", "test2": 2}} + for k, v in metadata.items(): + await store.store(k, v) + assert await store.get_all() == metadata diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_chromadb_store.py b/packages/ragbits-core/tests/unit/vector_stores/test_chromadb_store.py index 020e756d4..36d79d3e3 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_chromadb_store.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_chromadb_store.py @@ -66,12 +66,6 @@ def test_chromadbstore_init_import_error(): ChromaDBStore(index_name="test_index", chroma_client=MagicMock(), embedding_function=MagicMock()) -def test_get_chroma_collection(mock_chromadb_store): - _ = mock_chromadb_store._get_chroma_collection() - - assert mock_chromadb_store._chroma_client.get_or_create_collection.called - - async def test_stores_entries_correctly(mock_chromadb_store): data = [ VectorDBEntry( @@ -90,15 +84,15 @@ async def test_stores_entries_correctly(mock_chromadb_store): def test_process_db_entry(mock_chromadb_store, mock_vector_db_entry): - id, embedding, metadata = mock_chromadb_store._process_db_entry(mock_vector_db_entry) + id, embedding, key, metadata = mock_chromadb_store._process_db_entry(mock_vector_db_entry) assert id == sha256(b"test_key").hexdigest() assert embedding == [0.1, 0.2, 0.3] - assert ( - metadata["__metadata"] - == '{"content": "test content", "document": {"title": "test title", "source": {"path": "/test/path"}, "document_type": "test_type"}}' - ) - assert metadata["__key"] == "test_key" + assert metadata == { + "content": "test content", + "document": {"title": "test title", "source": {"path": "/test/path"}, "document_type": "test_type"}, + } + assert key == "test_key" async def test_store(mock_chromadb_store, mock_vector_db_entry): @@ -109,14 +103,13 @@ async def test_store(mock_chromadb_store, mock_vector_db_entry): async def test_retrieves_entries_correctly(mock_chromadb_store): vector = [0.1, 0.2, 0.3] - mock_collection = mock_chromadb_store._get_chroma_collection() + mock_collection = await mock_chromadb_store._get_chroma_collection() mock_collection.query.return_value = { "documents": [["test content"]], "metadatas": [ [ { - "__key": "test_key", - "__metadata": '{"content": "test content", "document": {"title": "test title", "source": {"path": "/test/path"}, "document_type": "test_type"}}', + "__key": '{"content": "test content", "document": {"title": "test title", "source": {"path": "/test/path"}, "document_type": "test_type"}}' } ] ], @@ -131,7 +124,7 @@ async def test_retrieves_entries_correctly(mock_chromadb_store): async def test_handles_empty_retrieve(mock_chromadb_store): vector = [0.1, 0.2, 0.3] - mock_collection = mock_chromadb_store._get_chroma_collection() + mock_collection = await mock_chromadb_store._get_chroma_collection() mock_collection.query.return_value = {"documents": [], "metadatas": []} entries = await mock_chromadb_store.retrieve(vector) From af9dc067a53f1bb5bc160ca5a2cbe8bd17ec234b Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Fri, 25 Oct 2024 09:37:30 +0200 Subject: [PATCH 02/11] Linter fixes --- .../src/ragbits/core/metadata_store/base.py | 4 +- .../ragbits/core/metadata_store/in_memory.py | 2 +- .../src/ragbits/core/vector_store/base.py | 5 +- .../core/vector_store/chromadb_store.py | 78 ++++++++++--------- .../ragbits/core/vector_store/in_memory.py | 3 +- .../unit/vector_stores/test_chromadb_store.py | 16 ++-- 6 files changed, 57 insertions(+), 51 deletions(-) diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/base.py b/packages/ragbits-core/src/ragbits/core/metadata_store/base.py index e17c5728d..b317682ab 100644 --- a/packages/ragbits-core/src/ragbits/core/metadata_store/base.py +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/base.py @@ -19,7 +19,7 @@ async def store(self, key: str | UUID, metadata: dict) -> None: """ @abc.abstractmethod - async def query(self, metadata_field_name: str, value: Any) -> dict: + async def query(self, metadata_field_name: str, value: Any) -> dict: # noqa """ Queries metastore and returns dicts with key: metadata format that match @@ -66,6 +66,6 @@ async def get_global(self) -> dict: """ Get key value pairs of metadata that is shared across entries - Returns + Returns: metadata for the whole collection """ diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py b/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py index 4ff03095e..05e6d3c13 100644 --- a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py @@ -23,7 +23,7 @@ async def store(self, key: str | UUID, metadata: dict) -> None: """ self._storage[key] = metadata - async def query(self, metadata_field_name: str, value: Any) -> dict: + async def query(self, metadata_field_name: str, value: Any) -> dict: # noqa """ Queries metastore and returns dicts with key: metadata format that match diff --git a/packages/ragbits-core/src/ragbits/core/vector_store/base.py b/packages/ragbits-core/src/ragbits/core/vector_store/base.py index 647e7c8f6..4b9a1d31b 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_store/base.py +++ b/packages/ragbits-core/src/ragbits/core/vector_store/base.py @@ -1,5 +1,4 @@ import abc -from typing import Optional from pydantic import BaseModel @@ -24,9 +23,9 @@ class VectorStore(abc.ABC): A class with an implementation of Vector Store, allowing to store and retrieve vectors by similarity function. """ - metadata_store: Optional[MetadataStore] + metadata_store: MetadataStore | None - def __init__(self, metadata_store: Optional[MetadataStore] = None): + def __init__(self, metadata_store: MetadataStore | None = None): self.metadata_store = metadata_store @abc.abstractmethod diff --git a/packages/ragbits-core/src/ragbits/core/vector_store/chromadb_store.py b/packages/ragbits-core/src/ragbits/core/vector_store/chromadb_store.py index 4d31c0389..a2087bcb9 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_store/chromadb_store.py +++ b/packages/ragbits-core/src/ragbits/core/vector_store/chromadb_store.py @@ -2,7 +2,7 @@ import json from hashlib import sha256 -from typing import List, Literal, Optional +from typing import Literal try: import chromadb @@ -16,18 +16,19 @@ from ragbits.core.utils.config_handling import get_cls_from_config from ragbits.core.vector_store import VectorDBEntry, VectorStore, WhereQuery +CHROMA_IDS_KEY = "ids" +CHROMA_DOCUMENTS_KEY = "documents" +CHROMA_DISTANCES_KEY = "distances" +CHROMA_METADATA_KEY = "metadatas" +CHROMA_EMBEDDINGS_KEY = "embeddings" +CHROMA_INCLUDE_KEYS = [CHROMA_DOCUMENTS_KEY, CHROMA_DISTANCES_KEY, CHROMA_METADATA_KEY, CHROMA_EMBEDDINGS_KEY] + class ChromaDBStore(VectorStore): """ Class that stores text embeddings using [Chroma](https://docs.trychroma.com/). """ - CHROMA_IDS_KEY = "ids" - CHROMA_DOCUMENTS_KEY = "documents" - CHROMA_DISTANCES_KEY = "distances" - CHROMA_METADATA_KEY = "metadatas" - CHROMA_EMBEDDINGS_KEY = "embeddings" - CHROMA_INCLUDE_KEYS = [CHROMA_DOCUMENTS_KEY, CHROMA_DISTANCES_KEY, CHROMA_METADATA_KEY, CHROMA_EMBEDDINGS_KEY] DEFAULT_DISTANCE_METHOD = "l2" METADATA_INNER_KEY = "__metadata" @@ -38,7 +39,7 @@ def __init__( embedding_function: Embeddings | chromadb.EmbeddingFunction, max_distance: float | None = None, distance_method: Literal["l2", "ip", "cosine"] = "l2", - metadata_store: Optional[MetadataStore] = None, + metadata_store: MetadataStore | None = None, ): """ Initializes the ChromaDBStore with the given parameters. @@ -49,6 +50,7 @@ def __init__( embedding_function: The embedding function. max_distance: The maximum distance for similarity. distance_method: The distance method to use. + metadata_store: The metadata store to use. """ if not HAS_CHROMADB: raise ImportError("Install the 'ragbits-document-search[chromadb]' extra to use LiteLLM embeddings models") @@ -59,7 +61,7 @@ def __init__( self._embedding_function = embedding_function self._max_distance = max_distance self._metadata = {"hnsw:space": distance_method} - self._collection = None + self._collection: chromadb.Collection | None = None @classmethod def from_config(cls, config: dict) -> ChromaDBStore: @@ -123,8 +125,8 @@ def _return_best_match(self, retrieved: dict) -> str | None: Returns: The best match or None if no match is found. """ - if self._max_distance is None or retrieved[self.CHROMA_DISTANCES_KEY][0][0] <= self._max_distance: - return retrieved[self.CHROMA_DOCUMENTS_KEY][0][0] + if self._max_distance is None or retrieved[CHROMA_DISTANCES_KEY][0][0] <= self._max_distance: + return retrieved[CHROMA_DOCUMENTS_KEY][0][0] return None @@ -156,16 +158,39 @@ async def store(self, entries: list[VectorDBEntry]) -> None: ids, embeddings, contents, metadatas = map(list, zip(*entries_processed, strict=False)) if self.metadata_store is not None: - for key, meta in zip(ids, metadatas): + for key, meta in zip(ids, metadatas, strict=False): await self.metadata_store.store(key, meta) metadata_to_store = None else: metadata_to_store = [{self.METADATA_INNER_KEY: json.dumps(m, default=str)} for m in metadatas] collection = await self._get_chroma_collection() - collection.add(ids=ids, embeddings=embeddings, metadatas=metadata_to_store, documents=contents) + collection.add(ids=ids, embeddings=embeddings, metadatas=metadata_to_store, documents=contents) # type: ignore + + async def _extract_entries_from_query( + self, query_results: chromadb.api.types.QueryResult | chromadb.api.types.GetResult + ) -> list[VectorDBEntry]: + db_entries: list[VectorDBEntry] = [] + + if len(query_results[CHROMA_DOCUMENTS_KEY]) < 1: # type: ignore + return db_entries + for i in range(len(query_results[CHROMA_DOCUMENTS_KEY][0])): # type: ignore + key = query_results[CHROMA_DOCUMENTS_KEY][0][i] # type: ignore + if self.metadata_store is not None: + metadata = await self.metadata_store.get(query_results[CHROMA_IDS_KEY][0][i]) # type: ignore + else: + metadata = json.loads(query_results[CHROMA_METADATA_KEY][0][i][self.METADATA_INNER_KEY]) # type: ignore + + db_entry = VectorDBEntry( + key=key, + vector=query_results[CHROMA_EMBEDDINGS_KEY][0][i], # type: ignore + metadata=metadata, + ) + db_entries.append(db_entry) + + return db_entries - async def retrieve(self, vector: List[float], k: int = 5) -> List[VectorDBEntry]: + async def retrieve(self, vector: list[float], k: int = 5) -> list[VectorDBEntry]: """ Retrieves entries from the ChromaDB collection. @@ -177,7 +202,7 @@ async def retrieve(self, vector: List[float], k: int = 5) -> List[VectorDBEntry] The retrieved entries. """ collection = await self._get_chroma_collection() - query_result = collection.query(query_embeddings=[vector], n_results=k, include=self.CHROMA_INCLUDE_KEYS) + query_result = collection.query(query_embeddings=[vector], n_results=k, include=CHROMA_INCLUDE_KEYS) # type: ignore return await self._extract_entries_from_query(query_result) async def list( @@ -199,30 +224,9 @@ async def list( where_chroma: chromadb.Where | None = dict(where) if where else None collection = await self._get_chroma_collection() - get_results = collection.get(where=where_chroma, limit=limit, offset=offset, include=self.CHROMA_INCLUDE_KEYS) + get_results = collection.get(where=where_chroma, limit=limit, offset=offset, include=CHROMA_INCLUDE_KEYS) # type: ignore return await self._extract_entries_from_query(get_results) - async def _extract_entries_from_query(self, query_results: chromadb.api.types.QueryResult | chromadb.api.types.GetResult) -> List[VectorDBEntry]: - db_entries: list[VectorDBEntry] = [] - - if len(query_results[self.CHROMA_DOCUMENTS_KEY]) < 1: - return db_entries - for i in range(len(query_results[self.CHROMA_DOCUMENTS_KEY][0])): - key = query_results[self.CHROMA_DOCUMENTS_KEY][0][i] - if self.metadata_store is not None: - metadata = await self.metadata_store.get(query_results[self.CHROMA_IDS_KEY][0][i]) - else: - metadata = json.loads(query_results[self.CHROMA_METADATA_KEY][0][i][self.METADATA_INNER_KEY]) - - db_entry = VectorDBEntry( - key=key, - vector=query_results[self.CHROMA_EMBEDDINGS_KEY][0][i], - metadata=metadata, - ) - db_entries.append(db_entry) - - return db_entries - def __repr__(self) -> str: """ Returns the string representation of the object. diff --git a/packages/ragbits-core/src/ragbits/core/vector_store/in_memory.py b/packages/ragbits-core/src/ragbits/core/vector_store/in_memory.py index 802aaa1e4..1f2a3e96e 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_store/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/vector_store/in_memory.py @@ -1,5 +1,4 @@ from itertools import islice -from typing import Optional import numpy as np @@ -12,7 +11,7 @@ class InMemoryVectorStore(VectorStore): A simple in-memory implementation of Vector Store, storing vectors in memory. """ - def __init__(self, metadata_store: Optional[MetadataStore] = None) -> None: + def __init__(self, metadata_store: MetadataStore | None = None) -> None: super().__init__(metadata_store) self._storage: dict[str, VectorDBEntry] = {} diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_chromadb_store.py b/packages/ragbits-core/tests/unit/vector_stores/test_chromadb_store.py index 29aee5d6c..97adaf483 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_chromadb_store.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_chromadb_store.py @@ -75,6 +75,7 @@ def test_chromadbstore_init_import_error(): embedding_function=MagicMock(), ) + async def test_stores_entries_correctly(mock_chromadb_store: ChromaDBStore): data = [ VectorDBEntry( @@ -96,7 +97,7 @@ async def test_stores_entries_correctly(mock_chromadb_store: ChromaDBStore): mock_chromadb_store._chroma_client.get_or_create_collection().add.assert_called_once() # type: ignore -def test_process_db_entry(mock_chromadb_store, mock_vector_db_entry): +def test_process_db_entry(mock_chromadb_store: ChromaDBStore, mock_vector_db_entry: VectorDBEntry): id, embedding, key, metadata = mock_chromadb_store._process_db_entry(mock_vector_db_entry) assert id == sha256(b"test_key").hexdigest() @@ -122,7 +123,8 @@ async def test_retrieves_entries_correctly(mock_chromadb_store: ChromaDBStore): "metadatas": [ [ { - "__metadata": '{"content": "test content", "document": {"title": "test title", "source": {"path": "/test/path"}, "document_type": "test_type"}}' + "__metadata": '{"content": "test content", "document": {"title": "test title", ' + '"source": {"path": "/test/path"}, "document_type": "test_type"}}' } ] ], @@ -137,17 +139,19 @@ async def test_retrieves_entries_correctly(mock_chromadb_store: ChromaDBStore): assert entries[0].vector == [0.12, 0.25, 0.29] -async def test_lists_entries_correctly(mock_chromadb_store): +async def test_lists_entries_correctly(mock_chromadb_store: ChromaDBStore): mock_collection = await mock_chromadb_store._get_chroma_collection() - mock_collection.get.return_value = { + mock_collection.get.return_value = { # type: ignore "documents": [["test content", "test content 2"]], "metadatas": [ [ { - "__metadata": '{"content": "test content", "document": {"title": "test title", "source": {"path": "/test/path"}, "document_type": "test_type"}}', + "__metadata": '{"content": "test content", "document": {"title": "test title", ' + '"source": {"path": "/test/path"}, "document_type": "test_type"}}', }, { - "__metadata": '{"content": "test content 2", "document": {"title": "test title 2", "source": {"path": "/test/path"}, "document_type": "test_type"}}', + "__metadata": '{"content": "test content 2", "document": {"title": "test title 2", ' + '"source": {"path": "/test/path"}, "document_type": "test_type"}}', }, ] ], From c3769083885451e23e10592866284560c8b2704b Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Fri, 25 Oct 2024 10:37:04 +0200 Subject: [PATCH 03/11] Fix ruff --- packages/ragbits-core/src/ragbits/core/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/ragbits-core/src/ragbits/core/cli.py b/packages/ragbits-core/src/ragbits/core/cli.py index f1ac8deed..c359312e5 100644 --- a/packages/ragbits-core/src/ragbits/core/cli.py +++ b/packages/ragbits-core/src/ragbits/core/cli.py @@ -81,7 +81,6 @@ def execute( Raises: ValueError: If `llm_factory` is not provided. """ - from ragbits.core.llms.factory import get_llm_from_factory prompt = _render(prompt_path=prompt_path, payload=payload) From b8e116b36a6759e714a725888ba98032fc3fd5b4 Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Tue, 29 Oct 2024 09:47:54 +0100 Subject: [PATCH 04/11] Fix linte --- packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index a83100868..25852231b 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -190,5 +190,5 @@ async def list( vector=list(embedding), metadata=await self._load_sample_metadata(metadata, sample_id), ) - for metadata, embedding, sample_id, document in zip(metadatas, embeddings, ids, documents, strict=False) # type: ignore + for metadata, embedding, sample_id, document in zip(metadatas, embeddings, ids, documents, strict=False) # type: ignore ] From 837b5d857fb449f98b7e9e86ce4e37cf8dc4643e Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Tue, 29 Oct 2024 12:57:52 +0100 Subject: [PATCH 05/11] Review fixes --- .../src/ragbits/core/metadata_store/base.py | 27 ------------------ .../ragbits/core/metadata_store/in_memory.py | 28 ------------------- .../src/ragbits/core/vector_stores/chroma.py | 9 +----- .../test_in_memory_metadata_store.py | 17 ----------- .../providers/unstructured/images.py | 23 ++++++++++----- 5 files changed, 17 insertions(+), 87 deletions(-) diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/base.py b/packages/ragbits-core/src/ragbits/core/metadata_store/base.py index b317682ab..e513d1fe5 100644 --- a/packages/ragbits-core/src/ragbits/core/metadata_store/base.py +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/base.py @@ -42,30 +42,3 @@ async def get(self, key: str | UUID) -> dict: Returns: metadata dict associated with a given key """ - - @abc.abstractmethod - async def get_all(self) -> dict: - """ - Returns all keys with associated metadata - - Returns: - metadata dict for all entries - """ - - @abc.abstractmethod - async def store_global(self, metadata: dict) -> None: - """ - Store key value pairs of metadata that is shared across entries - - Args: - metadata: common key value pairs for the whole collection - """ - - @abc.abstractmethod - async def get_global(self) -> dict: - """ - Get key value pairs of metadata that is shared across entries - - Returns: - metadata for the whole collection - """ diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py b/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py index 05e6d3c13..252b60595 100644 --- a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py @@ -11,7 +11,6 @@ class InMemoryMetadataStore(MetadataStore): def __init__(self) -> None: self._storage: dict[str | UUID, Any] = {} - self._global_storage: dict[str | UUID, Any] = {} async def store(self, key: str | UUID, metadata: dict) -> None: """ @@ -49,30 +48,3 @@ async def get(self, key: str | UUID) -> dict: metadata dict associated with a given key """ return self._storage.get(key, {}) - - async def get_all(self) -> dict: - """ - Returns all keys with associated metadata - - Returns: - metadata dict for all entries - """ - return self._storage - - async def store_global(self, metadata: dict) -> None: - """ - Store key value pairs of metadata that is shared across entries - - Args: - metadata: common key value pairs for the whole collection - """ - self._global_storage.update(metadata) - - async def get_global(self) -> dict: - """ - Get key value pairs of metadata that is shared across entries - - Returns: - metadata for the whole collection - """ - return self._global_storage diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index 25852231b..013c84ee9 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -62,16 +62,9 @@ async def _get_chroma_collection(self) -> Collection: if self._collection is not None: return self._collection - global_metadata = {"hnsw:space": self._distance_method} - if self.metadata_store is not None: - await self.metadata_store.store_global(global_metadata) - metadata_to_store = None - else: - metadata_to_store = global_metadata - self._collection = self._client.get_or_create_collection( name=self._index_name, - metadata=metadata_to_store, + metadata={"hnsw:space": self._distance_method}, ) return self._collection diff --git a/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py b/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py index e7ad80d66..5996463c6 100644 --- a/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py +++ b/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py @@ -14,20 +14,3 @@ async def test_in_memory_vector_store(): assert await store.query("test2", 2) == {"key1": metadata_key_1_values} assert await store.query("test1", "test1") == {"key1": metadata_key_1_values, "key2": metadata_key_2_values} - - -async def test_in_memory_vector_store_global(): - store = InMemoryMetadataStore() - - metadata = {"test1": "test1", "test2": 2} - await store.store_global(metadata) - assert await store.get_global() == metadata - - -async def test_in_memory_vector_store_all(): - store = InMemoryMetadataStore() - - metadata = {"key1": {"test1": "test1", "test2": 2}, "key2": {"test1": "test1", "test2": 2}} - for k, v in metadata.items(): - await store.store(k, v) - assert await store.get_all() == metadata diff --git a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py index 47539a9a7..4c2280a5b 100644 --- a/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py +++ b/packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py @@ -1,3 +1,4 @@ +import warnings from pathlib import Path from PIL import Image @@ -6,7 +7,9 @@ from unstructured.documents.elements import Element as UnstructuredElement from unstructured.documents.elements import ElementType -from ragbits.core.llms.base import LLM +from ragbits.core.llms.base import LLM, LLMType +from ragbits.core.llms.factory import get_default_llm, has_default_llm +from ragbits.core.llms.litellm import LiteLLM from ragbits.core.prompt import Prompt from ragbits.document_search.documents.document import DocumentMeta, DocumentType from ragbits.document_search.documents.element import Element, ImageElement @@ -95,12 +98,18 @@ async def _to_image_element( img_bytes = crop_and_convert_to_bytes(image, top_x, top_y, bottom_x, bottom_y) prompt = _ImagePrompt(_ImagePromptInput(images=[img_bytes])) - if self._llm: - image_description = await self._llm.generate(prompt=prompt) - elif self.image_describer: - image_description = await self.image_describer.get_image_description(prompt=prompt) - else: - image_description = "" + if self.image_describer is None: + if self._llm is not None: + llm_to_use = self._llm + elif has_default_llm(LLMType.VISION): + llm_to_use = get_default_llm(LLMType.VISION) + else: + warnings.warn( + f"Vision LLM was not provided, setting default option to {DEFAULT_LLM_IMAGE_DESCRIPTION_MODEL}" + ) + llm_to_use = LiteLLM(DEFAULT_LLM_IMAGE_DESCRIPTION_MODEL) + self.image_describer = ImageDescriber(llm_to_use) + image_description = await self.image_describer.get_image_description(prompt=prompt) return ImageElement( description=image_description, ocr_extracted_text=element.text, From 2dcc00fe0c5ad7802032780cc5245a84467ff9c4 Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Tue, 29 Oct 2024 13:41:59 +0100 Subject: [PATCH 06/11] Review fixes 2 --- .../src/ragbits/core/vector_stores/base.py | 4 +- .../src/ragbits/core/vector_stores/chroma.py | 38 +++++++++---------- .../tests/unit/vector_stores/test_chroma.py | 8 ++-- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py index 1f96d7c4a..9f7cce40a 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py @@ -31,12 +31,10 @@ class VectorStore(ABC): A class with an implementation of Vector Store, allowing to store and retrieve vectors by similarity function. """ - metadata_store: MetadataStore | None - def __init__(self, default_options: VectorStoreOptions | None = None, metadata_store: MetadataStore | None = None): super().__init__() self._default_options = default_options or VectorStoreOptions() - self.metadata_store = metadata_store + self._metadata_store = metadata_store @abstractmethod async def store(self, entries: list[VectorStoreEntry]) -> None: diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index 013c84ee9..ee43b9395 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -50,23 +50,19 @@ def __init__( self._client = client self._index_name = index_name self._distance_method = distance_method - self._collection: Collection | None = None + self._collection = self._get_chroma_collection() - async def _get_chroma_collection(self) -> Collection: + def _get_chroma_collection(self) -> Collection: """ Gets or creates a collection with the given name and metadata. Returns: The collection. """ - if self._collection is not None: - return self._collection - - self._collection = self._client.get_or_create_collection( + return self._client.get_or_create_collection( name=self._index_name, metadata={"hnsw:space": self._distance_method}, ) - return self._collection @classmethod def from_config(cls, config: dict) -> ChromaVectorStore: @@ -98,9 +94,9 @@ async def store(self, entries: list[VectorStoreEntry]) -> None: ids = [sha256(entry.key.encode("utf-8")).hexdigest() for entry in entries] embeddings = [entry.vector for entry in entries] - if self.metadata_store is not None: + if self._metadata_store is not None: for key, meta in zip(ids, [entry.metadata for entry in entries], strict=False): - await self.metadata_store.store(key, meta) + await self._metadata_store.store(key, meta) metadata_to_store = None else: metadata_to_store = [ @@ -108,9 +104,7 @@ async def store(self, entries: list[VectorStoreEntry]) -> None: ] contents = [entry.key for entry in entries] - - collection = await self._get_chroma_collection() - collection.add(ids=ids, embeddings=embeddings, metadatas=metadata_to_store, documents=contents) # type: ignore + self._collection.add(ids=ids, embeddings=embeddings, metadatas=metadata_to_store, documents=contents) # type: ignore async def retrieve(self, vector: list[float], options: VectorStoreOptions | None = None) -> list[VectorStoreEntry]: """ @@ -124,8 +118,11 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None The retrieved entries. """ options = self._default_options if options is None else options - collection = await self._get_chroma_collection() - results = collection.query(query_embeddings=vector, n_results=options.k, include=CHROMA_QUERY_INCLUDE_KEYS) # type: ignore + results = self._collection.query( + query_embeddings=vector, + n_results=options.k, + include=CHROMA_QUERY_INCLUDE_KEYS, # type: ignore + ) metadatas = results.get(CHROMA_METADATA_KEY) or [] embeddings = results.get(CHROMA_EMBEDDINGS_KEY) or [] distances = results.get(CHROMA_DISTANCES_KEY) or [] @@ -144,8 +141,8 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None ] async def _load_sample_metadata(self, metadata: dict, sample_id: str) -> dict: - if self.metadata_store is not None: - metadata = await self.metadata_store.get(sample_id) + if self._metadata_store is not None: + metadata = await self._metadata_store.get(sample_id) else: metadata = json.loads(metadata[self.METADATA_INNER_KEY]) @@ -169,9 +166,12 @@ async def list( # Cast `where` to chromadb's Where type where_chroma: chromadb.Where | None = dict(where) if where else None - collection = await self._get_chroma_collection() - get_results = collection.get(where=where_chroma, limit=limit, offset=offset, include=CHROMA_LIST_INCLUDE_KEYS) # type: ignore - + get_results = self._collection.get( + where=where_chroma, + limit=limit, + offset=offset, + include=CHROMA_LIST_INCLUDE_KEYS, # type: ignore + ) metadatas = get_results.get(CHROMA_METADATA_KEY) or [] embeddings = get_results.get(CHROMA_EMBEDDINGS_KEY) or [] documents = get_results.get(CHROMA_DOCUMENTS_KEY) or [] diff --git a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py index 6386a8f6e..2f7968480 100644 --- a/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py +++ b/packages/ragbits-core/tests/unit/vector_stores/test_chroma.py @@ -15,8 +15,8 @@ def mock_chromadb_store() -> ChromaVectorStore: async def test_get_chroma_collection(mock_chromadb_store: ChromaVectorStore) -> None: - _ = await mock_chromadb_store._get_chroma_collection() - assert mock_chromadb_store._client.get_or_create_collection.call_count == 1 # type: ignore + _ = mock_chromadb_store._get_chroma_collection() + assert mock_chromadb_store._client.get_or_create_collection.call_count == 2 # type: ignore async def test_store(mock_chromadb_store: ChromaVectorStore) -> None: @@ -69,7 +69,7 @@ async def test_retrieve( mock_chromadb_store: ChromaVectorStore, max_distance: float | None, results: list[dict] ) -> None: vector = [0.1, 0.2, 0.3] - mock_collection = await mock_chromadb_store._get_chroma_collection() + mock_collection = mock_chromadb_store._get_chroma_collection() mock_collection.query.return_value = { # type: ignore "metadatas": [ [ @@ -99,7 +99,7 @@ async def test_retrieve( async def test_list(mock_chromadb_store: ChromaVectorStore) -> None: - mock_collection = await mock_chromadb_store._get_chroma_collection() + mock_collection = mock_chromadb_store._get_chroma_collection() mock_collection.get.return_value = { # type: ignore "metadatas": [ { From 51b15e19aa3b7c40eb84021eb017280386bfabd3 Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Tue, 29 Oct 2024 14:40:31 +0100 Subject: [PATCH 07/11] Allow to create metadata store from config --- .../ragbits/core/metadata_store/__init__.py | 30 +++++++++++++++++++ .../ragbits/core/vector_stores/__init__.py | 7 ++++- .../src/ragbits/core/vector_stores/chroma.py | 2 ++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py b/packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py index e69de29bb..722a45702 100644 --- a/packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py @@ -0,0 +1,30 @@ +import sys + +from ragbits.core.utils.config_handling import get_cls_from_config + +from .base import MetadataStore +from .in_memory import InMemoryMetadataStore + +__all__ = ["InMemoryMetadataStore", "MetadataStore"] + +module = sys.modules[__name__] + + +def get_metadata_store(metadata_store_config: dict | None) -> MetadataStore | None: + """ + Initializes and returns a MetadataStore object based on the provided configuration. + + Args: + metadata_store_config: A dictionary containing configuration details for the MetadataStore. + + Returns: + An instance of the specified MetadataStore class, initialized with the provided config + (if any) or default arguments. + """ + if metadata_store_config is None: + return None + + metadata_store_class = get_cls_from_config(metadata_store_config["type"], module) + config = metadata_store_config.get("config", {}) + + return metadata_store_class(**config) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py b/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py index 06a9338a0..fd0bae429 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py @@ -1,5 +1,6 @@ import sys +from ..metadata_store import get_metadata_store from ..utils.config_handling import get_cls_from_config from .base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery from .in_memory import InMemoryVectorStore @@ -26,4 +27,8 @@ def get_vector_store(vector_store_config: dict) -> VectorStore: if vector_store_config["type"].endswith("ChromaVectorStore"): return vector_store_cls.from_config(config) - return vector_store_cls(default_options=VectorStoreOptions(**config.get("default_options", {}))) + metadata_store_config = vector_store_config.get("metadata_store_config", {}) + return vector_store_cls( + default_options=VectorStoreOptions(**config.get("default_options", {})), + metadata_store=get_metadata_store(metadata_store_config), + ) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index ee43b9395..8fa592ccf 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -8,6 +8,7 @@ from chromadb import Collection from chromadb.api import ClientAPI +from ragbits.core.metadata_store import get_metadata_store from ragbits.core.metadata_store.base import MetadataStore from ragbits.core.utils.config_handling import get_cls_from_config from ragbits.core.vector_stores.base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery @@ -81,6 +82,7 @@ def from_config(cls, config: dict) -> ChromaVectorStore: index_name=config["index_name"], distance_method=config.get("distance_method", "l2"), default_options=VectorStoreOptions(**config.get("default_options", {})), + metadata_store=get_metadata_store(config.get("metadata_store_config", {})), ) async def store(self, entries: list[VectorStoreEntry]) -> None: From d880d4cd989430679fbb3fbd3ce650436c2d9f6b Mon Sep 17 00:00:00 2001 From: Konrad Czarnota Date: Tue, 29 Oct 2024 14:54:38 +0100 Subject: [PATCH 08/11] Fix tests --- .../ragbits-core/src/ragbits/core/vector_stores/__init__.py | 2 +- packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py b/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py index fd0bae429..b6a572963 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py @@ -27,7 +27,7 @@ def get_vector_store(vector_store_config: dict) -> VectorStore: if vector_store_config["type"].endswith("ChromaVectorStore"): return vector_store_cls.from_config(config) - metadata_store_config = vector_store_config.get("metadata_store_config", {}) + metadata_store_config = vector_store_config.get("metadata_store_config") return vector_store_cls( default_options=VectorStoreOptions(**config.get("default_options", {})), metadata_store=get_metadata_store(metadata_store_config), diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index 8fa592ccf..06ab93a11 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -82,7 +82,7 @@ def from_config(cls, config: dict) -> ChromaVectorStore: index_name=config["index_name"], distance_method=config.get("distance_method", "l2"), default_options=VectorStoreOptions(**config.get("default_options", {})), - metadata_store=get_metadata_store(config.get("metadata_store_config", {})), + metadata_store=get_metadata_store(config.get("metadata_store")), ) async def store(self, entries: list[VectorStoreEntry]) -> None: From 9567ca3ba08f508a2fc7f5db787ac2aac65fce7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pstr=C4=85g?= Date: Wed, 30 Oct 2024 02:28:21 +0100 Subject: [PATCH 09/11] fixes --- examples/document-search/from_config.py | 3 + .../src/ragbits/core/metadata_store/base.py | 42 +++---- .../ragbits/core/metadata_store/exceptions.py | 8 ++ .../ragbits/core/metadata_store/in_memory.py | 47 ++++---- .../src/ragbits/core/vector_stores/base.py | 13 ++- .../src/ragbits/core/vector_stores/chroma.py | 103 +++++++++--------- .../ragbits/core/vector_stores/in_memory.py | 13 ++- .../test_in_memory_metadata_store.py | 35 ++++-- 8 files changed, 144 insertions(+), 120 deletions(-) create mode 100644 packages/ragbits-core/src/ragbits/core/metadata_store/exceptions.py diff --git a/examples/document-search/from_config.py b/examples/document-search/from_config.py index b68ab64e0..6907a1dd3 100644 --- a/examples/document-search/from_config.py +++ b/examples/document-search/from_config.py @@ -37,6 +37,9 @@ "k": 3, "max_distance": 1.2, }, + "metadata_store": { + "type": "InMemoryMetadataStore", + }, }, }, "reranker": {"type": "ragbits.document_search.retrieval.rerankers.noop:NoopReranker"}, diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/base.py b/packages/ragbits-core/src/ragbits/core/metadata_store/base.py index e513d1fe5..5a6ad81d4 100644 --- a/packages/ragbits-core/src/ragbits/core/metadata_store/base.py +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/base.py @@ -1,44 +1,32 @@ -import abc -from typing import Any -from uuid import UUID +from abc import ABC, abstractmethod -class MetadataStore(abc.ABC): +class MetadataStore(ABC): """ - An abstract class for metadata storage. Allows to store, query and retrieve metadata in form of key value pairs + An abstract class for metadata storage. Allows to store, query and retrieve metadata in form of key value pairs. """ - @abc.abstractmethod - async def store(self, key: str | UUID, metadata: dict) -> None: + @abstractmethod + async def store(self, ids: list[str], metadatas: list[dict]) -> None: """ - Store metadata under key in metadata store + Store metadatas under ids in metadata store. Args: - key: unique key of the entry - metadata: dict with metadata + ids: list of unique ids of the entries + metadatas: list of dicts with metadata. """ - @abc.abstractmethod - async def query(self, metadata_field_name: str, value: Any) -> dict: # noqa + @abstractmethod + async def get(self, ids: list[str]) -> list[dict]: """ - Queries metastore and returns dicts with key: metadata format that match + Returns metadatas associated with a given ids. Args: - metadata_field_name: name of metadata field - value: value to match against + ids: list of ids to use. Returns: - dict with key: metadata entries that match query - """ - - @abc.abstractmethod - async def get(self, key: str | UUID) -> dict: - """ - Returns metadata associated with a given key + List of metadata dicts associated with a given ids. - Args: - key: key to use - - Returns: - metadata dict associated with a given key + Raises: + MetadataNotFoundError: If the metadata is not found. """ diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/exceptions.py b/packages/ragbits-core/src/ragbits/core/metadata_store/exceptions.py new file mode 100644 index 000000000..222206855 --- /dev/null +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/exceptions.py @@ -0,0 +1,8 @@ +class MetadataNotFoundError(Exception): + """ + Raised when metadata is not found in the metadata store + """ + + def __init__(self, id: str) -> None: + super().__init__(f"Metadata not found for {id} id.") + self.id = id diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py b/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py index 252b60595..5c173bfd2 100644 --- a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py @@ -1,7 +1,5 @@ -from typing import Any -from uuid import UUID - from ragbits.core.metadata_store.base import MetadataStore +from ragbits.core.metadata_store.exceptions import MetadataNotFoundError class InMemoryMetadataStore(MetadataStore): @@ -10,41 +8,36 @@ class InMemoryMetadataStore(MetadataStore): """ def __init__(self) -> None: - self._storage: dict[str | UUID, Any] = {} - - async def store(self, key: str | UUID, metadata: dict) -> None: """ - Store metadata under key in metadata store - - Args: - key: unique key of the entry - metadata: dict with metadata + Constructs a new InMemoryMetadataStore instance. """ - self._storage[key] = metadata + self._storage: dict[str, dict] = {} - async def query(self, metadata_field_name: str, value: Any) -> dict: # noqa + async def store(self, ids: list[str], metadatas: list[dict]) -> None: """ - Queries metastore and returns dicts with key: metadata format that match + Store metadatas under ids in metadata store. Args: - metadata_field_name: name of metadata field - value: value to match against - - Returns: - dict with key: metadata entries that match query + ids: list of unique ids of the entries + metadatas: list of dicts with metadata. """ - return { - key: metadata for key, metadata in self._storage.items() if metadata.get(metadata_field_name, None) == value - } + for _id, metadata in zip(ids, metadatas, strict=False): + self._storage[_id] = metadata - async def get(self, key: str | UUID) -> dict: + async def get(self, ids: list[str]) -> list[dict]: """ - Returns metadata associated with a given key + Returns metadatas associated with a given ids. Args: - key: key to use + ids: list of ids to use. Returns: - metadata dict associated with a given key + List of metadata dicts associated with a given ids. + + Raises: + MetadataNotFoundError: If the metadata is not found. """ - return self._storage.get(key, {}) + try: + return [self._storage[_id] for _id in ids] + except KeyError as exc: + raise MetadataNotFoundError(*exc.args) from exc diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py index 9f7cce40a..9b32d360c 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py @@ -31,7 +31,18 @@ class VectorStore(ABC): A class with an implementation of Vector Store, allowing to store and retrieve vectors by similarity function. """ - def __init__(self, default_options: VectorStoreOptions | None = None, metadata_store: MetadataStore | None = None): + def __init__( + self, + default_options: VectorStoreOptions | None = None, + metadata_store: MetadataStore | None = None, + ) -> None: + """ + Constructs a new VectorStore instance. + + Args: + default_options: The default options for querying the vector store. + metadata_store: The metadata store to use. + """ super().__init__() self._default_options = default_options or VectorStoreOptions() self._metadata_store = metadata_store diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index 06ab93a11..47f9246b9 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -13,22 +13,12 @@ from ragbits.core.utils.config_handling import get_cls_from_config from ragbits.core.vector_stores.base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery -CHROMA_IDS_KEY = "ids" -CHROMA_DOCUMENTS_KEY = "documents" -CHROMA_DISTANCES_KEY = "distances" -CHROMA_METADATA_KEY = "metadatas" -CHROMA_EMBEDDINGS_KEY = "embeddings" -CHROMA_LIST_INCLUDE_KEYS = [CHROMA_DOCUMENTS_KEY, CHROMA_METADATA_KEY, CHROMA_EMBEDDINGS_KEY] -CHROMA_QUERY_INCLUDE_KEYS = CHROMA_LIST_INCLUDE_KEYS + [CHROMA_DISTANCES_KEY] - class ChromaVectorStore(VectorStore): """ Class that stores text embeddings using [Chroma](https://docs.trychroma.com/). """ - METADATA_INNER_KEY = "__metadata" - def __init__( self, client: ClientAPI, @@ -36,18 +26,18 @@ def __init__( distance_method: Literal["l2", "ip", "cosine"] = "l2", default_options: VectorStoreOptions | None = None, metadata_store: MetadataStore | None = None, - ): + ) -> None: """ - Initializes the ChromaVectorStore with the given parameters. + Constructs a new ChromaVectorStore instance. Args: client: The ChromaDB client. index_name: The name of the index. distance_method: The distance method to use. default_options: The default options for querying the vector store. - metadata_store: The metadata store to use. + metadata_store: The metadata store to use. If None, the metadata will be stored in ChromaDB. """ - super().__init__(default_options, metadata_store) + super().__init__(default_options=default_options, metadata_store=metadata_store) self._client = client self._index_name = index_name self._distance_method = distance_method @@ -76,9 +66,9 @@ def from_config(cls, config: dict) -> ChromaVectorStore: Returns: An initialized instance of the ChromaVectorStore class. """ - client = get_cls_from_config(config["client"]["type"], chromadb) # type: ignore + client_cls = get_cls_from_config(config["client"]["type"], chromadb) return cls( - client=client(**config["client"].get("config", {})), + client=client_cls(**config["client"].get("config", {})), index_name=config["index_name"], distance_method=config.get("distance_method", "l2"), default_options=VectorStoreOptions(**config.get("default_options", {})), @@ -92,21 +82,17 @@ async def store(self, entries: list[VectorStoreEntry]) -> None: Args: entries: The entries to store. """ - # TODO: Think about better id components for hashing + # TODO: Think about better id components for hashing and move hash computing to VectorStoreEntry ids = [sha256(entry.key.encode("utf-8")).hexdigest() for entry in entries] + documents = [entry.key for entry in entries] embeddings = [entry.vector for entry in entries] - - if self._metadata_store is not None: - for key, meta in zip(ids, [entry.metadata for entry in entries], strict=False): - await self._metadata_store.store(key, meta) - metadata_to_store = None - else: - metadata_to_store = [ - {self.METADATA_INNER_KEY: json.dumps(entry.metadata, default=str)} for entry in entries - ] - - contents = [entry.key for entry in entries] - self._collection.add(ids=ids, embeddings=embeddings, metadatas=metadata_to_store, documents=contents) # type: ignore + metadatas = [entry.metadata for entry in entries] + metadatas = ( + [{"__metadata": json.dumps(metadata, default=str)} for metadata in metadatas] + if self._metadata_store is None + else await self._metadata_store.store(ids, metadatas) # type: ignore + ) + self._collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas, documents=documents) # type: ignore async def retrieve(self, vector: list[float], options: VectorStoreOptions | None = None) -> list[VectorStoreEntry]: """ @@ -118,38 +104,40 @@ async def retrieve(self, vector: list[float], options: VectorStoreOptions | None Returns: The retrieved entries. + + Raises: + MetadataNotFoundError: If the metadata is not found. """ options = self._default_options if options is None else options + results = self._collection.query( query_embeddings=vector, n_results=options.k, - include=CHROMA_QUERY_INCLUDE_KEYS, # type: ignore + include=["metadatas", "embeddings", "distances", "documents"], ) - metadatas = results.get(CHROMA_METADATA_KEY) or [] - embeddings = results.get(CHROMA_EMBEDDINGS_KEY) or [] - distances = results.get(CHROMA_DISTANCES_KEY) or [] - ids = results.get(CHROMA_IDS_KEY) or [] - documents = results.get(CHROMA_DOCUMENTS_KEY) or [] + ids = results.get("ids") or [] + metadatas = results.get("metadatas") or [] + embeddings = results.get("embeddings") or [] + distances = results.get("distances") or [] + documents = results.get("documents") or [] + + metadatas = [ + [json.loads(metadata["__metadata"]) for batch in metadatas for metadata in batch] # type: ignore + if self._metadata_store is None + else await self._metadata_store.get(*ids) + ] return [ VectorStoreEntry( key=document, vector=list(embeddings), - metadata=await self._load_sample_metadata(metadata, sample_id), + metadata=metadata, # type: ignore ) - for batch in zip(metadatas, embeddings, distances, ids, documents, strict=False) # type: ignore - for metadata, embeddings, distance, sample_id, document in zip(*batch, strict=False) + for batch in zip(metadatas, embeddings, distances, documents, strict=False) + for metadata, embeddings, distance, document in zip(*batch, strict=False) if options.max_distance is None or distance <= options.max_distance ] - async def _load_sample_metadata(self, metadata: dict, sample_id: str) -> dict: - if self._metadata_store is not None: - metadata = await self._metadata_store.get(sample_id) - else: - metadata = json.loads(metadata[self.METADATA_INNER_KEY]) - - return metadata - async def list( self, where: WhereQuery | None = None, limit: int | None = None, offset: int = 0 ) -> list[VectorStoreEntry]: @@ -164,6 +152,9 @@ async def list( Returns: The entries. + + Raises: + MetadataNotFoundError: If the metadata is not found. """ # Cast `where` to chromadb's Where type where_chroma: chromadb.Where | None = dict(where) if where else None @@ -172,18 +163,24 @@ async def list( where=where_chroma, limit=limit, offset=offset, - include=CHROMA_LIST_INCLUDE_KEYS, # type: ignore + include=["metadatas", "embeddings", "documents"], + ) + ids = get_results.get("ids") or [] + metadatas = get_results.get("metadatas") or [] + embeddings = get_results.get("embeddings") or [] + documents = get_results.get("documents") or [] + + metadatas = ( + [json.loads(metadata["__metadata"]) for metadata in metadatas] # type: ignore + if self._metadata_store is None + else await self._metadata_store.get(ids) ) - metadatas = get_results.get(CHROMA_METADATA_KEY) or [] - embeddings = get_results.get(CHROMA_EMBEDDINGS_KEY) or [] - documents = get_results.get(CHROMA_DOCUMENTS_KEY) or [] - ids = get_results.get(CHROMA_IDS_KEY) or [] return [ VectorStoreEntry( key=document, vector=list(embedding), - metadata=await self._load_sample_metadata(metadata, sample_id), + metadata=metadata, # type: ignore ) - for metadata, embedding, sample_id, document in zip(metadatas, embeddings, ids, documents, strict=False) # type: ignore + for metadata, embedding, document in zip(metadatas, embeddings, documents, strict=False) ] diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py index e6207b946..155ced673 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py @@ -12,9 +12,18 @@ class InMemoryVectorStore(VectorStore): """ def __init__( - self, default_options: VectorStoreOptions | None = None, metadata_store: MetadataStore | None = None + self, + default_options: VectorStoreOptions | None = None, + metadata_store: MetadataStore | None = None, ) -> None: - super().__init__(default_options, metadata_store) + """ + Constructs a new InMemoryVectorStore instance. + + Args: + default_options: The default options for querying the vector store. + metadata_store: The metadata store to use. + """ + super().__init__(default_options=default_options, metadata_store=metadata_store) self._storage: dict[str, VectorStoreEntry] = {} async def store(self, entries: list[VectorStoreEntry]) -> None: diff --git a/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py b/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py index 5996463c6..571439951 100644 --- a/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py +++ b/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py @@ -1,16 +1,31 @@ +import pytest + +from ragbits.core.metadata_store.exceptions import MetadataNotFoundError from ragbits.core.metadata_store.in_memory import InMemoryMetadataStore -async def test_in_memory_vector_store(): - store = InMemoryMetadataStore() +@pytest.fixture +def metadata_store() -> InMemoryMetadataStore: + return InMemoryMetadataStore() + + +async def test_store(metadata_store: InMemoryMetadataStore) -> None: + ids = ["id1", "id2"] + metadatas = [{"key1": "value1"}, {"key2": "value2"}] + await metadata_store.store(ids, metadatas) + assert metadata_store._storage["id1"] == {"key1": "value1"} + assert metadata_store._storage["id2"] == {"key2": "value2"} + - metadata_key_1_values = {"test1": "test1", "test2": 2} - metadata_key_2_values = {"test1": "test1", "test2": 4} - await store.store("key1", metadata_key_1_values) - await store.store("key2", metadata_key_2_values) +async def test_get(metadata_store: InMemoryMetadataStore) -> None: + ids = ["id1", "id2"] + metadatas = [{"key1": "value1"}, {"key2": "value2"}] + await metadata_store.store(ids, metadatas) + result = await metadata_store.get(ids) + assert result == [{"key1": "value1"}, {"key2": "value2"}] - assert await store.get("key1") == metadata_key_1_values - assert await store.get("key2") == metadata_key_2_values - assert await store.query("test2", 2) == {"key1": metadata_key_1_values} - assert await store.query("test1", "test1") == {"key1": metadata_key_1_values, "key2": metadata_key_2_values} +async def test_get_metadata_not_found(metadata_store: InMemoryMetadataStore) -> None: + ids = ["id1"] + with pytest.raises(MetadataNotFoundError): + await metadata_store.get(ids) From fdd4527fcbb88cbc1267b0f2c339764f74cc8582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pstr=C4=85g?= Date: Wed, 30 Oct 2024 02:30:02 +0100 Subject: [PATCH 10/11] renames --- .../core/{metadata_store => metadata_stores}/__init__.py | 0 .../ragbits/core/{metadata_store => metadata_stores}/base.py | 0 .../core/{metadata_store => metadata_stores}/exceptions.py | 0 .../core/{metadata_store => metadata_stores}/in_memory.py | 4 ++-- .../ragbits-core/src/ragbits/core/vector_stores/__init__.py | 2 +- packages/ragbits-core/src/ragbits/core/vector_stores/base.py | 2 +- .../ragbits-core/src/ragbits/core/vector_stores/chroma.py | 4 ++-- .../ragbits-core/src/ragbits/core/vector_stores/in_memory.py | 2 +- .../test_in_memory.py} | 4 ++-- 9 files changed, 9 insertions(+), 9 deletions(-) rename packages/ragbits-core/src/ragbits/core/{metadata_store => metadata_stores}/__init__.py (100%) rename packages/ragbits-core/src/ragbits/core/{metadata_store => metadata_stores}/base.py (100%) rename packages/ragbits-core/src/ragbits/core/{metadata_store => metadata_stores}/exceptions.py (100%) rename packages/ragbits-core/src/ragbits/core/{metadata_store => metadata_stores}/in_memory.py (89%) rename packages/ragbits-core/tests/unit/{metadata_store/test_in_memory_metadata_store.py => metadata_stores/test_in_memory.py} (86%) diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py b/packages/ragbits-core/src/ragbits/core/metadata_stores/__init__.py similarity index 100% rename from packages/ragbits-core/src/ragbits/core/metadata_store/__init__.py rename to packages/ragbits-core/src/ragbits/core/metadata_stores/__init__.py diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/base.py b/packages/ragbits-core/src/ragbits/core/metadata_stores/base.py similarity index 100% rename from packages/ragbits-core/src/ragbits/core/metadata_store/base.py rename to packages/ragbits-core/src/ragbits/core/metadata_stores/base.py diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/exceptions.py b/packages/ragbits-core/src/ragbits/core/metadata_stores/exceptions.py similarity index 100% rename from packages/ragbits-core/src/ragbits/core/metadata_store/exceptions.py rename to packages/ragbits-core/src/ragbits/core/metadata_stores/exceptions.py diff --git a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py b/packages/ragbits-core/src/ragbits/core/metadata_stores/in_memory.py similarity index 89% rename from packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py rename to packages/ragbits-core/src/ragbits/core/metadata_stores/in_memory.py index 5c173bfd2..6f2a98902 100644 --- a/packages/ragbits-core/src/ragbits/core/metadata_store/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/metadata_stores/in_memory.py @@ -1,5 +1,5 @@ -from ragbits.core.metadata_store.base import MetadataStore -from ragbits.core.metadata_store.exceptions import MetadataNotFoundError +from ragbits.core.metadata_stores.base import MetadataStore +from ragbits.core.metadata_stores.exceptions import MetadataNotFoundError class InMemoryMetadataStore(MetadataStore): diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py b/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py index b6a572963..b9c16c34a 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/__init__.py @@ -1,6 +1,6 @@ import sys -from ..metadata_store import get_metadata_store +from ..metadata_stores import get_metadata_store from ..utils.config_handling import get_cls_from_config from .base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery from .in_memory import InMemoryVectorStore diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py index 9b32d360c..f1c102b1c 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/base.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/base.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from ragbits.core.metadata_store.base import MetadataStore +from ragbits.core.metadata_stores.base import MetadataStore WhereQuery = dict[str, str | int | float | bool] diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py index 47f9246b9..c8500e7ae 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/chroma.py @@ -8,8 +8,8 @@ from chromadb import Collection from chromadb.api import ClientAPI -from ragbits.core.metadata_store import get_metadata_store -from ragbits.core.metadata_store.base import MetadataStore +from ragbits.core.metadata_stores import get_metadata_store +from ragbits.core.metadata_stores.base import MetadataStore from ragbits.core.utils.config_handling import get_cls_from_config from ragbits.core.vector_stores.base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery diff --git a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py index 155ced673..48dd6a886 100644 --- a/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py +++ b/packages/ragbits-core/src/ragbits/core/vector_stores/in_memory.py @@ -2,7 +2,7 @@ import numpy as np -from ragbits.core.metadata_store.base import MetadataStore +from ragbits.core.metadata_stores.base import MetadataStore from ragbits.core.vector_stores.base import VectorStore, VectorStoreEntry, VectorStoreOptions, WhereQuery diff --git a/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py b/packages/ragbits-core/tests/unit/metadata_stores/test_in_memory.py similarity index 86% rename from packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py rename to packages/ragbits-core/tests/unit/metadata_stores/test_in_memory.py index 571439951..321e78902 100644 --- a/packages/ragbits-core/tests/unit/metadata_store/test_in_memory_metadata_store.py +++ b/packages/ragbits-core/tests/unit/metadata_stores/test_in_memory.py @@ -1,7 +1,7 @@ import pytest -from ragbits.core.metadata_store.exceptions import MetadataNotFoundError -from ragbits.core.metadata_store.in_memory import InMemoryMetadataStore +from ragbits.core.metadata_stores.exceptions import MetadataNotFoundError +from ragbits.core.metadata_stores.in_memory import InMemoryMetadataStore @pytest.fixture From 558df2aa31e994a7214ef04d02f3c62b61643eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pstr=C4=85g?= Date: Wed, 30 Oct 2024 02:38:19 +0100 Subject: [PATCH 11/11] fix tests --- packages/ragbits-core/tests/unit/metadata_stores/__init__.py | 0 packages/ragbits-core/tests/unit/vector_stores/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 packages/ragbits-core/tests/unit/metadata_stores/__init__.py create mode 100644 packages/ragbits-core/tests/unit/vector_stores/__init__.py diff --git a/packages/ragbits-core/tests/unit/metadata_stores/__init__.py b/packages/ragbits-core/tests/unit/metadata_stores/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/ragbits-core/tests/unit/vector_stores/__init__.py b/packages/ragbits-core/tests/unit/vector_stores/__init__.py new file mode 100644 index 000000000..e69de29bb