Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inference docstring #1186

Merged
merged 5 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 20 additions & 20 deletions FlagEmbedding/abc/evaluation/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ class AbsEvalDataLoader(ABC):

Args:
eval_name (str): The experiment name of current evaluation.
dataset_dir (str, optional): path to the datasets. Defaults to None.
cache_dir (str, optional): Path to HuggingFace cache directory. Defaults to None.
token (str, optional): HF_TOKEN to access the private datasets/models in HF. Defaults to None.
force_redownload: If True, will force redownload the dataset to cover the local dataset. Defaults to False.
dataset_dir (str, optional): path to the datasets. Defaults to :data:`None`.
cache_dir (str, optional): Path to HuggingFace cache directory. Defaults to :data:`None`.
token (str, optional): HF_TOKEN to access the private datasets/models in HF. Defaults to :data:`None`.
force_redownload: If True, will force redownload the dataset to cover the local dataset. Defaults to :data:`False`.
"""
def __init__(
self,
Expand Down Expand Up @@ -98,7 +98,7 @@ def load_corpus(self, dataset_name: Optional[str] = None) -> datasets.DatasetDic
"""Load the corpus from the dataset.

Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.

Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
Expand All @@ -116,8 +116,8 @@ def load_qrels(self, dataset_name: Optional[str] = None, split: str = 'test') ->
"""Load the corpus from the dataset.

Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
split (str, optional): The split to load relevance from. Defaults to 'test'.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
split (str, optional): The split to load relevance from. Defaults to :data:'test'.

Raises:
ValueError
Expand All @@ -144,8 +144,8 @@ def load_queries(self, dataset_name: Optional[str] = None, split: str = 'test')
"""Load the queries from the dataset.

Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
split (str, optional): The split to load queries from. Defaults to 'test'.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
split (str, optional): The split to load queries from. Defaults to :data:`'test'`.

Raises:
ValueError
Expand Down Expand Up @@ -176,8 +176,8 @@ def _load_remote_corpus(
"""Abstract method to load corpus from remote dataset, to be overrode in child class.

Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
save_dir (Optional[str], optional): Path to save the new downloaded corpus. Defaults to None.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
save_dir (Optional[str], optional): Path to save the new downloaded corpus. Defaults to :data:`None`.

Raises:
NotImplementedError: Loading remote corpus is not implemented.
Expand All @@ -196,9 +196,9 @@ def _load_remote_qrels(
"""Abstract method to load relevance from remote dataset, to be overrode in child class.

Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
split (str, optional): Split to load from the remote dataset. Defaults to 'test'.
save_dir (Optional[str], optional): Path to save the new downloaded relevance. Defaults to None.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
split (str, optional): Split to load from the remote dataset. Defaults to :data:`'test'`.
save_dir (Optional[str], optional): Path to save the new downloaded relevance. Defaults to :data:`None`.

Raises:
NotImplementedError: Loading remote qrels is not implemented.
Expand All @@ -217,9 +217,9 @@ def _load_remote_queries(
"""Abstract method to load queries from remote dataset, to be overrode in child class.

Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
split (str, optional): Split to load from the remote dataset. Defaults to 'test'.
save_dir (Optional[str], optional): Path to save the new downloaded queries. Defaults to None.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
split (str, optional): Split to load from the remote dataset. Defaults to :data:`'test'`.
save_dir (Optional[str], optional): Path to save the new downloaded queries. Defaults to :data:`None`.

Raises:
NotImplementedError
Expand All @@ -234,7 +234,7 @@ def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None)

Args:
save_dir (str): Path to save the loaded corpus.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.

Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
Expand All @@ -257,8 +257,8 @@ def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, s

Args:
save_dir (str): Path to save the loaded relevance.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
split (str, optional): Split to load from the local dataset. Defaults to 'test'.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
split (str, optional): Split to load from the local dataset. Defaults to :data:`'test'`.

Raises:
ValueError
Expand Down
16 changes: 8 additions & 8 deletions FlagEmbedding/abc/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,12 @@ def __call__(
Args:
splits (Union[str, List[str]]): Splits of datasets.
search_results_save_dir (str): Directory to save the search results.
retriever (EvalRetriever): object of :class:EvalRetriever
reranker (Optional[EvalReranker], optional): Object of :class:EvalReranker. Defaults to None.
corpus_embd_save_dir (Optional[str], optional): Directory to save the embedded corpus. Defaults to None.
ignore_identical_ids (bool, optional): If True, will ignore identical ids in search results. Defaults to False.
k_values (List[int], optional): Cutoffs. Defaults to [1, 3, 5, 10, 100, 1000].
dataset_name (Optional[str], optional): Name of the datasets. Defaults to None.
retriever (EvalRetriever): object of :class:EvalRetriever.
reranker (Optional[EvalReranker], optional): Object of :class:EvalReranker. Defaults to :data:`None`.
corpus_embd_save_dir (Optional[str], optional): Directory to save the embedded corpus. Defaults to :data:`None`.
ignore_identical_ids (bool, optional): If True, will ignore identical ids in search results. Defaults to :data:`False`.
k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`.
dataset_name (Optional[str], optional): Name of the datasets. Defaults to :data:`None`.
"""
# Check Splits
checked_splits = self.data_loader.check_splits(splits, dataset_name=dataset_name)
Expand Down Expand Up @@ -278,7 +278,7 @@ def save_search_results(
search_results (Dict[str, Dict[str, float]]): Dictionary of search results.
output_path (str): Output path to write the results.
split (str): Split used in searching.
dataset_name (Optional[str], optional): Name of dataset used. Defaults to None.
dataset_name (Optional[str], optional): Name of dataset used. Defaults to :data:`None`.
"""
data = {
"eval_name": eval_name,
Expand Down Expand Up @@ -354,7 +354,7 @@ def evaluate_results(

Args:
search_results_save_dir (str): Path to the search results.
k_values (List[int], optional): Cutoffs. Defaults to [1, 3, 5, 10, 100, 1000].
k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`.

Returns:
_type_: _description_
Expand Down
6 changes: 3 additions & 3 deletions FlagEmbedding/abc/evaluation/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,9 @@ def evaluate_metrics(

Args:
search_results_save_dir (str): Path to save the search results.
output_method (str, optional): Output results to `json` or `markdown`. Defaults to "markdown".
output_path (str, optional): Path to write the output. Defaults to "./eval_dev_results.md".
metrics (Union[str, List[str]], optional): metrics to use. Defaults to ["ndcg_at_10", "recall_at_10"].
output_method (str, optional): Output results to `json` or `markdown`. Defaults to :data:`"markdown"`.
output_path (str, optional): Path to write the output. Defaults to :data:`"./eval_dev_results.md"`.
metrics (Union[str, List[str]], optional): metrics to use. Defaults to :data:`["ndcg_at_10", "recall_at_10"]`.

Raises:
FileNotFoundError: Eval results not found
Expand Down
4 changes: 4 additions & 0 deletions FlagEmbedding/abc/evaluation/searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def __call__(
queries: Dict[str, str]: Queries to search for.
Structure: {<qid>: <query>}.
Example: {"q-0": "This is a query."}
corpus_embd_save_dir (Optional[str]): Defaults to :data:`None`.
ignore_identical_ids (bool): Defaults to :data:`False`.
**kwargs: Any: Additional arguments.

Returns: Dict[str, Dict[str, float]]: Top-k search results for each query. k is specified by search_top_k.
Expand Down Expand Up @@ -87,6 +89,8 @@ def __call__(
queries: Dict[str, str]: Queries to search for.
Structure: {<qid>: <query>}.
Example: {"q-0": "This is a query."}
corpus_embd_save_dir (Optional[str]): Defaults to :data:`None`.
ignore_identical_ids (bool): Defaults to :data:`False`.
**kwargs: Any: Additional arguments.

Returns: Dict[str, Dict[str, float]]: Top-k search results for each query. k is specified by search_top_k.
Expand Down
6 changes: 3 additions & 3 deletions FlagEmbedding/abc/evaluation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ def search(

Args:
faiss_index (faiss.Index): The Faiss index that contains all the corpus embeddings.
k (int, optional): Top k numbers of closest neighbours. Defaults to 100.
query_embeddings (Optional[np.ndarray], optional): The embedding vectors of queries. Defaults to None.
load_path (Optional[str], optional): Path to load embeddings from. Defaults to None.
k (int, optional): Top k numbers of closest neighbours. Defaults to :data:`100`.
query_embeddings (Optional[np.ndarray], optional): The embedding vectors of queries. Defaults to :data:`None`.
load_path (Optional[str], optional): Path to load embeddings from. Defaults to :data:`None`.

Returns:
Tuple[np.ndarray, np.ndarray]: The scores of search results and their corresponding indices.
Expand Down
46 changes: 22 additions & 24 deletions FlagEmbedding/abc/inference/AbsEmbedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,23 @@
class AbsEmbedder(ABC):
"""
Base class for embedder.
Extend this class and implement :meth:`encode_queries`, :meth:`encode_passages`, :meth:`encode` for custom embedders.
Extend this class and implement :meth:`encode_queries`, :meth:`encode_corpus`, :meth:`encode` for custom embedders.

Args:
model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and
load a model from HuggingFace Hub with the name.
normalize_embeddings (bool, optional): If True, normalize the embedding vector. Default: `True`.
normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`.
use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance
degradation. Default: `True`.
degradation. Defaults to :data:`True`.
query_instruction_for_retrieval: (Optional[str], optional): Query instruction for retrieval tasks, which will be used with
with :attr:`query_instruction_format`. Default: `None`.
query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_retrieval`. Default: `"{}{}"`.
devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Default: `None`.
batch_size (int, optional): Batch size for inference. Default: `256`.
query_max_length (int, optional): Maximum length for query. Default: `512`.
passage_max_length (int, optional): Maximum length for passage. Default: `512`.
instruction (Optional[str], optional): Instruction for embedding with :attr:`instruction_format`. Default: `None`.
instruction_format (str, optional): Instruction format when using :attr:`instruction`. Default: `"{}{}"`.
with :attr:`query_instruction_format`. Defaults to :data:`None`.
query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`.
devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`.
batch_size (int, optional): Batch size for inference. Defaults to :data:`256`.
query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`.
passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`.
convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor.
Default: `True`.
Defaults to :data:`True`.
kwargs (Dict[Any], optional): Additional parameters for HuggingFace Transformers config or children classes.
"""

Expand Down Expand Up @@ -139,10 +137,10 @@ def encode_queries(

Args:
queries (Union[List[str], str]): Input queries to encode.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will
be a Torch Tensor. Defaults to None.
be a Torch Tensor. Defaults to :data:`None`.

Returns:
Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
Expand Down Expand Up @@ -173,10 +171,10 @@ def encode_corpus(

Args:
corpus (Union[List[str], str]): Input corpus to encode.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will
be a Torch Tensor. Defaults to None.
be a Torch Tensor. Defaults to :data:`None`.

Returns:
Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
Expand Down Expand Up @@ -212,12 +210,12 @@ def encode(

Args:
sentences (Union[List[str], str]): Input sentences to encode.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will
be a Torch Tensor. Defaults to None.
instruction (Optional[str], optional): The text of instruction. Defaults to None.
instruction_format (Optional[str], optional): Format for instruction. Defaults to None.
be a Torch Tensor. Defaults to :data:`None`.
instruction (Optional[str], optional): The text of instruction. Defaults to :data:`None`.
instruction_format (Optional[str], optional): Format for instruction. Defaults to :data:`None`.

Returns:
Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
Expand Down Expand Up @@ -396,7 +394,7 @@ def _concatenate_results_from_multi_process(self, results_list: List[Union[torch
"""concatenate and return the results from all the processes

Args:
results_list (List[Union[torch.Tensor, np.ndarray, Any]]): a list of results from all the processes
results_list (List[Union[torch.Tensor, np.ndarray, Any]]): A list of results from all the processes.

Raises:
NotImplementedError: Unsupported type for results_list
Expand Down
Loading