FlagOpen · ZiyiXia · Nov 4, 2024 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/FlagEmbedding/abc/evaluation/data_loader.py b/FlagEmbedding/abc/evaluation/data_loader.py
@@ -17,10 +17,10 @@ class AbsEvalDataLoader(ABC):
 
     Args:
         eval_name (str): The experiment name of current evaluation.
-        dataset_dir (str, optional): path to the datasets. Defaults to None.
-        cache_dir (str, optional): Path to HuggingFace cache directory. Defaults to None.
-        token (str, optional): HF_TOKEN to access the private datasets/models in HF. Defaults to None.
-        force_redownload: If True, will force redownload the dataset to cover the local dataset. Defaults to False.
+        dataset_dir (str, optional): path to the datasets. Defaults to :data:`None`.
+        cache_dir (str, optional): Path to HuggingFace cache directory. Defaults to :data:`None`.
+        token (str, optional): HF_TOKEN to access the private datasets/models in HF. Defaults to :data:`None`.
+        force_redownload: If True, will force redownload the dataset to cover the local dataset. Defaults to :data:`False`.
     """
     def __init__(
         self,
@@ -98,7 +98,7 @@ def load_corpus(self, dataset_name: Optional[str] = None) -> datasets.DatasetDic
         """Load the corpus from the dataset.
 
         Args:
-            dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
+            dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
 
         Returns:
             datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
@@ -116,8 +116,8 @@ def load_qrels(self, dataset_name: Optional[str] = None, split: str = 'test') ->
         """Load the corpus from the dataset.
 
         Args:
-            dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
-            split (str, optional): The split to load relevance from. Defaults to 'test'.
+            dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
+            split (str, optional): The split to load relevance from. Defaults to :data:'test'.
 
         Raises:
             ValueError
@@ -144,8 +144,8 @@ def load_queries(self, dataset_name: Optional[str] = None, split: str = 'test')
         """Load the queries from the dataset.
 
         Args:
-            dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
-            split (str, optional): The split to load queries from. Defaults to 'test'.
+            dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
+            split (str, optional): The split to load queries from. Defaults to :data:`'test'`.
 
         Raises:
             ValueError
@@ -176,8 +176,8 @@ def _load_remote_corpus(
         """Abstract method to load corpus from remote dataset, to be overrode in child class.
 
         Args:
-            dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
-            save_dir (Optional[str], optional): Path to save the new downloaded corpus. Defaults to None.
+            dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
+            save_dir (Optional[str], optional): Path to save the new downloaded corpus. Defaults to :data:`None`.
 
         Raises:
             NotImplementedError: Loading remote corpus is not implemented.
@@ -196,9 +196,9 @@ def _load_remote_qrels(
         """Abstract method to load relevance from remote dataset, to be overrode in child class.
 
         Args:
-            dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
-            split (str, optional): Split to load from the remote dataset. Defaults to 'test'.
-            save_dir (Optional[str], optional): Path to save the new downloaded relevance. Defaults to None.
+            dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
+            split (str, optional): Split to load from the remote dataset. Defaults to :data:`'test'`.
+            save_dir (Optional[str], optional): Path to save the new downloaded relevance. Defaults to :data:`None`.
 
         Raises:
             NotImplementedError: Loading remote qrels is not implemented.
@@ -217,9 +217,9 @@ def _load_remote_queries(
         """Abstract method to load queries from remote dataset, to be overrode in child class.
 
         Args:
-            dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
-            split (str, optional): Split to load from the remote dataset. Defaults to 'test'.
-            save_dir (Optional[str], optional): Path to save the new downloaded queries. Defaults to None.
+            dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
+            split (str, optional): Split to load from the remote dataset. Defaults to :data:`'test'`.
+            save_dir (Optional[str], optional): Path to save the new downloaded queries. Defaults to :data:`None`.
 
         Raises:
             NotImplementedError
@@ -234,7 +234,7 @@ def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None)
 
         Args:
             save_dir (str): Path to save the loaded corpus.
-            dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
+            dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
 
         Returns:
             datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
@@ -257,8 +257,8 @@ def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, s
 
         Args:
             save_dir (str):  Path to save the loaded relevance.
-            dataset_name (Optional[str], optional): Name of the dataset. Defaults to None.
-            split (str, optional): Split to load from the local dataset. Defaults to 'test'.
+            dataset_name (Optional[str], optional): Name of the dataset. Defaults to :data:`None`.
+            split (str, optional): Split to load from the local dataset. Defaults to :data:`'test'`.
 
         Raises:
             ValueError

diff --git a/FlagEmbedding/abc/evaluation/evaluator.py b/FlagEmbedding/abc/evaluation/evaluator.py
@@ -116,12 +116,12 @@ def __call__(
         Args:
             splits (Union[str, List[str]]): Splits of datasets.
             search_results_save_dir (str): Directory to save the search results.
-            retriever (EvalRetriever): object of :class:EvalRetriever
-            reranker (Optional[EvalReranker], optional): Object of :class:EvalReranker. Defaults to None.
-            corpus_embd_save_dir (Optional[str], optional): Directory to save the embedded corpus. Defaults to None.
-            ignore_identical_ids (bool, optional): If True, will ignore identical ids in search results. Defaults to False.
-            k_values (List[int], optional): Cutoffs. Defaults to [1, 3, 5, 10, 100, 1000].
-            dataset_name (Optional[str], optional): Name of the datasets. Defaults to None.
+            retriever (EvalRetriever): object of :class:EvalRetriever.
+            reranker (Optional[EvalReranker], optional): Object of :class:EvalReranker. Defaults to :data:`None`.
+            corpus_embd_save_dir (Optional[str], optional): Directory to save the embedded corpus. Defaults to :data:`None`.
+            ignore_identical_ids (bool, optional): If True, will ignore identical ids in search results. Defaults to :data:`False`.
+            k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`.
+            dataset_name (Optional[str], optional): Name of the datasets. Defaults to :data:`None`.
         """
         # Check Splits
         checked_splits = self.data_loader.check_splits(splits, dataset_name=dataset_name)
@@ -278,7 +278,7 @@ def save_search_results(
             search_results (Dict[str, Dict[str, float]]): Dictionary of search results.
             output_path (str): Output path to write the results.
             split (str): Split used in searching.
-            dataset_name (Optional[str], optional): Name of dataset used. Defaults to None.
+            dataset_name (Optional[str], optional): Name of dataset used. Defaults to :data:`None`.
         """
         data = {
             "eval_name": eval_name,
@@ -354,7 +354,7 @@ def evaluate_results(
 
         Args:
             search_results_save_dir (str): Path to the search results.
-            k_values (List[int], optional): Cutoffs. Defaults to [1, 3, 5, 10, 100, 1000].
+            k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`.
 
         Returns:
             _type_: _description_

diff --git a/FlagEmbedding/abc/evaluation/runner.py b/FlagEmbedding/abc/evaluation/runner.py
@@ -145,9 +145,9 @@ def evaluate_metrics(
 
         Args:
             search_results_save_dir (str): Path to save the search results.
-            output_method (str, optional): Output results to `json` or `markdown`. Defaults to "markdown".
-            output_path (str, optional): Path to write the output. Defaults to "./eval_dev_results.md".
-            metrics (Union[str, List[str]], optional): metrics to use. Defaults to ["ndcg_at_10", "recall_at_10"].
+            output_method (str, optional): Output results to `json` or `markdown`. Defaults to :data:`"markdown"`.
+            output_path (str, optional): Path to write the output. Defaults to :data:`"./eval_dev_results.md"`.
+            metrics (Union[str, List[str]], optional): metrics to use. Defaults to :data:`["ndcg_at_10", "recall_at_10"]`.
 
         Raises:
             FileNotFoundError: Eval results not found

diff --git a/FlagEmbedding/abc/evaluation/searcher.py b/FlagEmbedding/abc/evaluation/searcher.py
@@ -57,6 +57,8 @@ def __call__(
             queries: Dict[str, str]: Queries to search for.
                 Structure: {<qid>: <query>}.
                 Example: {"q-0": "This is a query."}
+            corpus_embd_save_dir (Optional[str]): Defaults to :data:`None`.
+            ignore_identical_ids (bool): Defaults to :data:`False`.
             **kwargs: Any: Additional arguments.
 
         Returns: Dict[str, Dict[str, float]]: Top-k search results for each query. k is specified by search_top_k.
@@ -87,6 +89,8 @@ def __call__(
             queries: Dict[str, str]: Queries to search for.
                 Structure: {<qid>: <query>}.
                 Example: {"q-0": "This is a query."}
+            corpus_embd_save_dir (Optional[str]): Defaults to :data:`None`.
+            ignore_identical_ids (bool): Defaults to :data:`False`.
             **kwargs: Any: Additional arguments.
 
         Returns: Dict[str, Dict[str, float]]: Top-k search results for each query. k is specified by search_top_k.

diff --git a/FlagEmbedding/abc/evaluation/utils.py b/FlagEmbedding/abc/evaluation/utils.py
@@ -162,9 +162,9 @@ def search(
 
     Args:
         faiss_index (faiss.Index): The Faiss index that contains all the corpus embeddings.
-        k (int, optional): Top k numbers of closest neighbours. Defaults to 100.
-        query_embeddings (Optional[np.ndarray], optional): The embedding vectors of queries. Defaults to None.
-        load_path (Optional[str], optional): Path to load embeddings from. Defaults to None.
+        k (int, optional): Top k numbers of closest neighbours. Defaults to :data:`100`.
+        query_embeddings (Optional[np.ndarray], optional): The embedding vectors of queries. Defaults to :data:`None`.
+        load_path (Optional[str], optional): Path to load embeddings from. Defaults to :data:`None`.
 
     Returns:
         Tuple[np.ndarray, np.ndarray]: The scores of search results and their corresponding indices.

diff --git a/FlagEmbedding/abc/inference/AbsEmbedder.py b/FlagEmbedding/abc/inference/AbsEmbedder.py
@@ -18,25 +18,23 @@
 class AbsEmbedder(ABC):
     """
     Base class for embedder.
-    Extend this class and implement :meth:`encode_queries`, :meth:`encode_passages`, :meth:`encode` for custom embedders.
+    Extend this class and implement :meth:`encode_queries`, :meth:`encode_corpus`, :meth:`encode` for custom embedders.
 
     Args:
         model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and
             load a model from HuggingFace Hub with the name.
-        normalize_embeddings (bool, optional): If True, normalize the embedding vector. Default: `True`.
+        normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`.
         use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance 
-            degradation. Default: `True`.
+            degradation. Defaults to :data:`True`.
         query_instruction_for_retrieval: (Optional[str], optional): Query instruction for retrieval tasks, which will be used with
-            with :attr:`query_instruction_format`. Default: `None`.
-        query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_retrieval`. Default: `"{}{}"`.
-        devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Default: `None`.
-        batch_size (int, optional): Batch size for inference. Default: `256`.
-        query_max_length (int, optional): Maximum length for query. Default: `512`.
-        passage_max_length (int, optional): Maximum length for passage. Default: `512`.
-        instruction (Optional[str], optional): Instruction for embedding with :attr:`instruction_format`. Default: `None`.
-        instruction_format (str, optional): Instruction format when using :attr:`instruction`. Default: `"{}{}"`.
+            with :attr:`query_instruction_format`. Defaults to :data:`None`.
+        query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`.
+        devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`.
+        batch_size (int, optional): Batch size for inference. Defaults to :data:`256`.
+        query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`.
+        passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`.
         convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. 
-            Default: `True`.
+            Defaults to :data:`True`.
         kwargs (Dict[Any], optional): Additional parameters for HuggingFace Transformers config or children classes.
     """
 
@@ -139,10 +137,10 @@ def encode_queries(
 
         Args:
             queries (Union[List[str], str]): Input queries to encode.
-            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
-            max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
             convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
-                be a Torch Tensor. Defaults to None.
+                be a Torch Tensor. Defaults to :data:`None`.
 
         Returns:
             Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
@@ -173,10 +171,10 @@ def encode_corpus(
 
         Args:
             corpus (Union[List[str], str]): Input corpus to encode.
-            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
-            max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
             convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
-                be a Torch Tensor. Defaults to None.
+                be a Torch Tensor. Defaults to :data:`None`.
 
         Returns:
             Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor.
@@ -212,12 +210,12 @@ def encode(
 
         Args:
             sentences (Union[List[str], str]): Input sentences to encode.
-            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to None.
-            max_length (Optional[int], optional): Maximum length of tokens. Defaults to None.
+            batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`.
+            max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`.
             convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will 
-                be a Torch Tensor. Defaults to None.
-            instruction (Optional[str], optional): The text of instruction. Defaults to None.
-            instruction_format (Optional[str], optional): Format for instruction. Defaults to None.
+                be a Torch Tensor. Defaults to :data:`None`.
+            instruction (Optional[str], optional): The text of instruction. Defaults to :data:`None`.
+            instruction_format (Optional[str], optional): Format for instruction. Defaults to :data:`None`.
 
         Returns:
             Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
@@ -396,7 +394,7 @@ def _concatenate_results_from_multi_process(self, results_list: List[Union[torch
         """concatenate and return the results from all the processes
 
         Args:
-            results_list (List[Union[torch.Tensor, np.ndarray, Any]]): a list of results from all the processes
+            results_list (List[Union[torch.Tensor, np.ndarray, Any]]): A list of results from all the processes.
 
         Raises:
             NotImplementedError: Unsupported type for results_list