From d314bbb0f444b9f3af837d0ed6ee85b1802526d7 Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Sun, 28 Apr 2024 12:23:40 +0200 Subject: [PATCH 1/2] fix typos --- spacy/cli/find_threshold.py | 4 ++-- spacy/tests/test_language.py | 2 +- website/docs/api/attributes.mdx | 2 +- website/docs/api/cli.mdx | 4 ++-- website/docs/api/entitylinker.mdx | 4 ++-- website/docs/api/entityruler.mdx | 2 +- website/docs/api/span.mdx | 2 +- website/docs/api/transformer.mdx | 2 +- website/docs/api/vectors.mdx | 2 +- website/docs/usage/layers-architectures.mdx | 2 +- website/docs/usage/linguistic-features.mdx | 2 +- website/docs/usage/projects.mdx | 4 ++-- website/docs/usage/saving-loading.mdx | 2 +- website/docs/usage/v2-2.mdx | 2 +- website/docs/usage/v3-2.mdx | 2 +- 15 files changed, 19 insertions(+), 19 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 6d591053df9..875978eeeb0 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -39,7 +39,7 @@ def find_threshold_cli( # fmt: on ): """ - Runs prediction trials for a trained model with varying tresholds to maximize + Runs prediction trials for a trained model with varying thresholds to maximize the specified metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` @@ -81,7 +81,7 @@ def find_threshold( silent: bool = True, ) -> Tuple[float, float, Dict[float, float]]: """ - Runs prediction trials for models with varying tresholds to maximize the specified metric. + Runs prediction trials for models with varying thresholds to maximize the specified metric. model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory. data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 236856dad2c..ce3fbb57670 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -327,7 +327,7 @@ def test_language_pipe_error_handler(n_process): nlp.set_error_handler(raise_error) with pytest.raises(ValueError): list(nlp.pipe(texts, n_process=n_process)) - # set explicitely to ignoring + # set explicitly to ignoring nlp.set_error_handler(ignore_error) docs = list(nlp.pipe(texts, n_process=n_process)) assert len(docs) == 0 diff --git a/website/docs/api/attributes.mdx b/website/docs/api/attributes.mdx index 3142b741d9a..c2030fa33f9 100644 --- a/website/docs/api/attributes.mdx +++ b/website/docs/api/attributes.mdx @@ -49,7 +49,7 @@ appending `_` as in `token.dep_`. | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `DEP` | The token's dependency label. ~~str~~ | | `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | -| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | | `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | | `ENT_TYPE` | The token's entity label. ~~str~~ | | `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 5b4bca1ce52..c3aac6ce22b 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -521,7 +521,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) ✔ Good amount of examples for all labels -✔ Examples without occurences available for all labels +✔ Examples without occurrences available for all labels ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== @@ -1233,7 +1233,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] ## find-threshold {id="find-threshold",version="3.5",tag="command"} -Runs prediction trials for a trained model with varying tresholds to maximize +Runs prediction trials for a trained model with varying thresholds to maximize the specified metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 21d2e9015ce..b57ecd85dbb 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -67,7 +67,7 @@ architectures and their arguments and hyperparameters. | `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | | `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -114,7 +114,7 @@ custom knowledge base, you should either call | `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | | `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ## EntityLinker.\_\_call\_\_ {id="call",tag="method"} diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx index 27624398ec6..58f66906d6b 100644 --- a/website/docs/api/entityruler.mdx +++ b/website/docs/api/entityruler.mdx @@ -173,7 +173,7 @@ happens automatically after the component has been added to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized with `overwrite_ents=True`, existing entities will be replaced if they overlap with the matches. When matches overlap in a Doc, the entity ruler prioritizes -longer patterns over shorter, and if equal the match occuring first in the Doc +longer patterns over shorter, and if equal the match occurring first in the Doc is chosen. > #### Example diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index 41422a5b4e1..225ff6e6acd 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -288,7 +288,7 @@ it – so no NP-level coordination, no prepositional phrases, and no relative clauses. If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) -has not been implemeted for the given language, a `NotImplementedError` is +has not been implemented for the given language, a `NotImplementedError` is raised. > #### Example diff --git a/website/docs/api/transformer.mdx b/website/docs/api/transformer.mdx index ad8ecce5454..d3a4457e1d4 100644 --- a/website/docs/api/transformer.mdx +++ b/website/docs/api/transformer.mdx @@ -405,7 +405,7 @@ by this class. Instances of this class are typically assigned to the | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `width` | The width of the last hidden layer. ~~int~~ | -### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"} +### TransformerData.empty {id="transformerdata-empty",tag="classmethod"} Create an empty `TransformerData` container. diff --git a/website/docs/api/vectors.mdx b/website/docs/api/vectors.mdx index d6033c0966d..fcb50550193 100644 --- a/website/docs/api/vectors.mdx +++ b/website/docs/api/vectors.mdx @@ -440,7 +440,7 @@ Load state from a binary string. > #### Example > > ```python -> fron spacy.vectors import Vectors +> from spacy.vectors import Vectors > vectors_bytes = vectors.to_bytes() > new_vectors = Vectors(StringStore()) > new_vectors.from_bytes(vectors_bytes) diff --git a/website/docs/usage/layers-architectures.mdx b/website/docs/usage/layers-architectures.mdx index 8f6bf3a205f..e1c5d2bf6db 100644 --- a/website/docs/usage/layers-architectures.mdx +++ b/website/docs/usage/layers-architectures.mdx @@ -830,7 +830,7 @@ retrieve and add to them. After creation, the component needs to be [initialized](/usage/training#initialization). This method can define the -relevant labels in two ways: explicitely by setting the `labels` argument in the +relevant labels in two ways: explicitly by setting the `labels` argument in the [`initialize` block](/api/data-formats#config-initialize) of the config, or implicately by deducing them from the `get_examples` callback that generates the full **training data set**, or a representative sample. diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx index 55d5680fe9c..f930174aba3 100644 --- a/website/docs/usage/linguistic-features.mdx +++ b/website/docs/usage/linguistic-features.mdx @@ -1900,7 +1900,7 @@ the two words. "Shore": ("coast", 0.732257), "Precautionary": ("caution", 0.490973), "hopelessness": ("sadness", 0.742366), - "Continous": ("continuous", 0.732549), + "Continuous": ("continuous", 0.732549), "Disemboweled": ("corpse", 0.499432), "biostatistician": ("scientist", 0.339724), "somewheres": ("somewheres", 0.402736), diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx index f3cca8013f1..c25a54ff58f 100644 --- a/website/docs/usage/projects.mdx +++ b/website/docs/usage/projects.mdx @@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the dependency check, set `check_requirements: false` in your project's `project.yml`. -### 4. Run a workflow {id="run-workfow"} +### 4. Run a workflow {id="run-workflow"} > #### project.yml > @@ -286,7 +286,7 @@ pipelines. | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | -| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx index aad8ea35394..4dfc73ecac6 100644 --- a/website/docs/usage/saving-loading.mdx +++ b/website/docs/usage/saving-loading.mdx @@ -346,7 +346,7 @@ them**! To stick with the theme of [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/), consider the following custom spaCy -[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a +[pipeline component](/usage/processing-pipelines#custom-components) that prints a snake when it's called: > #### Package directory structure diff --git a/website/docs/usage/v2-2.mdx b/website/docs/usage/v2-2.mdx index 84129657dda..cf4f7c5bf57 100644 --- a/website/docs/usage/v2-2.mdx +++ b/website/docs/usage/v2-2.mdx @@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) ✔ Good amount of examples for all labels -✔ Examples without occurences available for all labels +✔ Examples without occurrences available for all labels ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== diff --git a/website/docs/usage/v3-2.mdx b/website/docs/usage/v3-2.mdx index b4a4ef67242..b3ffd5d6820 100644 --- a/website/docs/usage/v3-2.mdx +++ b/website/docs/usage/v3-2.mdx @@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under `TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details in the [transformer API docs](/api/architectures#TransformerModel). -`spacy-transfomers` v1.1 also adds support for `transformer_config` settings +`spacy-transformers` v1.1 also adds support for `transformer_config` settings such as `output_attentions`. Additional output is stored under `TransformerData.model_output`. More details are in the [TransformerModel docs](/api/architectures#TransformerModel). The training speed From 1f22e197f504e705ed0461d12e8416ee42c42da5 Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Mon, 29 Apr 2024 10:14:28 +0200 Subject: [PATCH 2/2] prettier formatting --- website/docs/api/attributes.mdx | 58 +++++++++++++-------------- website/docs/api/cli.mdx | 14 +++---- website/docs/api/entitylinker.mdx | 30 +++++++------- website/docs/api/entityruler.mdx | 4 +- website/docs/usage/projects.mdx | 2 +- website/docs/usage/saving-loading.mdx | 11 +++-- 6 files changed, 61 insertions(+), 58 deletions(-) diff --git a/website/docs/api/attributes.mdx b/website/docs/api/attributes.mdx index c2030fa33f9..9cb76ac5842 100644 --- a/website/docs/api/attributes.mdx +++ b/website/docs/api/attributes.mdx @@ -45,33 +45,33 @@ For attributes that represent string values, the internal integer ID is accessed as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by appending `_` as in `token.dep_`. -| Attribute | Description | -| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `DEP` | The token's dependency label. ~~str~~ | -| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| Attribute | Description | +| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `DEP` | The token's dependency label. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | | `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | -| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | -| `ENT_TYPE` | The token's entity label. ~~str~~ | -| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | -| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | -| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | -| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | -| `IS_PUNCT` | Token is punctuation. ~~bool~~ | -| `IS_SPACE` | Token is whitespace. ~~bool~~ | -| `IS_STOP` | Token is a stop word. ~~bool~~ | -| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | -| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | -| `LEMMA` | The token's lemma. ~~str~~ | -| `LENGTH` | The length of the token text. ~~int~~ | -| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | -| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | -| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | -| `LOWER` | The lowercase form of the token text. ~~str~~ | -| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | -| `NORM` | The normalized form of the token text. ~~str~~ | -| `ORTH` | The exact verbatim text of a token. ~~str~~ | -| `POS` | The token's universal part of speech (UPOS). ~~str~~ | -| `SENT_START` | Token is start of sentence. ~~bool~~ | -| `SHAPE` | The token's shape. ~~str~~ | -| `SPACY` | Token has a trailing space. ~~bool~~ | -| `TAG` | The token's fine-grained part of speech. ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | +| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | +| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | +| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | +| `IS_PUNCT` | Token is punctuation. ~~bool~~ | +| `IS_SPACE` | Token is whitespace. ~~bool~~ | +| `IS_STOP` | Token is a stop word. ~~bool~~ | +| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | +| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | +| `LEMMA` | The token's lemma. ~~str~~ | +| `LENGTH` | The length of the token text. ~~int~~ | +| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | +| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | +| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | +| `NORM` | The normalized form of the token text. ~~str~~ | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `POS` | The token's universal part of speech (UPOS). ~~str~~ | +| `SENT_START` | Token is start of sentence. ~~bool~~ | +| `SHAPE` | The token's shape. ~~str~~ | +| `SPACY` | Token has a trailing space. ~~bool~~ | +| `TAG` | The token's fine-grained part of speech. ~~str~~ | diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index c3aac6ce22b..95bffd7f377 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1650,10 +1650,10 @@ $ python -m spacy huggingface-hub push [whl_path] [--org] [--msg] [--verbose] > $ python -m spacy huggingface-hub push en_ner_fashion-0.0.0-py3-none-any.whl > ``` -| Name | Description | -| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | -| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | -| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | -| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ | -| **UPLOADS** | The pipeline to the hub. | +| Name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------- | +| `whl_path` | The path to the `.whl` file packaged with [`spacy package`](https://spacy.io/api/cli#package). ~~Path(positional)~~ | +| `--org`, `-o` | Optional name of organization to which the pipeline should be uploaded. ~~str (option)~~ | +| `--msg`, `-m` | Commit message to use for update. Defaults to `"Update spaCy pipeline"`. ~~str (option)~~ | +| `--verbose`, `-V` | Output additional info for debugging, e.g. the full generated hub metadata. ~~bool (flag)~~ | +| **UPLOADS** | The pipeline to the hub. | diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index b57ecd85dbb..7bf4ccf75de 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -67,7 +67,7 @@ architectures and their arguments and hyperparameters. | `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | | `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -100,20 +100,20 @@ custom knowledge base, you should either call [`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the [`initialize`](/api/entitylinker#initialize) call. -| Name | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| Name | Description | +| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | | `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ## EntityLinker.\_\_call\_\_ {id="call",tag="method"} diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx index 58f66906d6b..335e87676c7 100644 --- a/website/docs/api/entityruler.mdx +++ b/website/docs/api/entityruler.mdx @@ -58,7 +58,7 @@ how the component should be configured. You can override its settings via the | Setting | Description | | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | @@ -92,7 +92,7 @@ be a token pattern (list) or a phrase pattern (string). For example: | `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | | _keyword-only_ | | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | | `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx index c25a54ff58f..84d5b062222 100644 --- a/website/docs/usage/projects.mdx +++ b/website/docs/usage/projects.mdx @@ -286,7 +286,7 @@ pipelines. | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | -| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx index 4dfc73ecac6..c891e5ea8fb 100644 --- a/website/docs/usage/saving-loading.mdx +++ b/website/docs/usage/saving-loading.mdx @@ -306,7 +306,9 @@ installed in the same environment – that's it. ### Loading probability tables into existing models -You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`. +You can load a probability table from +[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an +existing spaCy model like `en_core_web_sm`. ```python # Requirements: pip install spacy-lookups-data @@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"]) nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob")) ``` -When training a model from scratch you can also specify probability tables in the `config.cfg`. +When training a model from scratch you can also specify probability tables in +the `config.cfg`. ```ini {title="config.cfg (excerpt)"} [initialize.lookups] @@ -346,8 +349,8 @@ them**! To stick with the theme of [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/), consider the following custom spaCy -[pipeline component](/usage/processing-pipelines#custom-components) that prints a -snake when it's called: +[pipeline component](/usage/processing-pipelines#custom-components) that prints +a snake when it's called: > #### Package directory structure >