From 35ff3c94067ff20ef15c4cef23a4ece3fc73df90 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 18 Oct 2023 10:35:56 +0200 Subject: [PATCH 1/5] Fix displacy span stacking. --- spacy/displacy/render.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 2ab41ccc2a2..6500ca6589b 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -164,11 +164,11 @@ def render_spans( token_markup: Dict[str, Any] = {} token_markup["text"] = token concurrent_spans = 0 + intersecting_spans: List[Dict[str, Any]] = [] entities = [] for span in spans: ent = {} if span["start_token"] <= idx < span["end_token"]: - concurrent_spans += 1 span_start = idx == span["start_token"] ent["label"] = span["label"] ent["is_start"] = span_start @@ -176,7 +176,9 @@ def render_spans( # When the span starts, we need to know how many other # spans are on the 'span stack' and will be rendered. # This value becomes the vertical render slot for this entire span - span["render_slot"] = concurrent_spans + span["render_slot"] = \ + (intersecting_spans[-1]["render_slot"] if len(intersecting_spans) else 0) + 1 + intersecting_spans.append(span) ent["render_slot"] = span["render_slot"] kb_id = span.get("kb_id", "") kb_url = span.get("kb_url", "#") From e7ca6a2a56ccf4ba75ab47aadec7d3399a3f0bfa Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 18 Oct 2023 10:39:17 +0200 Subject: [PATCH 2/5] Format. Remove counter. --- blub.py | 20 +++++++++++ render.html | 73 ++++++++++++++++++++++++++++++++++++++++ spacy/displacy/render.py | 8 +++-- 3 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 blub.py create mode 100644 render.html diff --git a/blub.py b/blub.py new file mode 100644 index 00000000000..4b9524815eb --- /dev/null +++ b/blub.py @@ -0,0 +1,20 @@ +doc_rendering = { + "text": "Welcome to the Bank of China.", + "spans": [ + {"start_token": 2, "end_token": 5, "label": "SkillNC"}, + {"start_token": 0, "end_token": 2, "label": "Skill"}, + {"start_token": 1, "end_token": 3, "label": "Skill"}, + ], + "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."], +} + +from spacy import displacy + +html = displacy.render( + doc_rendering, + style="span", + manual=True, + options={"colors": {"Skill": "#56B4E9", "SkillNC": "#FF5733"}}, +) +with open("render.html", "w") as file: + file.write(html) diff --git a/render.html b/render.html new file mode 100644 index 00000000000..b28d3e9358a --- /dev/null +++ b/render.html @@ -0,0 +1,73 @@ +
+ + Welcome + + + + + + + + Skill + + + + + + + + to + + + + + + + + + + + Skill + + + + + + + + the + + + + + + + + + + + SkillNC + + + + + + + + Bank + + + + + + + + + of + + + + + + +China .
\ No newline at end of file diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 6500ca6589b..a6033efc949 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -163,7 +163,6 @@ def render_spans( # start token of said Span. We'll use this for the final HTML render token_markup: Dict[str, Any] = {} token_markup["text"] = token - concurrent_spans = 0 intersecting_spans: List[Dict[str, Any]] = [] entities = [] for span in spans: @@ -176,8 +175,11 @@ def render_spans( # When the span starts, we need to know how many other # spans are on the 'span stack' and will be rendered. # This value becomes the vertical render slot for this entire span - span["render_slot"] = \ - (intersecting_spans[-1]["render_slot"] if len(intersecting_spans) else 0) + 1 + span["render_slot"] = ( + intersecting_spans[-1]["render_slot"] + if len(intersecting_spans) + else 0 + ) + 1 intersecting_spans.append(span) ent["render_slot"] = span["render_slot"] kb_id = span.get("kb_id", "") From eb46a9c62d83a3472329795e6d671f365027f1ab Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 18 Oct 2023 10:42:04 +0200 Subject: [PATCH 3/5] Remove test files. --- blub.py | 20 --------------- render.html | 73 ----------------------------------------------------- 2 files changed, 93 deletions(-) delete mode 100644 blub.py delete mode 100644 render.html diff --git a/blub.py b/blub.py deleted file mode 100644 index 4b9524815eb..00000000000 --- a/blub.py +++ /dev/null @@ -1,20 +0,0 @@ -doc_rendering = { - "text": "Welcome to the Bank of China.", - "spans": [ - {"start_token": 2, "end_token": 5, "label": "SkillNC"}, - {"start_token": 0, "end_token": 2, "label": "Skill"}, - {"start_token": 1, "end_token": 3, "label": "Skill"}, - ], - "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."], -} - -from spacy import displacy - -html = displacy.render( - doc_rendering, - style="span", - manual=True, - options={"colors": {"Skill": "#56B4E9", "SkillNC": "#FF5733"}}, -) -with open("render.html", "w") as file: - file.write(html) diff --git a/render.html b/render.html deleted file mode 100644 index b28d3e9358a..00000000000 --- a/render.html +++ /dev/null @@ -1,73 +0,0 @@ -
- - Welcome - - - - - - - - Skill - - - - - - - - to - - - - - - - - - - - Skill - - - - - - - - the - - - - - - - - - - - SkillNC - - - - - - - - Bank - - - - - - - - - of - - - - - - -China .
\ No newline at end of file From ce814e9e7dd75e6384006b9c8a4c9d0c37f5b8e5 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 18 Oct 2023 14:35:27 +0200 Subject: [PATCH 4/5] Add unit test. Refactor to allow for unit test. --- spacy/displacy/render.py | 29 +++++++++++++++++++++++------ spacy/tests/test_displacy.py | 22 +++++++++++++++++++++- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index a6033efc949..40b9986e85b 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -142,7 +142,25 @@ def render_spans( spans (list): Individual entity spans and their start, end, label, kb_id and kb_url. title (str / None): Document title set in Doc.user_data['title']. """ - per_token_info = [] + per_token_info = self._assemble_per_token_info(tokens, spans) + markup = self._render_markup(per_token_info) + markup = TPL_SPANS.format(content=markup, dir=self.direction) + if title: + markup = TPL_TITLE.format(title=title) + markup + return markup + + @staticmethod + def _assemble_per_token_info( + tokens: List[str], spans: List[Dict[str, Any]] + ) -> List[Dict[str, List[Dict[str, Any]]]]: + """Assembles token info used to generate markup in render_spans(). + tokens (List[str]): Tokens in text. + spans (List[Dict[str, Any]]): Spans in text. + RETURNS (List[Dict[str, List[Dict, str, Any]]]): Per token info needed to render HTML markup for given tokens + and spans. + """ + per_token_info: List[Dict[str, List[Dict[str, Any]]]] = [] + # we must sort so that we can correctly describe when spans need to "stack" # which is determined by their start token, then span length (longer spans on top), # then break any remaining ties with the span label @@ -154,10 +172,12 @@ def render_spans( s["label"], ), ) + for s in spans: # this is the vertical 'slot' that the span will be rendered in # vertical_position = span_label_offset + (offset_step * (slot - 1)) s["render_slot"] = 0 + for idx, token in enumerate(tokens): # Identify if a token belongs to a Span (and which) and if it's a # start token of said Span. We'll use this for the final HTML render @@ -197,11 +217,8 @@ def render_spans( span["render_slot"] = 0 token_markup["entities"] = entities per_token_info.append(token_markup) - markup = self._render_markup(per_token_info) - markup = TPL_SPANS.format(content=markup, dir=self.direction) - if title: - markup = TPL_TITLE.format(title=title) + markup - return markup + + return per_token_info def _render_markup(self, per_token_info: List[Dict[str, Any]]) -> str: """Render the markup from per-token information""" diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 12d903dca45..d2afda54c0b 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -2,7 +2,7 @@ import pytest from spacy import displacy -from spacy.displacy.render import DependencyRenderer, EntityRenderer +from spacy.displacy.render import DependencyRenderer, EntityRenderer, SpanRenderer from spacy.lang.en import English from spacy.lang.fa import Persian from spacy.tokens import Doc, Span @@ -468,3 +468,23 @@ def test_issue12816(en_vocab) -> None: # Verify that the HTML tag is still escaped html = displacy.render(doc, style="span") assert "<TEST>" in html + + +@pytest.mark.issue(13056) +def test_displacy_span_stacking(): + """Test whether span stacking works properly for multiple overlapping spans.""" + spans = [ + {"start_token": 2, "end_token": 5, "label": "SkillNC"}, + {"start_token": 0, "end_token": 2, "label": "Skill"}, + {"start_token": 1, "end_token": 3, "label": "Skill"}, + ] + tokens = ["Welcome", "to", "the", "Bank", "of", "China", "."] + per_token_info = SpanRenderer._assemble_per_token_info(spans=spans, tokens=tokens) + + assert len(per_token_info) == len(tokens) + assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)]) + assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)]) + assert per_token_info[1]["entities"][1]["render_slot"] == 1 + assert per_token_info[1]["entities"][2]["render_slot"] == 2 + assert per_token_info[2]["entities"][1]["render_slot"] == 2 + assert per_token_info[2]["entities"][2]["render_slot"] == 3 From 6178399fabbdddf85d46a0445f5d0c29dd4c5af2 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 18 Oct 2023 15:00:45 +0200 Subject: [PATCH 5/5] Fix off-by-one error in tests. --- spacy/tests/test_displacy.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index d2afda54c0b..b83c7db07f4 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -484,7 +484,7 @@ def test_displacy_span_stacking(): assert len(per_token_info) == len(tokens) assert all([len(per_token_info[i]["entities"]) == 1 for i in (0, 3, 4)]) assert all([len(per_token_info[i]["entities"]) == 2 for i in (1, 2)]) - assert per_token_info[1]["entities"][1]["render_slot"] == 1 - assert per_token_info[1]["entities"][2]["render_slot"] == 2 - assert per_token_info[2]["entities"][1]["render_slot"] == 2 - assert per_token_info[2]["entities"][2]["render_slot"] == 3 + assert per_token_info[1]["entities"][0]["render_slot"] == 1 + assert per_token_info[1]["entities"][1]["render_slot"] == 2 + assert per_token_info[2]["entities"][0]["render_slot"] == 2 + assert per_token_info[2]["entities"][1]["render_slot"] == 3