From 88be7e45bbebaecccaa553df05a33ff3f11caaf8 Mon Sep 17 00:00:00 2001 From: Arnaud Rachez Date: Sun, 2 Mar 2025 22:51:47 +0900 Subject: [PATCH] feat: draft annotation ui --- README.md | 4 ++ wsd/annotate/__init__.py | 0 wsd/annotate/app.py | 64 +++++++++++++++++++++++++---- wsd/annotate/components/__init__.py | 1 + wsd/annotate/components/pop.js | 36 ++++++++++++++++ wsd/annotate/components/pop.py | 26 ++++++++++++ wsd/models/baseline.py | 21 ++++++++-- wsd/parsers/jmdict.py | 12 +++--- 8 files changed, 146 insertions(+), 18 deletions(-) delete mode 100644 wsd/annotate/__init__.py create mode 100644 wsd/annotate/components/__init__.py create mode 100644 wsd/annotate/components/pop.js create mode 100644 wsd/annotate/components/pop.py diff --git a/README.md b/README.md index 6c84462..0cb5fb9 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,10 @@ Let's build a state-of-the-art multi-lingual Word Sense Disambiguation model. - Download the data with `git lfs fetch --all` - See `examples/data.ipynb`. +## Annotate new data + +Run `mesop wsd/annotate/app.py` + ## Attribution and LICENSE - [The JMDict Project](https://www.edrdg.org/jmdict/j_jmdict.html) - [XL-WSD](https://sapienzanlp.github.io/xl-wsd/docs/data/) diff --git a/wsd/annotate/__init__.py b/wsd/annotate/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/wsd/annotate/app.py b/wsd/annotate/app.py index 8f734bf..2831edd 100644 --- a/wsd/annotate/app.py +++ b/wsd/annotate/app.py @@ -1,10 +1,18 @@ +from dataclasses import field +from typing import List import mesop as me +import mesop.labs as mel from mesop.server.wsgi_app import create_app from wsd.parsers import XLWSDParser +from wsd.parsers.jmdict import Entry +from wsd.models import JMDict +from wsd.annotate.components import linpop_component -parser = XLWSDParser() -X, y = parser.parse("ja") +xlwsd = XLWSDParser() +X, y = xlwsd.parse("ja") + +jmdict = JMDict() style_grid = me.Style( display="grid", @@ -19,23 +27,61 @@ padding=me.Padding.all(24), overflow_y="auto" ) -style_footer = me.Style( - background="#f0f0f0", - padding=me.Padding.all(24) +style_card = me.Style( + padding=me.Padding.all(24), + overflow_y="auto", + z_index=100, + box_shadow="0 0 10px rgba(0, 0, 0, 0.1)" +) +style_group = me.Style( + display="flex", + gap=8 ) +@me.stateclass +class State: + candidates: List[Entry] = field(default_factory=list) -@me.page(path="/") +@me.page( + path="/", + security_policy=me.SecurityPolicy( + allowed_script_srcs=[ + "https://cdn.jsdelivr.net", + ] + ), +) def app(): with me.box(style=style_grid): with me.box(style=style_header): me.text("SEMCOR WSD") with me.box(style=style_body): - me.text(''.join(X[0])) + with me.box(style=style_card): + for tok in X[0]: + linpop_component( + text=tok, + on_pop=on_pop + ) + state = me.state(State) + for candidate in state.candidates: + with me.box(style=style_card): + me.text(candidate.ent_seq) + with me.box(style=style_group): + for kanji in candidate.k_ele: + me.text(kanji.keb) + with me.box(style=style_group): + for reading in candidate.r_ele: + me.text(reading.reb) + with me.box(): + for sense in candidate.sense: + for gloss in sense.gloss: + me.text(gloss.text) + - with me.box(style=style_footer): - me.text("Footer") +def on_pop(event: mel.WebEvent): + state = me.state(State) + query = event.value['text'] + state.candidates = jmdict.search(query) if __name__ == "__main__": diff --git a/wsd/annotate/components/__init__.py b/wsd/annotate/components/__init__.py new file mode 100644 index 0000000..6ec6c9b --- /dev/null +++ b/wsd/annotate/components/__init__.py @@ -0,0 +1 @@ +from .pop import linpop_component \ No newline at end of file diff --git a/wsd/annotate/components/pop.js b/wsd/annotate/components/pop.js new file mode 100644 index 0000000..a6df0d3 --- /dev/null +++ b/wsd/annotate/components/pop.js @@ -0,0 +1,36 @@ +import { + LitElement, + html, + css +} from 'https://cdn.jsdelivr.net/gh/lit/dist@3/all/lit-all.min.js'; + +class LinPopComponent extends LitElement { + static properties = { + text: { type: String }, + popEvent: { type: String } + }; + static styles = css` + .pop:hover { + background-color:rgb(181, 181, 181); + cursor: pointer; + } + `; + + render() { + return html` + + ${this.text} + + `; + } + + _pop() { + this.dispatchEvent( + new MesopEvent(this.popEvent, { + text: this.text, + }), + ); + } +} + +customElements.define('linpop-component', LinPopComponent); \ No newline at end of file diff --git a/wsd/annotate/components/pop.py b/wsd/annotate/components/pop.py new file mode 100644 index 0000000..511a3e2 --- /dev/null +++ b/wsd/annotate/components/pop.py @@ -0,0 +1,26 @@ +from typing import Any, Callable + +import mesop.labs as mel + + +@mel.web_component(path="./pop.js") +def linpop_component( + *, + text: str, + on_pop: Callable[[mel.WebEvent], Any], + key: str | None = None, +): + return mel.insert_web_component( + name="linpop-component", + key=key, + events={ + "popEvent": on_pop, + }, + properties={ + "text": text, + }, + ) + +__all__ = [ + "linpop_component", +] \ No newline at end of file diff --git a/wsd/models/baseline.py b/wsd/models/baseline.py index 712f1d8..5036a66 100644 --- a/wsd/models/baseline.py +++ b/wsd/models/baseline.py @@ -1,14 +1,19 @@ """A simple dictionary interface for JMDict.""" +import os +from typing import List from wsd.parsers import JMDictParser +from wsd.parsers.jmdict import Entry +data_dir = os.path.join(os.path.dirname(__file__), '../../data') class JMDict: """A simple dictionary interface for JMDict""" def __init__(self): - self.entries = JMDictParser().parse('../data/JMdict_en.gz') + jmdict_file = os.path.join(data_dir, 'JMdict_en.gz') + self.entries = JMDictParser().parse(jmdict_file) - def search(self, text): + def search(self, text: str) -> List[Entry]: """Search for an entry by text. Currently returns all entries that contain the text in either the kanji @@ -18,6 +23,11 @@ def search(self, text): ---------- text : str The text to search for + + Returns + ------- + List[Entry] + A list of entries that contain the query. """ res = [] for entry in self.entries: @@ -29,7 +39,7 @@ def search(self, text): res.append(entry) return res - def feeling_lucky(self, text): + def feeling_lucky(self, text: str) -> Entry: """Return the first entry found. Currently returns the first entry that contains the text in either the @@ -39,6 +49,11 @@ def feeling_lucky(self, text): ---------- text : str The text to search for + + Returns + ------- + Entry + The first entry that contains the query. """ entries = self.search(text) return entries[0] if entries else None diff --git a/wsd/parsers/jmdict.py b/wsd/parsers/jmdict.py index e2df5a6..f7efbc6 100644 --- a/wsd/parsers/jmdict.py +++ b/wsd/parsers/jmdict.py @@ -11,7 +11,7 @@ @dataclass class Kanji: """Kanji element""" - keb: str + keb: str = field(default_factory=str) ke_inf: List[str] = field(default_factory=list) ke_pri: List[str] = field(default_factory=list) @@ -27,7 +27,7 @@ def from_node(cls, node): @dataclass class Reading: """Reading element""" - reb: str + reb: str = field(default_factory=str) re_nokanji: bool = False re_restr: List[str] = field(default_factory=list) re_inf: List[str] = field(default_factory=list) @@ -47,8 +47,8 @@ def from_node(cls, node): @dataclass class Gloss: """A gloss element""" - text: str - lang: str = None + text: str = field(default_factory=str) + lang: str = field(default_factory=str) @classmethod def from_node(cls, node): @@ -69,7 +69,7 @@ class Sense: ant: List[str] = field(default_factory=list) field_: List[str] = field(default_factory=list) misc: List[str] = field(default_factory=list) - s_inf: str = None + s_inf: str = field(default_factory=str) lsource: List[str] = field(default_factory=list) dial: List[str] = field(default_factory=list) gloss: List[Gloss] = field(default_factory=list) @@ -95,7 +95,7 @@ def from_node(cls, node): @dataclass class Entry: """A dictionary entry""" - ent_seq: str + ent_seq: str = field(default_factory=str) k_ele: List[Kanji] = field(default_factory=list) r_ele: List[Reading] = field(default_factory=list) sense: List[Sense] = field(default_factory=list)