Skip to content

Commit 638a867

Browse files
authored
Replace html5lib with html5rdf, make it an optional dependency (#2951)
* Revert previous commit that made html support non-optional. html support is now optional again, and it uses html5rdf rather than html5lib/html5lib-modern. * Revert "Auxiliary commit to revert individual files from 18d1d70" This reverts commit 6f80ad9cd2cc4b76ff4e1bc0998951768aff7573. * Add html5lib-modern back into dockerfile dependencies so the docker image can be built with the released rdflib v7.1.0 * Reformat test file again. * lxml is not actually required for operation of html5rdf * Fix differences in compariston of XML and HTML nodes when html5rdf is used vs when it is not used. * Add correct type hint to _XML_COMPARABLE variable. * Fix logic in using non-ill-typed XML literals for comparison
1 parent 4afa455 commit 638a867

10 files changed

+90
-49
lines changed

devtools/constraints.min

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@ pyparsing==2.1.0
66
importlib-metadata==4.0.0
77
berkeleydb==18.1.2
88
networkx==2.0
9-
html5lib-modern==1.2.0
9+
html5rdf==1.2.0
1010
lxml==4.3.0
1111
orjson==3.9.14

docker/latest/requirements.in

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# This file is used for building a docker image of the latest rdflib release. It
22
# will be updated by dependabot when new releases are made.
33
rdflib==7.1.0
4+
html5rdf==1.2.0
5+
# html5lib-modern is required to allow the Dockerfile to build on with pre-RDFLib-7.1.1 releases.
46
html5lib-modern==1.2.0
5-
# isodate is required to allow the Dockerfile to build on with pre-RDFLib-7.1 releases.
6-
isodate==0.7.2

docker/latest/requirements.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
#
55
# pip-compile docker/latest/requirements.in
66
#
7-
html5lib-modern==1.2
7+
html5rdf==1.2
88
# via
99
# -r docker/latest/requirements.in
1010
# rdflib
11-
isodate==0.7.2
11+
html5lib-modern==1.2
1212
# via -r docker/latest/requirements.in
1313
pyparsing==3.0.9
1414
# via rdflib

poetry.lock

+6-6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+4-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ isodate = {version=">=0.7.2,<1.0.0", python = "<3.11"}
4343
pyparsing = ">=2.1.0,<4"
4444
berkeleydb = {version = "^18.1.0", optional = true}
4545
networkx = {version = ">=2,<4", optional = true}
46-
html5lib-modern = "^1.2"
46+
html5rdf = {version = ">=1.2,<2", optional = true}
4747
lxml = {version = ">=4.3,<6.0", optional = true}
4848
orjson = {version = ">=3.9.14,<4", optional = true}
4949

@@ -74,6 +74,9 @@ ruff = ">=0.0.286,<0.8.0"
7474
[tool.poetry.extras]
7575
berkeleydb = ["berkeleydb"]
7676
networkx = ["networkx"]
77+
# html support is optional, it is used only in tokenizing `rdf:HTML` type Literals
78+
html = ["html5rdf"]
79+
# lxml support is optional, it is used only for parsing XML-formatted SPARQL results
7780
lxml = ["lxml"]
7881
orjson = ["orjson"]
7982

rdflib/term.py

+53-20
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,6 @@
6565
from urllib.parse import urldefrag, urljoin, urlparse
6666
from uuid import uuid4
6767

68-
import html5lib
69-
7068
import rdflib
7169
import rdflib.util
7270
from rdflib.compat import long_type
@@ -86,6 +84,14 @@
8684
from .namespace import NamespaceManager
8785
from .paths import AlternativePath, InvPath, NegatedPath, Path, SequencePath
8886

87+
_HAS_HTML5RDF = False
88+
89+
try:
90+
import html5rdf
91+
92+
_HAS_HTML5RDF = True
93+
except ImportError:
94+
html5rdf = None
8995

9096
_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"
9197

@@ -1107,7 +1113,7 @@ def __gt__(self, other: Any) -> bool:
11071113
if other is None:
11081114
return True # Everything is greater than None
11091115
if isinstance(other, Literal):
1110-
# Fast path for comapring numeric literals
1116+
# Fast path for comparing numeric literals
11111117
# that are not ill-typed and don't have a None value
11121118
if (
11131119
(
@@ -1350,9 +1356,15 @@ def eq(self, other: Any) -> bool:
13501356
13511357
"""
13521358
if isinstance(other, Literal):
1359+
# Fast path for comparing numeric literals
1360+
# that are not ill-typed and don't have a None value
13531361
if (
1354-
self.datatype in _NUMERIC_LITERAL_TYPES
1355-
and other.datatype in _NUMERIC_LITERAL_TYPES
1362+
(
1363+
self.datatype in _NUMERIC_LITERAL_TYPES
1364+
and other.datatype in _NUMERIC_LITERAL_TYPES
1365+
)
1366+
and ((not self.ill_typed) and (not other.ill_typed))
1367+
and (self.value is not None and other.value is not None)
13561368
):
13571369
if self.value is not None and other.value is not None:
13581370
return self.value == other.value
@@ -1374,6 +1386,16 @@ def eq(self, other: Any) -> bool:
13741386
# string/plain literals, compare on lexical form
13751387
return str.__eq__(self, other)
13761388

1389+
# XML can be compared to HTML, only if html5rdf is enabled
1390+
if (
1391+
(dtself in _XML_COMPARABLE and dtother in _XML_COMPARABLE)
1392+
and
1393+
# Ill-typed can be None if unknown, but we don't want it to be True.
1394+
((self.ill_typed is not True) and (other.ill_typed is not True))
1395+
and (self.value is not None and other.value is not None)
1396+
):
1397+
return _isEqualXMLNode(self.value, other.value)
1398+
13771399
if dtself != dtother:
13781400
if rdflib.DAWG_LITERAL_COLLATION:
13791401
raise TypeError(
@@ -1387,9 +1409,6 @@ def eq(self, other: Any) -> bool:
13871409
# maybe there are counter examples
13881410

13891411
if self.value is not None and other.value is not None:
1390-
if self.datatype in (_RDF_XMLLITERAL, _RDF_HTMLLITERAL):
1391-
return _isEqualXMLNode(self.value, other.value)
1392-
13931412
return self.value == other.value
13941413
else:
13951414
if str.__eq__(self, other):
@@ -1668,19 +1687,19 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document: # noqa: N802
16681687
def _parse_html(lexical_form: str) -> xml.dom.minidom.DocumentFragment:
16691688
"""
16701689
Parse the lexical form of an HTML literal into a document fragment
1671-
using the ``dom`` from html5lib tree builder.
1690+
using the ``dom`` from html5rdf tree builder.
16721691
16731692
:param lexical_form: The lexical form of the HTML literal.
16741693
:return: A document fragment representing the HTML literal.
1675-
:raises: `html5lib.html5parser.ParseError` if the lexical form is
1694+
:raises: `html5rdf.html5parser.ParseError` if the lexical form is
16761695
not valid HTML.
16771696
"""
1678-
parser = html5lib.HTMLParser(
1679-
tree=html5lib.treebuilders.getTreeBuilder("dom"), strict=True
1697+
parser = html5rdf.HTMLParser(
1698+
tree=html5rdf.treebuilders.getTreeBuilder("dom"), strict=True
16801699
)
16811700
try:
16821701
result: xml.dom.minidom.DocumentFragment = parser.parseFragment(lexical_form)
1683-
except html5lib.html5parser.ParseError as e:
1702+
except html5rdf.html5parser.ParseError as e:
16841703
logger.info(f"Failed to parse HTML: {e}")
16851704
raise e
16861705
result.normalize()
@@ -1695,7 +1714,7 @@ def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
16951714
:param value: A document fragment representing an HTML literal.
16961715
:return: The lexical form of the HTML literal.
16971716
"""
1698-
result = html5lib.serialize(value, tree="dom")
1717+
result = html5rdf.serialize(value, tree="dom")
16991718
return result
17001719

17011720

@@ -2012,14 +2031,21 @@ def _castPythonToLiteral( # noqa: N802
20122031
(Duration, (lambda i: duration_isoformat(i), _XSD_DURATION)),
20132032
(timedelta, (lambda i: duration_isoformat(i), _XSD_DAYTIMEDURATION)),
20142033
(xml.dom.minidom.Document, (_writeXML, _RDF_XMLLITERAL)),
2015-
# This is a bit dirty, by accident the html5lib parser produces
2016-
# DocumentFragments, and the xml parser Documents, letting this
2017-
# decide what datatype to use makes roundtripping easier, but it a
2018-
# bit random.
2019-
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL)),
20202034
(Fraction, (None, _OWL_RATIONAL)),
20212035
]
20222036

2037+
if html5rdf is not None:
2038+
# This is a bit dirty, by accident the html5rdf parser produces
2039+
# DocumentFragments, and the xml parser Documents, letting this
2040+
# decide what datatype to use makes roundtripping easier, but its a
2041+
# bit random.
2042+
2043+
# This must happen before _GenericPythonToXSDRules is assigned to
2044+
# _OriginalGenericPythonToXSDRules.
2045+
_GenericPythonToXSDRules.append(
2046+
(xml.dom.minidom.DocumentFragment, (_write_html, _RDF_HTMLLITERAL))
2047+
)
2048+
20232049
_OriginalGenericPythonToXSDRules = list(_GenericPythonToXSDRules)
20242050

20252051
_SpecificPythonToXSDRules: List[
@@ -2069,10 +2095,17 @@ def _castPythonToLiteral( # noqa: N802
20692095
URIRef(_XSD_PFX + "double"): float,
20702096
URIRef(_XSD_PFX + "base64Binary"): b64decode,
20712097
URIRef(_XSD_PFX + "anyURI"): None,
2072-
_RDF_HTMLLITERAL: _parse_html,
20732098
_RDF_XMLLITERAL: _parseXML,
20742099
}
20752100

2101+
if html5rdf is not None:
2102+
# It is probably best to keep this close to the definition of
2103+
# _GenericPythonToXSDRules so nobody misses it.
2104+
XSDToPython[_RDF_HTMLLITERAL] = _parse_html
2105+
_XML_COMPARABLE: Tuple[URIRef, ...] = (_RDF_XMLLITERAL, _RDF_HTMLLITERAL)
2106+
else:
2107+
_XML_COMPARABLE = (_RDF_XMLLITERAL,)
2108+
20762109
_check_well_formed_types: Dict[URIRef, Callable[[Union[str, bytes], Any], bool]] = {
20772110
URIRef(_XSD_PFX + "boolean"): _well_formed_boolean,
20782111
URIRef(_XSD_PFX + "nonPositiveInteger"): _well_formed_non_positive_integer,

test/test_literal/test_literal.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323

2424

2525
try:
26-
import html5lib as _ # noqa: F401
26+
import html5rdf as _ # noqa: F401
2727

28-
_HAVE_HTML5LIB = True
28+
_HAVE_HTML5RDF = True
2929
except ImportError:
30-
_HAVE_HTML5LIB = False
30+
_HAVE_HTML5RDF = False
3131

3232
import pytest
3333

@@ -981,7 +981,7 @@ def __eq__(self, __value: object) -> bool:
981981
(
982982
lambda: Literal("<body>", datatype=RDF.HTML),
983983
LiteralChecker(
984-
..., None, RDF.HTML, True if _HAVE_HTML5LIB else None, "<body>"
984+
..., None, RDF.HTML, True if _HAVE_HTML5RDF else None, "<body>"
985985
),
986986
),
987987
(
@@ -990,7 +990,7 @@ def __eq__(self, __value: object) -> bool:
990990
...,
991991
None,
992992
RDF.HTML,
993-
False if _HAVE_HTML5LIB else None,
993+
False if _HAVE_HTML5RDF else None,
994994
"<table></table>",
995995
),
996996
),

test/test_literal/test_literal_html5lib.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import xml.dom.minidom
22
from typing import Callable
33

4-
import html5lib # noqa: F401
54
import pytest
65

76
import rdflib.term
@@ -10,8 +9,14 @@
109
from test.utils.literal import LiteralChecker
1110
from test.utils.outcome import OutcomeChecker, OutcomePrimitives
1211

12+
try:
13+
import html5rdf as _ # noqa: F401
14+
except ImportError:
15+
pytest.skip("html5rdf not installed", allow_module_level=True)
1316

14-
def test_has_html5lib() -> None:
17+
18+
def test_has_html5rdf() -> None:
19+
assert rdflib.term._HAS_HTML5RDF is True
1520
assert RDF.HTML in rdflib.term.XSDToPython
1621
rule = next(
1722
(
@@ -29,7 +34,7 @@ def test_has_html5lib() -> None:
2934
["factory", "outcome"],
3035
[
3136
# Ill-typed literals, these have lexical forms that result in
32-
# errors when parsed as HTML by html5lib.
37+
# errors when parsed as HTML by html5rdf.
3338
(
3439
lambda: Literal("<body><h1>Hello, World!</h1></body>", datatype=RDF.HTML),
3540
LiteralChecker(
@@ -47,7 +52,7 @@ def test_has_html5lib() -> None:
4752
),
4853
),
4954
# Well-typed literals, these have lexical forms that parse
50-
# without errors with html5lib.
55+
# without errors with html5rdf.
5156
(
5257
lambda: Literal("<table></table>", datatype=RDF.HTML),
5358
LiteralChecker(..., None, RDF.HTML, False, "<table></table>"),

test/test_literal/test_xmlliterals.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@
99
from rdflib import RDF, Literal
1010

1111
try:
12-
import html5lib # noqa: F401
12+
import html5rdf # noqa: F401
1313

14-
have_html5lib = True
14+
have_html5rdf = True
1515
except ImportError:
16-
have_html5lib = False
16+
have_html5rdf = False
1717

1818

1919
def testPythonRoundtrip(): # noqa: N802
@@ -90,7 +90,7 @@ def testRoundtrip(): # noqa: N802
9090
roundtrip("nt")
9191

9292

93-
@pytest.mark.skipif(not have_html5lib, reason="requires html5lib")
93+
@pytest.mark.skipif(not have_html5rdf, reason="requires html5rdf")
9494
def testHTML(): # noqa: N802
9595
l1 = Literal("<msg>hello</msg>", datatype=RDF.XMLLiteral)
9696
assert l1.value is not None, "xml must have been parsed"
@@ -126,7 +126,7 @@ def testHTML(): # noqa: N802
126126
textwrap.dedent(
127127
"""\
128128
<!DOCTYPE example>
129-
<something/>
129+
<something2/>
130130
"""
131131
)
132132
),
@@ -137,7 +137,7 @@ def testHTML(): # noqa: N802
137137
textwrap.dedent(
138138
"""\
139139
<!DOCTYPE example>
140-
<something />
140+
<something2 />
141141
"""
142142
)
143143
),

tox.ini

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ setenv =
1515
COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}}
1616
MYPY_CACHE_DIR = {envdir}/.mypy_cache
1717
docs: POETRY_ARGS_docs = --only=docs
18-
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=orjson
18+
extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson
1919
lxml: POETRY_ARGS_lxml = --extras=lxml
2020
commands_pre =
2121
py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))'
@@ -59,7 +59,7 @@ setenv =
5959
PYTHONHASHSEED = 0
6060
commands_pre =
6161
poetry lock --check
62-
poetry install --only=main --only=docs
62+
poetry install --only=main --only=docs --extras=html
6363
poetry env info
6464
commands =
6565
poetry run sphinx-build -T -W -b html -d {envdir}/doctree docs docs/_build/html

0 commit comments

Comments
 (0)