65
65
from urllib .parse import urldefrag , urljoin , urlparse
66
66
from uuid import uuid4
67
67
68
- import html5lib
69
-
70
68
import rdflib
71
69
import rdflib .util
72
70
from rdflib .compat import long_type
86
84
from .namespace import NamespaceManager
87
85
from .paths import AlternativePath , InvPath , NegatedPath , Path , SequencePath
88
86
87
+ _HAS_HTML5RDF = False
88
+
89
+ try :
90
+ import html5rdf
91
+
92
+ _HAS_HTML5RDF = True
93
+ except ImportError :
94
+ html5rdf = None
89
95
90
96
_SKOLEM_DEFAULT_AUTHORITY = "https://rdflib.github.io"
91
97
@@ -1107,7 +1113,7 @@ def __gt__(self, other: Any) -> bool:
1107
1113
if other is None :
1108
1114
return True # Everything is greater than None
1109
1115
if isinstance (other , Literal ):
1110
- # Fast path for comapring numeric literals
1116
+ # Fast path for comparing numeric literals
1111
1117
# that are not ill-typed and don't have a None value
1112
1118
if (
1113
1119
(
@@ -1350,9 +1356,15 @@ def eq(self, other: Any) -> bool:
1350
1356
1351
1357
"""
1352
1358
if isinstance (other , Literal ):
1359
+ # Fast path for comparing numeric literals
1360
+ # that are not ill-typed and don't have a None value
1353
1361
if (
1354
- self .datatype in _NUMERIC_LITERAL_TYPES
1355
- and other .datatype in _NUMERIC_LITERAL_TYPES
1362
+ (
1363
+ self .datatype in _NUMERIC_LITERAL_TYPES
1364
+ and other .datatype in _NUMERIC_LITERAL_TYPES
1365
+ )
1366
+ and ((not self .ill_typed ) and (not other .ill_typed ))
1367
+ and (self .value is not None and other .value is not None )
1356
1368
):
1357
1369
if self .value is not None and other .value is not None :
1358
1370
return self .value == other .value
@@ -1374,6 +1386,16 @@ def eq(self, other: Any) -> bool:
1374
1386
# string/plain literals, compare on lexical form
1375
1387
return str .__eq__ (self , other )
1376
1388
1389
+ # XML can be compared to HTML, only if html5rdf is enabled
1390
+ if (
1391
+ (dtself in _XML_COMPARABLE and dtother in _XML_COMPARABLE )
1392
+ and
1393
+ # Ill-typed can be None if unknown, but we don't want it to be True.
1394
+ ((self .ill_typed is not True ) and (other .ill_typed is not True ))
1395
+ and (self .value is not None and other .value is not None )
1396
+ ):
1397
+ return _isEqualXMLNode (self .value , other .value )
1398
+
1377
1399
if dtself != dtother :
1378
1400
if rdflib .DAWG_LITERAL_COLLATION :
1379
1401
raise TypeError (
@@ -1387,9 +1409,6 @@ def eq(self, other: Any) -> bool:
1387
1409
# maybe there are counter examples
1388
1410
1389
1411
if self .value is not None and other .value is not None :
1390
- if self .datatype in (_RDF_XMLLITERAL , _RDF_HTMLLITERAL ):
1391
- return _isEqualXMLNode (self .value , other .value )
1392
-
1393
1412
return self .value == other .value
1394
1413
else :
1395
1414
if str .__eq__ (self , other ):
@@ -1668,19 +1687,19 @@ def _parseXML(xmlstring: str) -> xml.dom.minidom.Document: # noqa: N802
1668
1687
def _parse_html (lexical_form : str ) -> xml .dom .minidom .DocumentFragment :
1669
1688
"""
1670
1689
Parse the lexical form of an HTML literal into a document fragment
1671
- using the ``dom`` from html5lib tree builder.
1690
+ using the ``dom`` from html5rdf tree builder.
1672
1691
1673
1692
:param lexical_form: The lexical form of the HTML literal.
1674
1693
:return: A document fragment representing the HTML literal.
1675
- :raises: `html5lib .html5parser.ParseError` if the lexical form is
1694
+ :raises: `html5rdf .html5parser.ParseError` if the lexical form is
1676
1695
not valid HTML.
1677
1696
"""
1678
- parser = html5lib .HTMLParser (
1679
- tree = html5lib .treebuilders .getTreeBuilder ("dom" ), strict = True
1697
+ parser = html5rdf .HTMLParser (
1698
+ tree = html5rdf .treebuilders .getTreeBuilder ("dom" ), strict = True
1680
1699
)
1681
1700
try :
1682
1701
result : xml .dom .minidom .DocumentFragment = parser .parseFragment (lexical_form )
1683
- except html5lib .html5parser .ParseError as e :
1702
+ except html5rdf .html5parser .ParseError as e :
1684
1703
logger .info (f"Failed to parse HTML: { e } " )
1685
1704
raise e
1686
1705
result .normalize ()
@@ -1695,7 +1714,7 @@ def _write_html(value: xml.dom.minidom.DocumentFragment) -> bytes:
1695
1714
:param value: A document fragment representing an HTML literal.
1696
1715
:return: The lexical form of the HTML literal.
1697
1716
"""
1698
- result = html5lib .serialize (value , tree = "dom" )
1717
+ result = html5rdf .serialize (value , tree = "dom" )
1699
1718
return result
1700
1719
1701
1720
@@ -2012,14 +2031,21 @@ def _castPythonToLiteral( # noqa: N802
2012
2031
(Duration , (lambda i : duration_isoformat (i ), _XSD_DURATION )),
2013
2032
(timedelta , (lambda i : duration_isoformat (i ), _XSD_DAYTIMEDURATION )),
2014
2033
(xml .dom .minidom .Document , (_writeXML , _RDF_XMLLITERAL )),
2015
- # This is a bit dirty, by accident the html5lib parser produces
2016
- # DocumentFragments, and the xml parser Documents, letting this
2017
- # decide what datatype to use makes roundtripping easier, but it a
2018
- # bit random.
2019
- (xml .dom .minidom .DocumentFragment , (_write_html , _RDF_HTMLLITERAL )),
2020
2034
(Fraction , (None , _OWL_RATIONAL )),
2021
2035
]
2022
2036
2037
+ if html5rdf is not None :
2038
+ # This is a bit dirty, by accident the html5rdf parser produces
2039
+ # DocumentFragments, and the xml parser Documents, letting this
2040
+ # decide what datatype to use makes roundtripping easier, but its a
2041
+ # bit random.
2042
+
2043
+ # This must happen before _GenericPythonToXSDRules is assigned to
2044
+ # _OriginalGenericPythonToXSDRules.
2045
+ _GenericPythonToXSDRules .append (
2046
+ (xml .dom .minidom .DocumentFragment , (_write_html , _RDF_HTMLLITERAL ))
2047
+ )
2048
+
2023
2049
_OriginalGenericPythonToXSDRules = list (_GenericPythonToXSDRules )
2024
2050
2025
2051
_SpecificPythonToXSDRules : List [
@@ -2069,10 +2095,17 @@ def _castPythonToLiteral( # noqa: N802
2069
2095
URIRef (_XSD_PFX + "double" ): float ,
2070
2096
URIRef (_XSD_PFX + "base64Binary" ): b64decode ,
2071
2097
URIRef (_XSD_PFX + "anyURI" ): None ,
2072
- _RDF_HTMLLITERAL : _parse_html ,
2073
2098
_RDF_XMLLITERAL : _parseXML ,
2074
2099
}
2075
2100
2101
+ if html5rdf is not None :
2102
+ # It is probably best to keep this close to the definition of
2103
+ # _GenericPythonToXSDRules so nobody misses it.
2104
+ XSDToPython [_RDF_HTMLLITERAL ] = _parse_html
2105
+ _XML_COMPARABLE : Tuple [URIRef , ...] = (_RDF_XMLLITERAL , _RDF_HTMLLITERAL )
2106
+ else :
2107
+ _XML_COMPARABLE = (_RDF_XMLLITERAL ,)
2108
+
2076
2109
_check_well_formed_types : Dict [URIRef , Callable [[Union [str , bytes ], Any ], bool ]] = {
2077
2110
URIRef (_XSD_PFX + "boolean" ): _well_formed_boolean ,
2078
2111
URIRef (_XSD_PFX + "nonPositiveInteger" ): _well_formed_non_positive_integer ,
0 commit comments