From 5663b7f0824611ac534bdf66fc5e86b687494442 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 13 Feb 2025 21:27:04 +0100 Subject: [PATCH 01/12] Support for Croissant RecordSet for resources in Datastore For resources that are on the DataStore, we can use `datastore_info` to get a list of the fields present in the data, and expose those as a `RecordSet`. Data types are also included (although these would be all `text` for standard xloader imports) as well as primary keys. --- ckanext/dcat/profiles/croissant.py | 74 +++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index 24c757d2..94048ca6 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -9,7 +9,7 @@ from dateutil.parser import parse as parse_date from rdflib import URIRef, BNode, Literal from rdflib.namespace import Namespace -from ckantoolkit import url_for, config +from ckantoolkit import url_for, config, asbool, get_action from ckanext.dcat.utils import resource_uri from .base import RDFProfile, CleanedURIRef @@ -328,6 +328,7 @@ def _resources_graph(self, dataset_ref, dataset_dict): self._resource_graph(dataset_ref, resource_ref, resource_dict) + def _resource_graph(self, dataset_ref, resource_ref, resource_dict): # Basic fields self._resource_basic_fields_graph(resource_ref, resource_dict) @@ -347,6 +348,9 @@ def _resource_graph(self, dataset_ref, resource_ref, resource_dict): # Subresources self._resource_subresources_graph(dataset_ref, resource_ref, resource_dict) + # RecordSet + self._recordset_graph(dataset_ref, resource_ref, resource_dict) + def _resource_basic_fields_graph(self, resource_ref, resource_dict): items = [ ("name", SCHEMA.name, None, Literal), @@ -387,7 +391,7 @@ def _resource_url_graph(self, resource_ref, resource_dict): def _resource_numbers_graph(self, resource_ref, resource_dict): if resource_dict.get("size"): - self.g.add((resource_ref, SCHEMA.contentSize, Literal(resource_dict["size"]))) + self.g.add((resource_ref, SCHEMA.contentSize, Literal(str(resource_dict["size"])))) def _resource_subresources_graph(self, dataset_ref, resource_ref, resource_dict): subresource_dicts = self._get_resource_value(resource_dict, "subresources") @@ -433,3 +437,69 @@ def _resource_subresources_graph(self, dataset_ref, resource_ref, resource_dict) self._add_list_triples_from_dict(subresource_dict, subresource_ref, items) self.g.add((subresource_ref, CR.containedIn, resource_ref)) + + def _recordset_graph(self, dataset_ref, resource_ref, resource_dict): + + # Skip if data not in the DataStore + if not resource_dict.get("id") or not asbool(resource_dict.get("datastore_active")): + return + + # Get fields info + try: + datastore_info = get_action("datastore_info")( + {"ignore_auth": True}, + {"id": resource_dict["id"]} + ) + except KeyError: + # DataStore not enabled + return + + if not datastore_info or not datastore_info.get("fields"): + return + + recordset_ref = URIRef(f"{resource_dict['id']}/records") + + self.g.add((recordset_ref, RDF.type, CR.RecordSet)) + +# self.g.add((recordset_ref, RDF.type, SCHEMA.Text)) + + self.g.add((recordset_ref, SCHEMA.name, Literal(recordset_ref))) + + FIELD_TYPES = { + "text": SCHEMA.Text, + "int": SCHEMA.Integer, + "float": SCHEMA.Float, + "numeric": SCHEMA.Float, + "timestamp": SCHEMA.DateTime, + } + + unique_fields = [] + for field in datastore_info["fields"]: + + + field_ref = URIRef(f"{resource_dict['id']}/records/{field['id']}") + + self.g.add((recordset_ref, SCHEMA.field, field_ref)) + self.g.add((field_ref, RDF.type, CR.Field)) + + self.g.add((field_ref, CR.dataType, FIELD_TYPES.get(field["type"]))) + + source_ref = BNode() + + self.g.add((field_ref, SCHEMA.source, source_ref)) + self.g.add((source_ref, SCHEMA.fileObject, resource_ref)) + + extract_ref = BNode() + + self.g.add((source_ref, SCHEMA.extract, extract_ref)) + self.g.add((extract_ref, SCHEMA.column, Literal(field['id']))) + + if field["schema"]["is_index"]: + unique_fields.append(field_ref) + + if unique_fields: + for unique_field_ref in unique_fields: + self.g.add((recordset_ref, SCHEMA.key, unique_field_ref)) + + self.g.add((dataset_ref, CR.recordSet, recordset_ref)) + From d0cffbd7012678c534581ce16074b4c8fee7c44c Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 18 Feb 2025 10:21:57 +0100 Subject: [PATCH 02/12] Bette is_live_dataset check --- ckanext/dcat/profiles/croissant.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index 7cc65c24..1ab98472 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -156,8 +156,11 @@ def _basic_fields_graph(self, dataset_ref, dataset_dict): dataset_url = url_for("dataset.read", id=dataset_dict["name"], _external=True) # required self.g.add((dataset_ref, SCHEMA.url, Literal(dataset_url))) - if 'is_live_dataset' in dataset_dict: - is_live_dataset = asbool(dataset_dict["is_live_dataset"]) + if "is_live_dataset" in dataset_dict: + try: + is_live_dataset = asbool(dataset_dict["is_live_dataset"]) + except ValueError: + is_live_dataset = None else: is_live_dataset = None From 8bac82f0b4749c31a243abb271fc285efae6e2d3 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Feb 2025 14:30:04 +0100 Subject: [PATCH 03/12] Add serialize test for recordset logic --- ckanext/dcat/profiles/croissant.py | 22 +++--- .../profiles/croissant/test_serialize.py | 76 ++++++++++++++++++- 2 files changed, 86 insertions(+), 12 deletions(-) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index 1ab98472..7c053984 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -68,6 +68,15 @@ "transform": "cr:transform" } +CROISSANT_FIELD_TYPES = { + "text": SCHEMA.Text, + "int": SCHEMA.Integer, + "float": SCHEMA.Float, + "numeric": SCHEMA.Float, + "timestamp": SCHEMA.DateTime, +} + + class CroissantProfile(RDFProfile): """ An RDF profile based on the schema.org Dataset, modified by Croissant. @@ -442,28 +451,19 @@ def _recordset_graph(self, dataset_ref, resource_ref, resource_dict): self.g.add((recordset_ref, RDF.type, CR.RecordSet)) -# self.g.add((recordset_ref, RDF.type, SCHEMA.Text)) + # self.g.add((recordset_ref, RDF.type, SCHEMA.Text)) self.g.add((recordset_ref, SCHEMA.name, Literal(recordset_ref))) - FIELD_TYPES = { - "text": SCHEMA.Text, - "int": SCHEMA.Integer, - "float": SCHEMA.Float, - "numeric": SCHEMA.Float, - "timestamp": SCHEMA.DateTime, - } - unique_fields = [] for field in datastore_info["fields"]: - field_ref = URIRef(f"{resource_dict['id']}/records/{field['id']}") self.g.add((recordset_ref, SCHEMA.field, field_ref)) self.g.add((field_ref, RDF.type, CR.Field)) - self.g.add((field_ref, CR.dataType, FIELD_TYPES.get(field["type"]))) + self.g.add((field_ref, CR.dataType, CROISSANT_FIELD_TYPES.get(field["type"]))) source_ref = BNode() diff --git a/ckanext/dcat/tests/profiles/croissant/test_serialize.py b/ckanext/dcat/tests/profiles/croissant/test_serialize.py index f678aa03..5c5b5780 100644 --- a/ckanext/dcat/tests/profiles/croissant/test_serialize.py +++ b/ckanext/dcat/tests/profiles/croissant/test_serialize.py @@ -1,5 +1,7 @@ from builtins import str import json +from unittest import mock +import uuid import pytest @@ -14,7 +16,7 @@ from ckanext.dcat import utils from ckanext.dcat.profiles import XSD, DCT, FOAF from ckanext.dcat.processors import RDFSerializer -from ckanext.dcat.profiles.croissant import SCHEMA, CR +from ckanext.dcat.profiles.croissant import SCHEMA, CR, CROISSANT_FIELD_TYPES from ckanext.dcat.tests.profiles.dcat_ap.test_euro_dcatap_profile_serialize import ( BaseSerializeTest, @@ -259,3 +261,75 @@ def test_graph_from_dataset(self): CR.excludes, sub_resource_file_set_dict["excludes"], ) + + def test_graph_from_dataset_with_recordset(self): + + dataset_id = str(uuid.uuid4()) + resource_id = str(uuid.uuid4()) + + dataset_dict = { + "id": dataset_id, + "name": "test-dataset", + "title": "Test Dataset", + "notes": "Test description", + "resources": [ + { + "id": resource_id, + "url": "http://example.com/data.csv", + "format": "CSV", + "datastore_active": True, + } + ], + } + fields_datastore = [ + {"id": "name", "type": "text", "schema": {"is_index": True}}, + {"id": "age", "type": "int", "schema": {"is_index": False}}, + {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}}, + ] + + def mock_datastore_info(context, data_dict): + return { + "meta": {"id": resource_id, "count": 10, "table_type": "BASE TABLE"}, + "fields": fields_datastore, + } + + with mock.patch( + "ckanext.dcat.profiles.croissant.get_action" + ) as mock_get_action: + mock_get_action.return_value = mock_datastore_info + + s = RDFSerializer(profiles=["croissant"]) + g = s.g + + dataset_ref = s.graph_from_dataset(dataset_dict) + resource_ref = list(g.objects(dataset_ref, SCHEMA.distribution))[0] + + recordset_ref = URIRef(f"{resource_id}/records") + assert self._triple(g, dataset_ref, CR.recordSet, recordset_ref) + assert self._triple(g, recordset_ref, RDF.type, CR.RecordSet) + assert self._triple(g, recordset_ref, SCHEMA.name, str(recordset_ref)) + + # Test fields + fields = list(g.objects(recordset_ref, SCHEMA.field)) + assert len(fields) == 3 + + for field_datastore in fields_datastore: + field_ref = URIRef(f"{resource_id}/records/{field_datastore['id']}") + + assert self._triple(g, recordset_ref, SCHEMA.field, field_ref) + + assert self._triple( + g, field_ref, CR.dataType, CROISSANT_FIELD_TYPES.get(field_datastore["type"]) + ) + + source_ref = list(g.objects(field_ref, SCHEMA.source))[0] + + assert self._triple(g, source_ref, SCHEMA.fileObject, resource_ref) + + extract_ref = list(g.objects(source_ref, SCHEMA.extract))[0] + + assert self._triple(g, extract_ref, SCHEMA.column, field_datastore["id"]) + + assert self._triple( + g, recordset_ref, SCHEMA.key, URIRef(f"{resource_id}/records/name") + ) From d0c9bfed0208214ac77d411374557d3413c13cb1 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Feb 2025 15:36:06 +0100 Subject: [PATCH 04/12] Add validation test for recordset Currently failing because SCHEMA.DateTime is not an expected type?? --- .../tests/profiles/croissant/test_validate.py | 43 ++++++++++++++++++- .../ckan/ckan_full_dataset_croissant.json | 1 + 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/tests/profiles/croissant/test_validate.py b/ckanext/dcat/tests/profiles/croissant/test_validate.py index e1df0a2b..e75709f3 100644 --- a/ckanext/dcat/tests/profiles/croissant/test_validate.py +++ b/ckanext/dcat/tests/profiles/croissant/test_validate.py @@ -1,5 +1,6 @@ import json import sys +from unittest import mock try: import mlcroissant as mlc @@ -15,7 +16,7 @@ @pytest.mark.skipif( - sys.version_info < (3, 10), reason="croissant is not available in py<3.10" + sys.version_info < (3, 10), reason="mlcroissant is not available in py<3.10" ) def test_valid_output(): @@ -37,3 +38,43 @@ def test_valid_output(): mlc.Dataset(croissant_dict) except mlc.ValidationError as exception: raise + + +@pytest.mark.skipif( + sys.version_info < (3, 10), reason="mlcroissant is not available in py<3.10" +) +def test_valid_output_with_recordset(): + + dataset_dict = json.loads( + get_file_contents("ckan/ckan_full_dataset_croissant.json") + ) + + resource_id = dataset_dict["resources"][0]["id"] + + def mock_datastore_info(context, data_dict): + return { + "meta": {"id": resource_id, "count": 10, "table_type": "BASE TABLE"}, + "fields": [ + {"id": "name", "type": "text", "schema": {"is_index": True}}, + {"id": "age", "type": "int", "schema": {"is_index": False}}, + {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}}, + ], + } + + with mock.patch("ckanext.dcat.profiles.croissant.get_action") as mock_get_action: + mock_get_action.return_value = mock_datastore_info + + s = RDFSerializer(profiles=["croissant"]) + + s.graph_from_dataset(dataset_dict) + + croissant_dict = json.loads( + s.g.serialize(format="json-ld", auto_compact=True, context=JSONLD_CONTEXT) + ) + with open("graph.jsonld", "w") as f: + f.write(json.dumps(croissant_dict)) + + try: + mlc.Dataset(croissant_dict) + except mlc.ValidationError as exception: + raise diff --git a/examples/ckan/ckan_full_dataset_croissant.json b/examples/ckan/ckan_full_dataset_croissant.json index 5a650560..8d867136 100644 --- a/examples/ckan/ckan_full_dataset_croissant.json +++ b/examples/ckan/ckan_full_dataset_croissant.json @@ -60,6 +60,7 @@ "id_given": "my-custom-resource-id", "size": "12323", "hash": "b221d9dbb083a7f33428d7c2a3c3198ae925614d70210e28716ccaa7cd4ddb79", + "datastore_active": true, "subresources": [ { "type": "fileObject", From 5fef19cda9ad0e90876f3c4cd7a91ef757b1457b Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Feb 2025 15:37:58 +0100 Subject: [PATCH 05/12] Test float fields --- ckanext/dcat/tests/profiles/croissant/test_serialize.py | 3 ++- ckanext/dcat/tests/profiles/croissant/test_validate.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ckanext/dcat/tests/profiles/croissant/test_serialize.py b/ckanext/dcat/tests/profiles/croissant/test_serialize.py index 5c5b5780..6958f7ce 100644 --- a/ckanext/dcat/tests/profiles/croissant/test_serialize.py +++ b/ckanext/dcat/tests/profiles/croissant/test_serialize.py @@ -284,6 +284,7 @@ def test_graph_from_dataset_with_recordset(self): fields_datastore = [ {"id": "name", "type": "text", "schema": {"is_index": True}}, {"id": "age", "type": "int", "schema": {"is_index": False}}, + {"id": "temperature", "type": "float", "schema": {"is_index": False}}, {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}}, ] @@ -311,7 +312,7 @@ def mock_datastore_info(context, data_dict): # Test fields fields = list(g.objects(recordset_ref, SCHEMA.field)) - assert len(fields) == 3 + assert len(fields) == 4 for field_datastore in fields_datastore: field_ref = URIRef(f"{resource_id}/records/{field_datastore['id']}") diff --git a/ckanext/dcat/tests/profiles/croissant/test_validate.py b/ckanext/dcat/tests/profiles/croissant/test_validate.py index e75709f3..02efd2ff 100644 --- a/ckanext/dcat/tests/profiles/croissant/test_validate.py +++ b/ckanext/dcat/tests/profiles/croissant/test_validate.py @@ -57,6 +57,7 @@ def mock_datastore_info(context, data_dict): "fields": [ {"id": "name", "type": "text", "schema": {"is_index": True}}, {"id": "age", "type": "int", "schema": {"is_index": False}}, + {"id": "temperature", "type": "float", "schema": {"is_index": False}}, {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}}, ], } From 392f98f1cd946d99c39f1c19e4d8127ceea89a10 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Feb 2025 16:01:38 +0100 Subject: [PATCH 06/12] Use SCHEMA.Date instead of SCHEMA.DateTime for timestamps It sounds really counterintuitive but the validator doesn't like fields with a dataType of DateTime: ``` [Metadata(Test Croissant dataset) > RecordSet(568b8ac9-8c69-4475-b35e-d7f812a63c32/records) > Field()] The field does not specify a valid http://mlcommons.org/croissant/dataType, neither does any of its predecessor. Got: [rdflib.term.URIRef('https://schema.org/DateTime')] ``` Looks like the Date / DateTime situation in schema.org is a bit confusing: https://github.com/schemaorg/schemaorg/issues/1748 --- ckanext/dcat/profiles/croissant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index 9c80b147..22fbd5cc 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -65,7 +65,7 @@ "int": SCHEMA.Integer, "float": SCHEMA.Float, "numeric": SCHEMA.Float, - "timestamp": SCHEMA.DateTime, + "timestamp": SCHEMA.Date, } From 0291008d80cee1d2ffddb5d863f3b4621c020afd Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 20 Feb 2025 12:14:33 +0100 Subject: [PATCH 07/12] Don't fail if field type unknown, handle extra field types --- ckanext/dcat/profiles/croissant.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index 22fbd5cc..3a18d64d 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -63,9 +63,14 @@ CROISSANT_FIELD_TYPES = { "text": SCHEMA.Text, "int": SCHEMA.Integer, + "int4": SCHEMA.Integer, + "int8": SCHEMA.Integer, "float": SCHEMA.Float, + "float4": SCHEMA.Float, + "float8": SCHEMA.Float, "numeric": SCHEMA.Float, - "timestamp": SCHEMA.Date, + "double precision": SCHEMA.Float, + "timestamp": SCHEMA.DateTime, } @@ -527,14 +532,15 @@ def _recordset_graph(self, dataset_ref, resource_ref, resource_dict): self.g.add((recordset_ref, SCHEMA.name, Literal(recordset_ref))) unique_fields = [] + for field in datastore_info["fields"]: field_ref = URIRef(f"{resource_dict['id']}/records/{field['id']}") self.g.add((recordset_ref, SCHEMA.field, field_ref)) self.g.add((field_ref, RDF.type, CR.Field)) - - self.g.add((field_ref, CR.dataType, CROISSANT_FIELD_TYPES.get(field["type"]))) + if field_type := CROISSANT_FIELD_TYPES.get(field["type"]): + self.g.add((field_ref, CR.dataType, field_type)) source_ref = BNode() From 21e6b44de794857a7078ffa08edaa1def174dcab Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 20 Feb 2025 12:37:29 +0100 Subject: [PATCH 08/12] Revert SCHEMA.DateTime to SCHEMA.Date --- ckanext/dcat/profiles/croissant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index 3a18d64d..bc9b23f9 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -70,7 +70,7 @@ "float8": SCHEMA.Float, "numeric": SCHEMA.Float, "double precision": SCHEMA.Float, - "timestamp": SCHEMA.DateTime, + "timestamp": SCHEMA.Date, } From 84312ef170812a1295a7ce20274bcaf57a582522 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 20 Feb 2025 12:38:43 +0100 Subject: [PATCH 09/12] Document RecordSet support --- docs/croissant.md | 84 ++++++++++++++++++- .../croissant/full_croissant_dataset.jsonld | 62 ++++++++++++++ 2 files changed, 145 insertions(+), 1 deletion(-) diff --git a/docs/croissant.md b/docs/croissant.md index 81c3314a..8c6b8720 100644 --- a/docs/croissant.md +++ b/docs/croissant.md @@ -21,8 +21,15 @@ Once the plugin is enabled, the Croissant output will be embedded in the source https://{ckan-instance-host}/dataset/{dataset-id}/croissant.jsonld +## Schema mapping + The extension includes a [schema](getting-started.md#schemas) ([`ckanext/dcat/schemas/croissant.yaml`](https://github.com/ckan/ckanext-dcat/tree/master/ckanext/dcat/schemas/croissant.yml)) for sites that want to take advantage of all the entities and properties of the Croissant spec. +This maps CKAN's datasets to [schema.org Datasets](https://docs.mlcommons.org/croissant/docs/croissant-spec.html#dataset-level-information) and resources to [Croissant resources](https://docs.mlcommons.org/croissant/docs/croissant-spec.html#resources), which can have type `FileObject` or `FileSet`. For `FileSet` resources, use the "Sub-resources" repeating subfield to describe the contents of the file set. + +Additionally, for resources that have been imported to the CKAN [DataStore](https://docs.ckan.org/en/latest/maintaining/datastore.html), the resource will also expose Croissant's [RecordSet](https://docs.mlcommons.org/croissant/docs/croissant-spec.html#recordset) objects with information about the data fields (e.g. column names and types). + + ## Customizing If you want to modify the Croissant output you can [write your own profile](writing-profiles.md) extending the builtin `ckanext.dcat.profiles.croissant.CroissantProfile` class and register it. @@ -36,7 +43,20 @@ ckanext.dcat.croissant.profiles = my_custom_croissant_profile ## Examples * The [`examples/ckan/ckan_full_dataset_croissant.json`](https://github.com/ckan/ckanext-dcat/tree/master/examples/ckan/ckan_full_dataset_croissant.json) file contains a full CKAN dataset dict that implements the custom Croissant schema. -* Below is the Croissant serialization resulting from the dataset above: +* Below is the Croissant serialization resulting from the dataset above, and assuming the resource has a DataStore tableassociated with the following structure: + +```json +{ + "fields": [ + {"id": "name", "type": "text", "schema": {"is_index": True}}, + {"id": "age", "type": "int", "schema": {"is_index": False}}, + {"id": "temperature", "type": "float", "schema": {"is_index": False}}, + {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}}, + ] +} + +``` + ```json { @@ -172,6 +192,68 @@ ckanext.dcat.croissant.profiles = my_custom_croissant_profile "name": "Test Publisher", "url": "https://example.org" }, + "recordSet": { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records", + "@type": "cr:RecordSet", + "field": [ + { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/temperature", + "@type": "cr:Field", + "dataType": "Float", + "source": { + "extract": { + "column": "temperature" + }, + "fileObject": { + "@id": "my-custom-resource-id", + } + } + }, + { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/timestamp", + "@type": "cr:Field", + "dataType": "Date", + "source": { + "extract": { + "column": "timestamp" + }, + "fileObject": { + "@id": "my-custom-resource-id" + } + } + }, + { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/name", + "@type": "cr:Field", + "dataType": "Text", + "source": { + "extract": { + "column": "name" + }, + "fileObject": { + "@id": "my-custom-resource-id" + } + } + }, + { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/age", + "@type": "cr:Field", + "dataType": "Integer", + "source": { + "extract": { + "column": "age" + }, + "fileObject": { + "@id": "my-custom-resource-id" + } + } + } + ], + "key": { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/name" + }, + "name": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records" + }, "sameAs": [ "https://some.other.catalog/dataset/123", "https://yet.another.catalog/dataset/xyz" diff --git a/examples/croissant/full_croissant_dataset.jsonld b/examples/croissant/full_croissant_dataset.jsonld index 24eab94f..9811cca7 100644 --- a/examples/croissant/full_croissant_dataset.jsonld +++ b/examples/croissant/full_croissant_dataset.jsonld @@ -131,6 +131,68 @@ "name": "Test Publisher", "url": "https://example.org" }, + "recordSet": { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records", + "@type": "cr:RecordSet", + "field": [ + { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/temperature", + "@type": "cr:Field", + "dataType": "Float", + "source": { + "extract": { + "column": "temperature" + }, + "fileObject": { + "@id": "my-custom-resource-id", + } + } + }, + { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/timestamp", + "@type": "cr:Field", + "dataType": "Date", + "source": { + "extract": { + "column": "timestamp" + }, + "fileObject": { + "@id": "my-custom-resource-id" + } + } + }, + { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/name", + "@type": "cr:Field", + "dataType": "Text", + "source": { + "extract": { + "column": "name" + }, + "fileObject": { + "@id": "my-custom-resource-id" + } + } + }, + { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/age", + "@type": "cr:Field", + "dataType": "Integer", + "source": { + "extract": { + "column": "age" + }, + "fileObject": { + "@id": "my-custom-resource-id" + } + } + } + ], + "key": { + "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/name" + }, + "name": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records" + }, "sameAs": [ "https://some.other.catalog/dataset/123", "https://yet.another.catalog/dataset/xyz" From 9199d3e7804923b1868659469aa0f9ec33e5b5a6 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 24 Feb 2025 12:31:53 +0100 Subject: [PATCH 10/12] Update croissant profile per @Reikyo suggestions --- ckanext/dcat/profiles/croissant.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index bc9b23f9..912f36a5 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -527,36 +527,32 @@ def _recordset_graph(self, dataset_ref, resource_ref, resource_dict): self.g.add((recordset_ref, RDF.type, CR.RecordSet)) - # self.g.add((recordset_ref, RDF.type, SCHEMA.Text)) - - self.g.add((recordset_ref, SCHEMA.name, Literal(recordset_ref))) - unique_fields = [] for field in datastore_info["fields"]: field_ref = URIRef(f"{resource_dict['id']}/records/{field['id']}") - self.g.add((recordset_ref, SCHEMA.field, field_ref)) + self.g.add((recordset_ref, CR.field, field_ref)) self.g.add((field_ref, RDF.type, CR.Field)) if field_type := CROISSANT_FIELD_TYPES.get(field["type"]): self.g.add((field_ref, CR.dataType, field_type)) source_ref = BNode() - self.g.add((field_ref, SCHEMA.source, source_ref)) - self.g.add((source_ref, SCHEMA.fileObject, resource_ref)) + self.g.add((field_ref, CR.source, source_ref)) + self.g.add((source_ref, CR.fileObject, resource_ref)) extract_ref = BNode() - self.g.add((source_ref, SCHEMA.extract, extract_ref)) - self.g.add((extract_ref, SCHEMA.column, Literal(field['id']))) + self.g.add((source_ref, CR.extract, extract_ref)) + self.g.add((extract_ref, CR.column, Literal(field['id']))) if field["schema"]["is_index"]: unique_fields.append(field_ref) if unique_fields: for unique_field_ref in unique_fields: - self.g.add((recordset_ref, SCHEMA.key, unique_field_ref)) + self.g.add((recordset_ref, CR.key, unique_field_ref)) self.g.add((dataset_ref, CR.recordSet, recordset_ref)) From 86eff8c55212caa31ec0348b61f181a3c01dec96 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 24 Feb 2025 13:10:55 +0100 Subject: [PATCH 11/12] Update serialization test after profile changes --- .../tests/profiles/croissant/test_serialize.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/ckanext/dcat/tests/profiles/croissant/test_serialize.py b/ckanext/dcat/tests/profiles/croissant/test_serialize.py index 1f4ecc6c..e7c5265c 100644 --- a/ckanext/dcat/tests/profiles/croissant/test_serialize.py +++ b/ckanext/dcat/tests/profiles/croissant/test_serialize.py @@ -310,31 +310,30 @@ def mock_datastore_info(context, data_dict): recordset_ref = URIRef(f"{resource_id}/records") assert self._triple(g, dataset_ref, CR.recordSet, recordset_ref) assert self._triple(g, recordset_ref, RDF.type, CR.RecordSet) - assert self._triple(g, recordset_ref, SCHEMA.name, str(recordset_ref)) # Test fields - fields = list(g.objects(recordset_ref, SCHEMA.field)) + fields = list(g.objects(recordset_ref, CR.field)) assert len(fields) == 4 for field_datastore in fields_datastore: field_ref = URIRef(f"{resource_id}/records/{field_datastore['id']}") - assert self._triple(g, recordset_ref, SCHEMA.field, field_ref) + assert self._triple(g, recordset_ref, CR.field, field_ref) assert self._triple( g, field_ref, CR.dataType, CROISSANT_FIELD_TYPES.get(field_datastore["type"]) ) - source_ref = list(g.objects(field_ref, SCHEMA.source))[0] + source_ref = list(g.objects(field_ref, CR.source))[0] - assert self._triple(g, source_ref, SCHEMA.fileObject, resource_ref) + assert self._triple(g, source_ref, CR.fileObject, resource_ref) - extract_ref = list(g.objects(source_ref, SCHEMA.extract))[0] + extract_ref = list(g.objects(source_ref, CR.extract))[0] - assert self._triple(g, extract_ref, SCHEMA.column, field_datastore["id"]) + assert self._triple(g, extract_ref, CR.column, field_datastore["id"]) assert self._triple( - g, recordset_ref, SCHEMA.key, URIRef(f"{resource_id}/records/name") + g, recordset_ref, CR.key, URIRef(f"{resource_id}/records/name") ) @pytest.mark.usefixtures("with_plugins", "clean_db") From 7c1d81084340d07d924b002f312a6cbf26eaac74 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 24 Feb 2025 14:13:18 +0100 Subject: [PATCH 12/12] Use mimetype for encodingFormat if present --- ckanext/dcat/profiles/croissant.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py index 912f36a5..ad325701 100644 --- a/ckanext/dcat/profiles/croissant.py +++ b/ckanext/dcat/profiles/croissant.py @@ -439,11 +439,7 @@ def _resource_list_fields_graph(self, resource_ref, resource_dict): self._add_list_triples_from_dict(resource_dict, resource_ref, items) def _resource_format_graph(self, resource_ref, resource_dict): - if resource_dict.get("format"): - self.g.add( - (resource_ref, SCHEMA.encodingFormat, Literal(resource_dict["format"])) - ) - elif resource_dict.get("mimetype"): + if resource_dict.get("mimetype"): self.g.add( ( resource_ref, @@ -451,6 +447,11 @@ def _resource_format_graph(self, resource_ref, resource_dict): Literal(resource_dict["mimetype"]), ) ) + elif resource_dict.get("format"): + self.g.add( + (resource_ref, SCHEMA.encodingFormat, Literal(resource_dict["format"])) + ) + def _resource_url_graph(self, resource_ref, resource_dict): if (resource_dict.get("type") == "fileObject") and resource_dict.get("url"):