From 5663b7f0824611ac534bdf66fc5e86b687494442 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Thu, 13 Feb 2025 21:27:04 +0100
Subject: [PATCH 01/12] Support for Croissant RecordSet for resources in
 Datastore

For resources that are on the DataStore, we can use `datastore_info`
to get a list of the fields present in the data, and expose those as a
`RecordSet`.
Data types are also included (although these would be all `text` for
standard xloader imports) as well as primary keys.
---
 ckanext/dcat/profiles/croissant.py | 74 +++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index 24c757d2..94048ca6 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -9,7 +9,7 @@
 from dateutil.parser import parse as parse_date
 from rdflib import URIRef, BNode, Literal
 from rdflib.namespace import Namespace
-from ckantoolkit import url_for, config
+from ckantoolkit import url_for, config, asbool, get_action
 
 from ckanext.dcat.utils import resource_uri
 from .base import RDFProfile, CleanedURIRef
@@ -328,6 +328,7 @@ def _resources_graph(self, dataset_ref, dataset_dict):
 
             self._resource_graph(dataset_ref, resource_ref, resource_dict)
 
+
     def _resource_graph(self, dataset_ref, resource_ref, resource_dict):
         # Basic fields
         self._resource_basic_fields_graph(resource_ref, resource_dict)
@@ -347,6 +348,9 @@ def _resource_graph(self, dataset_ref, resource_ref, resource_dict):
         # Subresources
         self._resource_subresources_graph(dataset_ref, resource_ref, resource_dict)
 
+        # RecordSet
+        self._recordset_graph(dataset_ref, resource_ref, resource_dict)
+
     def _resource_basic_fields_graph(self, resource_ref, resource_dict):
         items = [
             ("name", SCHEMA.name, None, Literal),
@@ -387,7 +391,7 @@ def _resource_url_graph(self, resource_ref, resource_dict):
 
     def _resource_numbers_graph(self, resource_ref, resource_dict):
         if resource_dict.get("size"):
-            self.g.add((resource_ref, SCHEMA.contentSize, Literal(resource_dict["size"])))
+            self.g.add((resource_ref, SCHEMA.contentSize, Literal(str(resource_dict["size"]))))
 
     def _resource_subresources_graph(self, dataset_ref, resource_ref, resource_dict):
         subresource_dicts = self._get_resource_value(resource_dict, "subresources")
@@ -433,3 +437,69 @@ def _resource_subresources_graph(self, dataset_ref, resource_ref, resource_dict)
                     self._add_list_triples_from_dict(subresource_dict, subresource_ref, items)
 
                 self.g.add((subresource_ref, CR.containedIn, resource_ref))
+
+    def _recordset_graph(self, dataset_ref, resource_ref, resource_dict):
+
+        # Skip if data not in the DataStore
+        if not resource_dict.get("id") or not asbool(resource_dict.get("datastore_active")):
+            return
+
+        # Get fields info
+        try:
+            datastore_info = get_action("datastore_info")(
+                {"ignore_auth": True},
+                {"id": resource_dict["id"]}
+            )
+        except KeyError:
+            # DataStore not enabled
+            return
+
+        if not datastore_info or not datastore_info.get("fields"):
+            return
+
+        recordset_ref = URIRef(f"{resource_dict['id']}/records")
+
+        self.g.add((recordset_ref, RDF.type, CR.RecordSet))
+
+#        self.g.add((recordset_ref, RDF.type, SCHEMA.Text))
+
+        self.g.add((recordset_ref, SCHEMA.name, Literal(recordset_ref)))
+
+        FIELD_TYPES = {
+            "text": SCHEMA.Text,
+            "int": SCHEMA.Integer,
+            "float": SCHEMA.Float,
+            "numeric": SCHEMA.Float,
+            "timestamp": SCHEMA.DateTime,
+        }
+
+        unique_fields = []
+        for field in datastore_info["fields"]:
+
+
+            field_ref = URIRef(f"{resource_dict['id']}/records/{field['id']}")
+
+            self.g.add((recordset_ref, SCHEMA.field, field_ref))
+            self.g.add((field_ref, RDF.type, CR.Field))
+
+            self.g.add((field_ref, CR.dataType, FIELD_TYPES.get(field["type"])))
+
+            source_ref = BNode()
+
+            self.g.add((field_ref, SCHEMA.source, source_ref))
+            self.g.add((source_ref, SCHEMA.fileObject, resource_ref))
+
+            extract_ref = BNode()
+
+            self.g.add((source_ref, SCHEMA.extract, extract_ref))
+            self.g.add((extract_ref, SCHEMA.column, Literal(field['id'])))
+
+            if field["schema"]["is_index"]:
+                unique_fields.append(field_ref)
+
+        if unique_fields:
+            for unique_field_ref in unique_fields:
+                self.g.add((recordset_ref, SCHEMA.key, unique_field_ref))
+
+        self.g.add((dataset_ref, CR.recordSet, recordset_ref))
+

From d0cffbd7012678c534581ce16074b4c8fee7c44c Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Tue, 18 Feb 2025 10:21:57 +0100
Subject: [PATCH 02/12] Bette is_live_dataset check

---
 ckanext/dcat/profiles/croissant.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index 7cc65c24..1ab98472 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -156,8 +156,11 @@ def _basic_fields_graph(self, dataset_ref, dataset_dict):
         dataset_url = url_for("dataset.read", id=dataset_dict["name"], _external=True) # required
         self.g.add((dataset_ref, SCHEMA.url, Literal(dataset_url)))
 
-        if 'is_live_dataset' in dataset_dict:
-            is_live_dataset = asbool(dataset_dict["is_live_dataset"])
+        if "is_live_dataset" in dataset_dict:
+            try:
+                is_live_dataset = asbool(dataset_dict["is_live_dataset"])
+            except ValueError:
+                is_live_dataset = None
         else:
             is_live_dataset = None
 

From 8bac82f0b4749c31a243abb271fc285efae6e2d3 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Wed, 19 Feb 2025 14:30:04 +0100
Subject: [PATCH 03/12] Add serialize test for recordset logic

---
 ckanext/dcat/profiles/croissant.py            | 22 +++---
 .../profiles/croissant/test_serialize.py      | 76 ++++++++++++++++++-
 2 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index 1ab98472..7c053984 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -68,6 +68,15 @@
     "transform": "cr:transform"
 }
 
+CROISSANT_FIELD_TYPES = {
+    "text": SCHEMA.Text,
+    "int": SCHEMA.Integer,
+    "float": SCHEMA.Float,
+    "numeric": SCHEMA.Float,
+    "timestamp": SCHEMA.DateTime,
+}
+
+
 class CroissantProfile(RDFProfile):
     """
     An RDF profile based on the schema.org Dataset, modified by Croissant.
@@ -442,28 +451,19 @@ def _recordset_graph(self, dataset_ref, resource_ref, resource_dict):
 
         self.g.add((recordset_ref, RDF.type, CR.RecordSet))
 
-#        self.g.add((recordset_ref, RDF.type, SCHEMA.Text))
+        #        self.g.add((recordset_ref, RDF.type, SCHEMA.Text))
 
         self.g.add((recordset_ref, SCHEMA.name, Literal(recordset_ref)))
 
-        FIELD_TYPES = {
-            "text": SCHEMA.Text,
-            "int": SCHEMA.Integer,
-            "float": SCHEMA.Float,
-            "numeric": SCHEMA.Float,
-            "timestamp": SCHEMA.DateTime,
-        }
-
         unique_fields = []
         for field in datastore_info["fields"]:
 
-
             field_ref = URIRef(f"{resource_dict['id']}/records/{field['id']}")
 
             self.g.add((recordset_ref, SCHEMA.field, field_ref))
             self.g.add((field_ref, RDF.type, CR.Field))
 
-            self.g.add((field_ref, CR.dataType, FIELD_TYPES.get(field["type"])))
+            self.g.add((field_ref, CR.dataType, CROISSANT_FIELD_TYPES.get(field["type"])))
 
             source_ref = BNode()
 
diff --git a/ckanext/dcat/tests/profiles/croissant/test_serialize.py b/ckanext/dcat/tests/profiles/croissant/test_serialize.py
index f678aa03..5c5b5780 100644
--- a/ckanext/dcat/tests/profiles/croissant/test_serialize.py
+++ b/ckanext/dcat/tests/profiles/croissant/test_serialize.py
@@ -1,5 +1,7 @@
 from builtins import str
 import json
+from unittest import mock
+import uuid
 
 import pytest
 
@@ -14,7 +16,7 @@
 from ckanext.dcat import utils
 from ckanext.dcat.profiles import XSD, DCT, FOAF
 from ckanext.dcat.processors import RDFSerializer
-from ckanext.dcat.profiles.croissant import SCHEMA, CR
+from ckanext.dcat.profiles.croissant import SCHEMA, CR, CROISSANT_FIELD_TYPES
 
 from ckanext.dcat.tests.profiles.dcat_ap.test_euro_dcatap_profile_serialize import (
     BaseSerializeTest,
@@ -259,3 +261,75 @@ def test_graph_from_dataset(self):
             CR.excludes,
             sub_resource_file_set_dict["excludes"],
         )
+
+    def test_graph_from_dataset_with_recordset(self):
+
+        dataset_id = str(uuid.uuid4())
+        resource_id = str(uuid.uuid4())
+
+        dataset_dict = {
+            "id": dataset_id,
+            "name": "test-dataset",
+            "title": "Test Dataset",
+            "notes": "Test description",
+            "resources": [
+                {
+                    "id": resource_id,
+                    "url": "http://example.com/data.csv",
+                    "format": "CSV",
+                    "datastore_active": True,
+                }
+            ],
+        }
+        fields_datastore = [
+            {"id": "name", "type": "text", "schema": {"is_index": True}},
+            {"id": "age", "type": "int", "schema": {"is_index": False}},
+            {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}},
+        ]
+
+        def mock_datastore_info(context, data_dict):
+            return {
+                "meta": {"id": resource_id, "count": 10, "table_type": "BASE TABLE"},
+                "fields": fields_datastore,
+            }
+
+        with mock.patch(
+            "ckanext.dcat.profiles.croissant.get_action"
+        ) as mock_get_action:
+            mock_get_action.return_value = mock_datastore_info
+
+            s = RDFSerializer(profiles=["croissant"])
+            g = s.g
+
+            dataset_ref = s.graph_from_dataset(dataset_dict)
+            resource_ref = list(g.objects(dataset_ref, SCHEMA.distribution))[0]
+
+            recordset_ref = URIRef(f"{resource_id}/records")
+            assert self._triple(g, dataset_ref, CR.recordSet, recordset_ref)
+            assert self._triple(g, recordset_ref, RDF.type, CR.RecordSet)
+            assert self._triple(g, recordset_ref, SCHEMA.name, str(recordset_ref))
+
+            # Test fields
+            fields = list(g.objects(recordset_ref, SCHEMA.field))
+            assert len(fields) == 3
+
+            for field_datastore in fields_datastore:
+                field_ref = URIRef(f"{resource_id}/records/{field_datastore['id']}")
+
+                assert self._triple(g, recordset_ref, SCHEMA.field, field_ref)
+
+                assert self._triple(
+                    g, field_ref, CR.dataType, CROISSANT_FIELD_TYPES.get(field_datastore["type"])
+                )
+
+                source_ref = list(g.objects(field_ref, SCHEMA.source))[0]
+
+                assert self._triple(g, source_ref, SCHEMA.fileObject, resource_ref)
+
+                extract_ref = list(g.objects(source_ref, SCHEMA.extract))[0]
+
+                assert self._triple(g, extract_ref, SCHEMA.column, field_datastore["id"])
+
+            assert self._triple(
+                g, recordset_ref, SCHEMA.key, URIRef(f"{resource_id}/records/name")
+            )

From d0c9bfed0208214ac77d411374557d3413c13cb1 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Wed, 19 Feb 2025 15:36:06 +0100
Subject: [PATCH 04/12] Add validation test for recordset

Currently failing because SCHEMA.DateTime is not an expected type??
---
 .../tests/profiles/croissant/test_validate.py | 43 ++++++++++++++++++-
 .../ckan/ckan_full_dataset_croissant.json     |  1 +
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/ckanext/dcat/tests/profiles/croissant/test_validate.py b/ckanext/dcat/tests/profiles/croissant/test_validate.py
index e1df0a2b..e75709f3 100644
--- a/ckanext/dcat/tests/profiles/croissant/test_validate.py
+++ b/ckanext/dcat/tests/profiles/croissant/test_validate.py
@@ -1,5 +1,6 @@
 import json
 import sys
+from unittest import mock
 
 try:
     import mlcroissant as mlc
@@ -15,7 +16,7 @@
 
 
 @pytest.mark.skipif(
-    sys.version_info < (3, 10), reason="croissant is not available in py<3.10"
+    sys.version_info < (3, 10), reason="mlcroissant is not available in py<3.10"
 )
 def test_valid_output():
 
@@ -37,3 +38,43 @@ def test_valid_output():
         mlc.Dataset(croissant_dict)
     except mlc.ValidationError as exception:
         raise
+
+
+@pytest.mark.skipif(
+    sys.version_info < (3, 10), reason="mlcroissant is not available in py<3.10"
+)
+def test_valid_output_with_recordset():
+
+    dataset_dict = json.loads(
+        get_file_contents("ckan/ckan_full_dataset_croissant.json")
+    )
+
+    resource_id = dataset_dict["resources"][0]["id"]
+
+    def mock_datastore_info(context, data_dict):
+        return {
+            "meta": {"id": resource_id, "count": 10, "table_type": "BASE TABLE"},
+            "fields": [
+                {"id": "name", "type": "text", "schema": {"is_index": True}},
+                {"id": "age", "type": "int", "schema": {"is_index": False}},
+                {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}},
+            ],
+        }
+
+    with mock.patch("ckanext.dcat.profiles.croissant.get_action") as mock_get_action:
+        mock_get_action.return_value = mock_datastore_info
+
+        s = RDFSerializer(profiles=["croissant"])
+
+        s.graph_from_dataset(dataset_dict)
+
+        croissant_dict = json.loads(
+            s.g.serialize(format="json-ld", auto_compact=True, context=JSONLD_CONTEXT)
+        )
+        with open("graph.jsonld", "w") as f:
+            f.write(json.dumps(croissant_dict))
+
+        try:
+            mlc.Dataset(croissant_dict)
+        except mlc.ValidationError as exception:
+            raise
diff --git a/examples/ckan/ckan_full_dataset_croissant.json b/examples/ckan/ckan_full_dataset_croissant.json
index 5a650560..8d867136 100644
--- a/examples/ckan/ckan_full_dataset_croissant.json
+++ b/examples/ckan/ckan_full_dataset_croissant.json
@@ -60,6 +60,7 @@
             "id_given": "my-custom-resource-id",
             "size": "12323",
             "hash": "b221d9dbb083a7f33428d7c2a3c3198ae925614d70210e28716ccaa7cd4ddb79",
+            "datastore_active": true,
             "subresources": [
                 {
                     "type": "fileObject",

From 5fef19cda9ad0e90876f3c4cd7a91ef757b1457b Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Wed, 19 Feb 2025 15:37:58 +0100
Subject: [PATCH 05/12] Test float fields

---
 ckanext/dcat/tests/profiles/croissant/test_serialize.py | 3 ++-
 ckanext/dcat/tests/profiles/croissant/test_validate.py  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/ckanext/dcat/tests/profiles/croissant/test_serialize.py b/ckanext/dcat/tests/profiles/croissant/test_serialize.py
index 5c5b5780..6958f7ce 100644
--- a/ckanext/dcat/tests/profiles/croissant/test_serialize.py
+++ b/ckanext/dcat/tests/profiles/croissant/test_serialize.py
@@ -284,6 +284,7 @@ def test_graph_from_dataset_with_recordset(self):
         fields_datastore = [
             {"id": "name", "type": "text", "schema": {"is_index": True}},
             {"id": "age", "type": "int", "schema": {"is_index": False}},
+            {"id": "temperature", "type": "float", "schema": {"is_index": False}},
             {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}},
         ]
 
@@ -311,7 +312,7 @@ def mock_datastore_info(context, data_dict):
 
             # Test fields
             fields = list(g.objects(recordset_ref, SCHEMA.field))
-            assert len(fields) == 3
+            assert len(fields) == 4
 
             for field_datastore in fields_datastore:
                 field_ref = URIRef(f"{resource_id}/records/{field_datastore['id']}")
diff --git a/ckanext/dcat/tests/profiles/croissant/test_validate.py b/ckanext/dcat/tests/profiles/croissant/test_validate.py
index e75709f3..02efd2ff 100644
--- a/ckanext/dcat/tests/profiles/croissant/test_validate.py
+++ b/ckanext/dcat/tests/profiles/croissant/test_validate.py
@@ -57,6 +57,7 @@ def mock_datastore_info(context, data_dict):
             "fields": [
                 {"id": "name", "type": "text", "schema": {"is_index": True}},
                 {"id": "age", "type": "int", "schema": {"is_index": False}},
+                {"id": "temperature", "type": "float", "schema": {"is_index": False}},
                 {"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}},
             ],
         }

From 392f98f1cd946d99c39f1c19e4d8127ceea89a10 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Wed, 19 Feb 2025 16:01:38 +0100
Subject: [PATCH 06/12] Use SCHEMA.Date instead of SCHEMA.DateTime for
 timestamps

It sounds really counterintuitive but the validator doesn't like fields
with a dataType of DateTime:

```
[Metadata(Test Croissant dataset) >
RecordSet(568b8ac9-8c69-4475-b35e-d7f812a63c32/records) > Field()] The
field does not specify a valid http://mlcommons.org/croissant/dataType,
neither does any of its predecessor. Got:
[rdflib.term.URIRef('https://schema.org/DateTime')]
```

Looks like the Date / DateTime situation in schema.org is a bit
confusing:

https://github.com/schemaorg/schemaorg/issues/1748
---
 ckanext/dcat/profiles/croissant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index 9c80b147..22fbd5cc 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -65,7 +65,7 @@
     "int": SCHEMA.Integer,
     "float": SCHEMA.Float,
     "numeric": SCHEMA.Float,
-    "timestamp": SCHEMA.DateTime,
+    "timestamp": SCHEMA.Date,
 }
 
 

From 0291008d80cee1d2ffddb5d863f3b4621c020afd Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Thu, 20 Feb 2025 12:14:33 +0100
Subject: [PATCH 07/12] Don't fail if field type unknown, handle extra field
 types

---
 ckanext/dcat/profiles/croissant.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index 22fbd5cc..3a18d64d 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -63,9 +63,14 @@
 CROISSANT_FIELD_TYPES = {
     "text": SCHEMA.Text,
     "int": SCHEMA.Integer,
+    "int4": SCHEMA.Integer,
+    "int8": SCHEMA.Integer,
     "float": SCHEMA.Float,
+    "float4": SCHEMA.Float,
+    "float8": SCHEMA.Float,
     "numeric": SCHEMA.Float,
-    "timestamp": SCHEMA.Date,
+    "double precision": SCHEMA.Float,
+    "timestamp": SCHEMA.DateTime,
 }
 
 
@@ -527,14 +532,15 @@ def _recordset_graph(self, dataset_ref, resource_ref, resource_dict):
         self.g.add((recordset_ref, SCHEMA.name, Literal(recordset_ref)))
 
         unique_fields = []
+
         for field in datastore_info["fields"]:
 
             field_ref = URIRef(f"{resource_dict['id']}/records/{field['id']}")
 
             self.g.add((recordset_ref, SCHEMA.field, field_ref))
             self.g.add((field_ref, RDF.type, CR.Field))
-
-            self.g.add((field_ref, CR.dataType, CROISSANT_FIELD_TYPES.get(field["type"])))
+            if field_type := CROISSANT_FIELD_TYPES.get(field["type"]):
+                self.g.add((field_ref, CR.dataType, field_type))
 
             source_ref = BNode()
 

From 21e6b44de794857a7078ffa08edaa1def174dcab Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Thu, 20 Feb 2025 12:37:29 +0100
Subject: [PATCH 08/12] Revert SCHEMA.DateTime to SCHEMA.Date

---
 ckanext/dcat/profiles/croissant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index 3a18d64d..bc9b23f9 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -70,7 +70,7 @@
     "float8": SCHEMA.Float,
     "numeric": SCHEMA.Float,
     "double precision": SCHEMA.Float,
-    "timestamp": SCHEMA.DateTime,
+    "timestamp": SCHEMA.Date,
 }
 
 

From 84312ef170812a1295a7ce20274bcaf57a582522 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Thu, 20 Feb 2025 12:38:43 +0100
Subject: [PATCH 09/12] Document RecordSet support

---
 docs/croissant.md                             | 84 ++++++++++++++++++-
 .../croissant/full_croissant_dataset.jsonld   | 62 ++++++++++++++
 2 files changed, 145 insertions(+), 1 deletion(-)

diff --git a/docs/croissant.md b/docs/croissant.md
index 81c3314a..8c6b8720 100644
--- a/docs/croissant.md
+++ b/docs/croissant.md
@@ -21,8 +21,15 @@ Once the plugin is enabled, the Croissant output will be embedded in the source
 
     https://{ckan-instance-host}/dataset/{dataset-id}/croissant.jsonld
 
+## Schema mapping
+
 The extension includes a [schema](getting-started.md#schemas) ([`ckanext/dcat/schemas/croissant.yaml`](https://github.com/ckan/ckanext-dcat/tree/master/ckanext/dcat/schemas/croissant.yml)) for sites that want to take advantage of all the entities and properties of the Croissant spec.
 
+This maps CKAN's datasets to [schema.org Datasets](https://docs.mlcommons.org/croissant/docs/croissant-spec.html#dataset-level-information) and resources to [Croissant resources](https://docs.mlcommons.org/croissant/docs/croissant-spec.html#resources), which can have type `FileObject` or `FileSet`. For `FileSet` resources, use the "Sub-resources" repeating subfield to describe the contents of the file set.
+
+Additionally, for resources that have been imported to the CKAN [DataStore](https://docs.ckan.org/en/latest/maintaining/datastore.html), the resource will also expose Croissant's [RecordSet](https://docs.mlcommons.org/croissant/docs/croissant-spec.html#recordset) objects with information about the data fields (e.g. column names and types).
+
+
 ## Customizing
 
 If you want to modify the Croissant output you can [write your own profile](writing-profiles.md) extending the builtin `ckanext.dcat.profiles.croissant.CroissantProfile` class and register it.
@@ -36,7 +43,20 @@ ckanext.dcat.croissant.profiles = my_custom_croissant_profile
 ## Examples
 
 * The [`examples/ckan/ckan_full_dataset_croissant.json`](https://github.com/ckan/ckanext-dcat/tree/master/examples/ckan/ckan_full_dataset_croissant.json) file contains a full CKAN dataset dict that implements the custom Croissant schema.
-* Below is the Croissant serialization resulting from the dataset above:
+* Below is the Croissant serialization resulting from the dataset above, and assuming the resource has a DataStore tableassociated with the following structure:
+
+```json
+{
+	"fields": [
+		{"id": "name", "type": "text", "schema": {"is_index": True}},
+		{"id": "age", "type": "int", "schema": {"is_index": False}},
+		{"id": "temperature", "type": "float", "schema": {"is_index": False}},
+		{"id": "timestamp", "type": "timestamp", "schema": {"is_index": False}},
+	]
+}
+
+```
+
 
 ```json
 {
@@ -172,6 +192,68 @@ ckanext.dcat.croissant.profiles = my_custom_croissant_profile
         "name": "Test Publisher",
         "url": "https://example.org"
     },
+    "recordSet": {
+      "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records",
+      "@type": "cr:RecordSet",
+      "field": [
+        {
+          "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/temperature",
+          "@type": "cr:Field",
+          "dataType": "Float",
+          "source": {
+            "extract": {
+              "column": "temperature"
+            },
+            "fileObject": {
+              "@id": "my-custom-resource-id",
+            }
+          }
+        },
+        {
+          "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/timestamp",
+          "@type": "cr:Field",
+          "dataType": "Date",
+          "source": {
+            "extract": {
+              "column": "timestamp"
+            },
+            "fileObject": {
+              "@id": "my-custom-resource-id"
+            }
+          }
+        },
+        {
+          "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/name",
+          "@type": "cr:Field",
+          "dataType": "Text",
+          "source": {
+            "extract": {
+              "column": "name"
+            },
+            "fileObject": {
+              "@id": "my-custom-resource-id"
+            }
+          }
+        },
+        {
+          "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/age",
+          "@type": "cr:Field",
+          "dataType": "Integer",
+          "source": {
+            "extract": {
+              "column": "age"
+            },
+            "fileObject": {
+              "@id": "my-custom-resource-id"
+            }
+          }
+        }
+      ],
+      "key": {
+        "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/name"
+      },
+      "name": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records"
+    },
     "sameAs": [
         "https://some.other.catalog/dataset/123",
         "https://yet.another.catalog/dataset/xyz"
diff --git a/examples/croissant/full_croissant_dataset.jsonld b/examples/croissant/full_croissant_dataset.jsonld
index 24eab94f..9811cca7 100644
--- a/examples/croissant/full_croissant_dataset.jsonld
+++ b/examples/croissant/full_croissant_dataset.jsonld
@@ -131,6 +131,68 @@
         "name": "Test Publisher",
         "url": "https://example.org"
     },
+    "recordSet": {
+      "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records",
+      "@type": "cr:RecordSet",
+      "field": [
+        {
+          "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/temperature",
+          "@type": "cr:Field",
+          "dataType": "Float",
+          "source": {
+            "extract": {
+              "column": "temperature"
+            },
+            "fileObject": {
+              "@id": "my-custom-resource-id",
+            }
+          }
+        },
+        {
+          "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/timestamp",
+          "@type": "cr:Field",
+          "dataType": "Date",
+          "source": {
+            "extract": {
+              "column": "timestamp"
+            },
+            "fileObject": {
+              "@id": "my-custom-resource-id"
+            }
+          }
+        },
+        {
+          "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/name",
+          "@type": "cr:Field",
+          "dataType": "Text",
+          "source": {
+            "extract": {
+              "column": "name"
+            },
+            "fileObject": {
+              "@id": "my-custom-resource-id"
+            }
+          }
+        },
+        {
+          "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/age",
+          "@type": "cr:Field",
+          "dataType": "Integer",
+          "source": {
+            "extract": {
+              "column": "age"
+            },
+            "fileObject": {
+              "@id": "my-custom-resource-id"
+            }
+          }
+        }
+      ],
+      "key": {
+        "@id": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records/name"
+      },
+      "name": "568b8ac9-8c69-4475-b35e-d7f812a63c32/records"
+    },
     "sameAs": [
         "https://some.other.catalog/dataset/123",
         "https://yet.another.catalog/dataset/xyz"

From 9199d3e7804923b1868659469aa0f9ec33e5b5a6 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Mon, 24 Feb 2025 12:31:53 +0100
Subject: [PATCH 10/12] Update croissant profile per @Reikyo suggestions

---
 ckanext/dcat/profiles/croissant.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index bc9b23f9..912f36a5 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -527,36 +527,32 @@ def _recordset_graph(self, dataset_ref, resource_ref, resource_dict):
 
         self.g.add((recordset_ref, RDF.type, CR.RecordSet))
 
-        #        self.g.add((recordset_ref, RDF.type, SCHEMA.Text))
-
-        self.g.add((recordset_ref, SCHEMA.name, Literal(recordset_ref)))
-
         unique_fields = []
 
         for field in datastore_info["fields"]:
 
             field_ref = URIRef(f"{resource_dict['id']}/records/{field['id']}")
 
-            self.g.add((recordset_ref, SCHEMA.field, field_ref))
+            self.g.add((recordset_ref, CR.field, field_ref))
             self.g.add((field_ref, RDF.type, CR.Field))
             if field_type := CROISSANT_FIELD_TYPES.get(field["type"]):
                 self.g.add((field_ref, CR.dataType, field_type))
 
             source_ref = BNode()
 
-            self.g.add((field_ref, SCHEMA.source, source_ref))
-            self.g.add((source_ref, SCHEMA.fileObject, resource_ref))
+            self.g.add((field_ref, CR.source, source_ref))
+            self.g.add((source_ref, CR.fileObject, resource_ref))
 
             extract_ref = BNode()
 
-            self.g.add((source_ref, SCHEMA.extract, extract_ref))
-            self.g.add((extract_ref, SCHEMA.column, Literal(field['id'])))
+            self.g.add((source_ref, CR.extract, extract_ref))
+            self.g.add((extract_ref, CR.column, Literal(field['id'])))
 
             if field["schema"]["is_index"]:
                 unique_fields.append(field_ref)
 
         if unique_fields:
             for unique_field_ref in unique_fields:
-                self.g.add((recordset_ref, SCHEMA.key, unique_field_ref))
+                self.g.add((recordset_ref, CR.key, unique_field_ref))
 
         self.g.add((dataset_ref, CR.recordSet, recordset_ref))

From 86eff8c55212caa31ec0348b61f181a3c01dec96 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Mon, 24 Feb 2025 13:10:55 +0100
Subject: [PATCH 11/12] Update serialization test after profile changes

---
 .../tests/profiles/croissant/test_serialize.py    | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/ckanext/dcat/tests/profiles/croissant/test_serialize.py b/ckanext/dcat/tests/profiles/croissant/test_serialize.py
index 1f4ecc6c..e7c5265c 100644
--- a/ckanext/dcat/tests/profiles/croissant/test_serialize.py
+++ b/ckanext/dcat/tests/profiles/croissant/test_serialize.py
@@ -310,31 +310,30 @@ def mock_datastore_info(context, data_dict):
             recordset_ref = URIRef(f"{resource_id}/records")
             assert self._triple(g, dataset_ref, CR.recordSet, recordset_ref)
             assert self._triple(g, recordset_ref, RDF.type, CR.RecordSet)
-            assert self._triple(g, recordset_ref, SCHEMA.name, str(recordset_ref))
 
             # Test fields
-            fields = list(g.objects(recordset_ref, SCHEMA.field))
+            fields = list(g.objects(recordset_ref, CR.field))
             assert len(fields) == 4
 
             for field_datastore in fields_datastore:
                 field_ref = URIRef(f"{resource_id}/records/{field_datastore['id']}")
 
-                assert self._triple(g, recordset_ref, SCHEMA.field, field_ref)
+                assert self._triple(g, recordset_ref, CR.field, field_ref)
 
                 assert self._triple(
                     g, field_ref, CR.dataType, CROISSANT_FIELD_TYPES.get(field_datastore["type"])
                 )
 
-                source_ref = list(g.objects(field_ref, SCHEMA.source))[0]
+                source_ref = list(g.objects(field_ref, CR.source))[0]
 
-                assert self._triple(g, source_ref, SCHEMA.fileObject, resource_ref)
+                assert self._triple(g, source_ref, CR.fileObject, resource_ref)
 
-                extract_ref = list(g.objects(source_ref, SCHEMA.extract))[0]
+                extract_ref = list(g.objects(source_ref, CR.extract))[0]
 
-                assert self._triple(g, extract_ref, SCHEMA.column, field_datastore["id"])
+                assert self._triple(g, extract_ref, CR.column, field_datastore["id"])
 
             assert self._triple(
-                g, recordset_ref, SCHEMA.key, URIRef(f"{resource_id}/records/name")
+                g, recordset_ref, CR.key, URIRef(f"{resource_id}/records/name")
             )
 
     @pytest.mark.usefixtures("with_plugins", "clean_db")

From 7c1d81084340d07d924b002f312a6cbf26eaac74 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Mon, 24 Feb 2025 14:13:18 +0100
Subject: [PATCH 12/12] Use mimetype for encodingFormat if present

---
 ckanext/dcat/profiles/croissant.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ckanext/dcat/profiles/croissant.py b/ckanext/dcat/profiles/croissant.py
index 912f36a5..ad325701 100644
--- a/ckanext/dcat/profiles/croissant.py
+++ b/ckanext/dcat/profiles/croissant.py
@@ -439,11 +439,7 @@ def _resource_list_fields_graph(self, resource_ref, resource_dict):
         self._add_list_triples_from_dict(resource_dict, resource_ref, items)
 
     def _resource_format_graph(self, resource_ref, resource_dict):
-        if resource_dict.get("format"):
-            self.g.add(
-                (resource_ref, SCHEMA.encodingFormat, Literal(resource_dict["format"]))
-            )
-        elif resource_dict.get("mimetype"):
+        if resource_dict.get("mimetype"):
             self.g.add(
                 (
                     resource_ref,
@@ -451,6 +447,11 @@ def _resource_format_graph(self, resource_ref, resource_dict):
                     Literal(resource_dict["mimetype"]),
                 )
             )
+        elif resource_dict.get("format"):
+            self.g.add(
+                (resource_ref, SCHEMA.encodingFormat, Literal(resource_dict["format"]))
+            )
+
 
     def _resource_url_graph(self, resource_ref, resource_dict):
         if (resource_dict.get("type") == "fileObject") and resource_dict.get("url"):