LSSTDESC · JoanneBogart · Mar 12, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+## Version 1.2.0
+
+- Make `dataset.relative_path` nullable
+- Adjust code to set `dataset.relative_path` to null for datasets of
+  location type `exteranl` and `meta_only`
+- Add documentation for in-place database upgrades
+- Add script for making db upgrade
+
 ## Version 1.1.1
 
 - Move `get_keyword_list()` function to the `Query()` class.
@@ -111,7 +119,7 @@ list the proper join would not be made. This has been corrected.
 ## Version 0.5.3
 
 - Update the `schema.yaml` file to include unique constraints and table
-  indexes.  
+  indexes.
 - Update the unique constraints for the dataset table to be `owner`,
   `owner_type`, `name`, `version`, `version_suffix`.
 

diff --git a/docs/source/database_upgrades.rst b/docs/source/database_upgrades.rst
@@ -0,0 +1,61 @@
+
+Database Upgrades
+=================
+
+.. _database_upgrades
+
+
+Creating an alternate database
+******************************
+
+If it doesn't already exist, create an alternate database from ``psql`` using
+the admin account:
+
+.. code-block:: bash
+
+   $ psql dest_data_registry reg_admin
+   desc_data_registry=# create database alt_db;
+
+and add an entry to your .pgpass for the reg_admin account and alt_db database.
+If you expect to use standard ``dataregistry`` utilities for your updates
+(recommended) you'll also need an alternate config file for connecting to
+the alternate database as reg_admin.
+
+Dump the active database
+************************
+
+Dump both schemas and data from the ``desc_data_registry`` database
+.. code-block:: bash
+
+   $ pg_dump -U reg_admin desc_data_registry --schema=lsst_desc_production --file=production_dump.sql
+   $ pg_dump -U reg_admin desc_data_registry --schema=lsst_desc_working --file=working_dump.sql
+
+See ``pg_dump`` documentation for a description of all options.  For example you
+might want to use a different format than the default simple one used here.
+
+Restore to alt_db database
+**************************
+
+.. code-block:: bash
+
+   $ psql -U reg_admin -X --set ON_ERROR_STOP=on alt_db < production_dump.sql
+   $ psql -U reg_admin -X --set ON_ERROR_STOP=on alt_db < working_dump.sql
+
+Depending on the format of your dumped data, you might instead need to use the
+``pg_restore`` program rather than ``psql``  to do the restore.
+
+Test and apply for real
+***********************
+
+If the update involves changes to the schema you'll need a script to implement
+them and also add an entry to the ``provenance`` table. You'll also need
+to update the database version as stored in
+`dataregistry/src/dataregistry/schema/schema_version.py`.
+If the update involves changes to entries stored in the database, you'll need
+a script for that as well (or if more convenient use a single script for both).
+See examples in the dataregistry GitHub repo under
+`dataregistry/scripts/schema_migration/`.
+
+Run your script(s) in alt_db, fix any issues, then run in the
+``desc_data_registry``. Once you're satisfied everything is ok,
+delete the copy of the schemas in ``alt_db``
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -25,7 +25,7 @@ Getting started
 
 This documentation is to help you get set up using the ``dataregistry`` Python
 package; covering installation, how to register datasets, and how to query for
-them. 
+them.
 
 .. toctree::
    :maxdepth: 2
@@ -61,6 +61,7 @@ them.
    dev_notes_spin
    dev_notes_database
    installation_locally
+   database_upgrades
 
 .. toctree::
    :maxdepth: 2

diff --git a/scripts/create_registry_schema.py b/scripts/create_registry_schema.py
@@ -18,6 +18,12 @@
     load_preset_keywords,
     DEFAULT_NAMESPACE,
 )
+from dataregistry.schema.schema_version import (
+    _DB_VERSION_MAJOR,
+    _DB_VERSION_MINOR,
+    _DB_VERSION_PATCH,
+    _DB_VERSION_COMMENT
+)
 import logging
 
 """
@@ -266,17 +272,6 @@ def _BuildTable(schema, table_name, has_production, production):
     return Model
 
 
-# ----------------
-# Database version
-# ----------------
-
-# The following should be adjusted whenever there is a change to the structure
-# of the database tables.
-_DB_VERSION_MAJOR = 3
-_DB_VERSION_MINOR = 3
-_DB_VERSION_PATCH = 0
-_DB_VERSION_COMMENT = "Remove `is_overwritten`, `replace_date` and `replace_uid` columns, the information is in `status`"
-
 # ----------------------------
 # Parse command line arguments
 # ----------------------------
@@ -330,7 +325,7 @@ def _BuildTable(schema, table_name, has_production, production):
             logging_level=logging.DEBUG)
 
     if db_connection.dialect == "sqlite":
-        print(f"Creating sqlite database...")
+        print("Creating sqlite database...")
         schema = None
     elif schema == prod_schema:
         print(f"Creating production schema {prod_schema}...")
@@ -357,9 +352,9 @@ def _BuildTable(schema, table_name, has_production, production):
             except Exception:
                 raise RuntimeError("production schema does not exist or is ill-formed")
             if (
-                result["db_version_major"][0]
-                != _DB_VERSION_MAJOR | int(result["db_version_minor"][0])
-                > _DB_VERSION_MINOR
+                (result["db_version_major"][0]
+                != _DB_VERSION_MAJOR) or (int(result["db_version_minor"][0])
+                > _DB_VERSION_MINOR)
             ):
                 raise RuntimeError("production schema version incompatible")
 
@@ -395,7 +390,7 @@ def _BuildTable(schema, table_name, has_production, production):
                     ):
                         privs = "SELECT"
                     else:
-                        privs = f"SELECT, INSERT, UPDATE"
+                        privs = "SELECT, INSERT, UPDATE"
                     select_prv = (
                         f"GRANT {privs} ON ALL TABLES IN SCHEMA {schema} to {acct}"
                     )

diff --git a/scripts/schema_migration/update_relative_path_null_constraint.py b/scripts/schema_migration/update_relative_path_null_constraint.py
@@ -0,0 +1,69 @@
+import os
+import argparse
+from sqlalchemy import text
+from dataregistry.db_basic import DbConnection
+from dataregistry.db_basic import _insert_provenance
+from dataregistry.schema.schema_version import (
+    _DB_VERSION_MAJOR,
+    _DB_VERSION_MINOR,
+    _DB_VERSION_PATCH,
+    _DB_VERSION_COMMENT
+)
+
+parser = argparse.ArgumentParser(
+    description="Update specified schema, using specified config, to make dataset.relative_path nullable",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+)
+parser.add_argument("schema",
+                    help="name of schema whose tables are to be modified.")
+
+home = os.getenv('HOME')
+alt_config = os.path.join(home, '.config_100_alt_admin')
+parser.add_argument("--config", help="Path to the data registry config file. Determines database (regular or alt) to be modified", default=alt_config)
+parser.add_argument("--steps", choices=['mod_schema', 'mod_data', 'both'],
+                    default='mod_schema')
+args = parser.parse_args()
+
+if  args.schema.endswith('production'):
+    assoc_production = args.schema
+    entry_mode = 'production'
+elif args.schema.endswith('working'):
+    assoc_production = args.schema.replace('working', 'production')
+    entry_mode = 'working'
+else:
+    raise ValueError('Schema name must end with "production" or "working"')
+
+query_mode = entry_mode
+
+db_connection = DbConnection(schema=args.schema, config_file=args.config,
+                             entry_mode=entry_mode, query_mode=query_mode)
+
+if args.steps in ['mod_schema', 'both']:
+    # Update the schema:
+    alter_col = f"alter table {args.schema}.dataset alter column relative_path drop not null"
+
+    print("To be executed: ", alter_col)
+    with db_connection.engine.connect() as conn:
+        conn.execute(text(alter_col))
+        conn.commit()
+
+    # If we got this far add a row to the provenance table
+    _insert_provenance(
+        db_connection,
+        _DB_VERSION_MAJOR,
+        _DB_VERSION_MINOR,
+        _DB_VERSION_PATCH,
+        "MIGRATE",
+        comment=_DB_VERSION_COMMENT,
+        associated_production=assoc_production
+    )
+
+if args.steps in ['mod_data', 'both']:
+    # Update entries which should have relative_path set to NULL, now that
+    # it's possible
+    upd_col = f" update {args.schema}.dataset set relative_path = NULL where location_type in ('external', 'meta_only')"
+
+    print("to be executed: ", upd_col)
+    with db_connection.engine.connect() as conn:
+        conn.execute(text(upd_col))
+        conn.commit()
diff --git a/src/dataregistry/_version.py b/src/dataregistry/_version.py
@@ -1 +1 @@
-__version__ = "1.1.1"
+__version__ = "1.2.0"
diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py
@@ -95,6 +95,11 @@ def _validate_register_inputs(
                     "External datasets require either a url or contact_email"
                 )
 
+        # For location_type external or metadata_only, relative_path
+        # must be none
+        if kwargs_dict["location_type"] in ["external", "meta_only"]:
+            kwargs_dict["relative_path"] = None
+
         # Make sure the user passed `relative_path` is legal
         # Only needed for `register` function, `replace` has no `relative_path`
         # argument as this cannot be changed from the original `register`
@@ -397,7 +402,7 @@ def register(
             Absolute location of dataset to copy into the data registry.
 
             If None, dataset should already be at correct relative_path within
-            the data registry.
+            the data registry (for datasets of location_type "dataregistry").
         owner** : str, optional
         owner_type** : str, optional
         execution_name** : str, optional
@@ -422,7 +427,8 @@ def register(
         contact_email**: str, optional
         test_production: boolean, default False.  Set to True for testing
                          code for production owner_type
-        relative_path** : str, optional
+        relative_path** : str, optional. Always None for datasets with
+                          location_type "external" or "meta_only"
         kwargs_dict : dict
             Stores all the keyword arguments passed to this function (and
             defaults). Automatically generated by the decorator, do not pass
@@ -443,7 +449,11 @@ def register(
         self._compute_version_string(name, version, kwargs_dict)
 
         # If `relative_path` not passed, automatically generate it
-        if kwargs_dict["relative_path"] is None:
+        # But for location types "external" and "meta_only" it should
+        # be None
+        if kwargs_dict["location_type"] in ["external", "meta_only"]:
+            kwargs_dict["relative_path"] = None
+        elif kwargs_dict["relative_path"] is None:
             kwargs_dict["relative_path"] = _relpath_from_name(
                 name, kwargs_dict["version_string"], kwargs_dict["old_location"]
             )

diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml
@@ -341,7 +341,6 @@ tables:
       relative_path:
         type: "String"
         description: "Relative path storing the data, relative to `<root_dir>`. If None, generated from the `name` and `version_string`"
-        nullable: False
         cli_optional: True
       version_major:
         type: "Integer"

diff --git a/src/dataregistry/schema/schema_version.py b/src/dataregistry/schema/schema_version.py
@@ -0,0 +1,15 @@
+'''
+Store major, minor, patch for db version expected by this code version as
+well as associated comment for provenance table.
+These quantities should be updated whenever there is a change to the schema
+structure
+The information is only needed when creating a new schema or when
+modifying the schema in place
+'''
+_DB_VERSION_MAJOR = 3
+_DB_VERSION_MINOR = 4
+_DB_VERSION_PATCH = 0
+_DB_VERSION_COMMENT = "Allow dataset.relative_path to be NULL"
+
+__all__ = ["_DB_VERSION_MAJOR", "_DB_VERSION_MINOR", "_DB_VERSION_PATCH",
+           "_DB_VERSION_COMMENT"]