diff --git a/CHANGELOG.md b/CHANGELOG.md index a49e3cf..53eaf51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## Version 1.2.0 + +- Make `dataset.relative_path` nullable +- Adjust code to set `dataset.relative_path` to null for datasets of + location type `exteranl` and `meta_only` +- Add documentation for in-place database upgrades +- Add script for making db upgrade + ## Version 1.1.1 - Move `get_keyword_list()` function to the `Query()` class. @@ -111,7 +119,7 @@ list the proper join would not be made. This has been corrected. ## Version 0.5.3 - Update the `schema.yaml` file to include unique constraints and table - indexes. + indexes. - Update the unique constraints for the dataset table to be `owner`, `owner_type`, `name`, `version`, `version_suffix`. diff --git a/docs/source/database_upgrades.rst b/docs/source/database_upgrades.rst new file mode 100644 index 0000000..d82a2db --- /dev/null +++ b/docs/source/database_upgrades.rst @@ -0,0 +1,61 @@ + +Database Upgrades +================= + +.. _database_upgrades + + +Creating an alternate database +****************************** + +If it doesn't already exist, create an alternate database from ``psql`` using +the admin account: + +.. code-block:: bash + + $ psql dest_data_registry reg_admin + desc_data_registry=# create database alt_db; + +and add an entry to your .pgpass for the reg_admin account and alt_db database. +If you expect to use standard ``dataregistry`` utilities for your updates +(recommended) you'll also need an alternate config file for connecting to +the alternate database as reg_admin. + +Dump the active database +************************ + +Dump both schemas and data from the ``desc_data_registry`` database +.. code-block:: bash + + $ pg_dump -U reg_admin desc_data_registry --schema=lsst_desc_production --file=production_dump.sql + $ pg_dump -U reg_admin desc_data_registry --schema=lsst_desc_working --file=working_dump.sql + +See ``pg_dump`` documentation for a description of all options. For example you +might want to use a different format than the default simple one used here. + +Restore to alt_db database +************************** + +.. code-block:: bash + + $ psql -U reg_admin -X --set ON_ERROR_STOP=on alt_db < production_dump.sql + $ psql -U reg_admin -X --set ON_ERROR_STOP=on alt_db < working_dump.sql + +Depending on the format of your dumped data, you might instead need to use the +``pg_restore`` program rather than ``psql`` to do the restore. + +Test and apply for real +*********************** + +If the update involves changes to the schema you'll need a script to implement +them and also add an entry to the ``provenance`` table. You'll also need +to update the database version as stored in +`dataregistry/src/dataregistry/schema/schema_version.py`. +If the update involves changes to entries stored in the database, you'll need +a script for that as well (or if more convenient use a single script for both). +See examples in the dataregistry GitHub repo under +`dataregistry/scripts/schema_migration/`. + +Run your script(s) in alt_db, fix any issues, then run in the +``desc_data_registry``. Once you're satisfied everything is ok, +delete the copy of the schemas in ``alt_db`` diff --git a/docs/source/index.rst b/docs/source/index.rst index 94e1678..9e3761a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -25,7 +25,7 @@ Getting started This documentation is to help you get set up using the ``dataregistry`` Python package; covering installation, how to register datasets, and how to query for -them. +them. .. toctree:: :maxdepth: 2 @@ -61,6 +61,7 @@ them. dev_notes_spin dev_notes_database installation_locally + database_upgrades .. toctree:: :maxdepth: 2 diff --git a/scripts/create_registry_schema.py b/scripts/create_registry_schema.py index e37786f..db12064 100644 --- a/scripts/create_registry_schema.py +++ b/scripts/create_registry_schema.py @@ -18,6 +18,12 @@ load_preset_keywords, DEFAULT_NAMESPACE, ) +from dataregistry.schema.schema_version import ( + _DB_VERSION_MAJOR, + _DB_VERSION_MINOR, + _DB_VERSION_PATCH, + _DB_VERSION_COMMENT +) import logging """ @@ -266,17 +272,6 @@ def _BuildTable(schema, table_name, has_production, production): return Model -# ---------------- -# Database version -# ---------------- - -# The following should be adjusted whenever there is a change to the structure -# of the database tables. -_DB_VERSION_MAJOR = 3 -_DB_VERSION_MINOR = 3 -_DB_VERSION_PATCH = 0 -_DB_VERSION_COMMENT = "Remove `is_overwritten`, `replace_date` and `replace_uid` columns, the information is in `status`" - # ---------------------------- # Parse command line arguments # ---------------------------- @@ -330,7 +325,7 @@ def _BuildTable(schema, table_name, has_production, production): logging_level=logging.DEBUG) if db_connection.dialect == "sqlite": - print(f"Creating sqlite database...") + print("Creating sqlite database...") schema = None elif schema == prod_schema: print(f"Creating production schema {prod_schema}...") @@ -357,9 +352,9 @@ def _BuildTable(schema, table_name, has_production, production): except Exception: raise RuntimeError("production schema does not exist or is ill-formed") if ( - result["db_version_major"][0] - != _DB_VERSION_MAJOR | int(result["db_version_minor"][0]) - > _DB_VERSION_MINOR + (result["db_version_major"][0] + != _DB_VERSION_MAJOR) or (int(result["db_version_minor"][0]) + > _DB_VERSION_MINOR) ): raise RuntimeError("production schema version incompatible") @@ -395,7 +390,7 @@ def _BuildTable(schema, table_name, has_production, production): ): privs = "SELECT" else: - privs = f"SELECT, INSERT, UPDATE" + privs = "SELECT, INSERT, UPDATE" select_prv = ( f"GRANT {privs} ON ALL TABLES IN SCHEMA {schema} to {acct}" ) diff --git a/scripts/schema_migration/update_relative_path_null_constraint.py b/scripts/schema_migration/update_relative_path_null_constraint.py new file mode 100644 index 0000000..f6c94f7 --- /dev/null +++ b/scripts/schema_migration/update_relative_path_null_constraint.py @@ -0,0 +1,69 @@ +import os +import argparse +from sqlalchemy import text +from dataregistry.db_basic import DbConnection +from dataregistry.db_basic import _insert_provenance +from dataregistry.schema.schema_version import ( + _DB_VERSION_MAJOR, + _DB_VERSION_MINOR, + _DB_VERSION_PATCH, + _DB_VERSION_COMMENT +) + +parser = argparse.ArgumentParser( + description="Update specified schema, using specified config, to make dataset.relative_path nullable", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, +) +parser.add_argument("schema", + help="name of schema whose tables are to be modified.") + +home = os.getenv('HOME') +alt_config = os.path.join(home, '.config_100_alt_admin') +parser.add_argument("--config", help="Path to the data registry config file. Determines database (regular or alt) to be modified", default=alt_config) +parser.add_argument("--steps", choices=['mod_schema', 'mod_data', 'both'], + default='mod_schema') +args = parser.parse_args() + +if args.schema.endswith('production'): + assoc_production = args.schema + entry_mode = 'production' +elif args.schema.endswith('working'): + assoc_production = args.schema.replace('working', 'production') + entry_mode = 'working' +else: + raise ValueError('Schema name must end with "production" or "working"') + +query_mode = entry_mode + +db_connection = DbConnection(schema=args.schema, config_file=args.config, + entry_mode=entry_mode, query_mode=query_mode) + +if args.steps in ['mod_schema', 'both']: + # Update the schema: + alter_col = f"alter table {args.schema}.dataset alter column relative_path drop not null" + + print("To be executed: ", alter_col) + with db_connection.engine.connect() as conn: + conn.execute(text(alter_col)) + conn.commit() + + # If we got this far add a row to the provenance table + _insert_provenance( + db_connection, + _DB_VERSION_MAJOR, + _DB_VERSION_MINOR, + _DB_VERSION_PATCH, + "MIGRATE", + comment=_DB_VERSION_COMMENT, + associated_production=assoc_production + ) + +if args.steps in ['mod_data', 'both']: + # Update entries which should have relative_path set to NULL, now that + # it's possible + upd_col = f" update {args.schema}.dataset set relative_path = NULL where location_type in ('external', 'meta_only')" + + print("to be executed: ", upd_col) + with db_connection.engine.connect() as conn: + conn.execute(text(upd_col)) + conn.commit() diff --git a/src/dataregistry/_version.py b/src/dataregistry/_version.py index a82b376..c68196d 100644 --- a/src/dataregistry/_version.py +++ b/src/dataregistry/_version.py @@ -1 +1 @@ -__version__ = "1.1.1" +__version__ = "1.2.0" diff --git a/src/dataregistry/registrar/dataset.py b/src/dataregistry/registrar/dataset.py index 80924af..754ff05 100644 --- a/src/dataregistry/registrar/dataset.py +++ b/src/dataregistry/registrar/dataset.py @@ -95,6 +95,11 @@ def _validate_register_inputs( "External datasets require either a url or contact_email" ) + # For location_type external or metadata_only, relative_path + # must be none + if kwargs_dict["location_type"] in ["external", "meta_only"]: + kwargs_dict["relative_path"] = None + # Make sure the user passed `relative_path` is legal # Only needed for `register` function, `replace` has no `relative_path` # argument as this cannot be changed from the original `register` @@ -397,7 +402,7 @@ def register( Absolute location of dataset to copy into the data registry. If None, dataset should already be at correct relative_path within - the data registry. + the data registry (for datasets of location_type "dataregistry"). owner** : str, optional owner_type** : str, optional execution_name** : str, optional @@ -422,7 +427,8 @@ def register( contact_email**: str, optional test_production: boolean, default False. Set to True for testing code for production owner_type - relative_path** : str, optional + relative_path** : str, optional. Always None for datasets with + location_type "external" or "meta_only" kwargs_dict : dict Stores all the keyword arguments passed to this function (and defaults). Automatically generated by the decorator, do not pass @@ -443,7 +449,11 @@ def register( self._compute_version_string(name, version, kwargs_dict) # If `relative_path` not passed, automatically generate it - if kwargs_dict["relative_path"] is None: + # But for location types "external" and "meta_only" it should + # be None + if kwargs_dict["location_type"] in ["external", "meta_only"]: + kwargs_dict["relative_path"] = None + elif kwargs_dict["relative_path"] is None: kwargs_dict["relative_path"] = _relpath_from_name( name, kwargs_dict["version_string"], kwargs_dict["old_location"] ) diff --git a/src/dataregistry/schema/schema.yaml b/src/dataregistry/schema/schema.yaml index c0f44f3..a797407 100644 --- a/src/dataregistry/schema/schema.yaml +++ b/src/dataregistry/schema/schema.yaml @@ -341,7 +341,6 @@ tables: relative_path: type: "String" description: "Relative path storing the data, relative to ``. If None, generated from the `name` and `version_string`" - nullable: False cli_optional: True version_major: type: "Integer" diff --git a/src/dataregistry/schema/schema_version.py b/src/dataregistry/schema/schema_version.py new file mode 100644 index 0000000..1fb025e --- /dev/null +++ b/src/dataregistry/schema/schema_version.py @@ -0,0 +1,15 @@ +''' +Store major, minor, patch for db version expected by this code version as +well as associated comment for provenance table. +These quantities should be updated whenever there is a change to the schema +structure +The information is only needed when creating a new schema or when +modifying the schema in place +''' +_DB_VERSION_MAJOR = 3 +_DB_VERSION_MINOR = 4 +_DB_VERSION_PATCH = 0 +_DB_VERSION_COMMENT = "Allow dataset.relative_path to be NULL" + +__all__ = ["_DB_VERSION_MAJOR", "_DB_VERSION_MINOR", "_DB_VERSION_PATCH", + "_DB_VERSION_COMMENT"]