From a3677e00273873bf37318775c371cc3c558282fc Mon Sep 17 00:00:00 2001 From: bailliekova Date: Tue, 15 Oct 2024 09:39:31 -0500 Subject: [PATCH 01/16] update modles --- gerrydb_meta/models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gerrydb_meta/models.py b/gerrydb_meta/models.py index bd184d2..2e9b059 100644 --- a/gerrydb_meta/models.py +++ b/gerrydb_meta/models.py @@ -549,16 +549,17 @@ class ColumnValue(Base): __tablename__ = "column_value" __table_args__ = (UniqueConstraint("col_id", "geo_id", "valid_from"),) - val_id: Mapped[int] = mapped_column(BigInteger, primary_key=True) col_id: Mapped[int] = mapped_column( Integer, ForeignKey("column.col_id"), nullable=False, + primary_key=True, ) geo_id: Mapped[int] = mapped_column( Integer, ForeignKey("geography.geo_id"), nullable=False, + primary_key=True, ) meta_id: Mapped[int] = mapped_column( Integer, ForeignKey("meta.meta_id"), nullable=False @@ -566,6 +567,7 @@ class ColumnValue(Base): valid_from: Mapped[datetime] = mapped_column( DateTime(timezone=True), nullable=False, + primary_key=True, ) valid_to: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=True) @@ -573,7 +575,6 @@ class ColumnValue(Base): val_int: Mapped[int] = mapped_column(BigInteger, nullable=True) val_str: Mapped[str] = mapped_column(Text, nullable=True) val_bool: Mapped[bool] = mapped_column(Boolean, nullable=True) - val_json: Mapped[Any] = mapped_column(postgresql.JSONB, nullable=True) meta: Mapped[ObjectMeta] = relationship("ObjectMeta") From 85b9d5706f74c9dcc3836e4ab460b09e5b515638 Mon Sep 17 00:00:00 2001 From: bailliekova Date: Tue, 15 Oct 2024 12:25:05 -0500 Subject: [PATCH 02/16] update set column value and tests --- gerrydb_meta/crud/column.py | 11 ++++++----- tests/api/test_column_value.py | 3 +-- tests/crud/test_column.py | 3 --- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/gerrydb_meta/crud/column.py b/gerrydb_meta/crud/column.py index 170cc38..014ed3c 100644 --- a/gerrydb_meta/crud/column.py +++ b/gerrydb_meta/crud/column.py @@ -202,8 +202,8 @@ def set_values( # Add the new column values and invalidate the old ones where present. geo_ids = [geo.geo_id for geo, _ in values] - with_values = ( - db.query(models.ColumnValue.val_id) + with_tuples = ( + db.query(models.ColumnValue.col_id,models.ColumnValue.geo_id,models.ColumnValue.valid_from) .filter( models.ColumnValue.col_id == col.col_id, models.ColumnValue.geo_id.in_(geo_ids), @@ -211,6 +211,8 @@ def set_values( ) .all() ) + + with_values=["_".join([str(val) for val in tup]) for tup in with_tuples] with db.begin(nested=True): db.execute(insert(models.ColumnValue), rows) @@ -220,9 +222,8 @@ def set_values( db.execute( update(models.ColumnValue) .where( - models.ColumnValue.val_id.in_( - val.val_id for val in with_values - ), + "_".join([str(models.ColumnValue.col_id), str(models.ColumnValue.geo_id), str(models.ColumnValue.valid_from)]) + in with_values ) .values(valid_to=now) ) diff --git a/tests/api/test_column_value.py b/tests/api/test_column_value.py index 5689169..ef006f0 100644 --- a/tests/api/test_column_value.py +++ b/tests/api/test_column_value.py @@ -18,9 +18,8 @@ (ColumnType.FLOAT, 1.0, float("inf")), (ColumnType.BOOL, True, False), (ColumnType.STR, "", "abc"), - (ColumnType.JSON, {"key": "value"}, [1, 2, "3"]), ], - ids=("int", "float", "bool", "str", "json"), + ids=("int", "float", "bool", "str"), ) def test_api_column_value_set__two_geos(ctx_public_namespace_read_write, typed_vals): col_type, val1, val2 = typed_vals diff --git a/tests/crud/test_column.py b/tests/crud/test_column.py index 4815fcb..2ff335e 100644 --- a/tests/crud/test_column.py +++ b/tests/crud/test_column.py @@ -204,9 +204,6 @@ def test_crud_column_set_values(db_with_meta): assert cols_list[0].val_bool is None assert cols_list[1].val_bool is None - assert cols_list[0].val_json is None - assert cols_list[1].val_json is None - def test_crud_column_patch(db_with_meta): db, meta = db_with_meta From 5a27fa82fee5c4ea24d908d9ac461c665320507c Mon Sep 17 00:00:00 2001 From: bailliekova Date: Tue, 15 Oct 2024 12:25:18 -0500 Subject: [PATCH 03/16] updated poetry.lock file --- poetry.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index aa4026c..1c89047 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "alembic" @@ -1659,4 +1659,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "2f0751dcf294170c7afc8ca9e23e0868e796401dd14a19f4b5d03ff3670be7ca" +content-hash = "0b26ea671bd11c1b848083cd337fda53edc29b642b40a4502f097be19a1bc6ad" From 2d6df9fa343975b9f83df50e648677d3cff5e76b Mon Sep 17 00:00:00 2001 From: bailliekova Date: Tue, 15 Oct 2024 12:29:41 -0500 Subject: [PATCH 04/16] fix formatting --- gerrydb_meta/crud/column.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/gerrydb_meta/crud/column.py b/gerrydb_meta/crud/column.py index 014ed3c..129e477 100644 --- a/gerrydb_meta/crud/column.py +++ b/gerrydb_meta/crud/column.py @@ -203,7 +203,11 @@ def set_values( # Add the new column values and invalidate the old ones where present. geo_ids = [geo.geo_id for geo, _ in values] with_tuples = ( - db.query(models.ColumnValue.col_id,models.ColumnValue.geo_id,models.ColumnValue.valid_from) + db.query( + models.ColumnValue.col_id, + models.ColumnValue.geo_id, + models.ColumnValue.valid_from, + ) .filter( models.ColumnValue.col_id == col.col_id, models.ColumnValue.geo_id.in_(geo_ids), @@ -211,8 +215,8 @@ def set_values( ) .all() ) - - with_values=["_".join([str(val) for val in tup]) for tup in with_tuples] + + with_values = ["_".join([str(val) for val in tup]) for tup in with_tuples] with db.begin(nested=True): db.execute(insert(models.ColumnValue), rows) @@ -222,8 +226,14 @@ def set_values( db.execute( update(models.ColumnValue) .where( - "_".join([str(models.ColumnValue.col_id), str(models.ColumnValue.geo_id), str(models.ColumnValue.valid_from)]) - in with_values + "_".join( + [ + str(models.ColumnValue.col_id), + str(models.ColumnValue.geo_id), + str(models.ColumnValue.valid_from), + ] + ) + in with_values ) .values(valid_to=now) ) From 147cca54f679dc9befabb291d8694ef629e75237 Mon Sep 17 00:00:00 2001 From: bailliekova Date: Mon, 18 Nov 2024 14:32:36 -0600 Subject: [PATCH 05/16] just file with todo --- gerrydb_meta/crud/geography.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gerrydb_meta/crud/geography.py b/gerrydb_meta/crud/geography.py index fa600e2..dbf7d3b 100644 --- a/gerrydb_meta/crud/geography.py +++ b/gerrydb_meta/crud/geography.py @@ -60,6 +60,9 @@ def create_bulk( "Cannot create geographies with duplicate paths.", paths=[path for path in paths if paths.count(path) > 1], ) + + #THIS IS WHERE WE MAKE GEOIDS, ANNA! + #TODO: also create a column_value empty partition for the geoid. (after the foreign key exists, obvs) with db.begin(nested=True): geos = list( From 4d880654cb9a31a1f843f9d4f1d7f55632b8862d Mon Sep 17 00:00:00 2001 From: bailliekova Date: Thu, 23 Jan 2025 03:43:00 -0600 Subject: [PATCH 06/16] in flight view work --- gerrydb_meta/crud/view.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/gerrydb_meta/crud/view.py b/gerrydb_meta/crud/view.py index c61f043..0c77af0 100644 --- a/gerrydb_meta/crud/view.py +++ b/gerrydb_meta/crud/view.py @@ -478,6 +478,40 @@ def render(self, db: Session, *, view: models.View) -> ViewRenderContext: ), ) + def _agg_select(col)->str: + select_str=f""" + + MAX(case when col_id = {col.col_id} then {COLUMN_TYPE_TO_VALUE_COLUMN[col.type]} else null) as {col.canonical_ref.path} + """ + return select_str + + def _column_sub(cols, geoids, view_at,)-> str: + COLUMN_VALUE_TABLE_NAME = "column_value" + + agg_selects=[] + for col in cols: + agg_selects.append( + f""" + + MAX(case when col_id = {col.col_id} then {COLUMN_TYPE_TO_VALUE_COLUMN[col.type]} else null) as {col.canonical_ref.path} + """ + ) + + + + subquery= f""" + SELECT + geoid, + {",\n".join([_agg_select(col) for col in cols]) + } + FROM {COLUMN_VALUE_TABLE_NAME} + WHERE column_id in ({",".join([col.id for col in cols])}) + AND valid_from <= {view_at} + AND (valid_to is NONE OR valid_to >= {view_at}) + """ + + return subquery + def _geo_meta( self, db: Session, view: models.View ) -> tuple[dict[str, int], dict[int, models.ObjectMeta]]: From beba27181b9902f7f1450070f226e593e7bcbd93 Mon Sep 17 00:00:00 2001 From: bailliekova Date: Wed, 29 Jan 2025 01:45:03 -0600 Subject: [PATCH 07/16] partitioning by geo_id --- gerrydb_meta/crud/column.py | 9 ++++++++- gerrydb_meta/crud/geography.py | 12 +++++++----- gerrydb_meta/models.py | 16 ++++++++++++---- gerrydb_meta/utils.py | 7 +++++++ 4 files changed, 34 insertions(+), 10 deletions(-) create mode 100644 gerrydb_meta/utils.py diff --git a/gerrydb_meta/crud/column.py b/gerrydb_meta/crud/column.py index 129e477..703c901 100644 --- a/gerrydb_meta/crud/column.py +++ b/gerrydb_meta/crud/column.py @@ -5,13 +5,14 @@ from datetime import datetime, timezone from typing import Any, Collection, Tuple -from sqlalchemy import exc, insert, update +from sqlalchemy import exc, insert, update, text from sqlalchemy.orm import Session from gerrydb_meta import models, schemas from gerrydb_meta.crud.base import NamespacedCRBase, normalize_path from gerrydb_meta.enums import ColumnType from gerrydb_meta.exceptions import ColumnValueTypeError, CreateValueError +from gerrydb_meta.utils import create_column_value_partition_text log = logging.getLogger() @@ -168,6 +169,7 @@ def set_values( Raises: ColumnValueTypeError: If column types do not match expected types. """ + table_name=models.ColumnValue.__table__.name val_column = COLUMN_TYPE_TO_VALUE_COLUMN[col.type] now = datetime.now(timezone.utc) @@ -202,6 +204,11 @@ def set_values( # Add the new column values and invalidate the old ones where present. geo_ids = [geo.geo_id for geo, _ in values] + + #make sure partitions exist for all geos + for geo_id in set(geo_ids): + db.execute(create_column_value_partition_text(geo_id=geo_id)) + with_tuples = ( db.query( models.ColumnValue.col_id, diff --git a/gerrydb_meta/crud/geography.py b/gerrydb_meta/crud/geography.py index dbf7d3b..b09a047 100644 --- a/gerrydb_meta/crud/geography.py +++ b/gerrydb_meta/crud/geography.py @@ -7,13 +7,14 @@ from typing import Collection from geoalchemy2.elements import WKBElement -from sqlalchemy import and_, insert, or_, update +from sqlalchemy import and_, insert, or_, update, text from sqlalchemy.exc import StatementError from sqlalchemy.orm import Session from gerrydb_meta import models, schemas from gerrydb_meta.crud.base import NamespacedCRBase, normalize_path from gerrydb_meta.exceptions import BulkCreateError, BulkPatchError +from gerrydb_meta.utils import create_column_value_partition_text log = logging.getLogger() @@ -61,9 +62,6 @@ def create_bulk( paths=[path for path in paths if paths.count(path) > 1], ) - #THIS IS WHERE WE MAKE GEOIDS, ANNA! - #TODO: also create a column_value empty partition for the geoid. (after the foreign key exists, obvs) - with db.begin(nested=True): geos = list( db.scalars( @@ -80,6 +78,8 @@ def create_bulk( ], ) ) + for geo in geos: + db.execute(create_column_value_partition_text(geo_id=geo.geo_id)) try: geo_versions = list( @@ -112,9 +112,11 @@ def create_bulk( ) raise BulkCreateError( "Failed to insert geometries. Geometries must be encoded in WKB format." - ) from ex + ) from ex etag = self._update_etag(db, namespace) + + db.flush() return list(zip(geos, geo_versions)), etag diff --git a/gerrydb_meta/models.py b/gerrydb_meta/models.py index 2e9b059..cc53ec9 100644 --- a/gerrydb_meta/models.py +++ b/gerrydb_meta/models.py @@ -5,7 +5,7 @@ from uuid import UUID, uuid4 from geoalchemy2 import Geography as SqlGeography -from sqlalchemy import JSON, BigInteger, Boolean, CheckConstraint, DateTime +from sqlalchemy import JSON, BigInteger, Boolean, CheckConstraint, DateTime, text, event from sqlalchemy import Enum as SqlEnum from sqlalchemy import ( ForeignKey, @@ -17,8 +17,9 @@ UniqueConstraint, ) from sqlalchemy.dialects import postgresql -from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, Session from sqlalchemy.sql import func +from sqlalchemy.types import CHAR from gerrydb_meta.enums import ( ColumnKind, @@ -27,8 +28,10 @@ ScopeType, ViewRenderStatus, ) +from gerrydb_meta.utils import create_column_value_partition_text -metadata_obj = MetaData(schema="gerrydb") +SCHEMA= "gerrydb" +metadata_obj = MetaData(schema=SCHEMA) class Base(DeclarativeBase): @@ -371,6 +374,10 @@ def full_path(self): """Path with namespace prefix.""" return f"/{self.namespace.path}/{self.path}" +@event.listens_for(Geography, "after_insert") +def create_geo_partition_in_column_value(mapper, connection, geo): + geo_id=geo.geo_id + Session.object_session(geo).execute(create_column_value_partition_text(geo_id=geo_id)) class GeoImport(Base): __tablename__ = "geo_import" @@ -547,7 +554,8 @@ class ColumnSetMember(Base): class ColumnValue(Base): __tablename__ = "column_value" - __table_args__ = (UniqueConstraint("col_id", "geo_id", "valid_from"),) + __table_args__ = (UniqueConstraint("col_id", "geo_id", "valid_from"), + {"postgresql_partition_by": "LIST (geo_id)" }) col_id: Mapped[int] = mapped_column( Integer, diff --git a/gerrydb_meta/utils.py b/gerrydb_meta/utils.py new file mode 100644 index 0000000..2f97df4 --- /dev/null +++ b/gerrydb_meta/utils.py @@ -0,0 +1,7 @@ +from sqlalchemy import text +from gerrydb_meta import models + +def create_column_value_partition_text(geo_id: int): + table_name=models.ColumnValue.__table__.name + sql=f"CREATE TABLE IF NOT EXISTS {models.SCHEMA}_{table_name}_{geo_id} PARTITION OF {models.SCHEMA}.{table_name} FOR VALUES IN ({geo_id})" + return text(sql) \ No newline at end of file From 6927fcbc3f6fed2b7685e0a848785e8f26d715dc Mon Sep 17 00:00:00 2001 From: bailliekova Date: Wed, 29 Jan 2025 01:59:09 -0600 Subject: [PATCH 08/16] clean up unused assigned vars and add test --- gerrydb_meta/crud/column.py | 1 - gerrydb_meta/models.py | 1 - gerrydb_meta/utils.py | 2 +- tests/test_utils.py | 9 +++++++++ 4 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 tests/test_utils.py diff --git a/gerrydb_meta/crud/column.py b/gerrydb_meta/crud/column.py index 703c901..0b8789b 100644 --- a/gerrydb_meta/crud/column.py +++ b/gerrydb_meta/crud/column.py @@ -169,7 +169,6 @@ def set_values( Raises: ColumnValueTypeError: If column types do not match expected types. """ - table_name=models.ColumnValue.__table__.name val_column = COLUMN_TYPE_TO_VALUE_COLUMN[col.type] now = datetime.now(timezone.utc) diff --git a/gerrydb_meta/models.py b/gerrydb_meta/models.py index cc53ec9..c6b5e92 100644 --- a/gerrydb_meta/models.py +++ b/gerrydb_meta/models.py @@ -19,7 +19,6 @@ from sqlalchemy.dialects import postgresql from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship, Session from sqlalchemy.sql import func -from sqlalchemy.types import CHAR from gerrydb_meta.enums import ( ColumnKind, diff --git a/gerrydb_meta/utils.py b/gerrydb_meta/utils.py index 2f97df4..998a04c 100644 --- a/gerrydb_meta/utils.py +++ b/gerrydb_meta/utils.py @@ -3,5 +3,5 @@ def create_column_value_partition_text(geo_id: int): table_name=models.ColumnValue.__table__.name - sql=f"CREATE TABLE IF NOT EXISTS {models.SCHEMA}_{table_name}_{geo_id} PARTITION OF {models.SCHEMA}.{table_name} FOR VALUES IN ({geo_id})" + sql=f"CREATE TABLE IF NOT EXISTS {models.SCHEMA}.{table_name}_{geo_id} PARTITION OF {models.SCHEMA}.{table_name} FOR VALUES IN ({geo_id})" return text(sql) \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..01fc5d9 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,9 @@ +from gerrydb_meta.utils import create_column_value_partition_text +from sqlalchemy import text + +def test_create_column_value_partition_text(): + geo_id=42 + got=create_column_value_partition_text(geo_id=geo_id) + wanted=text("CREATE TABLE IF NOT EXISTS gerrydb.column_value_42 PARTITION OF gerrydb.column_value FOR VALUES IN (42)") + #different object instances, so compare string form + assert str(got)==str(wanted) \ No newline at end of file From d11a4002ba781c4e3f1ad567d2f0a757427e4ad6 Mon Sep 17 00:00:00 2001 From: bailliekova Date: Wed, 29 Jan 2025 02:02:46 -0600 Subject: [PATCH 09/16] fix formatting --- gerrydb_meta/crud/column.py | 2 +- gerrydb_meta/crud/geography.py | 6 ++---- gerrydb_meta/models.py | 16 +++++++++++----- gerrydb_meta/utils.py | 7 ++++--- tests/test_utils.py | 13 ++++++++----- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/gerrydb_meta/crud/column.py b/gerrydb_meta/crud/column.py index 0b8789b..6e4bb52 100644 --- a/gerrydb_meta/crud/column.py +++ b/gerrydb_meta/crud/column.py @@ -204,7 +204,7 @@ def set_values( # Add the new column values and invalidate the old ones where present. geo_ids = [geo.geo_id for geo, _ in values] - #make sure partitions exist for all geos + # make sure partitions exist for all geos for geo_id in set(geo_ids): db.execute(create_column_value_partition_text(geo_id=geo_id)) diff --git a/gerrydb_meta/crud/geography.py b/gerrydb_meta/crud/geography.py index b09a047..8fdfcbd 100644 --- a/gerrydb_meta/crud/geography.py +++ b/gerrydb_meta/crud/geography.py @@ -61,7 +61,7 @@ def create_bulk( "Cannot create geographies with duplicate paths.", paths=[path for path in paths if paths.count(path) > 1], ) - + with db.begin(nested=True): geos = list( db.scalars( @@ -112,11 +112,9 @@ def create_bulk( ) raise BulkCreateError( "Failed to insert geometries. Geometries must be encoded in WKB format." - ) from ex + ) from ex etag = self._update_etag(db, namespace) - - db.flush() return list(zip(geos, geo_versions)), etag diff --git a/gerrydb_meta/models.py b/gerrydb_meta/models.py index c6b5e92..1ebd586 100644 --- a/gerrydb_meta/models.py +++ b/gerrydb_meta/models.py @@ -29,7 +29,7 @@ ) from gerrydb_meta.utils import create_column_value_partition_text -SCHEMA= "gerrydb" +SCHEMA = "gerrydb" metadata_obj = MetaData(schema=SCHEMA) @@ -373,10 +373,14 @@ def full_path(self): """Path with namespace prefix.""" return f"/{self.namespace.path}/{self.path}" + @event.listens_for(Geography, "after_insert") def create_geo_partition_in_column_value(mapper, connection, geo): - geo_id=geo.geo_id - Session.object_session(geo).execute(create_column_value_partition_text(geo_id=geo_id)) + geo_id = geo.geo_id + Session.object_session(geo).execute( + create_column_value_partition_text(geo_id=geo_id) + ) + class GeoImport(Base): __tablename__ = "geo_import" @@ -553,8 +557,10 @@ class ColumnSetMember(Base): class ColumnValue(Base): __tablename__ = "column_value" - __table_args__ = (UniqueConstraint("col_id", "geo_id", "valid_from"), - {"postgresql_partition_by": "LIST (geo_id)" }) + __table_args__ = ( + UniqueConstraint("col_id", "geo_id", "valid_from"), + {"postgresql_partition_by": "LIST (geo_id)"}, + ) col_id: Mapped[int] = mapped_column( Integer, diff --git a/gerrydb_meta/utils.py b/gerrydb_meta/utils.py index 998a04c..a6c9099 100644 --- a/gerrydb_meta/utils.py +++ b/gerrydb_meta/utils.py @@ -1,7 +1,8 @@ from sqlalchemy import text from gerrydb_meta import models + def create_column_value_partition_text(geo_id: int): - table_name=models.ColumnValue.__table__.name - sql=f"CREATE TABLE IF NOT EXISTS {models.SCHEMA}.{table_name}_{geo_id} PARTITION OF {models.SCHEMA}.{table_name} FOR VALUES IN ({geo_id})" - return text(sql) \ No newline at end of file + table_name = models.ColumnValue.__table__.name + sql = f"CREATE TABLE IF NOT EXISTS {models.SCHEMA}.{table_name}_{geo_id} PARTITION OF {models.SCHEMA}.{table_name} FOR VALUES IN ({geo_id})" + return text(sql) diff --git a/tests/test_utils.py b/tests/test_utils.py index 01fc5d9..ea47a19 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,12 @@ from gerrydb_meta.utils import create_column_value_partition_text from sqlalchemy import text + def test_create_column_value_partition_text(): - geo_id=42 - got=create_column_value_partition_text(geo_id=geo_id) - wanted=text("CREATE TABLE IF NOT EXISTS gerrydb.column_value_42 PARTITION OF gerrydb.column_value FOR VALUES IN (42)") - #different object instances, so compare string form - assert str(got)==str(wanted) \ No newline at end of file + geo_id = 42 + got = create_column_value_partition_text(geo_id=geo_id) + wanted = text( + "CREATE TABLE IF NOT EXISTS gerrydb.column_value_42 PARTITION OF gerrydb.column_value FOR VALUES IN (42)" + ) + # different object instances, so compare string form + assert str(got) == str(wanted) From 9ca3669232440de8efec3a6b103954f2d7250f74 Mon Sep 17 00:00:00 2001 From: bailliekova Date: Sat, 1 Feb 2025 10:07:30 -0600 Subject: [PATCH 10/16] switch partitioning to column --- gerrydb_meta/crud/column.py | 8 +++++--- gerrydb_meta/crud/geography.py | 3 --- gerrydb_meta/models.py | 2 +- gerrydb_meta/utils.py | 4 ++-- tests/test_utils.py | 4 ++-- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/gerrydb_meta/crud/column.py b/gerrydb_meta/crud/column.py index 6e4bb52..d5465db 100644 --- a/gerrydb_meta/crud/column.py +++ b/gerrydb_meta/crud/column.py @@ -81,6 +81,9 @@ def create( canonical_ref.col_id = col.col_id db.flush() + #create partition + db.execute(create_column_value_partition_text(column_id=col.col_id)) + # Create additional aliases (non-canonical references) to the column. if obj_in.aliases: self._add_aliases( @@ -204,9 +207,8 @@ def set_values( # Add the new column values and invalidate the old ones where present. geo_ids = [geo.geo_id for geo, _ in values] - # make sure partitions exist for all geos - for geo_id in set(geo_ids): - db.execute(create_column_value_partition_text(geo_id=geo_id)) + # make sure partition exists for column + db.execute(create_column_value_partition_text(column_id=col.col_id)) with_tuples = ( db.query( diff --git a/gerrydb_meta/crud/geography.py b/gerrydb_meta/crud/geography.py index 8fdfcbd..1464c16 100644 --- a/gerrydb_meta/crud/geography.py +++ b/gerrydb_meta/crud/geography.py @@ -14,7 +14,6 @@ from gerrydb_meta import models, schemas from gerrydb_meta.crud.base import NamespacedCRBase, normalize_path from gerrydb_meta.exceptions import BulkCreateError, BulkPatchError -from gerrydb_meta.utils import create_column_value_partition_text log = logging.getLogger() @@ -78,8 +77,6 @@ def create_bulk( ], ) ) - for geo in geos: - db.execute(create_column_value_partition_text(geo_id=geo.geo_id)) try: geo_versions = list( diff --git a/gerrydb_meta/models.py b/gerrydb_meta/models.py index 1ebd586..49b24de 100644 --- a/gerrydb_meta/models.py +++ b/gerrydb_meta/models.py @@ -559,7 +559,7 @@ class ColumnValue(Base): __tablename__ = "column_value" __table_args__ = ( UniqueConstraint("col_id", "geo_id", "valid_from"), - {"postgresql_partition_by": "LIST (geo_id)"}, + {"postgresql_partition_by": "LIST (col_id)"}, ) col_id: Mapped[int] = mapped_column( diff --git a/gerrydb_meta/utils.py b/gerrydb_meta/utils.py index a6c9099..f17d018 100644 --- a/gerrydb_meta/utils.py +++ b/gerrydb_meta/utils.py @@ -2,7 +2,7 @@ from gerrydb_meta import models -def create_column_value_partition_text(geo_id: int): +def create_column_value_partition_text(column_id: int): table_name = models.ColumnValue.__table__.name - sql = f"CREATE TABLE IF NOT EXISTS {models.SCHEMA}.{table_name}_{geo_id} PARTITION OF {models.SCHEMA}.{table_name} FOR VALUES IN ({geo_id})" + sql = f"CREATE TABLE IF NOT EXISTS {models.SCHEMA}.{table_name}_{column_id} PARTITION OF {models.SCHEMA}.{table_name} FOR VALUES IN ({column_id})" return text(sql) diff --git a/tests/test_utils.py b/tests/test_utils.py index ea47a19..32870fc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,8 +3,8 @@ def test_create_column_value_partition_text(): - geo_id = 42 - got = create_column_value_partition_text(geo_id=geo_id) + column_id = 42 + got = create_column_value_partition_text(column_id=column_id) wanted = text( "CREATE TABLE IF NOT EXISTS gerrydb.column_value_42 PARTITION OF gerrydb.column_value FOR VALUES IN (42)" ) From e2288a13ac8108af2bf7c72c883c0270f95a1c47 Mon Sep 17 00:00:00 2001 From: bailliekova Date: Sat, 1 Feb 2025 10:09:28 -0600 Subject: [PATCH 11/16] reformatted --- gerrydb_meta/crud/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gerrydb_meta/crud/column.py b/gerrydb_meta/crud/column.py index d5465db..452ed76 100644 --- a/gerrydb_meta/crud/column.py +++ b/gerrydb_meta/crud/column.py @@ -81,7 +81,7 @@ def create( canonical_ref.col_id = col.col_id db.flush() - #create partition + # create partition db.execute(create_column_value_partition_text(column_id=col.col_id)) # Create additional aliases (non-canonical references) to the column. From c1faef5d7c7cce148ab3c4d871e62a47765d750a Mon Sep 17 00:00:00 2001 From: bailliekova Date: Sat, 1 Feb 2025 10:11:05 -0600 Subject: [PATCH 12/16] remove unused import --- gerrydb_meta/crud/geography.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gerrydb_meta/crud/geography.py b/gerrydb_meta/crud/geography.py index 1464c16..fa600e2 100644 --- a/gerrydb_meta/crud/geography.py +++ b/gerrydb_meta/crud/geography.py @@ -7,7 +7,7 @@ from typing import Collection from geoalchemy2.elements import WKBElement -from sqlalchemy import and_, insert, or_, update, text +from sqlalchemy import and_, insert, or_, update from sqlalchemy.exc import StatementError from sqlalchemy.orm import Session From e4feac7362cc257fa0704f0ce2e82e900b0d8005 Mon Sep 17 00:00:00 2001 From: bailliekova Date: Sat, 1 Feb 2025 10:40:16 -0600 Subject: [PATCH 13/16] remove unneeded trigger --- gerrydb_meta/models.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/gerrydb_meta/models.py b/gerrydb_meta/models.py index 49b24de..79a4cbf 100644 --- a/gerrydb_meta/models.py +++ b/gerrydb_meta/models.py @@ -374,14 +374,6 @@ def full_path(self): return f"/{self.namespace.path}/{self.path}" -@event.listens_for(Geography, "after_insert") -def create_geo_partition_in_column_value(mapper, connection, geo): - geo_id = geo.geo_id - Session.object_session(geo).execute( - create_column_value_partition_text(geo_id=geo_id) - ) - - class GeoImport(Base): __tablename__ = "geo_import" From 0e3879bc1945c72e037d292a395b760ff398484c Mon Sep 17 00:00:00 2001 From: bailliekova Date: Thu, 13 Feb 2025 10:14:55 -0600 Subject: [PATCH 14/16] update black and do some linting --- alembic/versions/6898afa765ca_create_view_schema.py | 2 +- alembic/versions/7367a058533d_create_authn_authz.py | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/alembic/versions/6898afa765ca_create_view_schema.py b/alembic/versions/6898afa765ca_create_view_schema.py index df9558f..79800ec 100644 --- a/alembic/versions/6898afa765ca_create_view_schema.py +++ b/alembic/versions/6898afa765ca_create_view_schema.py @@ -1,7 +1,7 @@ """Create view schema Revision ID: 6898afa765ca -Revises: 8dd630f55d05 +Revises: 8dd630f55d05 Create Date: 2023-03-21 16:54:56.498038 """ diff --git a/alembic/versions/7367a058533d_create_authn_authz.py b/alembic/versions/7367a058533d_create_authn_authz.py index dc76081..670e60b 100644 --- a/alembic/versions/7367a058533d_create_authn_authz.py +++ b/alembic/versions/7367a058533d_create_authn_authz.py @@ -1,7 +1,7 @@ """Create authentication/authorization models Revision ID: 7367a058533d -Revises: +Revises: Create Date: 2023-03-21 17:25:22.277920 """ diff --git a/pyproject.toml b/pyproject.toml index 19df272..3a945e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ networkx = "^3.3.0" [tool.poetry.dev-dependencies] pytest = "^7.2.1" -black = "^22.6.0" +black = "^25.1.0" isort = "^5.10.1" alembic = "^1.9.2" networkx = "^3.3.0" From 843cb4d4defe18c52d851136bb973ab1ac316cdb Mon Sep 17 00:00:00 2001 From: bailliekova Date: Tue, 25 Feb 2025 02:50:17 -0600 Subject: [PATCH 15/16] fix syntax errors --- gerrydb_meta/crud/view.py | 99 +++++++++++++++------------------------ 1 file changed, 38 insertions(+), 61 deletions(-) diff --git a/gerrydb_meta/crud/view.py b/gerrydb_meta/crud/view.py index 0c77af0..f08b332 100644 --- a/gerrydb_meta/crud/view.py +++ b/gerrydb_meta/crud/view.py @@ -14,6 +14,7 @@ from sqlalchemy import Sequence, cast, exc, func, label, or_, select, union from sqlalchemy.dialects import postgresql from sqlalchemy.orm import Session +from sqlalchemy.sql import text, column from gerrydb_meta import models, schemas from gerrydb_meta.crud.base import NamespacedCRBase, normalize_path @@ -358,7 +359,7 @@ def get_cached_render( .order_by(models.ViewRender.created_at.desc()) .first() ) - + def render(self, db: Session, *, view: models.View) -> ViewRenderContext: """Generates queries to retrieve view data. @@ -374,29 +375,39 @@ def render(self, db: Session, *, view: models.View) -> ViewRenderContext: .subquery() ) geo_sub = select(models.Geography.geo_id, models.Geography.path).subquery() + - # Generate subqueries for joining tabular data. - column_subs = [] - column_labels = [] - for alias, col in columns.items(): - value_col = COLUMN_TYPE_TO_VALUE_COLUMN[col.type] - column_sub = ( - select( - models.ColumnValue.geo_id, - getattr(models.ColumnValue, value_col).label(alias), - ) - .filter( - models.ColumnValue.col_id == col.col_id, - models.ColumnValue.valid_from <= view.at, - or_( - models.ColumnValue.valid_to.is_(None), - models.ColumnValue.valid_to >= view.at, - ), - ) - .subquery() + COLUMN_VALUE_TABLE_NAME = "column_value" + + agg_selects=[] + column_labels=[] + col_ids=[] + for _, col in columns.items(): + agg_selects.append( + f""" + + MAX(case when col_id = {col.col_id} then {COLUMN_TYPE_TO_VALUE_COLUMN[col.type]} else null end) as {col.canonical_ref.path} + """ ) - column_subs.append(column_sub) - column_labels.append(column_sub.c[col.canonical_ref.path]) + column_labels.append(column(col.canonical_ref.path)) + col_ids.append(str(col.col_id)) + + agg_select=",\n".join(agg_selects) + col_where=f"({",".join(col_ids)})" + + column_sub=text(f""" + SELECT + geo_id, + {agg_select} + FROM gerrydb.{COLUMN_VALUE_TABLE_NAME} + WHERE col_id in {col_where} + AND valid_from <= '{view.at}' + AND (valid_to is NULL OR valid_to >= '{view.at}') + GROUP BY geo_id + """ + ).columns(models.GeoVersion.geo_id, + *column_labels + ).subquery() timestamp_clauses = [ models.GeoVersion.valid_from <= view.at, @@ -418,13 +429,12 @@ def render(self, db: Session, *, view: models.View) -> ViewRenderContext: ) .join(geo_sub, geo_sub.c.geo_id == models.GeoVersion.geo_id) ) - - # Join tabular data. - for column_sub in column_subs: - geo_query = geo_query.join( - column_sub, column_sub.c.geo_id == models.GeoVersion.geo_id - ) + + geo_query = geo_query.join( + column_sub, column_sub.c.geo_id == models.GeoVersion.geo_id + ) geo_query = geo_query.where(*timestamp_clauses) + print(str(geo_query.compile(dialect=postgresql.dialect(),compile_kwargs={"literal_binds": True}))) internal_point_query = ( select( @@ -478,39 +488,6 @@ def render(self, db: Session, *, view: models.View) -> ViewRenderContext: ), ) - def _agg_select(col)->str: - select_str=f""" - - MAX(case when col_id = {col.col_id} then {COLUMN_TYPE_TO_VALUE_COLUMN[col.type]} else null) as {col.canonical_ref.path} - """ - return select_str - - def _column_sub(cols, geoids, view_at,)-> str: - COLUMN_VALUE_TABLE_NAME = "column_value" - - agg_selects=[] - for col in cols: - agg_selects.append( - f""" - - MAX(case when col_id = {col.col_id} then {COLUMN_TYPE_TO_VALUE_COLUMN[col.type]} else null) as {col.canonical_ref.path} - """ - ) - - - - subquery= f""" - SELECT - geoid, - {",\n".join([_agg_select(col) for col in cols]) - } - FROM {COLUMN_VALUE_TABLE_NAME} - WHERE column_id in ({",".join([col.id for col in cols])}) - AND valid_from <= {view_at} - AND (valid_to is NONE OR valid_to >= {view_at}) - """ - - return subquery def _geo_meta( self, db: Session, view: models.View From b51aeca8b251b6ce4517cf02c5a0a801463eaca6 Mon Sep 17 00:00:00 2001 From: bailliekova Date: Tue, 25 Feb 2025 04:00:14 -0600 Subject: [PATCH 16/16] sqlalchemy version of agg functions --- gerrydb_meta/crud/view.py | 56 +++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/gerrydb_meta/crud/view.py b/gerrydb_meta/crud/view.py index f08b332..8e07f97 100644 --- a/gerrydb_meta/crud/view.py +++ b/gerrydb_meta/crud/view.py @@ -11,7 +11,7 @@ from geoalchemy2 import Geometry from geoalchemy2 import func as geo_func -from sqlalchemy import Sequence, cast, exc, func, label, or_, select, union +from sqlalchemy import Sequence, cast, exc, func, label, or_, select, union from sqlalchemy.dialects import postgresql from sqlalchemy.orm import Session from sqlalchemy.sql import text, column @@ -372,42 +372,25 @@ def render(self, db: Session, *, view: models.View) -> ViewRenderContext: models.GeoSetMember.geo_id, ) .filter(models.GeoSetMember.set_version_id == view.set_version_id) - .subquery() + .subquery("members_sub") ) - geo_sub = select(models.Geography.geo_id, models.Geography.path).subquery() - - - COLUMN_VALUE_TABLE_NAME = "column_value" + geo_sub = select(models.Geography.geo_id, models.Geography.path).subquery("geo_sub") agg_selects=[] column_labels=[] col_ids=[] for _, col in columns.items(): - agg_selects.append( - f""" - - MAX(case when col_id = {col.col_id} then {COLUMN_TYPE_TO_VALUE_COLUMN[col.type]} else null end) as {col.canonical_ref.path} - """ - ) + agg_selects.append(func.max(column(COLUMN_TYPE_TO_VALUE_COLUMN[col.type])).filter(models.ColumnValue.col_id==col.col_id).label(col.canonical_ref.path)) column_labels.append(column(col.canonical_ref.path)) - col_ids.append(str(col.col_id)) - - agg_select=",\n".join(agg_selects) - col_where=f"({",".join(col_ids)})" - - column_sub=text(f""" - SELECT - geo_id, - {agg_select} - FROM gerrydb.{COLUMN_VALUE_TABLE_NAME} - WHERE col_id in {col_where} - AND valid_from <= '{view.at}' - AND (valid_to is NULL OR valid_to >= '{view.at}') - GROUP BY geo_id - """ - ).columns(models.GeoVersion.geo_id, - *column_labels - ).subquery() + col_ids.append(col.col_id) + + column_sub= select(models.ColumnValue.geo_id,*agg_selects).where( + models.ColumnValue.col_id.in_(col_ids), + models.ColumnValue.valid_from <= view.at, + or_( + models.ColumnValue.valid_to.is_(None), + models.ColumnValue.valid_to >= view.at, + )).group_by(models.ColumnValue.geo_id).subquery("column_value") timestamp_clauses = [ models.GeoVersion.valid_from <= view.at, @@ -417,6 +400,18 @@ def render(self, db: Session, *, view: models.View) -> ViewRenderContext: ), ] + ## included for reference: a version without subqueries. + # geo_query=( + # select(models.Geography.path, + # models.GeoVersion.geography, + # *column_labels, + # ).join(models.GeoSetMember, models.GeoSetMember.geo_id==models.Geography.geo_id) + # .join(models.GeoVersion, models.GeoSetMember.geo_id==models.GeoVersion.geo_id) + # .join(column_sub, column_sub.c.geo_id==models.Geography.geo_id) + # .where(models.GeoSetMember.set_version_id == view.set_version_id, *timestamp_clauses) + # ) + + geo_query = ( select( geo_sub.c.path, @@ -434,7 +429,6 @@ def render(self, db: Session, *, view: models.View) -> ViewRenderContext: column_sub, column_sub.c.geo_id == models.GeoVersion.geo_id ) geo_query = geo_query.where(*timestamp_clauses) - print(str(geo_query.compile(dialect=postgresql.dialect(),compile_kwargs={"literal_binds": True}))) internal_point_query = ( select(