diff --git a/.dockerignore b/.dockerignore
index d256b21af1d5..a6e11805e92c 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,3 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
+!vm-cgconfig.conf
diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 71fced23c25f..13b44f40520f 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-prod-storage-ap-southeast-1
     bucket_region: ap-southeast-1
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
     broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index 83d4f6f37db8..2236dcbc06c7 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-prod-storage-eu-central-1
     bucket_region: eu-central-1
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
     broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 7f7601cd3928..56bece3e7752 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-prod-storage-us-east-2
     bucket_region: us-east-2
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
     broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 9cad79b986ab..f03e2d943586 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-prod-storage-us-west-2
     bucket_region: us-west-2
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
     broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
deleted file mode 100644
index ecb847bd6131..000000000000
--- a/.github/ansible/production.hosts.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
----
-storage:
-  vars:
-    console_mgmt_base_url: http://console-release.local
-    bucket_name: zenith-storage-oregon
-    bucket_region: us-west-2
-    broker_endpoint: http://storage-broker.prod.local:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: prod-1/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: production
-
-  children:
-    pageservers:
-      hosts:
-        zenith-1-ps-2:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-3:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-4:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-5:
-          console_region_id: aws-us-west-2
-
-    safekeepers:
-      hosts:
-        zenith-1-sk-1:
-          console_region_id: aws-us-west-2
-        zenith-1-sk-2:
-          console_region_id: aws-us-west-2
-        zenith-1-sk-4:
-          console_region_id: aws-us-west-2
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index f28dc8e07b54..b5377957049a 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -2,12 +2,17 @@ storage:
   vars:
     bucket_name: neon-dev-storage-eu-west-1
     bucket_region: eu-west-1
-    console_mgmt_base_url: http://console-staging.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.build
     broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
       metric_collection_interval: 10min
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: "20m"
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 9a1a0952822a..cd8f832af017 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -2,12 +2,17 @@ storage:
   vars:
     bucket_name: neon-staging-storage-us-east-2
     bucket_region: us-east-2
-    console_mgmt_base_url: http://console-staging.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.build
     broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
       metric_collection_interval: 10min
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: "20m"
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index c49b8d20097a..ecf57554d971 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
   sentryEnvironment: "staging"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index 157ae66ed1af..91ddd07eae5d 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -10,7 +10,7 @@ settings:
   uri: "https://console.stage.neon.tech/psql_session/"
   domain: "pg.neon.build"
   sentryEnvironment: "staging"
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
 # -- Additional labels for neon-proxy-link pods
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index 99b67d75c1c8..6ec18ff388a9 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
   sentryEnvironment: "staging"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index 764bb25b646a..9b250fce6eee 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
   sentryEnvironment: "staging"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index a640d468b34c..389da354638e 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index c9430877de84..7e16ac2d3d7e 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 677df6a5be1f..05e41e7a97c4 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
index 3a5cde4b01a3..e67a3e44618d 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.cloud.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 919a0d503cf8..5dc23b282e06 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/production.neon-storage-broker.yaml b/.github/helm-values/production.neon-storage-broker.yaml
deleted file mode 100644
index aa64081da3f0..000000000000
--- a/.github/helm-values/production.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: production
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "production"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 27b7f54856f8..d16d221cc4ca 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -611,34 +611,31 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_INFORMANT_VERSION: 0.1.1
+      VM_BUILDER_VERSION: v0.4.6
 
     steps:
-      - name: Downloading latest vm-builder
+      - name: Checkout
+        uses: actions/checkout@v1
+        with:
+          fetch-depth: 0
+
+      - name: Downloading vm-builder
         run: |
-          curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
+          curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
       - name: Pulling compute-node image
         run: |
           docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-      - name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
-        run: |
-          curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
-          chmod +x vm-informant
-
-      - name: Adding VM informant to compute-node image
+      - name: Building VM compute-node rootfs
         run: |
-          ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
-          docker cp vm-informant $ID:/bin/vm-informant
-          docker commit $ID temp-vm-compute-node
-          docker rm -f $ID
+          docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
 
       - name: Build vm image
         run: |
           # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
-          ./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
       - name: Pushing vm-compute-node image
         run: |
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
index f4ce7e9afa34..540d187274a1 100644
--- a/.github/workflows/deploy-prod.yml
+++ b/.github/workflows/deploy-prod.yml
@@ -165,80 +165,3 @@ jobs:
       - name: Deploy storage-broker
         run:
           helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  # Deploy to old account below          
-
-  deploy:
-    runs-on: prod
-    container:
-      image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-      options: --user root --privileged
-    if: inputs.deployStorage && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    environment:
-      name: prod-old
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{ inputs.dockerTag }}
-          cd "$(pwd)/.github/ansible"
-
-          ./get_binaries.sh
-
-          eval $(ssh-agent)
-          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
-          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-          chmod 0600 ssh-key
-          ssh-add ssh-key
-          rm -f ssh-key ssh-key-cert.pub
-          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
-          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
-      - name: Cleanup ansible folder
-        run: rm -rf ~/.ansible
-
-  deploy-storage-broker:
-    name: deploy storage broker on old staging and old prod
-    runs-on: [ self-hosted, gen3, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Add neon helm chart
-        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-      - name: Cleanup helm folder
-        run: rm -rf ~/.cache
diff --git a/Cargo.lock b/Cargo.lock
index 67e54d38337e..d154b4eaea2f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2110,6 +2110,16 @@ version = "0.3.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
 
+[[package]]
+name = "mime_guess"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
+dependencies = [
+ "mime",
+ "unicase",
+]
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -2884,6 +2894,7 @@ dependencies = [
  "md5",
  "metrics",
  "once_cell",
+ "opentelemetry",
  "parking_lot",
  "pin-project-lite",
  "pq_proto",
@@ -2892,6 +2903,8 @@ dependencies = [
  "rcgen",
  "regex",
  "reqwest",
+ "reqwest-middleware",
+ "reqwest-tracing",
  "routerify",
  "rstest",
  "rustls",
@@ -2909,7 +2922,9 @@ dependencies = [
  "tokio-postgres-rustls",
  "tokio-rustls",
  "tracing",
+ "tracing-opentelemetry",
  "tracing-subscriber",
+ "tracing-utils",
  "url",
  "utils",
  "uuid",
@@ -3079,6 +3094,7 @@ dependencies = [
  "js-sys",
  "log",
  "mime",
+ "mime_guess",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -3098,6 +3114,36 @@ dependencies = [
  "winreg",
 ]
 
+[[package]]
+name = "reqwest-middleware"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "http",
+ "reqwest",
+ "serde",
+ "task-local-extensions",
+ "thiserror",
+]
+
+[[package]]
+name = "reqwest-tracing"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce"
+dependencies = [
+ "async-trait",
+ "opentelemetry",
+ "reqwest",
+ "reqwest-middleware",
+ "task-local-extensions",
+ "tracing",
+ "tracing-opentelemetry",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3790,6 +3836,15 @@ dependencies = [
  "xattr",
 ]
 
+[[package]]
+name = "task-local-extensions"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4"
+dependencies = [
+ "tokio",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.3.0"
@@ -3809,6 +3864,8 @@ name = "tenant_size_model"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "serde",
+ "serde_json",
  "workspace_hack",
 ]
 
@@ -4358,6 +4415,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "unicase"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
 [[package]]
 name = "unicode-bidi"
 version = "0.3.10"
diff --git a/Cargo.toml b/Cargo.toml
index 4e4667f253af..99a3f560262a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -76,6 +76,8 @@ prost = "0.11"
 rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
+reqwest-middleware = "0.2.0"
 routerify = "3"
 rpds = "0.12.0"
 rustls = "0.20"
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 5c58f4baaa79..1d6c2f354f5c 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -11,7 +11,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
     apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev
+    libicu-dev libxslt1-dev
 
 #########################################################################################
 #
@@ -23,7 +23,8 @@ FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
+    --with-libxml --with-libxslt && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
     # Install headers
@@ -34,7 +35,8 @@ RUN cd postgres && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
 
 #########################################################################################
 #
@@ -50,15 +52,15 @@ RUN apt update && \
     libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
     protobuf-c-compiler xsltproc
 
-RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
-    tar zxvf SFCGAL-v1.3.10.tar.gz && \
-    cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
+# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
+RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
     DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
     make clean && cp -R /sfcgal/* /
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
+    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
     ./autogen.sh && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -83,29 +85,14 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
+    apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
-    tar xvzf v3.1.4.tar.gz && \
-    cd plv8-3.1.4 && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
+    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
-    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
+    find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -126,20 +113,17 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2
       && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
       && rm /tmp/cmake-install.sh
 
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
-    tar xvzf h3.tgz  && \
-    cd h3-4.0.1 && \
-    mkdir build && \
-    cd build && \
+RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
+    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir build && cd build && \
     cmake .. -DCMAKE_BUILD_TYPE=Release && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     DESTDIR=/h3 make install && \
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
-    tar xvzf h3-pg.tgz && \
-    cd h3-pg-4.0.1 && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -155,9 +139,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
 FROM build-deps AS unit-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
-    tar xvzf 7.7.tar.gz && \
-    cd postgresql-unit-7.7 && \
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -176,12 +159,27 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz &
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \
-    cd pgvector && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
+    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
+#########################################################################################
+#
+# Layer "pgjwt-pg-build"
+# compile pgjwt extension
+#
+#########################################################################################
+FROM build-deps AS pgjwt-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
+RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -196,6 +194,7 @@ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
 COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -258,6 +257,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libicu67, locales for collations (including ICU)
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
+# libxml2, libxslt1.1 for xml2
 RUN apt update &&  \
     apt install --no-install-recommends -y \
         locales \
@@ -269,6 +269,8 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 \
         libsfcgal1 \
+        libxml2 \
+        libxslt1.1 \
         gdb && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node
new file mode 100644
index 000000000000..af3bfb359005
--- /dev/null
+++ b/Dockerfile.vm-compute-node
@@ -0,0 +1,32 @@
+# Note: this file *mostly* just builds on Dockerfile.compute-node
+
+ARG SRC_IMAGE
+ARG VM_INFORMANT_VERSION=v0.1.6
+
+# Pull VM informant and set up inittab
+FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
+
+RUN set -e \
+	&& rm -f /etc/inittab \
+	&& touch /etc/inittab
+
+ADD vm-cgconfig.conf /etc/cgconfig.conf
+RUN set -e \
+	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
+	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab
+
+# Combine, starting from non-VM compute node image.
+FROM $SRC_IMAGE as base
+
+# Temporarily set user back to root so we can run apt update and adduser
+USER root
+RUN apt update && \
+	apt install --no-install-recommends -y \
+        cgroup-tools
+RUN adduser vm-informant --disabled-password --no-create-home
+USER postgres
+
+COPY --from=informant /etc/inittab /etc/inittab
+COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
+
+ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
diff --git a/Makefile b/Makefile
index 92a453268482..e04a82c7c9d1 100644
--- a/Makefile
+++ b/Makefile
@@ -136,9 +136,15 @@ neon-pg-ext-%: postgres-%
 
 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
 
 .PHONY: neon-pg-ext
 neon-pg-ext: \
diff --git a/README.md b/README.md
index 29389e7a5d34..819693f1f3ef 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
   libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
   protobuf-devel
 ```
+* On Arch based systems, these packages are needed:
+```bash
+pacman -S base-devel readline zlib libseccomp openssl clang \
+postgresql-libs cmake postgresql protobuf
+```
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```
@@ -83,9 +88,10 @@ cd neon
 
 # The preferred and default is to make a debug build. This will create a
 # demonstrably slower build than a release build. For a release build,
-# use "BUILD_TYPE=release make -j`nproc`"
+# use "BUILD_TYPE=release make -j`nproc` -s"
+# Remove -s for the verbose build log
 
-make -j`nproc`
+make -j`nproc` -s
 ```
 
 #### Building on OSX
@@ -99,9 +105,10 @@ cd neon
 
 # The preferred and default is to make a debug build. This will create a
 # demonstrably slower build than a release build. For a release build,
-# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
+# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s"
+# Remove -s for the verbose build log
 
-make -j`sysctl -n hw.logicalcpu`
+make -j`sysctl -n hw.logicalcpu` -s
 ```
 
 #### Dependency installation notes
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 2c426620205d..49cf1cd3473d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,7 +44,6 @@ use tracing::{error, info};
 
 use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
 use compute_tools::http::api::launch_http_server;
-use compute_tools::informant::spawn_vm_informant_if_present;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
@@ -141,8 +140,6 @@ fn main() -> Result<()> {
     // requests, while configuration is still in progress.
     let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
     let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
-    // Also spawn the thread responsible for handling the VM informant -- if it's present
-    let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
 
     // Start Postgres
     let mut delay_exit = false;
diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs
deleted file mode 100644
index 8a6e3ab43a3d..000000000000
--- a/compute_tools/src/informant.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use std::path::Path;
-use std::process;
-use std::thread;
-use std::time::Duration;
-use tracing::{info, warn};
-
-use anyhow::{Context, Result};
-
-const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
-const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
-
-/// Launch a thread to start the VM informant if it's present (and restart, on failure)
-pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
-    let exists = Path::new(VM_INFORMANT_PATH)
-        .try_exists()
-        .context("could not check if path exists")?;
-
-    if !exists {
-        return Ok(None);
-    }
-
-    Ok(Some(
-        thread::Builder::new()
-            .name("run-vm-informant".into())
-            .spawn(move || run_informant())?,
-    ))
-}
-
-fn run_informant() -> ! {
-    let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
-
-    info!("starting VM informant");
-
-    loop {
-        let mut cmd = process::Command::new(VM_INFORMANT_PATH);
-        // Block on subprocess:
-        let result = cmd.status();
-
-        match result {
-            Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
-            Ok(status) if !status.success() => {
-                warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
-            }
-            Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
-        }
-
-        // Wait before retrying
-        thread::sleep(restart_wait);
-    }
-}
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index a71b92f91a8e..aee6b53e6a19 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,7 +8,6 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
-pub mod informant;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
diff --git a/docs/settings.md b/docs/settings.md
index 58d32157a3db..817f97d8ba7a 100644
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898'
 checkpoint_distance = '268435456' # in bytes
 checkpoint_timeout = '10m'
 
-gc_period = '100 s'
+gc_period = '1 hour'
 gc_horizon = '67108864'
 
 max_file_descriptors = '100'
@@ -101,7 +101,7 @@ away.
 
 #### gc_period
 
-Interval at which garbage collection is triggered. Default is 100 s.
+Interval at which garbage collection is triggered. Default is 1 hour.
 
 #### image_creation_threshold
 
@@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3.
 
 #### pitr_interval
 
-WAL retention duration for PITR branching. Default is 30 days.
+WAL retention duration for PITR branching. Default is 7 days.
 
 #### walreceiver_connect_timeout
 
diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md
index 8378efc84256..407d7b525ac7 100644
--- a/docs/synthetic-size.md
+++ b/docs/synthetic-size.md
@@ -37,7 +37,7 @@ The synthetic size is designed to:
 
 ## Terms & assumptions
 
-- logical size is the size of a database *at a given point in
+- logical size is the size of a branch *at a given point in
   time*. It's the total size of all tables in all databases, as you
   see with "\l+" in psql for example, plus the Postgres SLRUs and some
   small amount of metadata. NOTE that currently, Neon does not include
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index c5e4dbd1f0d9..b7995c840cb9 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -75,27 +75,36 @@ impl StartupMessageParams {
     /// taking into account all escape sequences but leaving them as-is.
     /// [`None`] means that there's no `options` in [`Self`].
     pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
+        self.get("options").map(Self::parse_options_raw)
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// applying all escape sequences (using owned strings as needed).
+    /// [`None`] means that there's no `options` in [`Self`].
+    pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
+        self.get("options").map(Self::parse_options_escaped)
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// taking into account all escape sequences but leaving them as-is.
+    pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
         // See `postgres: pg_split_opts`.
         let mut last_was_escape = false;
-        let iter = self
-            .get("options")?
+        input
             .split(move |c: char| {
                 // We split by non-escaped whitespace symbols.
                 let should_split = c.is_ascii_whitespace() && !last_was_escape;
                 last_was_escape = c == '\\' && !last_was_escape;
                 should_split
             })
-            .filter(|s| !s.is_empty());
-
-        Some(iter)
+            .filter(|s| !s.is_empty())
     }
 
     /// Split command-line options according to PostgreSQL's logic,
     /// applying all escape sequences (using owned strings as needed).
-    /// [`None`] means that there's no `options` in [`Self`].
-    pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
+    pub fn parse_options_escaped(input: &str) -> impl Iterator<Item = Cow<'_, str>> {
         // See `postgres: pg_split_opts`.
-        let iter = self.options_raw()?.map(|s| {
+        Self::parse_options_raw(input).map(|s| {
             let mut preserve_next_escape = false;
             let escape = |c| {
                 // We should remove '\\' unless it's preceded by '\\'.
@@ -108,9 +117,12 @@ impl StartupMessageParams {
                 true => Cow::Owned(s.replace(escape, "")),
                 false => Cow::Borrowed(s),
             }
-        });
+        })
+    }
 
-        Some(iter)
+    /// Iterate through key-value pairs in an arbitrary order.
+    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
+        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
     }
 
     // This function is mostly useful in tests.
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index a5f0160f35f8..15e78932a83d 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -7,5 +7,7 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+serde.workspace = true
+serde_json.workspace = true
 
 workspace_hack.workspace = true
diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs
new file mode 100644
index 000000000000..093b053675a7
--- /dev/null
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -0,0 +1,219 @@
+use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
+
+//
+//                 *-g--*---D--->
+//                /
+//               /
+//              /                 *---b----*-B--->
+//             /                 /
+//            /                 /
+//      -----*--e---*-----f----* C
+//           E                  \
+//                               \
+//                                *--a---*---A-->
+//
+// If A and B need to be retained, is it cheaper to store
+// snapshot at C+a+b, or snapshots at A and B ?
+//
+// If D also needs to be retained, which is cheaper:
+//
+// 1. E+g+e+f+a+b
+// 2. D+C+a+b
+// 3. D+A+B
+
+/// [`Segment`] which has had it's size calculated.
+#[derive(Clone, Debug)]
+struct SegmentSize {
+    method: SegmentMethod,
+
+    // calculated size of this subtree, using this method
+    accum_size: u64,
+
+    seg_id: usize,
+    children: Vec<SegmentSize>,
+}
+
+struct SizeAlternatives {
+    // cheapest alternative if parent is available.
+    incremental: SegmentSize,
+
+    // cheapest alternative if parent node is not available
+    non_incremental: Option<SegmentSize>,
+}
+
+impl StorageModel {
+    pub fn calculate(&self) -> SizeResult {
+        // Build adjacency list. 'child_list' is indexed by segment id. Each entry
+        // contains a list of all child segments of the segment.
+        let mut roots: Vec<usize> = Vec::new();
+        let mut child_list: Vec<Vec<usize>> = Vec::new();
+        child_list.resize(self.segments.len(), Vec::new());
+
+        for (seg_id, seg) in self.segments.iter().enumerate() {
+            if let Some(parent_id) = seg.parent {
+                child_list[parent_id].push(seg_id);
+            } else {
+                roots.push(seg_id);
+            }
+        }
+
+        let mut segment_results = Vec::new();
+        segment_results.resize(
+            self.segments.len(),
+            SegmentSizeResult {
+                method: SegmentMethod::Skipped,
+                accum_size: 0,
+            },
+        );
+
+        let mut total_size = 0;
+        for root in roots {
+            if let Some(selected) = self.size_here(root, &child_list).non_incremental {
+                StorageModel::fill_selected_sizes(&selected, &mut segment_results);
+                total_size += selected.accum_size;
+            } else {
+                // Couldn't find any way to get this root. Error?
+            }
+        }
+
+        SizeResult {
+            total_size,
+            segments: segment_results,
+        }
+    }
+
+    fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec<SegmentSizeResult>) {
+        result[selected.seg_id] = SegmentSizeResult {
+            method: selected.method,
+            accum_size: selected.accum_size,
+        };
+        // recurse to children
+        for child in selected.children.iter() {
+            StorageModel::fill_selected_sizes(child, result);
+        }
+    }
+
+    //
+    // This is the core of the sizing calculation.
+    //
+    // This is a recursive function, that for each Segment calculates the best way
+    // to reach all the Segments that are marked as needed in this subtree, under two
+    // different conditions:
+    // a) when the parent of this segment is available (as a snaphot or through WAL), and
+    // b) when the parent of this segment is not available.
+    //
+    fn size_here(&self, seg_id: usize, child_list: &Vec<Vec<usize>>) -> SizeAlternatives {
+        let seg = &self.segments[seg_id];
+        // First figure out the best way to get each child
+        let mut children = Vec::new();
+        for child_id in &child_list[seg_id] {
+            children.push(self.size_here(*child_id, child_list))
+        }
+
+        // Method 1. If this node is not needed, we can skip it as long as we
+        // take snapshots later in each sub-tree
+        let snapshot_later = if !seg.needed {
+            let mut snapshot_later = SegmentSize {
+                seg_id,
+                method: SegmentMethod::Skipped,
+                accum_size: 0,
+                children: Vec::new(),
+            };
+
+            let mut possible = true;
+            for child in children.iter() {
+                if let Some(non_incremental) = &child.non_incremental {
+                    snapshot_later.accum_size += non_incremental.accum_size;
+                    snapshot_later.children.push(non_incremental.clone())
+                } else {
+                    possible = false;
+                    break;
+                }
+            }
+            if possible {
+                Some(snapshot_later)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        // Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of
+        // this Segment was given.
+        let snapshot_here = if !seg.needed || seg.parent.is_none() {
+            if let Some(snapshot_size) = seg.size {
+                let mut snapshot_here = SegmentSize {
+                    seg_id,
+                    method: SegmentMethod::SnapshotHere,
+                    accum_size: snapshot_size,
+                    children: Vec::new(),
+                };
+                for child in children.iter() {
+                    snapshot_here.accum_size += child.incremental.accum_size;
+                    snapshot_here.children.push(child.incremental.clone())
+                }
+                Some(snapshot_here)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        // Method 3. Use WAL to get here from parent
+        let wal_here = {
+            let mut wal_here = SegmentSize {
+                seg_id,
+                method: SegmentMethod::Wal,
+                accum_size: if let Some(parent_id) = seg.parent {
+                    seg.lsn - self.segments[parent_id].lsn
+                } else {
+                    0
+                },
+                children: Vec::new(),
+            };
+            for child in children {
+                wal_here.accum_size += child.incremental.accum_size;
+                wal_here.children.push(child.incremental)
+            }
+            wal_here
+        };
+
+        // If the parent is not available, what's the cheapest method involving
+        // a snapshot here or later?
+        let mut cheapest_non_incremental: Option<SegmentSize> = None;
+        if let Some(snapshot_here) = snapshot_here {
+            cheapest_non_incremental = Some(snapshot_here);
+        }
+        if let Some(snapshot_later) = snapshot_later {
+            // Use <=, to prefer skipping if the size is equal
+            if let Some(parent) = &cheapest_non_incremental {
+                if snapshot_later.accum_size <= parent.accum_size {
+                    cheapest_non_incremental = Some(snapshot_later);
+                }
+            } else {
+                cheapest_non_incremental = Some(snapshot_later);
+            }
+        }
+
+        // And what's the cheapest method, if the parent is available?
+        let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental
+        {
+            // Is it cheaper to use a snapshot here or later, anyway?
+            // Use <, to prefer Wal over snapshot if the cost is the same
+            if wal_here.accum_size < cheapest_non_incremental.accum_size {
+                wal_here
+            } else {
+                cheapest_non_incremental.clone()
+            }
+        } else {
+            wal_here
+        };
+
+        SizeAlternatives {
+            incremental: cheapest_incremental,
+            non_incremental: cheapest_non_incremental,
+        }
+    }
+}
diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index b156e1be9d52..c151e3b42cde 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,401 +1,70 @@
-use std::borrow::Cow;
-use std::collections::HashMap;
-
-use anyhow::Context;
-
-/// Pricing model or history size builder.
+//! Synthetic size calculation
+
+mod calculation;
+pub mod svg;
+
+/// StorageModel is the input to the synthetic size calculation. It represents
+/// a tree of timelines, with just the information that's needed for the
+/// calculation. This doesn't track timeline names or where each timeline
+/// begins and ends, for example. Instead, it consists of "points of interest"
+/// on the timelines. A point of interest could be the timeline start or end point,
+/// the oldest point on a timeline that needs to be retained because of PITR
+/// cutoff, or snapshot points named by the user. For each such point, and the
+/// edge connecting the points (implicit in Segment), we store information about
+/// whether we need to be able to recover to the point, and if known, the logical
+/// size at the point.
 ///
-/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
-/// type.
-pub struct Storage<K: 'static> {
-    segments: Vec<Segment>,
-
-    /// Mapping from the branch name to the index of a segment describing it's latest state.
-    branches: HashMap<K, usize>,
+/// The segments must form a well-formed tree, with no loops.
+#[derive(serde::Serialize)]
+pub struct StorageModel {
+    pub segments: Vec<Segment>,
 }
 
-/// Snapshot of a branch.
-#[derive(Clone, Debug, Eq, PartialEq)]
+/// Segment represents one point in the tree of branches, *and* the edge that leads
+/// to it (if any). We don't need separate structs for points and edges, because each
+/// point can have only one parent.
+///
+/// When 'needed' is true, it means that we need to be able to reconstruct
+/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only
+/// a single point is needed, create two Segments with the same lsn, and mark only
+/// the child as needed.
+///
+#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct Segment {
     /// Previous segment index into ['Storage::segments`], if any.
-    parent: Option<usize>,
-
-    /// Description of how did we get to this state.
-    ///
-    /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
-    /// modifying a branch directly.
-    pub op: Cow<'static, str>,
-
-    /// LSN before this state
-    start_lsn: u64,
+    pub parent: Option<usize>,
 
-    /// LSN at this state
-    pub end_lsn: u64,
+    /// LSN at this point
+    pub lsn: u64,
 
-    /// Logical size before this state
-    start_size: u64,
+    /// Logical size at this node, if known.
+    pub size: Option<u64>,
 
-    /// Logical size at this state. Can be None in the last Segment of a branch.
-    pub end_size: Option<u64>,
-
-    /// Indices to [`Storage::segments`]
-    ///
-    /// FIXME: this could be an Option<usize>
-    children_after: Vec<usize>,
-
-    /// Determined by `retention_period` given to [`Storage::calculate`]
+    /// If true, the segment from parent to this node is needed by `retention_period`
     pub needed: bool,
 }
 
-//
-//
-//
-//
-//                 *-g--*---D--->
-//                /
-//               /
-//              /                 *---b----*-B--->
-//             /                 /
-//            /                 /
-//      -----*--e---*-----f----* C
-//           E                  \
-//                               \
-//                                *--a---*---A-->
-//
-// If A and B need to be retained, is it cheaper to store
-// snapshot at C+a+b, or snapshots at A and B ?
-//
-// If D also needs to be retained, which is cheaper:
-//
-// 1. E+g+e+f+a+b
-// 2. D+C+a+b
-// 3. D+A+B
-
-/// [`Segment`] which has had it's size calculated.
-pub struct SegmentSize {
-    pub seg_id: usize,
-
-    pub method: SegmentMethod,
-
-    this_size: u64,
+/// Result of synthetic size calculation. Returned by StorageModel::calculate()
+pub struct SizeResult {
+    pub total_size: u64,
 
-    pub children: Vec<SegmentSize>,
+    // This has same length as the StorageModel::segments vector in the input.
+    // Each entry in this array corresponds to the entry with same index in
+    // StorageModel::segments.
+    pub segments: Vec<SegmentSizeResult>,
 }
 
-impl SegmentSize {
-    fn total(&self) -> u64 {
-        self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-    }
-
-    pub fn total_children(&self) -> u64 {
-        if self.method == SnapshotAfter {
-            self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-        } else {
-            self.children.iter().fold(0, |acc, x| acc + x.total())
-        }
-    }
+#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
+pub struct SegmentSizeResult {
+    pub method: SegmentMethod,
+    // calculated size of this subtree, using this method
+    pub accum_size: u64,
 }
 
 /// Different methods to retain history from a particular state
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
 pub enum SegmentMethod {
-    SnapshotAfter,
-    Wal,
-    WalNeeded,
+    SnapshotHere, // A logical snapshot is needed after this segment
+    Wal,          // Keep WAL leading up to this node
     Skipped,
 }
-
-use SegmentMethod::*;
-
-impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
-    /// Creates a new storage with the given default branch name.
-    pub fn new(initial_branch: K) -> Storage<K> {
-        let init_segment = Segment {
-            op: "".into(),
-            needed: false,
-            parent: None,
-            start_lsn: 0,
-            end_lsn: 0,
-            start_size: 0,
-            end_size: Some(0),
-            children_after: Vec::new(),
-        };
-
-        Storage {
-            segments: vec![init_segment],
-            branches: HashMap::from([(initial_branch, 0)]),
-        }
-    }
-
-    /// Advances the branch with a new point, at given LSN.
-    pub fn insert_point<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn: u64,
-        size: Option<u64>,
-    ) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        assert!(lsn > lastseg.end_lsn);
-
-        let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lsn,
-            start_size,
-            end_size: size,
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-
-        Ok(())
-    }
-
-    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
-    pub fn modify_branch<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn_bytes: u64,
-        size_bytes: i64,
-    ) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: last_end_size,
-            end_size: Some((last_end_size as i64 + size_bytes) as u64),
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-        Ok(())
-    }
-
-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
-    }
-
-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "update".into(), bytes, 0i64)
-    }
-
-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
-    }
-
-    pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q> + std::fmt::Debug,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        // Find the right segment
-        let branchseg_id = *self.branches.get(parent).with_context(|| {
-            format!(
-                "should had found the parent {:?} by key. in branches {:?}",
-                parent, self.branches
-            )
-        })?;
-
-        let _branchseg = &mut self.segments[branchseg_id];
-
-        // Create branch name for it
-        self.branches.insert(name, branchseg_id);
-        Ok(())
-    }
-
-    pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
-        // Phase 1: Mark all the segments that need to be retained
-        for (_branch, &last_seg_id) in self.branches.iter() {
-            let last_seg = &self.segments[last_seg_id];
-            let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
-            let mut seg_id = last_seg_id;
-            loop {
-                let seg = &mut self.segments[seg_id];
-                if seg.end_lsn < cutoff_lsn {
-                    break;
-                }
-                seg.needed = true;
-                if let Some(prev_seg_id) = seg.parent {
-                    seg_id = prev_seg_id;
-                } else {
-                    break;
-                }
-            }
-        }
-
-        // Phase 2: For each oldest segment in a chain that needs to be retained,
-        // calculate if we should store snapshot or WAL
-        self.size_from_snapshot_later(0)
-    }
-
-    fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
-        let seg = &self.segments[seg_id];
-
-        let this_size = seg.end_lsn - seg.start_lsn;
-
-        let mut children = Vec::new();
-
-        // try both ways
-        for &child_id in seg.children_after.iter() {
-            // try each child both ways
-            let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id)?;
-
-            let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id)?;
-                if p1.total() < p2.total() {
-                    p1
-                } else {
-                    p2
-                }
-            } else {
-                p1
-            };
-            children.push(p);
-        }
-        Ok(SegmentSize {
-            seg_id,
-            method: if seg.needed { WalNeeded } else { Wal },
-            this_size,
-            children,
-        })
-    }
-
-    fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
-        // If this is needed, then it's time to do the snapshot and continue
-        // with wal method.
-        let seg = &self.segments[seg_id];
-        //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
-        if seg.needed {
-            let mut children = Vec::new();
-
-            for &child_id in seg.children_after.iter() {
-                // try each child both ways
-                let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id)?;
-
-                let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id)?;
-                    if p1.total() < p2.total() {
-                        p1
-                    } else {
-                        p2
-                    }
-                } else {
-                    p1
-                };
-                children.push(p);
-            }
-            Ok(SegmentSize {
-                seg_id,
-                method: WalNeeded,
-                this_size: seg.start_size,
-                children,
-            })
-        } else {
-            // If any of the direct children are "needed", need to be able to reconstruct here
-            let mut children_needed = false;
-            for &child in seg.children_after.iter() {
-                let seg = &self.segments[child];
-                if seg.needed {
-                    children_needed = true;
-                    break;
-                }
-            }
-
-            let method1 = if !children_needed {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child)?);
-                }
-                Some(SegmentSize {
-                    seg_id,
-                    method: Skipped,
-                    this_size: 0,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            // If this a junction, consider snapshotting here
-            let method2 = if children_needed || seg.children_after.len() >= 2 {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child)?);
-                }
-                let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
-                Some(SegmentSize {
-                    seg_id,
-                    method: SnapshotAfter,
-                    this_size,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            Ok(match (method1, method2) {
-                (None, None) => anyhow::bail!(
-                    "neither method was applicable: children_after={}, children_needed={}",
-                    seg.children_after.len(),
-                    children_needed
-                ),
-                (Some(method), None) => method,
-                (None, Some(method)) => method,
-                (Some(method1), Some(method2)) => {
-                    if method1.total() < method2.total() {
-                        method1
-                    } else {
-                        method2
-                    }
-                }
-            })
-        }
-    }
-
-    pub fn into_segments(self) -> Vec<Segment> {
-        self.segments
-    }
-}
diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs
deleted file mode 100644
index e32dd055f491..000000000000
--- a/libs/tenant_size_model/src/main.rs
+++ /dev/null
@@ -1,269 +0,0 @@
-//! Tenant size model testing ground.
-//!
-//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
-//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
-//! into pngs.
-
-use tenant_size_model::{Segment, SegmentSize, Storage};
-
-// Main branch only. Some updates on it.
-fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Main branch only. Some updates on it.
-fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    storage.update("main", 1_000)?;
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Like 2, but more updates on main
-fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Diverged branches
-fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    for _ in 0..8 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    let mut storage = Storage::new("a");
-    storage.insert("a", 5000)?;
-    storage.branch("a", "b")?;
-    storage.update("b", 4000)?;
-    storage.update("a", 2000)?;
-    storage.branch("a", "c")?;
-    storage.insert("c", 4000)?;
-    storage.insert("a", 2000)?;
-
-    let size = storage.calculate(5000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    use std::borrow::Cow;
-
-    const NO_OP: Cow<'static, str> = Cow::Borrowed("");
-
-    let branches = [
-        Some(0x7ff1edab8182025f15ae33482edb590a_u128),
-        Some(0xb1719e044db05401a05a2ed588a3ad3f),
-        Some(0xb68d6691c895ad0a70809470020929ef),
-    ];
-
-    // compared to other scenarios, this one uses bytes instead of kB
-
-    let mut storage = Storage::new(None);
-
-    storage.branch(&None, branches[0])?; // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
-    storage.branch(&branches[0], branches[1])?; // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
-    storage.branch(&branches[0], branches[2])?; // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
-
-    let size = storage.calculate(100_000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn main() {
-    let args: Vec<String> = std::env::args().collect();
-
-    let scenario = if args.len() < 2 { "1" } else { &args[1] };
-
-    let (segments, size) = match scenario {
-        "1" => scenario_1(),
-        "2" => scenario_2(),
-        "3" => scenario_3(),
-        "4" => scenario_4(),
-        "5" => scenario_5(),
-        "6" => scenario_6(),
-        other => {
-            eprintln!("invalid scenario {}", other);
-            std::process::exit(1);
-        }
-    }
-    .unwrap();
-
-    graphviz_tree(&segments, &size);
-}
-
-fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
-    use tenant_size_model::SegmentMethod::*;
-
-    let seg_id = node.seg_id;
-    let seg = segments.get(seg_id).unwrap();
-    let lsn = seg.end_lsn;
-    let size = seg.end_size.unwrap_or(0);
-    let method = node.method;
-
-    println!("  {{");
-    println!("    node [width=0.1 height=0.1 shape=oval]");
-
-    let tenant_size = node.total_children();
-
-    let penwidth = if seg.needed { 6 } else { 3 };
-    let x = match method {
-        SnapshotAfter =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
-        Wal =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        WalNeeded =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        Skipped =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
-    };
-
-    println!("    \"seg{seg_id}\" [{x}]");
-    println!("  }}");
-
-    // Recurse. Much of the data is actually on the edge
-    for child in node.children.iter() {
-        let child_id = child.seg_id;
-        graphviz_recurse(segments, child);
-
-        let edge_color = match child.method {
-            SnapshotAfter => "gray",
-            Wal => "black",
-            WalNeeded => "black",
-            Skipped => "gray",
-        };
-
-        println!("  {{");
-        println!("    edge [] ");
-        print!("    \"seg{seg_id}\" -> \"seg{child_id}\" [");
-        print!("color={edge_color}");
-        if child.method == WalNeeded {
-            print!(" penwidth=6");
-        }
-        if child.method == Wal {
-            print!(" penwidth=3");
-        }
-
-        let next = segments.get(child_id).unwrap();
-
-        if next.op.is_empty() {
-            print!(
-                " label=\"{} / {}\"",
-                next.end_lsn - seg.end_lsn,
-                (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
-            );
-        } else {
-            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
-        }
-        println!("]");
-        println!("  }}");
-    }
-}
-
-fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
-    println!("digraph G {{");
-    println!("  fontname=\"Helvetica,Arial,sans-serif\"");
-    println!("  node [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  edge [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  graph [center=1 rankdir=LR]");
-    println!("  edge [dir=none]");
-
-    graphviz_recurse(segments, tree);
-
-    println!("}}");
-}
-
-#[test]
-fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
-    let truths: &[(u32, ScenarioFn, _)] = &[
-        (line!(), scenario_1, 8000),
-        (line!(), scenario_2, 9000),
-        (line!(), scenario_3, 13000),
-        (line!(), scenario_4, 16000),
-        (line!(), scenario_5, 17000),
-        (line!(), scenario_6, 333_792_000),
-    ];
-
-    for (line, scenario, expected) in truths {
-        let (_, size) = scenario().unwrap();
-        assert_eq!(*expected, size.total_children(), "scenario on line {line}");
-    }
-}
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
new file mode 100644
index 000000000000..f26d3aa79d1a
--- /dev/null
+++ b/libs/tenant_size_model/src/svg.rs
@@ -0,0 +1,193 @@
+use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
+use std::fmt::Write;
+
+const SVG_WIDTH: f32 = 500.0;
+
+struct SvgDraw<'a> {
+    storage: &'a StorageModel,
+    branches: &'a [String],
+    seg_to_branch: &'a [usize],
+    sizes: &'a [SegmentSizeResult],
+
+    // layout
+    xscale: f32,
+    min_lsn: u64,
+    seg_coordinates: Vec<(f32, f32)>,
+}
+
+fn draw_legend(result: &mut String) -> anyhow::Result<()> {
+    writeln!(
+        result,
+        "<circle cx=\"10\" cy=\"10\" r=\"5\" stroke=\"red\"/>"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"15\">logical snapshot</text>")?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"30\" x2=\"15\" y2=\"30\" stroke-width=\"6\" stroke=\"black\" />"
+    )?;
+    writeln!(
+        result,
+        "<text x=\"20\" y=\"35\">WAL within retention period</text>"
+    )?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"50\" x2=\"15\" y2=\"50\" stroke-width=\"3\" stroke=\"black\" />"
+    )?;
+    writeln!(
+        result,
+        "<text x=\"20\" y=\"55\">WAL retained to avoid copy</text>"
+    )?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
+    Ok(())
+}
+
+pub fn draw_svg(
+    storage: &StorageModel,
+    branches: &[String],
+    seg_to_branch: &[usize],
+    sizes: &SizeResult,
+) -> anyhow::Result<String> {
+    let mut draw = SvgDraw {
+        storage,
+        branches,
+        seg_to_branch,
+        sizes: &sizes.segments,
+
+        xscale: 0.0,
+        min_lsn: 0,
+        seg_coordinates: Vec::new(),
+    };
+
+    let mut result = String::new();
+
+    writeln!(result, "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" height=\"300\" width=\"500\">")?;
+
+    draw.calculate_svg_layout();
+
+    // Draw the tree
+    for (seg_id, _seg) in storage.segments.iter().enumerate() {
+        draw.draw_seg_phase1(seg_id, &mut result)?;
+    }
+
+    // Draw snapshots
+    for (seg_id, _seg) in storage.segments.iter().enumerate() {
+        draw.draw_seg_phase2(seg_id, &mut result)?;
+    }
+
+    draw_legend(&mut result)?;
+
+    write!(result, "</svg>")?;
+
+    Ok(result)
+}
+
+impl<'a> SvgDraw<'a> {
+    fn calculate_svg_layout(&mut self) {
+        // Find x scale
+        let segments = &self.storage.segments;
+        let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min);
+        let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max);
+
+        // Start with 1 pixel = 1 byte. Double the scale until it fits into the image
+        let mut xscale = 1.0;
+        while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH {
+            xscale *= 2.0;
+        }
+
+        // Layout the timelines on Y dimension.
+        // TODO
+        let mut y = 100.0;
+        let mut branch_y_coordinates = Vec::new();
+        for _branch in self.branches {
+            branch_y_coordinates.push(y);
+            y += 40.0;
+        }
+
+        // Calculate coordinates for each point
+        let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
+            .map(|(seg, branch_id)| {
+                let x = (seg.lsn - min_lsn) as f32 / xscale;
+                let y = branch_y_coordinates[*branch_id];
+                (x, y)
+            })
+            .collect();
+
+        self.xscale = xscale;
+        self.min_lsn = min_lsn;
+        self.seg_coordinates = seg_coordinates;
+    }
+
+    /// Draws lines between points
+    fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
+        let seg = &self.storage.segments[seg_id];
+
+        let wal_bytes = if let Some(parent_id) = seg.parent {
+            seg.lsn - self.storage.segments[parent_id].lsn
+        } else {
+            0
+        };
+
+        let style = match self.sizes[seg_id].method {
+            SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"",
+            SegmentMethod::Wal if seg.needed && wal_bytes > 0 => {
+                "stroke-width=\"6\" stroke=\"black\""
+            }
+            SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"",
+            SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"",
+        };
+        if let Some(parent_id) = seg.parent {
+            let (x1, y1) = self.seg_coordinates[parent_id];
+            let (x2, y2) = self.seg_coordinates[seg_id];
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(
+                result,
+                "  <title>{wal_bytes} bytes of WAL (seg {seg_id})</title>"
+            )?;
+            writeln!(result, "</line>")?;
+        } else {
+            // draw a little dash to mark the starting point of this branch
+            let (x, y) = self.seg_coordinates[seg_id];
+            let (x1, y1) = (x, y - 5.0);
+            let (x2, y2) = (x, y + 5.0);
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(result, "  <title>(seg {seg_id})</title>")?;
+            writeln!(result, "</line>")?;
+        }
+
+        Ok(())
+    }
+
+    /// Draw circles where snapshots are taken
+    fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
+        let seg = &self.storage.segments[seg_id];
+
+        // draw a snapshot point if it's needed
+        let (coord_x, coord_y) = self.seg_coordinates[seg_id];
+        if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
+            writeln!(
+                result,
+                "<circle cx=\"{coord_x}\" cy=\"{coord_y}\" r=\"5\" stroke=\"red\">",
+            )?;
+            writeln!(
+                result,
+                "  <title>logical size {}</title>",
+                seg.size.unwrap()
+            )?;
+            write!(result, "</circle>")?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs
new file mode 100644
index 000000000000..7660d41c56e7
--- /dev/null
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -0,0 +1,313 @@
+//! Tenant size model tests.
+
+use tenant_size_model::{Segment, SizeResult, StorageModel};
+
+use std::collections::HashMap;
+
+struct ScenarioBuilder {
+    segments: Vec<Segment>,
+
+    /// Mapping from the branch name to the index of a segment describing its latest state.
+    branches: HashMap<String, usize>,
+}
+
+impl ScenarioBuilder {
+    /// Creates a new storage with the given default branch name.
+    pub fn new(initial_branch: &str) -> ScenarioBuilder {
+        let init_segment = Segment {
+            parent: None,
+            lsn: 0,
+            size: Some(0),
+            needed: false, // determined later
+        };
+
+        ScenarioBuilder {
+            segments: vec![init_segment],
+            branches: HashMap::from([(initial_branch.into(), 0)]),
+        }
+    }
+
+    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
+    pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        let newseg = Segment {
+            parent: Some(lastseg_id),
+            lsn: lastseg.lsn + lsn_bytes,
+            size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
+            needed: false,
+        };
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
+    pub fn insert(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, bytes as i64);
+    }
+
+    pub fn update(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, 0i64);
+    }
+
+    pub fn _delete(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, -(bytes as i64));
+    }
+
+    /// Panics if the parent branch cannot be found.
+    pub fn branch(&mut self, parent: &str, name: &str) {
+        // Find the right segment
+        let branchseg_id = *self
+            .branches
+            .get(parent)
+            .expect("should had found the parent by key");
+        let _branchseg = &mut self.segments[branchseg_id];
+
+        // Create branch name for it
+        self.branches.insert(name.to_string(), branchseg_id);
+    }
+
+    pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) {
+        // Phase 1: Mark all the segments that need to be retained
+        for (_branch, &last_seg_id) in self.branches.iter() {
+            let last_seg = &self.segments[last_seg_id];
+            let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period);
+            let mut seg_id = last_seg_id;
+            loop {
+                let seg = &mut self.segments[seg_id];
+                if seg.lsn <= cutoff_lsn {
+                    break;
+                }
+                seg.needed = true;
+                if let Some(prev_seg_id) = seg.parent {
+                    seg_id = prev_seg_id;
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Perform the calculation
+        let storage_model = StorageModel {
+            segments: self.segments.clone(),
+        };
+        let size_result = storage_model.calculate();
+        (storage_model, size_result)
+    }
+}
+
+// Main branch only. Some updates on it.
+#[test]
+fn scenario_1() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Calculate the synthetic size with retention horizon 1000
+    let (_model, result) = scenario.calculate(1000);
+
+    // The end of the branch is at LSN 10000. Need to retain
+    // a logical snapshot at LSN 9000, plus the WAL between 9000-10000.
+    // The logical snapshot has size 5000.
+    assert_eq!(result.total_size, 5000 + 1000);
+}
+
+// Main branch only. Some updates on it.
+#[test]
+fn scenario_2() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    scenario.update("main", 1_000);
+
+    //
+    // The history looks like this now:
+    //
+    //         10000          11000
+    // *----*----*--------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, we need to retain logical snapshot
+    // at the branch point, size 5000, and the WAL from 10000-11000 on
+    // both branches.
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 1000);
+}
+
+// Like 2, but more updates on main
+#[test]
+fn scenario_3() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    //
+    // The history looks like this now:
+    //
+    //         10000                                 15000
+    // *----*----*------------------------------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, it's still cheapest to retain
+    // - snapshot at branch point (size 5000)
+    // - WAL on child between 10000-11000
+    // - WAL on main between 10000-15000
+    //
+    // This is in total 5000 + 1000 + 5000
+    //
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 5000);
+}
+
+// Diverged branches
+#[test]
+fn scenario_4() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..8 {
+        scenario.update("main", 1_000);
+    }
+
+    //
+    // The history looks like this now:
+    //
+    //         10000                                 18000
+    // *----*----*------------------------------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, it's now cheapest to retain
+    // separate snapshots on both branches:
+    // - snapshot on main branch at LSN 17000 (size 5000)
+    // - WAL on main between 17000-18000
+    // - snapshot on child branch at LSN 10000 (size 5000)
+    // - WAL on child between 10000-11000
+    //
+    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
+    //
+    // (If we used the the method from the previous scenario, and
+    // kept only snapshot at the branch point, we'd need to keep
+    // all the WAL between 10000-18000 on the main branch, so
+    // the total size would be 5000 + 1000 + 8000 = 14000. The
+    // calculation always picks the cheapest alternative)
+
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000);
+}
+
+#[test]
+fn scenario_5() {
+    let mut scenario = ScenarioBuilder::new("a");
+    scenario.insert("a", 5000);
+    scenario.branch("a", "b");
+    scenario.update("b", 4000);
+    scenario.update("a", 2000);
+    scenario.branch("a", "c");
+    scenario.insert("c", 4000);
+    scenario.insert("a", 2000);
+
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 17000);
+}
+
+#[test]
+fn scenario_6() {
+    let branches = [
+        "7ff1edab8182025f15ae33482edb590a",
+        "b1719e044db05401a05a2ed588a3ad3f",
+        "0xb68d6691c895ad0a70809470020929ef",
+    ];
+
+    // compared to other scenarios, this one uses bytes instead of kB
+
+    let mut scenario = ScenarioBuilder::new("");
+
+    scenario.branch("", branches[0]); // at 0
+    scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064
+    scenario.branch(branches[0], branches[1]); // at 108951064
+    scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472
+    scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424
+    scenario.branch(branches[0], branches[2]); // at 283415424
+    scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616
+    scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400
+
+    let (model, result) = scenario.calculate(100_000);
+
+    // FIXME: We previously calculated 333_792_000. But with this PR, we get
+    // a much lower number. At a quick look at the model output and the
+    // calculations here, the new result seems correct to me.
+    eprintln!(
+        " MODEL: {}",
+        serde_json::to_string(&model.segments).unwrap()
+    );
+    eprintln!(
+        "RESULT: {}",
+        serde_json::to_string(&result.segments).unwrap()
+    );
+
+    assert_eq!(result.total_size, 136_236_928);
+}
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 02684d3d16c1..f770622a601f 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -45,3 +45,115 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
 
     Ok(())
 }
+
+/// Disable the default rust panic hook by using `set_hook`.
+///
+/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
+/// that sentry is configured (if needed). sentry will install it's own on top of this, always
+/// processing the panic before we log it.
+///
+/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
+/// If the assumptions about the initialization order are not held, use
+/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
+/// lost.
+#[must_use]
+pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
+    std::panic::set_hook(Box::new(tracing_panic_hook));
+    TracingPanicHookGuard::new()
+}
+
+/// Drop guard which restores the std panic hook on drop.
+///
+/// Tracing should not be used when it's not configured, but we cannot really latch on to any
+/// imaginary lifetime of tracing.
+pub struct TracingPanicHookGuard {
+    act: bool,
+}
+
+impl TracingPanicHookGuard {
+    fn new() -> Self {
+        TracingPanicHookGuard { act: true }
+    }
+
+    /// Make this hook guard not do anything when dropped.
+    pub fn forget(&mut self) {
+        self.act = false;
+    }
+}
+
+impl Drop for TracingPanicHookGuard {
+    fn drop(&mut self) {
+        if self.act {
+            let _ = std::panic::take_hook();
+        }
+    }
+}
+
+/// Named symbol for our panic hook, which logs the panic.
+fn tracing_panic_hook(info: &std::panic::PanicInfo) {
+    // following rust 1.66.1 std implementation:
+    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
+    let location = info.location();
+
+    let msg = match info.payload().downcast_ref::<&'static str>() {
+        Some(s) => *s,
+        None => match info.payload().downcast_ref::<String>() {
+            Some(s) => &s[..],
+            None => "Box<dyn Any>",
+        },
+    };
+
+    let thread = std::thread::current();
+    let thread = thread.name().unwrap_or("<unnamed>");
+    let backtrace = std::backtrace::Backtrace::capture();
+
+    let _entered = if let Some(location) = location {
+        tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
+    } else {
+        // very unlikely to hit here, but the guarantees of std could change
+        tracing::error_span!("panic", %thread)
+    }
+    .entered();
+
+    if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+        // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
+        // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
+        // string, maybe even to a TLS one but tracing already does that.
+        tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
+    } else {
+        tracing::error!("{msg}");
+    }
+
+    // ensure that we log something on the panic if this hook is left after tracing has been
+    // unconfigured. worst case when teardown is racing the panic is to log the panic twice.
+    tracing::dispatcher::get_default(|d| {
+        if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
+            let location = location.map(PrettyLocation);
+            log_panic_to_stderr(thread, msg, location, &backtrace);
+        }
+    });
+}
+
+#[cold]
+fn log_panic_to_stderr(
+    thread: &str,
+    msg: &str,
+    location: Option<PrettyLocation<'_, '_>>,
+    backtrace: &std::backtrace::Backtrace,
+) {
+    eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
+}
+
+struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
+
+impl std::fmt::Display for PrettyLocation<'_, '_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
+    }
+}
+
+impl std::fmt::Debug for PrettyLocation<'_, '_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        <Self as std::fmt::Display>::fmt(self, f)
+    }
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 50eefa8c7787..01a2c85d7496 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -88,6 +88,13 @@ fn main() -> anyhow::Result<()> {
         }
     };
 
+    // Initialize logging, which must be initialized before the custom panic hook is installed.
+    logging::init(conf.log_format)?;
+
+    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
+    // disarming this hook on pageserver, because we never tear down tracing.
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(
         Some(GIT_VERSION.into()),
@@ -210,9 +217,6 @@ fn start_pageserver(
     launch_ts: &'static LaunchTimestamp,
     conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
-    // Initialize logging
-    logging::init(conf.log_format)?;
-
     // Print version and launch timestamp to the log,
     // and expose them as prometheus metrics.
     // A changed version string indicates changed software.
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index a730d393395e..8916d4f1c978 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
 
 #[serde_as]
-#[derive(Serialize)]
+#[derive(Serialize, Debug)]
 struct Ids {
     #[serde_as(as = "DisplayFromStr")]
     tenant_id: TenantId,
@@ -75,7 +75,7 @@ pub async fn collect_metrics(
     // define client here to reuse it for all requests
     let client = reqwest::Client::new();
     let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
-    let mut prev_iteration_time: Option<std::time::Instant> = None;
+    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
 
     loop {
         tokio::select! {
@@ -86,11 +86,11 @@ pub async fn collect_metrics(
             _ = ticker.tick() => {
 
                 // send cached metrics every cached_metric_collection_interval
-                let send_cached = prev_iteration_time
-                .map(|x| x.elapsed() >= cached_metric_collection_interval)
-                .unwrap_or(false);
+                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
 
-                prev_iteration_time = Some(std::time::Instant::now());
+                if send_cached {
+                    prev_iteration_time = std::time::Instant::now();
+                }
 
                 collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
             }
@@ -287,6 +287,12 @@ pub async fn collect_metrics_iteration(
                     }
                 } else {
                     error!("metrics endpoint refused the sent metrics: {:?}", res);
+                    for metric in chunk_to_send.iter() {
+                        // Report if the metric value is suspiciously large
+                        if metric.value > (1u64 << 40) {
+                            error!("potentially abnormal metric value: {:?}", metric);
+                        }
+                    }
                 }
             }
             Err(err) => {
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index fc271fe83b7b..e68ceb2dc6cf 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -437,6 +437,13 @@ paths:
           type: boolean
         description: |
           When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
+      - name: retention_period
+        in: query
+        required: false
+        schema:
+          type: integer
+        description: |
+          Override the default retention period (in bytes) used for size calculation.
     get:
       description: |
         Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6a9232e097aa..d2d9f24efbe8 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -7,19 +7,21 @@ use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use remote_storage::GenericRemoteStorage;
+use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
 use super::models::{
     StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::mgr::TenantMapInsertError;
+use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -38,7 +40,7 @@ use utils::{
 
 // Imports only used for testing APIs
 #[cfg(feature = "testing")]
-use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
+use super::models::ConfigureFailpointsRequest;
 
 struct State {
     conf: &'static PageServerConf,
@@ -479,11 +481,19 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 /// to debug any of the calculations. Requires `tenant_id` request parameter, supports
 /// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
 /// values.
+///
+/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size
+/// (only if it is shorter than the real cutoff).
+///
+/// Note: we don't update the cached size and prometheus metric here.
+/// The retention period might be different, and it's nice to have a method to just calculate it
+/// without modifying anything anyway.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
-
     let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
+    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
+    let headers = request.headers();
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
@@ -492,24 +502,29 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
 
     // this can be long operation
     let inputs = tenant
-        .gather_size_inputs(&ctx)
+        .gather_size_inputs(retention_period, &ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    let size = if !inputs_only.unwrap_or(false) {
-        Some(
-            tenant
-                .calc_and_update_cached_synthetic_size(&inputs)
-                .map_err(ApiError::InternalServerError)?,
-        )
-    } else {
-        None
-    };
+    let mut sizes = None;
+    if !inputs_only.unwrap_or(false) {
+        let storage_model = inputs
+            .calculate_model()
+            .map_err(ApiError::InternalServerError)?;
+        let size = storage_model.calculate();
 
-    /// Private response type with the additional "unstable" `inputs` field.
-    ///
-    /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
-    /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
+        // If request header expects html, return html
+        if headers["Accept"] == "text/html" {
+            return synthetic_size_html_response(inputs, storage_model, size);
+        }
+        sizes = Some(size);
+    } else if headers["Accept"] == "text/html" {
+        return Err(ApiError::BadRequest(anyhow!(
+            "inputs_only parameter is incompatible with html output request"
+        )));
+    }
+
+    /// The type resides in the pageserver not to expose `ModelInputs`.
     #[serde_with::serde_as]
     #[derive(serde::Serialize)]
     struct TenantHistorySize {
@@ -519,6 +534,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         ///
         /// Will be none if `?inputs_only=true` was given.
         size: Option<u64>,
+        /// Size of each segment used in the model.
+        /// Will be null if `?inputs_only=true` was given.
+        segment_sizes: Option<Vec<tenant_size_model::SegmentSizeResult>>,
         inputs: crate::tenant::size::ModelInputs,
     }
 
@@ -526,7 +544,8 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         StatusCode::OK,
         TenantHistorySize {
             id: tenant_id,
-            size,
+            size: sizes.as_ref().map(|x| x.total_size),
+            segment_sizes: sizes.map(|x| x.segments),
             inputs,
         },
     )
@@ -591,6 +610,62 @@ async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response
     }
 }
 
+/// Get tenant_size SVG graph along with the JSON data.
+fn synthetic_size_html_response(
+    inputs: ModelInputs,
+    storage_model: StorageModel,
+    sizes: SizeResult,
+) -> Result<Response<Body>, ApiError> {
+    let mut timeline_ids: Vec<String> = Vec::new();
+    let mut timeline_map: HashMap<TimelineId, usize> = HashMap::new();
+    for (index, ti) in inputs.timeline_inputs.iter().enumerate() {
+        timeline_map.insert(ti.timeline_id, index);
+        timeline_ids.push(ti.timeline_id.to_string());
+    }
+    let seg_to_branch: Vec<usize> = inputs
+        .segments
+        .iter()
+        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .collect();
+
+    let svg =
+        tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes)
+            .map_err(ApiError::InternalServerError)?;
+
+    let mut response = String::new();
+
+    use std::fmt::Write;
+    write!(response, "<html>\n<body>\n").unwrap();
+    write!(response, "<div>\n{svg}\n</div>").unwrap();
+    writeln!(response, "Project size: {}", sizes.total_size).unwrap();
+    writeln!(response, "<pre>").unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&inputs).unwrap()
+    )
+    .unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&sizes.segments).unwrap()
+    )
+    .unwrap();
+    writeln!(response, "</pre>").unwrap();
+    write!(response, "</body>\n</html>\n").unwrap();
+
+    html_response(StatusCode::OK, response)
+}
+
+pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
+    let response = Response::builder()
+        .status(status)
+        .header(hyper::header::CONTENT_TYPE, "text/html")
+        .body(Body::from(data.as_bytes().to_vec()))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+    Ok(response)
+}
+
 // Helper function to standardize the error messages we produce on bad durations
 //
 // Intended to be used with anyhow's `with_context`, e.g.:
@@ -850,7 +925,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }
 
 // Run GC immediately on given timeline.
-#[cfg(feature = "testing")]
 async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -954,6 +1028,17 @@ async fn active_timeline_of_active_tenant(
         .map_err(ApiError::NotFound)
 }
 
+async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
+    // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
+    // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
+    let query = req.uri().query();
+    let _ = std::panic::catch_unwind(|| {
+        panic!("unconditional panic for testing panic hook integration; request query: {query:?}")
+    });
+    json_response(StatusCode::NO_CONTENT, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -1019,7 +1104,7 @@ pub fn make_router(
         .get("/v1/tenant", tenant_list_handler)
         .post("/v1/tenant", tenant_create_handler)
         .get("/v1/tenant/:tenant_id", tenant_status)
-        .get("/v1/tenant/:tenant_id/size", tenant_size_handler)
+        .get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler)
         .put("/v1/tenant/config", update_tenant_config_handler)
         .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
         .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
@@ -1038,7 +1123,7 @@ pub fn make_router(
         )
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
-            testing_api!("run timeline GC", timeline_gc_handler),
+            timeline_gc_handler,
         )
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
@@ -1072,5 +1157,6 @@ pub fn make_router(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
             evict_timeline_layer_handler,
         )
+        .get("/v1/panic", always_panic_handler)
         .any(handler_404))
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 23210b98d5bd..9e9c98ad62af 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2418,6 +2418,9 @@ impl Tenant {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
     pub async fn gather_size_inputs(
         &self,
+        // `max_retention_period` overrides the cutoff that is used to calculate the size
+        // (only if it is shorter than the real cutoff).
+        max_retention_period: Option<u64>,
         ctx: &RequestContext,
     ) -> anyhow::Result<size::ModelInputs> {
         let logical_sizes_at_once = self
@@ -2425,32 +2428,41 @@ impl Tenant {
             .concurrent_tenant_size_logical_size_queries
             .inner();
 
-        // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
-        // are for testing/experimenting, we tolerate this.
+        // TODO: Having a single mutex block concurrent reads is not great for performance.
+        //
+        // But the only case where we need to run multiple of these at once is when we
+        // request a size for a tenant manually via API, while another background calculation
+        // is in progress (which is not a common case).
         //
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
+        size::gather_inputs(
+            self,
+            logical_sizes_at_once,
+            max_retention_period,
+            &mut shared_cache,
+            ctx,
+        )
+        .await
     }
 
-    /// Calculate synthetic tenant size
+    /// Calculate synthetic tenant size and cache the result.
     /// This is periodically called by background worker.
     /// result is cached in tenant struct
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
     pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs(ctx).await?;
+        let inputs = self.gather_size_inputs(None, ctx).await?;
 
-        self.calc_and_update_cached_synthetic_size(&inputs)
-    }
-
-    /// Calculate synthetic size , cache it and set metric value
-    pub fn calc_and_update_cached_synthetic_size(
-        &self,
-        inputs: &size::ModelInputs,
-    ) -> anyhow::Result<u64> {
         let size = inputs.calculate()?;
 
+        self.set_cached_synthetic_size(size);
+
+        Ok(size)
+    }
+
+    /// Cache given synthetic size and update the metric value
+    pub fn set_cached_synthetic_size(&self, size: u64) {
         self.cached_synthetic_tenant_size
             .store(size, Ordering::Relaxed);
 
@@ -2458,8 +2470,6 @@ impl Tenant {
             .get_metric_with_label_values(&[&self.tenant_id.to_string()])
             .unwrap()
             .set(size);
-
-        Ok(size)
     }
 
     pub fn get_cached_synthetic_size(&self) -> u64 {
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index e446e34f4e0f..8d7d9c6f8f05 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -154,6 +154,12 @@ where
         expected: &Arc<L>,
         new: Arc<L>,
     ) -> anyhow::Result<Replacement<Arc<L>>> {
+        fail::fail_point!("layermap-replace-notfound", |_| Ok(
+            // this is not what happens if an L0 layer was not found a anyhow error but perhaps
+            // that should be changed. this is good enough to show a replacement failure.
+            Replacement::NotFound
+        ));
+
         self.layer_map.replace_historic_noflush(expected, new)
     }
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index a74dfdea04ae..a44cb02b4dd1 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -540,13 +540,11 @@ where
     }
 }
 
-#[cfg(feature = "testing")]
 use {
     crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
     utils::http::error::ApiError,
 };
 
-#[cfg(feature = "testing")]
 pub async fn immediate_gc(
     tenant_id: TenantId,
     timeline_id: TimelineId,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 985b480a764c..7049a0bd665a 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -571,14 +571,15 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    ///
     /// Launch a delete operation in the background.
     ///
+    /// The operation does not modify local state but assumes the local files have already been
+    /// deleted, and is used to mirror those changes to remote.
+    ///
     /// Note: This schedules an index file upload before the deletions.  The
     /// deletion won't actually be performed, until any previously scheduled
     /// upload operations, and the index file upload, have completed
     /// succesfully.
-    ///
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
         names: &[LayerFileName],
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 2fed4f88b3cc..a41889f16d2c 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -1,8 +1,9 @@
 use std::cmp;
+use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
-use anyhow::Context;
+use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
@@ -10,35 +11,80 @@ use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
 use super::Tenant;
+use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 
 use tracing::*;
 
+use tenant_size_model::{Segment, StorageModel};
+
 /// Inputs to the actual tenant sizing model
 ///
 /// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
 /// be a transferrable format between execution environments and developer.
+///
+/// This tracks more information than the actual StorageModel that calculation
+/// needs. We will convert this into a StorageModel when it's time to perform
+/// the calculation.
+///
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct ModelInputs {
-    updates: Vec<Update>,
-    retention_period: u64,
-
-    /// Relevant lsns per timeline.
-    ///
-    /// This field is not required for deserialization purposes, which is mostly used in tests. The
-    /// LSNs explain the outcome (updates) but are not needed in size calculation.
-    #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
-    #[serde(default)]
-    timeline_inputs: HashMap<TimelineId, TimelineInputs>,
+    pub segments: Vec<SegmentMeta>,
+    pub timeline_inputs: Vec<TimelineInputs>,
+}
+
+/// A [`Segment`], with some extra information for display purposes
+#[serde_with::serde_as]
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+pub struct SegmentMeta {
+    pub segment: Segment,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub timeline_id: TimelineId,
+    pub kind: LsnKind,
+}
+
+impl SegmentMeta {
+    fn size_needed(&self) -> bool {
+        match self.kind {
+            LsnKind::BranchStart => {
+                // If we don't have a later GcCutoff point on this branch, and
+                // no ancestor, calculate size for the branch start point.
+                self.segment.needed && self.segment.parent.is_none()
+            }
+            LsnKind::BranchPoint => true,
+            LsnKind::GcCutOff => true,
+            LsnKind::BranchEnd => false,
+        }
+    }
+}
+
+#[derive(
+    Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize,
+)]
+pub enum LsnKind {
+    /// A timeline starting here
+    BranchStart,
+    /// A child timeline branches off from here
+    BranchPoint,
+    /// GC cutoff point
+    GcCutOff,
+    /// Last record LSN
+    BranchEnd,
 }
 
 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
 /// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
-struct TimelineInputs {
+pub struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub timeline_id: TimelineId,
+
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    pub ancestor_id: Option<TimelineId>,
+
     #[serde_as(as = "serde_with::DisplayFromStr")]
     ancestor_lsn: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
@@ -49,118 +95,14 @@ struct TimelineInputs {
     horizon_cutoff: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
     pitr_cutoff: Lsn,
+
+    /// Cutoff point based on GC settings
     #[serde_as(as = "serde_with::DisplayFromStr")]
     next_gc_cutoff: Lsn,
-}
-
-// Adjust BranchFrom sorting so that we always process ancestor
-// before descendants. This is needed to correctly calculate size of
-// descendant timelines.
-//
-// Note that we may have multiple BranchFroms at the same LSN, so we
-// need to sort them in the tree order.
-//
-// see updates_sort_with_branches_at_same_lsn test below
-fn sort_updates_in_tree_order(updates: Vec<Update>) -> anyhow::Result<Vec<Update>> {
-    let mut sorted_updates = Vec::with_capacity(updates.len());
-    let mut known_timelineids = HashSet::new();
-    let mut i = 0;
-    while i < updates.len() {
-        let curr_upd = &updates[i];
-
-        if let Command::BranchFrom(parent_id) = curr_upd.command {
-            let parent_id = match parent_id {
-                Some(parent_id) if known_timelineids.contains(&parent_id) => {
-                    // we have already processed ancestor
-                    // process this BranchFrom Update normally
-                    known_timelineids.insert(curr_upd.timeline_id);
-                    sorted_updates.push(*curr_upd);
-                    i += 1;
-                    continue;
-                }
-                None => {
-                    known_timelineids.insert(curr_upd.timeline_id);
-                    sorted_updates.push(*curr_upd);
-                    i += 1;
-                    continue;
-                }
-                Some(parent_id) => parent_id,
-            };
-
-            let mut j = i;
-
-            // we have not processed ancestor yet.
-            // there is a chance that it is at the same Lsn
-            if !known_timelineids.contains(&parent_id) {
-                let mut curr_lsn_branchfroms: HashMap<TimelineId, Vec<(TimelineId, usize)>> =
-                    HashMap::new();
-
-                // inspect all branchpoints at the same lsn
-                while j < updates.len() && updates[j].lsn == curr_upd.lsn {
-                    let lookahead_upd = &updates[j];
-                    j += 1;
-
-                    if let Command::BranchFrom(lookahead_parent_id) = lookahead_upd.command {
-                        match lookahead_parent_id {
-                            Some(lookahead_parent_id)
-                                if !known_timelineids.contains(&lookahead_parent_id) =>
-                            {
-                                // we have not processed ancestor yet
-                                // store it for later
-                                let es =
-                                    curr_lsn_branchfroms.entry(lookahead_parent_id).or_default();
-                                es.push((lookahead_upd.timeline_id, j));
-                            }
-                            _ => {
-                                // we have already processed ancestor
-                                // process this BranchFrom Update normally
-                                known_timelineids.insert(lookahead_upd.timeline_id);
-                                sorted_updates.push(*lookahead_upd);
-                            }
-                        }
-                    }
-                }
-
-                // process BranchFroms in the tree order
-                // check that we don't have a cycle if somet entry is orphan
-                // (this should not happen, but better to be safe)
-                let mut processed_some_entry = true;
-                while processed_some_entry {
-                    processed_some_entry = false;
-
-                    curr_lsn_branchfroms.retain(|parent_id, branchfroms| {
-                        if known_timelineids.contains(parent_id) {
-                            for (timeline_id, j) in branchfroms {
-                                known_timelineids.insert(*timeline_id);
-                                sorted_updates.push(updates[*j - 1]);
-                            }
-                            processed_some_entry = true;
-                            false
-                        } else {
-                            true
-                        }
-                    });
-                }
-
-                if !curr_lsn_branchfroms.is_empty() {
-                    // orphans are expected to be rare and transient between tenant reloads
-                    // for example, an broken ancestor without the child branch being broken.
-                    anyhow::bail!(
-                        "orphan branch(es) detected in BranchFroms: {curr_lsn_branchfroms:?}"
-                    );
-                }
-            }
 
-            assert!(j > i);
-            i = j;
-        } else {
-            // not a BranchFrom, keep the same order
-            sorted_updates.push(*curr_upd);
-            i += 1;
-        }
-    }
-
-    Ok(sorted_updates)
+    /// Cutoff point calculated from the user-supplied 'max_retention_period'
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    retention_param_cutoff: Option<Lsn>,
 }
 
 /// Gathers the inputs for the tenant sizing model.
@@ -181,257 +123,264 @@ fn sort_updates_in_tree_order(updates: Vec<Update>) -> anyhow::Result<Vec<Update
 pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
+    max_retention_period: Option<u64>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
-    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
-    // our advantage with `?` error handling.
-    let mut joinset = tokio::task::JoinSet::new();
-
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
     tenant
         .refresh_gc_info(ctx)
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
-    let timelines = tenant.list_timelines();
+    // Collect information about all the timelines
+    let mut timelines = tenant.list_timelines();
 
     if timelines.is_empty() {
         // perhaps the tenant has just been created, and as such doesn't have any data yet
         return Ok(ModelInputs {
-            updates: vec![],
-            retention_period: 0,
-            timeline_inputs: HashMap::default(),
+            segments: vec![],
+            timeline_inputs: Vec::new(),
         });
     }
 
-    // record the used/inserted cache keys here, to remove extras not to start leaking
-    // after initial run the cache should be quite stable, but live timelines will eventually
-    // require new lsns to be inspected.
-    let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
+    // Filter out timelines that are not active
+    //
+    // There may be a race when a timeline is dropped,
+    // but it is unlikely to cause any issues. In the worst case,
+    // the calculation will error out.
+    timelines.retain(|t| t.is_active());
+
+    // Build a map of branch points.
+    let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
+    for timeline in timelines.iter() {
+        if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
+            branchpoints
+                .entry(ancestor_id)
+                .or_default()
+                .insert(timeline.get_ancestor_lsn());
+        }
+    }
 
-    let mut updates = Vec::new();
+    // These become the final result.
+    let mut timeline_inputs = Vec::with_capacity(timelines.len());
+    let mut segments: Vec<SegmentMeta> = Vec::new();
 
-    // record the per timeline values useful to debug the model inputs, also used to track
-    // ancestor_lsn without keeping a hold of Timeline
-    let mut timeline_inputs = HashMap::with_capacity(timelines.len());
+    //
+    // Build Segments representing each timeline. As we do that, also remember
+    // the branchpoints and branch startpoints in 'branchpoint_segments' and
+    // 'branchstart_segments'
+    //
 
-    // used to determine the `retention_period` for the size model
-    let mut max_cutoff_distance = None;
+    // BranchPoint segments of each timeline
+    // (timeline, branchpoint LSN) -> segment_id
+    let mut branchpoint_segments: HashMap<(TimelineId, Lsn), usize> = HashMap::new();
 
-    // mapping from (TimelineId, Lsn) => if this branch point has been handled already via
-    // GcInfo::retain_lsns or if it needs to have its logical_size calculated.
-    let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
-
-    for timeline in timelines {
-        if !timeline.is_active() {
-            anyhow::bail!(
-                "timeline {} is not active, cannot calculate tenant_size now",
-                timeline.timeline_id
-            );
-        }
+    // timeline, Branchpoint seg id, (ancestor, ancestor LSN)
+    type BranchStartSegment = (TimelineId, usize, Option<(TimelineId, Lsn)>);
+    let mut branchstart_segments: Vec<BranchStartSegment> = Vec::new();
 
+    for timeline in timelines.iter() {
+        let timeline_id = timeline.timeline_id;
         let last_record_lsn = timeline.get_last_record_lsn();
+        let ancestor_lsn = timeline.get_ancestor_lsn();
 
-        let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
-            // there's a race between the update (holding tenant.gc_lock) and this read but it
-            // might not be an issue, because it's not for Timeline::gc
-            let gc_info = timeline.gc_info.read().unwrap();
-
-            // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
-            // new gc run, which we have no control over. however differently from `Timeline::gc`
-            // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
-            // actually removing files.
-            let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
-
-            // the minimum where we should find the next_gc_cutoff for our calculations.
-            //
-            // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
-            // want to query any logical size before initdb_lsn.
-            let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
-
-            let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
-                Some((next_gc_cutoff, LsnKind::GcCutOff))
-            } else {
-                None
-            };
-
-            // this assumes there are no other lsns than the branchpoints
-            let lsns = gc_info
-                .retain_lsns
-                .iter()
-                .inspect(|&&lsn| {
-                    trace!(
-                        timeline_id=%timeline.timeline_id,
-                        "retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
-                        lsn < timeline.get_ancestor_lsn()
-                    )
-                })
-                .filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
-                .copied()
-                .map(|lsn| (lsn, LsnKind::BranchPoint))
-                .chain(maybe_cutoff)
-                .collect::<Vec<_>>();
-
-            (
-                lsns,
-                gc_info.horizon_cutoff,
-                gc_info.pitr_cutoff,
-                next_gc_cutoff,
-            )
+        // there's a race between the update (holding tenant.gc_lock) and this read but it
+        // might not be an issue, because it's not for Timeline::gc
+        let gc_info = timeline.gc_info.read().unwrap();
+
+        // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
+        // new gc run, which we have no control over. however differently from `Timeline::gc`
+        // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
+        // actually removing files.
+        let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
+
+        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
+        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
+            let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
+            if next_gc_cutoff < param_cutoff {
+                next_gc_cutoff = param_cutoff;
+            }
+            Some(param_cutoff)
+        } else {
+            None
         };
 
-        // update this to have a retention_period later for the tenant_size_model
-        // tenant_size_model compares this to the last segments start_lsn
-        if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
-            match max_cutoff_distance.as_mut() {
-                Some(max) => {
-                    *max = std::cmp::max(*max, cutoff_distance);
-                }
-                _ => {
-                    max_cutoff_distance = Some(cutoff_distance);
-                }
-            }
+        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+        // want to query any logical size before initdb_lsn.
+        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
+
+        // Build "interesting LSNs" on this timeline
+        let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
+            .retain_lsns
+            .iter()
+            .filter(|&&lsn| lsn > ancestor_lsn)
+            .copied()
+            // this assumes there are no other retain_lsns than the branchpoints
+            .map(|lsn| (lsn, LsnKind::BranchPoint))
+            .collect::<Vec<_>>();
+
+        // Add branch points we collected earlier, just in case there were any that were
+        // not present in retain_lsns. We will remove any duplicates below later.
+        if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
+            lsns.extend(
+                this_branchpoints
+                    .iter()
+                    .map(|lsn| (*lsn, LsnKind::BranchPoint)),
+            )
         }
 
-        // all timelines branch from something, because it might be impossible to pinpoint
-        // which is the tenant_size_model's "default" branch.
-
-        let ancestor_lsn = timeline.get_ancestor_lsn();
+        // Add a point for the GC cutoff
+        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
+        if !branch_start_needed {
+            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
+        }
 
-        updates.push(Update {
-            lsn: ancestor_lsn,
-            command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
+        lsns.sort_unstable();
+        lsns.dedup();
+
+        //
+        // Create Segments for the interesting points.
+        //
+
+        // Timeline start point
+        let ancestor = timeline
+            .get_ancestor_timeline_id()
+            .map(|ancestor_id| (ancestor_id, ancestor_lsn));
+        branchstart_segments.push((timeline_id, segments.len(), ancestor));
+        segments.push(SegmentMeta {
+            segment: Segment {
+                parent: None, // filled in later
+                lsn: branch_start_lsn.0,
+                size: None, // filled in later
+                needed: branch_start_needed,
+            },
             timeline_id: timeline.timeline_id,
+            kind: LsnKind::BranchStart,
         });
 
-        if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
-            // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
-            // which are over gc_horizon. for example, a "main" branch which never received any
-            // updates apart from initdb not have branch points recorded.
-            referenced_branch_froms
-                .entry((parent_timeline_id, timeline.get_ancestor_lsn()))
-                .or_default();
-        }
-
-        for (lsn, _kind) in &interesting_lsns {
-            // mark this visited so don't need to re-process this parent
-            *referenced_branch_froms
-                .entry((timeline.timeline_id, *lsn))
-                .or_default() = true;
-
-            if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
-                updates.push(Update {
-                    lsn: *lsn,
-                    timeline_id: timeline.timeline_id,
-                    command: Command::Update(*size),
-                });
-
-                needed_cache.insert((timeline.timeline_id, *lsn));
-            } else {
-                let timeline = Arc::clone(&timeline);
-                let parallel_size_calcs = Arc::clone(limit);
-                let ctx = ctx.attached_child();
-                joinset.spawn(calculate_logical_size(
-                    parallel_size_calcs,
-                    timeline,
-                    *lsn,
-                    ctx,
-                ));
+        // GC cutoff point, and any branch points, i.e. points where
+        // other timelines branch off from this timeline.
+        let mut parent = segments.len() - 1;
+        for (lsn, kind) in lsns {
+            if kind == LsnKind::BranchPoint {
+                branchpoint_segments.insert((timeline_id, lsn), segments.len());
             }
+            segments.push(SegmentMeta {
+                segment: Segment {
+                    parent: Some(parent),
+                    lsn: lsn.0,
+                    size: None,
+                    needed: lsn > next_gc_cutoff,
+                },
+                timeline_id: timeline.timeline_id,
+                kind,
+            });
+            parent += 1;
         }
 
-        timeline_inputs.insert(
-            timeline.timeline_id,
-            TimelineInputs {
-                ancestor_lsn,
-                last_record: last_record_lsn,
-                // this is not used above, because it might not have updated recently enough
-                latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-                horizon_cutoff,
-                pitr_cutoff,
-                next_gc_cutoff,
+        // Current end of the timeline
+        segments.push(SegmentMeta {
+            segment: Segment {
+                parent: Some(parent),
+                lsn: last_record_lsn.0,
+                size: None, // Filled in later, if necessary
+                needed: true,
             },
-        );
+            timeline_id: timeline.timeline_id,
+            kind: LsnKind::BranchEnd,
+        });
+
+        timeline_inputs.push(TimelineInputs {
+            timeline_id: timeline.timeline_id,
+            ancestor_id: timeline.get_ancestor_timeline_id(),
+            ancestor_lsn,
+            last_record: last_record_lsn,
+            // this is not used above, because it might not have updated recently enough
+            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
+            horizon_cutoff: gc_info.horizon_cutoff,
+            pitr_cutoff: gc_info.pitr_cutoff,
+            next_gc_cutoff,
+            retention_param_cutoff,
+        });
     }
 
-    // iterate over discovered branch points and make sure we are getting logical sizes at those
-    // points.
-    for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
-        if *handled {
-            continue;
+    // We now have all segments from the timelines in 'segments'. The timelines
+    // haven't been linked to each other yet, though. Do that.
+    for (_timeline_id, seg_id, ancestor) in branchstart_segments {
+        // Look up the branch point
+        if let Some(ancestor) = ancestor {
+            let parent_id = *branchpoint_segments.get(&ancestor).unwrap();
+            segments[seg_id].segment.parent = Some(parent_id);
         }
+    }
 
-        let timeline_id = *timeline_id;
-        let lsn = *lsn;
+    // We left the 'size' field empty in all of the Segments so far.
+    // Now find logical sizes for all of the points that might need or benefit from them.
+    fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?;
 
-        match timeline_inputs.get(&timeline_id) {
-            Some(inputs) if inputs.ancestor_lsn == lsn => {
-                // we don't need an update at this branch point which is also point where
-                // timeline_id branch was branched from.
-                continue;
-            }
-            Some(_) => {}
-            None => {
-                // we should have this because we have iterated through all of the timelines
-                anyhow::bail!("missing timeline_input for {timeline_id}")
-            }
-        }
+    Ok(ModelInputs {
+        segments,
+        timeline_inputs,
+    })
+}
 
-        if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
-            updates.push(Update {
-                lsn,
-                timeline_id,
-                command: Command::Update(*size),
-            });
+/// Augment 'segments' with logical sizes
+///
+/// this will probably conflict with on-demand downloaded layers, or at least force them all
+/// to be downloaded
+///
+async fn fill_logical_sizes(
+    timelines: &[Arc<Timeline>],
+    segments: &mut [SegmentMeta],
+    limit: &Arc<Semaphore>,
+    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
+    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
+        timelines
+            .iter()
+            .map(|timeline| (timeline.timeline_id, Arc::clone(timeline))),
+    );
 
-            needed_cache.insert((timeline_id, lsn));
-        } else {
-            let timeline = tenant
-                .get_timeline(timeline_id, false)
-                .context("find referenced ancestor timeline")?;
-            let parallel_size_calcs = Arc::clone(limit);
-            joinset.spawn(calculate_logical_size(
-                parallel_size_calcs,
-                timeline.clone(),
-                lsn,
-                ctx.attached_child(),
-            ));
-
-            if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
-                // we should not find new ones because we iterated tenants all timelines
-                anyhow::ensure!(
-                    timeline_inputs.contains_key(&parent_id),
-                    "discovered new timeline {parent_id} (parent of {timeline_id})"
-                );
-            }
-        };
-    }
+    // record the used/inserted cache keys here, to remove extras not to start leaking
+    // after initial run the cache should be quite stable, but live timelines will eventually
+    // require new lsns to be inspected.
+    let mut sizes_needed = HashMap::<(TimelineId, Lsn), Option<u64>>::new();
 
-    // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
-    // point. this is needed by the model.
-    for (timeline_id, inputs) in timeline_inputs.iter() {
-        let lsn = inputs.last_record;
+    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
+    // our advantage with `?` error handling.
+    let mut joinset = tokio::task::JoinSet::new();
 
-        if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
-            // this means that the (timeline_id, last_record_lsn) represents a branch point
-            // we do not want to add EndOfBranch updates for these points because it doesn't fit
-            // into the current tenant_size_model.
+    // For each point that would benefit from having a logical size available,
+    // spawn a Task to fetch it, unless we have it cached already.
+    for seg in segments.iter() {
+        if !seg.size_needed() {
             continue;
         }
 
-        if lsn > inputs.ancestor_lsn {
-            // all timelines also have an end point if they have made any progress
-            updates.push(Update {
-                lsn,
-                command: Command::EndOfBranch,
-                timeline_id: *timeline_id,
-            });
+        let timeline_id = seg.timeline_id;
+        let lsn = Lsn(seg.segment.lsn);
+
+        if let Entry::Vacant(e) = sizes_needed.entry((timeline_id, lsn)) {
+            let cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned();
+            if cached_size.is_none() {
+                let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
+                let parallel_size_calcs = Arc::clone(limit);
+                let ctx = ctx.attached_child();
+                joinset.spawn(calculate_logical_size(
+                    parallel_size_calcs,
+                    timeline,
+                    lsn,
+                    ctx,
+                ));
+            }
+            e.insert(cached_size);
         }
     }
 
+    // Perform the size lookups
     let mut have_any_error = false;
-
     while let Some(res) = joinset.join_next().await {
         // each of these come with Result<anyhow::Result<_>, JoinError>
         // because of spawn + spawn_blocking
@@ -460,19 +409,13 @@ pub(super) async fn gather_inputs(
                 debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
 
                 logical_size_cache.insert((timeline.timeline_id, lsn), size);
-                needed_cache.insert((timeline.timeline_id, lsn));
-
-                updates.push(Update {
-                    lsn,
-                    timeline_id: timeline.timeline_id,
-                    command: Command::Update(size),
-                });
+                sizes_needed.insert((timeline.timeline_id, lsn), Some(size));
             }
         }
     }
 
     // prune any keys not needed anymore; we record every used key and added key.
-    logical_size_cache.retain(|key, _| needed_cache.contains(key));
+    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));
 
     if have_any_error {
         // we cannot complete this round, because we are missing data.
@@ -480,105 +423,47 @@ pub(super) async fn gather_inputs(
         anyhow::bail!("failed to calculate some logical_sizes");
     }
 
-    // the data gathered to updates is per lsn, regardless of the branch, so we can use it to
-    // our advantage, not requiring a sorted container or graph walk.
-    //
-    // for branch points, which come as multiple updates at the same LSN, the Command::Update
-    // is needed before a branch is made out of that branch Command::BranchFrom. this is
-    // handled by the variant order in `Command`.
-    //
-    updates.sort_unstable();
+    // Insert the looked up sizes to the Segments
+    for seg in segments.iter_mut() {
+        if !seg.size_needed() {
+            continue;
+        }
 
-    // And another sort to handle Command::BranchFrom ordering
-    // in case when there are multiple branches at the same LSN.
-    let sorted_updates = sort_updates_in_tree_order(updates)?;
+        let timeline_id = seg.timeline_id;
+        let lsn = Lsn(seg.segment.lsn);
 
-    let retention_period = match max_cutoff_distance {
-        Some(max) => max.0,
-        None => {
-            anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
+        if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
+            seg.segment.size = Some(*size);
+        } else {
+            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
         }
-    };
-
-    Ok(ModelInputs {
-        updates: sorted_updates,
-        retention_period,
-        timeline_inputs,
-    })
+    }
+    Ok(())
 }
 
 impl ModelInputs {
-    pub fn calculate(&self) -> anyhow::Result<u64> {
-        // Option<TimelineId> is used for "naming" the branches because it is assumed to be
-        // impossible to always determine the a one main branch.
-        let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
-
-        for update in &self.updates {
-            let Update {
-                lsn,
-                command: op,
-                timeline_id,
-            } = update;
-
-            let Lsn(now) = *lsn;
-            match op {
-                Command::Update(sz) => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
-                }
-                Command::EndOfBranch => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
-                }
-                Command::BranchFrom(parent) => {
-                    // This branch command may fail if it cannot find a parent to branch from.
-                    storage.branch(parent, Some(*timeline_id))?;
-                }
-            }
-        }
+    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
+        // Convert SegmentMetas into plain Segments
+        let storage = StorageModel {
+            segments: self
+                .segments
+                .iter()
+                .map(|seg| seg.segment.clone())
+                .collect(),
+        };
 
-        Ok(storage.calculate(self.retention_period)?.total_children())
+        Ok(storage)
     }
-}
 
-/// A point of interest in the tree of branches
-#[serde_with::serde_as]
-#[derive(
-    Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
-)]
-struct Update {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    lsn: utils::lsn::Lsn,
-    command: Command,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    timeline_id: TimelineId,
-}
-
-#[serde_with::serde_as]
-#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
-#[serde(rename_all = "snake_case")]
-enum Command {
-    Update(u64),
-    BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
-    EndOfBranch,
-}
+    // calculate total project size
+    pub fn calculate(&self) -> anyhow::Result<u64> {
+        let storage = self.calculate_model()?;
+        let sizes = storage.calculate();
 
-impl std::fmt::Debug for Command {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
-        // linebreaks
-        match self {
-            Self::Update(arg0) => write!(f, "Update({arg0})"),
-            Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
-            Self::EndOfBranch => write!(f, "EndOfBranch"),
-        }
+        Ok(sizes.total_size)
     }
 }
 
-#[derive(Debug, Clone, Copy)]
-enum LsnKind {
-    BranchPoint,
-    GcCutOff,
-}
-
 /// Newtype around the tuple that carries the timeline at lsn logical size calculation.
 struct TimelineAtLsnSizeResult(
     Arc<crate::tenant::Timeline>,
@@ -604,227 +489,230 @@ async fn calculate_logical_size(
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }
 
-#[test]
-fn updates_sort() {
-    use std::str::FromStr;
-    use utils::id::TimelineId;
-    use utils::lsn::Lsn;
-
-    let ids = [
-        TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
-        TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
-        TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
-    ];
-
-    // try through all permutations
-    let ids = [
-        [&ids[0], &ids[1], &ids[2]],
-        [&ids[0], &ids[2], &ids[1]],
-        [&ids[1], &ids[0], &ids[2]],
-        [&ids[1], &ids[2], &ids[0]],
-        [&ids[2], &ids[0], &ids[1]],
-        [&ids[2], &ids[1], &ids[0]],
-    ];
-
-    for ids in ids {
-        // apply a fixture which uses a permutation of ids
-        let commands = [
-            Update {
-                lsn: Lsn(0),
-                command: Command::BranchFrom(None),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/67E7618").unwrap(),
-                command: Command::Update(43696128),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/67E7618").unwrap(),
-                command: Command::BranchFrom(Some(*ids[0])),
-                timeline_id: *ids[1],
-            },
-            Update {
-                lsn: Lsn::from_str("0/76BE4F0").unwrap(),
-                command: Command::Update(41844736),
-                timeline_id: *ids[1],
-            },
-            Update {
-                lsn: Lsn::from_str("0/10E49380").unwrap(),
-                command: Command::Update(42164224),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/10E49380").unwrap(),
-                command: Command::BranchFrom(Some(*ids[0])),
-                timeline_id: *ids[2],
-            },
-            Update {
-                lsn: Lsn::from_str("0/11D74910").unwrap(),
-                command: Command::Update(42172416),
-                timeline_id: *ids[2],
-            },
-            Update {
-                lsn: Lsn::from_str("0/12051E98").unwrap(),
-                command: Command::Update(42196992),
-                timeline_id: *ids[0],
-            },
-        ];
-
-        let mut sorted = commands;
-
-        // these must sort in the same order, regardless of how the ids sort
-        // which is why the timeline_id is the last field
-        sorted.sort_unstable();
-
-        assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
-    }
-}
-
 #[test]
 fn verify_size_for_multiple_branches() {
     // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
     // it has the stable lsn's
     //
-    // timelineinputs have been left out, because those explain the inputs, but don't participate
-    // in further size calculations.
-    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;
-
+    // The timeline_inputs don't participate in the size calculation, and are here just to explain
+    // the inputs.
+    let doc = r#"
+{
+  "segments": [
+    {
+      "segment": {
+        "parent": 9,
+        "lsn": 26033560,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 0,
+        "lsn": 35720400,
+        "size": 25206784,
+        "needed": false
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 1,
+        "lsn": 35851472,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "BranchEnd"
+    },
+    {
+      "segment": {
+        "parent": 7,
+        "lsn": 24566168,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 3,
+        "lsn": 25261936,
+        "size": 26050560,
+        "needed": false
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 4,
+        "lsn": 25393008,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "BranchEnd"
+    },
+    {
+      "segment": {
+        "parent": null,
+        "lsn": 23694408,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 6,
+        "lsn": 24566168,
+        "size": 25739264,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchPoint"
+    },
+    {
+      "segment": {
+        "parent": 7,
+        "lsn": 25902488,
+        "size": 26402816,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 8,
+        "lsn": 26033560,
+        "size": 26468352,
+        "needed": true
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchPoint"
+    },
+    {
+      "segment": {
+        "parent": 9,
+        "lsn": 26033560,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchEnd"
+    }
+  ],
+  "timeline_inputs": [
+    {
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "ancestor_lsn": "0/18D3D98",
+      "last_record": "0/2230CD0",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/2210CD0",
+      "pitr_cutoff": "0/2210CD0",
+      "next_gc_cutoff": "0/2210CD0",
+      "retention_param_cutoff": null
+    },
+    {
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "ancestor_lsn": "0/176D998",
+      "last_record": "0/1837770",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/1817770",
+      "pitr_cutoff": "0/1817770",
+      "next_gc_cutoff": "0/1817770",
+      "retention_param_cutoff": null
+    },
+    {
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "ancestor_lsn": "0/0",
+      "last_record": "0/18D3D98",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/18B3D98",
+      "pitr_cutoff": "0/18B3D98",
+      "next_gc_cutoff": "0/18B3D98",
+      "retention_param_cutoff": null
+    }
+  ]
+}
+"#;
     let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
 
-    assert_eq!(inputs.calculate().unwrap(), 36_409_872);
+    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
 }
 
 #[test]
-fn updates_sort_with_branches_at_same_lsn() {
-    use std::str::FromStr;
-    use Command::{BranchFrom, EndOfBranch};
-
-    macro_rules! lsn {
-        ($e:expr) => {
-            Lsn::from_str($e).unwrap()
-        };
+fn verify_size_for_one_branch() {
+    let doc = r#"
+{
+  "segments": [
+    {
+      "segment": {
+        "parent": null,
+        "lsn": 0,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 0,
+        "lsn": 305547335776,
+        "size": 220054675456,
+        "needed": false
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 1,
+        "lsn": 305614444640,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "BranchEnd"
     }
+  ],
+  "timeline_inputs": [
+    {
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "ancestor_lsn": "0/0",
+      "last_record": "47/280A5860",
+      "latest_gc_cutoff": "47/240A5860",
+      "horizon_cutoff": "47/240A5860",
+      "pitr_cutoff": "47/240A5860",
+      "next_gc_cutoff": "47/240A5860",
+      "retention_param_cutoff": "0/0"
+    }
+  ]
+}"#;
+
+    let model: ModelInputs = serde_json::from_str(doc).unwrap();
+
+    let res = model.calculate_model().unwrap().calculate();
 
-    let ids = [
-        TimelineId::from_str("00000000000000000000000000000000").unwrap(),
-        TimelineId::from_str("11111111111111111111111111111111").unwrap(),
-        TimelineId::from_str("22222222222222222222222222222222").unwrap(),
-        TimelineId::from_str("33333333333333333333333333333333").unwrap(),
-        TimelineId::from_str("44444444444444444444444444444444").unwrap(),
-    ];
-
-    // issue https://github.com/neondatabase/neon/issues/3179
-    let commands = vec![
-        Update {
-            lsn: lsn!("0/0"),
-            command: BranchFrom(None),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: Command::Update(25387008),
-            timeline_id: ids[0],
-        },
-        // next three are wrongly sorted, because
-        // ids[1] is branched from before ids[1] exists
-        // and ids[2] is branched from before ids[2] exists
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[0])),
-            timeline_id: ids[2],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[2])),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CA85B8"),
-            command: Command::Update(28925952),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: Command::Update(29024256),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[4],
-        },
-        Update {
-            lsn: lsn!("0/22DCE70"),
-            command: Command::Update(32546816),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/230CE70"),
-            command: EndOfBranch,
-            timeline_id: ids[3],
-        },
-    ];
-
-    let expected = vec![
-        Update {
-            lsn: lsn!("0/0"),
-            command: BranchFrom(None),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: Command::Update(25387008),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[0])),
-            timeline_id: ids[2],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[2])),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/1CA85B8"),
-            command: Command::Update(28925952),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: Command::Update(29024256),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[4],
-        },
-        Update {
-            lsn: lsn!("0/22DCE70"),
-            command: Command::Update(32546816),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/230CE70"),
-            command: EndOfBranch,
-            timeline_id: ids[3],
-        },
-    ];
-
-    let sorted_commands = sort_updates_in_tree_order(commands).unwrap();
-
-    assert_eq!(sorted_commands, expected);
+    println!("calculated synthetic size: {}", res.total_size);
+    println!("result: {:?}", serde_json::to_string(&res.segments));
+
+    use utils::lsn::Lsn;
+    let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap();
+    let last_lsn: Lsn = "47/280A5860".parse().unwrap();
+    println!(
+        "latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}",
+        u64::from(latest_gc_cutoff_lsn),
+        u64::from(last_lsn)
+    );
+    assert_eq!(res.total_size, 220121784320);
 }
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index 51bb4dcc2afe..8465a9933918 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -49,6 +49,17 @@ pub struct RemoteLayer {
     access_stats: LayerAccessStats,
 
     pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+
+    /// Has `LayerMap::replace` failed for this (true) or not (false).
+    ///
+    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
+    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
+    /// unprocessable, because a LayerMap::replace failed.
+    ///
+    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
+    /// a possible fast loop between `Timeline::get_reconstruct_data` and
+    /// `Timeline::download_remote_layer`, which also logs.
+    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
 }
 
 impl std::fmt::Debug for RemoteLayer {
@@ -207,6 +218,7 @@ impl RemoteLayer {
             file_name: fname.to_owned().into(),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
             access_stats,
         }
     }
@@ -228,6 +240,7 @@ impl RemoteLayer {
             file_name: fname.to_owned().into(),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
             access_stats,
         }
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b126545ee433..e9ce52d1ab37 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -3,7 +3,7 @@
 
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
@@ -11,6 +11,7 @@ use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantId;
 
@@ -53,37 +54,55 @@ async fn compaction_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let cancel = task_mgr::shutdown_token();
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
+        let mut first = true;
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request");
-                return;
+                    return;
                 },
                 tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
                     ControlFlow::Continue(tenant) => tenant,
                 },
-        };
+            };
+
+            let period = tenant.get_compaction_period();
 
-            let mut sleep_duration = tenant.get_compaction_period();
-            if sleep_duration == Duration::ZERO {
+            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
+            // loop
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
+                }
+            }
+
+            let started_at = Instant::now();
+
+            let sleep_duration = if period == Duration::ZERO {
                 info!("automatic compaction is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
+                Duration::from_secs(10)
             } else {
                 // Run compaction
                 if let Err(e) = tenant.compaction_iteration(&ctx).await {
-                    sleep_duration = wait_duration;
-                    error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
+                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
+                } else {
+                    period
                 }
-            }
+            };
+
+            warn_when_period_overrun(started_at.elapsed(), period, "compaction");
 
             // Sleep
             tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request during idling");
                     break;
                 },
@@ -105,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let cancel = task_mgr::shutdown_token();
         // GC might require downloading, to find the cutoff LSN that corresponds to the
         // cutoff specified as time.
         let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let mut first = true;
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request");
                     return;
                 },
@@ -122,27 +143,38 @@ async fn gc_loop(tenant_id: TenantId) {
                 },
             };
 
-            let gc_period = tenant.get_gc_period();
+            let period = tenant.get_gc_period();
+
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
+                }
+            }
+
+            let started_at = Instant::now();
+
             let gc_horizon = tenant.get_gc_horizon();
-            let mut sleep_duration = gc_period;
-            if sleep_duration == Duration::ZERO {
+            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
                 info!("automatic GC is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
+                Duration::from_secs(10)
             } else {
                 // Run gc
-                if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
-                    {
-                        sleep_duration = wait_duration;
-                        error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
-                    }
+                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                if let Err(e) = res {
+                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
+                } else {
+                    period
                 }
-            }
+            };
+
+            warn_when_period_overrun(started_at.elapsed(), period, "gc");
 
             // Sleep
             tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request during idling");
                     break;
                 },
@@ -197,3 +229,49 @@ async fn wait_for_active_tenant(
         }
     }
 }
+
+#[derive(thiserror::Error, Debug)]
+#[error("cancelled")]
+pub(crate) struct Cancelled;
+
+/// Provide a random delay for background task initialization.
+///
+/// This delay prevents a thundering herd of background tasks and will likely keep them running on
+/// different periods for more stable load.
+pub(crate) async fn random_init_delay(
+    period: Duration,
+    cancel: &CancellationToken,
+) -> Result<(), Cancelled> {
+    use rand::Rng;
+
+    let d = {
+        let mut rng = rand::thread_rng();
+
+        // gen_range asserts that the range cannot be empty, which it could be because period can
+        // be set to zero to disable gc or compaction, so lets set it to be at least 10s.
+        let period = std::cmp::max(period, Duration::from_secs(10));
+
+        // semi-ok default as the source of jitter
+        rng.gen_range(Duration::ZERO..=period)
+    };
+
+    tokio::select! {
+        _ = cancel.cancelled() => Err(Cancelled),
+        _ = tokio::time::sleep(d) => Ok(()),
+    }
+}
+
+pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
+    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
+    if elapsed >= period && period != Duration::ZERO {
+        // humantime does no significant digits clamping whereas Duration's debug is a bit more
+        // intelligent. however it makes sense to keep the "configuration format" for period, even
+        // though there's no way to output the actual config value.
+        warn!(
+            ?elapsed,
+            period = %humantime::format_duration(period),
+            task,
+            "task iteration took longer than the configured period"
+        );
+    }
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5a829e42e54c..f2b0a985097e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -613,7 +613,10 @@ impl Timeline {
         self.flush_frozen_layers_and_wait().await
     }
 
+    /// Outermost timeline compaction operation; downloads needed layers.
     pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        const ROUNDS: usize = 2;
+
         let last_record_lsn = self.get_last_record_lsn();
 
         // Last record Lsn could be zero in case the timeline was just created
@@ -622,6 +625,86 @@ impl Timeline {
             return Ok(());
         }
 
+        // retry two times to allow first round to find layers which need to be downloaded, then
+        // download them, then retry compaction
+        for round in 0..ROUNDS {
+            // should we error out with the most specific error?
+            let last_round = round == ROUNDS - 1;
+
+            let res = self.compact_inner(ctx).await;
+
+            // If `create_image_layers' or `compact_level0` scheduled any
+            // uploads or deletions, but didn't update the index file yet,
+            // do it now.
+            //
+            // This isn't necessary for correctness, the remote state is
+            // consistent without the uploads and deletions, and we would
+            // update the index file on next flush iteration too. But it
+            // could take a while until that happens.
+            //
+            // Additionally, only do this on the terminal round before sleeping.
+            if last_round {
+                if let Some(remote_client) = &self.remote_client {
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
+            }
+
+            let rls = match res {
+                Ok(()) => return Ok(()),
+                Err(CompactionError::DownloadRequired(rls)) if !last_round => {
+                    // this can be done at most one time before exiting, waiting
+                    rls
+                }
+                Err(CompactionError::DownloadRequired(rls)) => {
+                    anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
+                }
+                Err(CompactionError::Other(e)) => {
+                    return Err(e);
+                }
+            };
+
+            // this path can be visited in the second round of retrying, if first one found that we
+            // must first download some remote layers
+            let total = rls.len();
+
+            let mut downloads = rls
+                .into_iter()
+                .map(|rl| self.download_remote_layer(rl))
+                .collect::<futures::stream::FuturesUnordered<_>>();
+
+            let mut failed = 0;
+
+            let cancelled = task_mgr::shutdown_watcher();
+            tokio::pin!(cancelled);
+
+            loop {
+                tokio::select! {
+                    _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
+                    res = downloads.next() => {
+                        match res {
+                            Some(Ok(())) => {},
+                            Some(Err(e)) => {
+                                warn!("Downloading remote layer for compaction failed: {e:#}");
+                                failed += 1;
+                            }
+                            None => break,
+                        }
+                    }
+                }
+            }
+
+            if failed != 0 {
+                anyhow::bail!("{failed} out of {total} layers failed to download, retrying later");
+            }
+
+            // if everything downloaded fine, lets try again
+        }
+
+        unreachable!("retry loop exits")
+    }
+
+    /// Compaction which might need to be retried after downloading remote layers.
+    async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
         //
         // High level strategy for compaction / image creation:
         //
@@ -660,7 +743,7 @@ impl Timeline {
         // Is the timeline being deleted?
         let state = *self.state.borrow();
         if state == TimelineState::Stopping {
-            anyhow::bail!("timeline is Stopping");
+            return Err(anyhow::anyhow!("timeline is Stopping").into());
         }
 
         let target_file_size = self.get_checkpoint_distance();
@@ -680,7 +763,8 @@ impl Timeline {
                 // "enough".
                 let layer_paths_to_upload = self
                     .create_image_layers(&partitioning, lsn, false, ctx)
-                    .await?;
+                    .await
+                    .map_err(anyhow::Error::from)?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -692,18 +776,6 @@ impl Timeline {
                 self.compact_level0(&layer_removal_cs, target_file_size, ctx)
                     .await?;
                 timer.stop_and_record();
-
-                // If `create_image_layers' or `compact_level0` scheduled any
-                // uploads or deletions, but didn't update the index file yet,
-                // do it now.
-                //
-                // This isn't necessary for correctness, the remote state is
-                // consistent without the uploads and deletions, and we would
-                // update the index file on next flush iteration too. But it
-                // could take a while until that happens.
-                if let Some(remote_client) = &self.remote_client {
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -2536,10 +2608,13 @@ impl Timeline {
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
             let partitioning_guard = self.partitioning.lock().unwrap();
-            if partitioning_guard.1 != Lsn(0)
-                && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
-            {
-                // no repartitioning needed
+            let distance = lsn.0 - partitioning_guard.1 .0;
+            if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
+                debug!(
+                    distance,
+                    threshold = self.repartition_threshold,
+                    "no repartitioning needed"
+                );
                 return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
             }
         }
@@ -2557,8 +2632,12 @@ impl Timeline {
 
     // Is it time to create a new image layer for the given partition?
     fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
+        let threshold = self.get_image_creation_threshold();
+
         let layers = self.layers.read().unwrap();
 
+        let mut max_deltas = 0;
+
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn)?;
             for (img_range, last_img) in image_coverage {
@@ -2580,21 +2659,25 @@ impl Timeline {
                 // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
-                    let threshold = self.get_image_creation_threshold();
                     let num_deltas =
                         layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
 
-                    debug!(
-                        "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
-                        img_range.start, img_range.end, num_deltas, img_lsn, lsn
-                    );
+                    max_deltas = max_deltas.max(num_deltas);
                     if num_deltas >= threshold {
+                        debug!(
+                            "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
+                            img_range.start, img_range.end, num_deltas, img_lsn, lsn
+                        );
                         return Ok(true);
                     }
                 }
             }
         }
 
+        debug!(
+            max_deltas,
+            "none of the partitioned ranges had >= {threshold} deltas"
+        );
         Ok(false)
     }
 
@@ -2707,25 +2790,55 @@ impl Timeline {
         Ok(layer_paths_to_upload)
     }
 }
+
 #[derive(Default)]
 struct CompactLevel0Phase1Result {
     new_layers: Vec<DeltaLayer>,
     deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
 }
 
+/// Top-level failure to compact.
+#[derive(Debug)]
+enum CompactionError {
+    /// L0 compaction requires layers to be downloaded.
+    ///
+    /// This should not happen repeatedly, but will be retried once by top-level
+    /// `Timeline::compact`.
+    DownloadRequired(Vec<Arc<RemoteLayer>>),
+    /// Compaction cannot be done right now; page reconstruction and so on.
+    Other(anyhow::Error),
+}
+
+impl From<anyhow::Error> for CompactionError {
+    fn from(value: anyhow::Error) -> Self {
+        CompactionError::Other(value)
+    }
+}
+
 impl Timeline {
+    /// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
+    ///
+    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
+    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
+    /// start of level0 files compaction, the on-demand download should be revisited as well.
     async fn compact_level0_phase1(
         &self,
+        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
         target_file_size: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<CompactLevel0Phase1Result> {
+    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         let layers = self.layers.read().unwrap();
         let mut level0_deltas = layers.get_level0_deltas()?;
         drop(layers);
 
         // Only compact if enough layers have accumulated.
-        if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
-            return Ok(Default::default());
+        let threshold = self.get_compaction_threshold();
+        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
         }
 
         // Gather the files to compact in this iteration.
@@ -2761,6 +2874,24 @@ impl Timeline {
             end: deltas_to_compact.last().unwrap().get_lsn_range().end,
         };
 
+        let remotes = deltas_to_compact
+            .iter()
+            .filter(|l| l.is_remote_layer())
+            .inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
+            .map(|l| {
+                l.clone()
+                    .downcast_remote_layer()
+                    .expect("just checked it is remote layer")
+            })
+            .collect::<Vec<_>>();
+
+        if !remotes.is_empty() {
+            // caller is holding the lock to layer_removal_cs, and we don't want to download while
+            // holding that; in future download_remote_layer might take it as well. this is
+            // regardless of earlier image creation downloading on-demand, while holding the lock.
+            return Err(CompactionError::DownloadRequired(remotes));
+        }
+
         info!(
             "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
             lsn_range.start,
@@ -2768,9 +2899,11 @@ impl Timeline {
             deltas_to_compact.len(),
             level0_deltas.len()
         );
+
         for l in deltas_to_compact.iter() {
             info!("compact includes {}", l.filename().file_name());
         }
+
         // We don't need the original list of layers anymore. Drop it so that
         // we don't accidentally use it later in the function.
         drop(level0_deltas);
@@ -2940,7 +3073,9 @@ impl Timeline {
             }
 
             fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
+                return Err(
+                    anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(),
+                );
             });
 
             writer.as_mut().unwrap().put_value(key, lsn, value)?;
@@ -2959,7 +3094,7 @@ impl Timeline {
 
             // Fsync all the layer files and directory using multiple threads to
             // minimize latency.
-            par_fsync::par_fsync(&layer_paths)?;
+            par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
 
             layer_paths.pop().unwrap();
         }
@@ -2981,11 +3116,13 @@ impl Timeline {
         layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
         target_file_size: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
-        } = self.compact_level0_phase1(target_file_size, ctx).await?;
+        } = self
+            .compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
+            .await?;
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
@@ -3009,7 +3146,12 @@ impl Timeline {
         for l in new_layers {
             let new_delta_path = l.path();
 
-            let metadata = new_delta_path.metadata()?;
+            let metadata = new_delta_path.metadata().with_context(|| {
+                format!(
+                    "read file metadata for new created layer {}",
+                    new_delta_path.display()
+                )
+            })?;
 
             if let Some(remote_client) = &self.remote_client {
                 remote_client.schedule_layer_file_upload(
@@ -3243,7 +3385,7 @@ impl Timeline {
 
         let mut layers_to_remove = Vec::new();
 
-        // Scan all on-disk layers in the timeline.
+        // Scan all layers in the timeline (remote or on-disk).
         //
         // Garbage collect the layer if all conditions are satisfied:
         // 1. it is older than cutoff LSN;
@@ -3482,14 +3624,26 @@ impl Timeline {
         &self,
         remote_layer: Arc<RemoteLayer>,
     ) -> anyhow::Result<()> {
+        use std::sync::atomic::Ordering::Relaxed;
+
         let permit = match Arc::clone(&remote_layer.ongoing_download)
             .acquire_owned()
             .await
         {
             Ok(permit) => permit,
             Err(_closed) => {
-                info!("download of layer has already finished");
-                return Ok(());
+                if remote_layer.download_replacement_failure.load(Relaxed) {
+                    // this path will be hit often, in case there are upper retries. however
+                    // hitting this error will prevent a busy loop between get_reconstruct_data and
+                    // download, so an error is prefered.
+                    //
+                    // TODO: we really should poison the timeline, but panicking is not yet
+                    // supported. Related: https://github.com/neondatabase/neon/issues/3621
+                    anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
+                } else {
+                    info!("download of layer has already finished");
+                    return Ok(());
+                }
             }
         };
 
@@ -3525,8 +3679,8 @@ impl Timeline {
                     {
                         use crate::tenant::layer_map::Replacement;
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        match updates.replace_historic(&l, new_layer) {
-                            Ok(Replacement::Replaced { .. }) => { /* expected */ }
+                        let failure = match updates.replace_historic(&l, new_layer) {
+                            Ok(Replacement::Replaced { .. }) => false,
                             Ok(Replacement::NotFound) => {
                                 // TODO: the downloaded file should probably be removed, otherwise
                                 // it will be added to the layermap on next load? we should
@@ -3534,6 +3688,7 @@ impl Timeline {
                                 //
                                 // See: https://github.com/neondatabase/neon/issues/3533
                                 error!("replacing downloaded layer into layermap failed because layer was not found");
+                                true
                             }
                             Ok(Replacement::RemovalBuffered) => {
                                 unreachable!("current implementation does not remove anything")
@@ -3552,12 +3707,32 @@ impl Timeline {
                                     ?other,
                                     "replacing downloaded layer into layermap failed because another layer was found instead of expected"
                                 );
+                                true
                             }
                             Err(e) => {
                                 // this is a precondition failure, the layer filename derived
                                 // attributes didn't match up, which doesn't seem likely.
-                                error!("replacing downloaded layer into layermap failed: {e:#?}")
+                                error!("replacing downloaded layer into layermap failed: {e:#?}");
+                                true
                             }
+                        };
+
+                        if failure {
+                            // mark the remote layer permanently failed; the timeline is most
+                            // likely unusable after this. sadly we cannot just poison the layermap
+                            // lock with panic, because that would create an issue with shutdown.
+                            //
+                            // this does not change the retry semantics on failed downloads.
+                            //
+                            // use of Relaxed is valid because closing of the semaphore gives
+                            // happens-before and wakes up any waiters; we write this value before
+                            // and any waiters (or would be waiters) will load it after closing
+                            // semaphore.
+                            //
+                            // See: https://github.com/neondatabase/neon/issues/3533
+                            remote_layer
+                                .download_replacement_failure
+                                .store(true, Relaxed);
                         }
                     }
                     updates.flush();
@@ -3583,7 +3758,7 @@ impl Timeline {
                 drop(permit);
 
                 Ok(())
-            },
+            }.in_current_span(),
         );
 
         receiver.await.context("download task cancelled")?
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index e3e7ce4c9df9..2aad0ef0f344 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -41,9 +41,23 @@ impl Timeline {
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
     async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+        use crate::tenant::tasks::random_init_delay;
+        {
+            let policy = self.get_eviction_policy();
+            let period = match policy {
+                EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
+                EvictionPolicy::NoEviction => Duration::from_secs(10),
+            };
+            if random_init_delay(period, &cancel).await.is_err() {
+                info!("shutting down");
+                return;
+            }
+        }
+
         loop {
             let policy = self.get_eviction_policy();
             let cf = self.eviction_iteration(&policy, cancel.clone()).await;
+
             match cf {
                 ControlFlow::Break(()) => break,
                 ControlFlow::Continue(sleep_until) => {
@@ -78,13 +92,7 @@ impl Timeline {
                     ControlFlow::Continue(()) => (),
                 }
                 let elapsed = start.elapsed();
-                if elapsed > p.period {
-                    warn!(
-                        configured_period = %humantime::format_duration(p.period),
-                        last_period = %humantime::format_duration(elapsed),
-                        "this eviction period took longer than the configured period"
-                    );
-                }
+                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
                 ControlFlow::Continue(start + p.period)
             }
         }
@@ -100,7 +108,6 @@ impl Timeline {
         #[allow(dead_code)]
         #[derive(Debug, Default)]
         struct EvictionStats {
-            not_considered_due_to_clock_skew: usize,
             candidates: usize,
             evicted: usize,
             errors: usize,
@@ -129,9 +136,21 @@ impl Timeline {
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
                     Err(_e) => {
-                        // NB: don't log the error. If there are many layers and the system clock
-                        // is skewed, we'd be flooding the log.
-                        stats.not_considered_due_to_clock_skew += 1;
+                        // We reach here if `now` < `last_activity_ts`, which can legitimately
+                        // happen if there is an access between us getting `now`, and us getting
+                        // the access stats from the layer.
+                        //
+                        // The other reason why it can happen is system clock skew because
+                        // SystemTime::now() is not monotonic, so, even if there is no access
+                        // to the layer after we get `now` at the beginning of this function,
+                        // it could be that `now`  < `last_activity_ts`.
+                        //
+                        // To distinguish the cases, we would need to record `Instant`s in the
+                        // access stats (i.e., monotonic timestamps), but then, the timestamps
+                        // values in the access stats would need to be `Instant`'s, and hence
+                        // they would be meaningless outside of the pageserver process.
+                        // At the time of writing, the trade-off is that access stats are more
+                        // valuable than detecting clock skew.
                         continue;
                     }
                 };
@@ -188,8 +207,9 @@ impl Timeline {
                 }
             }
         }
-        if stats.not_considered_due_to_clock_skew > 0 || stats.errors > 0 || stats.not_evictable > 0
-        {
+        if stats.candidates == stats.not_evictable {
+            debug!(stats=?stats, "eviction iteration complete");
+        } else if stats.errors > 0 || stats.not_evictable > 0 {
             warn!(stats=?stats, "eviction iteration complete");
         } else {
             info!(stats=?stats, "eviction iteration complete");
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index c943bf0a276f..21c6ede27e93 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -27,8 +27,7 @@ use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
-use std::os::fd::RawFd;
-use std::os::unix::io::AsRawFd;
+use std::os::unix::io::{AsRawFd, RawFd};
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
 use std::process::Stdio;
diff --git a/poetry.lock b/poetry.lock
index 7e80b1e10ada..bc2c56d74c61 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1238,7 +1238,6 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"},
     {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"},
 ]
 
@@ -1309,66 +1308,63 @@ files = [
 
 [[package]]
 name = "moto"
-version = "3.1.18"
-description = "A library that allows your python tests to easily mock out the boto library"
+version = "4.1.2"
+description = ""
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
-    {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
+    {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
+    {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
 ]
 
 [package.dependencies]
 aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
 boto3 = ">=1.9.201"
 botocore = ">=1.12.201"
-cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""}
+cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
 cryptography = ">=3.3.1"
 docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
 ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
-flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""}
+flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
 flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
 graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
-idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""}
 Jinja2 = ">=2.10.1"
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
-MarkupSafe = "!=2.0.0a1"
 openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
 pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
 python-dateutil = ">=2.1,<3.0.0"
 python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
-pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
-responses = ">=0.9.0"
+responses = ">=0.13.0"
 setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
-werkzeug = ">=0.5,<2.2.0"
+werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
 xmltodict = "*"
 
 [package.extras]
-all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
 apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
 apigatewayv2 = ["PyYAML (>=5.1)"]
 appsync = ["graphql-core"]
 awslambda = ["docker (>=2.5.1)"]
 batch = ["docker (>=2.5.1)"]
-cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
 cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
 ds = ["sshpubkeys (>=3.1.0)"]
 dynamodb = ["docker (>=2.5.1)"]
-dynamodb2 = ["docker (>=2.5.1)"]
 dynamodbstreams = ["docker (>=2.5.1)"]
 ebs = ["sshpubkeys (>=3.1.0)"]
 ec2 = ["sshpubkeys (>=3.1.0)"]
 efs = ["sshpubkeys (>=3.1.0)"]
+eks = ["sshpubkeys (>=3.1.0)"]
 glue = ["pyparsing (>=3.0.7)"]
 iotdata = ["jsondiff (>=1.1.2)"]
 route53resolver = ["sshpubkeys (>=3.1.0)"]
 s3 = ["PyYAML (>=5.1)"]
-server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-ssm = ["PyYAML (>=5.1)", "dataclasses"]
+server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+ssm = ["PyYAML (>=5.1)"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
 [[package]]
@@ -1716,7 +1712,6 @@ python-versions = ">=3.6"
 files = [
     {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -1750,7 +1745,6 @@ files = [
     {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
     {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -1762,7 +1756,6 @@ files = [
     {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -1795,7 +1788,18 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
+    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
+    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
+    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
+    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
     {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
+    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
+    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
+    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
+    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
+    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
+    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
+    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 
@@ -2004,8 +2008,8 @@ files = [
 
 [package.dependencies]
 pytest = [
-    {version = ">=5.0", markers = "python_version < \"3.10\""},
     {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
+    {version = ">=5.0", markers = "python_version < \"3.10\""},
 ]
 
 [[package]]
@@ -2082,18 +2086,6 @@ cryptography = ["cryptography (>=3.4.0)"]
 pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
 pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]
 
-[[package]]
-name = "pytz"
-version = "2022.1"
-description = "World timezone definitions, modern and historical"
-category = "main"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
-    {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
-]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -2129,13 +2121,6 @@ files = [
     {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
     {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
     {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
-    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
-    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
     {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2449,16 +2434,19 @@ test = ["websockets"]
 
 [[package]]
 name = "werkzeug"
-version = "2.1.2"
+version = "2.2.3"
 description = "The comprehensive WSGI web application library."
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"},
-    {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"},
+    {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
+    {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
 ]
 
+[package.dependencies]
+MarkupSafe = ">=2.1.1"
+
 [package.extras]
 watchdog = ["watchdog"]
 
@@ -2655,4 +2643,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3"
+content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 152c83e4a0b0..030a5f1d6e47 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -28,6 +28,7 @@ itertools.workspace = true
 md5.workspace = true
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry.workspace = true
 parking_lot.workspace = true
 pin-project-lite.workspace = true
 pq_proto.workspace = true
@@ -35,6 +36,8 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
+reqwest-middleware.workspace = true
+reqwest-tracing.workspace = true
 routerify.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -48,8 +51,10 @@ thiserror.workspace = true
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["signal"] }
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+tracing-utils.workspace = true
 tracing.workspace = true
 url.workspace = true
 utils.workspace = true
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 50afbd2a27e2..b8599adaeb6b 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,11 +1,11 @@
 mod classic;
-
+mod hacks;
 mod link;
-use futures::TryFutureExt;
+
 pub use link::LinkAuthError;
 
 use crate::{
-    auth::{self, AuthFlow, ClientCredentials},
+    auth::{self, ClientCredentials},
     console::{
         self,
         provider::{CachedNodeInfo, ConsoleReqExtra},
@@ -13,9 +13,10 @@ use crate::{
     },
     stream, url,
 };
+use futures::TryFutureExt;
 use std::borrow::Cow;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::info;
 
 /// A product of successful authentication.
 pub struct AuthSuccess<T> {
@@ -105,97 +106,49 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
     }
 }
 
-// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc).
-// Read more: https://github.com/rust-lang/rust/issues/99190
-// Alleged fix: https://github.com/rust-lang/rust/pull/89056
-impl<'l> BackendType<'l, ClientCredentials<'_>> {
-    /// Do something special if user didn't provide the `project` parameter.
-    async fn try_password_hack<'a>(
-        &'a mut self,
-        extra: &'a ConsoleReqExtra<'a>,
-        client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    ) -> auth::Result<Option<AuthSuccess<CachedNodeInfo>>> {
-        use BackendType::*;
-
-        // If there's no project so far, that entails that client doesn't
-        // support SNI or other means of passing the project name.
-        // We now expect to see a very specific payload in the place of password.
-        let fetch_magic_payload = |client| async {
-            warn!("project name not specified, resorting to the password hack auth flow");
-            let payload = AuthFlow::new(client)
-                .begin(auth::PasswordHack)
-                .await?
-                .authenticate()
-                .await?;
-
-            info!(project = &payload.project, "received missing parameter");
-            auth::Result::Ok(payload)
-        };
-
-        // If we want to use cleartext password flow, we can read the password
-        // from the client and pretend that it's a magic payload (PasswordHack hack).
-        let fetch_plaintext_password = |client| async {
-            info!("using cleartext password flow");
-            let payload = AuthFlow::new(client)
-                .begin(auth::CleartextPassword)
-                .await?
-                .authenticate()
-                .await?;
-
-            auth::Result::Ok(auth::password_hack::PasswordHackPayload {
-                project: String::new(),
-                password: payload,
-            })
-        };
-
-        // TODO: find a proper way to merge those very similar blocks.
-        let (mut node, password) = match self {
-            Console(api, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload(client).await?;
-                creds.project = Some(payload.project.into());
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            // This is a hack to allow cleartext password in secure connections (wss).
-            Console(api, creds) if creds.use_cleartext_password_flow => {
-                let payload = fetch_plaintext_password(client).await?;
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            Postgres(api, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload(client).await?;
-                creds.project = Some(payload.project.into());
-                let node = api.wake_compute(extra, creds).await?;
+/// True to its name, this function encapsulates our current auth trade-offs.
+/// Here, we choose the appropriate auth flow based on circumstances.
+async fn auth_quirks(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    allow_cleartext: bool,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    // If there's no project so far, that entails that client doesn't
+    // support SNI or other means of passing the endpoint (project) name.
+    // We now expect to see a very specific payload in the place of password.
+    if creds.project.is_none() {
+        // Password will be checked by the compute node later.
+        return hacks::password_hack(api, extra, creds, client).await;
+    }
 
-                (node, payload.password)
-            }
-            _ => return Ok(None),
-        };
+    // Password hack should set the project name.
+    // TODO: make `creds.project` more type-safe.
+    assert!(creds.project.is_some());
 
-        node.config.password(password);
-        Ok(Some(AuthSuccess {
-            reported_auth_ok: false,
-            value: node,
-        }))
+    // Perform cleartext auth if we're allowed to do that.
+    // Currently, we use it for websocket connections (latency).
+    if allow_cleartext {
+        // Password will be checked by the compute node later.
+        return hacks::cleartext_hack(api, extra, creds, client).await;
     }
 
+    // Finally, proceed with the main auth flow (SCRAM-based).
+    classic::authenticate(api, extra, creds, client).await
+}
+
+impl BackendType<'_, ClientCredentials<'_>> {
     /// Authenticate the client via the requested backend, possibly using credentials.
-    pub async fn authenticate<'a>(
+    #[tracing::instrument(fields(allow_cleartext), skip_all)]
+    pub async fn authenticate(
         &mut self,
-        extra: &'a ConsoleReqExtra<'a>,
-        client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        extra: &ConsoleReqExtra<'_>,
+        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        allow_cleartext: bool,
     ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
         use BackendType::*;
 
-        // Handle cases when `project` is missing in `creds`.
-        // TODO: type safety: return `creds` with irrefutable `project`.
-        if let Some(res) = self.try_password_hack(extra, client).await? {
-            info!("user successfully authenticated (using the password hack)");
-            return Ok(res);
-        }
-
         let res = match self {
             Console(api, creds) => {
                 info!(
@@ -204,20 +157,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
                     "performing authentication using the console"
                 );
 
-                assert!(creds.project.is_some());
-                classic::handle_user(api.as_ref(), extra, creds, client).await?
+                let api = api.as_ref();
+                auth_quirks(api, extra, creds, client, allow_cleartext).await?
             }
             Postgres(api, creds) => {
-                info!("performing mock authentication using a local postgres instance");
+                info!(
+                    user = creds.user,
+                    project = creds.project(),
+                    "performing authentication using a local postgres instance"
+                );
 
-                assert!(creds.project.is_some());
-                classic::handle_user(api.as_ref(), extra, creds, client).await?
+                let api = api.as_ref();
+                auth_quirks(api, extra, creds, client, allow_cleartext).await?
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
                 info!("performing link authentication");
 
-                link::handle_user(url, client)
+                link::authenticate(url, client)
                     .await?
                     .map(CachedNodeInfo::new_uncached)
             }
@@ -229,9 +186,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
 
     /// When applicable, wake the compute node, gaining its connection info in the process.
     /// The link auth flow doesn't support this, so we return [`None`] in that case.
-    pub async fn wake_compute<'a>(
+    pub async fn wake_compute(
         &self,
-        extra: &'a ConsoleReqExtra<'a>,
+        extra: &ConsoleReqExtra<'_>,
     ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
         use BackendType::*;
 
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index eefef6e9b498..6753e7ed7fb1 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -9,7 +9,7 @@ use crate::{
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
-pub(super) async fn handle_user(
+pub(super) async fn authenticate(
     api: &impl console::Api,
     extra: &ConsoleReqExtra<'_>,
     creds: &ClientCredentials<'_>,
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
new file mode 100644
index 000000000000..f710581cb2d4
--- /dev/null
+++ b/proxy/src/auth/backend/hacks.rs
@@ -0,0 +1,66 @@
+use super::AuthSuccess;
+use crate::{
+    auth::{self, AuthFlow, ClientCredentials},
+    console::{
+        self,
+        provider::{CachedNodeInfo, ConsoleReqExtra},
+    },
+    stream,
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{info, warn};
+
+/// Compared to [SCRAM](crate::scram), cleartext password auth saves
+/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
+/// These properties are benefical for serverless JS workers, so we
+/// use this mechanism for websocket connections.
+pub async fn cleartext_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("cleartext auth flow override is enabled, proceeding");
+    let password = AuthFlow::new(client)
+        .begin(auth::CleartextPassword)
+        .await?
+        .authenticate()
+        .await?;
+
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(password);
+
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
+
+/// Workaround for clients which don't provide an endpoint (project) name.
+/// Very similar to [`cleartext_hack`], but there's a specific password format.
+pub async fn password_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("project not specified, resorting to the password hack auth flow");
+    let payload = AuthFlow::new(client)
+        .begin(auth::PasswordHack)
+        .await?
+        .authenticate()
+        .await?;
+
+    info!(project = &payload.project, "received missing parameter");
+    creds.project = Some(payload.project.into());
+
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(payload.password);
+
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 5d0049c95712..7175a23dc108 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -53,7 +53,7 @@ pub fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
-pub(super) async fn handle_user(
+pub(super) async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<AuthSuccess<NodeInfo>> {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 968104f05876..c556c33197f4 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -11,12 +11,16 @@ pub enum ClientCredsParseError {
     #[error("Parameter '{0}' is missing in startup packet.")]
     MissingKey(&'static str),
 
-    #[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)]
+    #[error(
+        "Inconsistent project name inferred from \
+         SNI ('{}') and project option ('{}').",
+        .domain, .option,
+    )]
     InconsistentProjectNames { domain: String, option: String },
 
     #[error(
         "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
-        SNI should be formatted as '<project-name>.{}'.",
+         SNI should be formatted as '<project-name>.{}'.",
         .sni, .cn, .cn,
     )]
     InconsistentSni { sni: String, cn: String },
@@ -34,9 +38,6 @@ pub struct ClientCredentials<'a> {
     pub user: &'a str,
     // TODO: this is a severe misnomer! We should think of a new name ASAP.
     pub project: Option<Cow<'a, str>>,
-    /// If `True`, we'll use the old cleartext password flow. This is used for
-    /// websocket connections, which want to minimize the number of round trips.
-    pub use_cleartext_password_flow: bool,
 }
 
 impl ClientCredentials<'_> {
@@ -51,7 +52,6 @@ impl<'a> ClientCredentials<'a> {
         params: &'a StartupMessageParams,
         sni: Option<&str>,
         common_name: Option<&str>,
-        use_cleartext_password_flow: bool,
     ) -> Result<Self, ClientCredsParseError> {
         use ClientCredsParseError::*;
 
@@ -96,18 +96,9 @@ impl<'a> ClientCredentials<'a> {
         }
         .transpose()?;
 
-        info!(
-            user = user,
-            project = project.as_deref(),
-            use_cleartext_password_flow = use_cleartext_password_flow,
-            "credentials"
-        );
-
-        Ok(Self {
-            user,
-            project,
-            use_cleartext_password_flow,
-        })
+        info!(user, project = project.as_deref(), "credentials");
+
+        Ok(Self { user, project })
     }
 }
 
@@ -131,7 +122,7 @@ mod tests {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
 
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project, None);
 
@@ -146,7 +137,7 @@ mod tests {
             ("foo", "bar"),        // should be ignored
         ]);
 
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project, None);
 
@@ -160,7 +151,7 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("foo"));
 
@@ -174,7 +165,7 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("bar"));
 
@@ -188,7 +179,7 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("baz"));
 
@@ -203,8 +194,7 @@ mod tests {
         let sni = Some("second.localhost");
         let common_name = Some("localhost");
 
-        let err =
-            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -221,8 +211,7 @@ mod tests {
         let sni = Some("project.localhost");
         let common_name = Some("example.com");
 
-        let err =
-            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
         match err {
             InconsistentSni { sni, cn } => {
                 assert_eq!(sni, "project.localhost");
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 3f5eb3caff80..2e12d9ee26fd 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -77,14 +77,9 @@ impl ConnCfg {
             self.dbname(dbname);
         }
 
-        if let Some(options) = params.options_raw() {
-            // We must drop all proxy-specific parameters.
-            #[allow(unstable_name_collisions)]
-            let options: String = options
-                .filter(|opt| !opt.starts_with("project="))
-                .intersperse(" ") // TODO: use impl from std once it's stabilized
-                .collect();
-
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
             self.options(&options);
         }
 
@@ -225,3 +220,46 @@ impl ConnCfg {
         Ok(connection)
     }
 }
+
+/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+    #[allow(unstable_name_collisions)]
+    let options: String = params
+        .options_raw()?
+        .filter(|opt| !opt.starts_with("project="))
+        .intersperse(" ") // TODO: use impl from std once it's stabilized
+        .collect();
+
+    // Don't even bother with empty options.
+    if options.is_empty() {
+        return None;
+    }
+
+    Some(options)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_filtered_options() {
+        // Empty options is unlikely to be useful anyway.
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);
+
+        // It's likely that clients will only use options to specify endpoint/project.
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);
+
+        // Same, because unescaped whitespaces are no-op.
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);
+
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+    }
+}
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5e285f3625d9..600db7f8eca2 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -8,6 +8,7 @@ pub struct ProxyConfig {
     pub metric_collection: Option<MetricCollectionConfig>,
 }
 
+#[derive(Debug)]
 pub struct MetricCollectionConfig {
     pub endpoint: reqwest::Url,
     pub interval: Duration,
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 51a117d3b782..c00c06fbb7c1 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -5,10 +5,7 @@ use crate::{
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::{
-    net::{TcpListener, TcpStream},
-    thread,
-};
+use std::{net::TcpStream, thread};
 use tracing::{error, info, info_span};
 use utils::{
     postgres_backend::{self, AuthType, PostgresBackend},
@@ -34,23 +31,24 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N
     CPLANE_WAITERS.notify(psql_session_id, msg)
 }
 
-/// Console management API listener thread.
+/// Console management API listener task.
 /// It spawns console response handlers needed for the link auth.
-pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
+pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("mgmt has shut down");
     }
 
-    listener
-        .set_nonblocking(false)
-        .context("failed to set listener to blocking")?;
-
     loop {
-        let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
+        let (socket, peer_addr) = listener.accept().await?;
         info!("accepted connection from {peer_addr}");
+
+        let socket = socket.into_std()?;
         socket
             .set_nodelay(true)
             .context("failed to set client socket option")?;
+        socket
+            .set_nonblocking(false)
+            .context("failed to set client socket option")?;
 
         // TODO: replace with async tasks.
         thread::spawn(move || {
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 7621aba19b24..80cd94d48366 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -11,8 +11,10 @@ use async_trait::async_trait;
 use std::sync::Arc;
 
 pub mod errors {
-    use crate::error::{io_error, UserFacingError};
-    use reqwest::StatusCode as HttpStatusCode;
+    use crate::{
+        error::{io_error, UserFacingError},
+        http,
+    };
     use thiserror::Error;
 
     /// A go-to error message which doesn't leak any detail.
@@ -24,7 +26,7 @@ pub mod errors {
         /// Error returned by the console itself.
         #[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
         Console {
-            status: HttpStatusCode,
+            status: http::StatusCode,
             text: Box<str>,
         },
 
@@ -35,7 +37,7 @@ pub mod errors {
 
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
-        pub fn http_status_code(&self) -> Option<HttpStatusCode> {
+        pub fn http_status_code(&self) -> Option<http::StatusCode> {
             use ApiError::*;
             match self {
                 Console { status, .. } => Some(*status),
@@ -51,15 +53,15 @@ pub mod errors {
                 // To minimize risks, only select errors are forwarded to users.
                 // Ask @neondatabase/control-plane for review before adding more.
                 Console { status, .. } => match *status {
-                    HttpStatusCode::NOT_FOUND => {
+                    http::StatusCode::NOT_FOUND => {
                         // Status 404: failed to get a project-related resource.
                         format!("{REQUEST_FAILED}: endpoint cannot be found")
                     }
-                    HttpStatusCode::NOT_ACCEPTABLE => {
+                    http::StatusCode::NOT_ACCEPTABLE => {
                         // Status 406: endpoint is disabled (we don't allow connections).
                         format!("{REQUEST_FAILED}: endpoint is disabled")
                     }
-                    HttpStatusCode::LOCKED => {
+                    http::StatusCode::LOCKED => {
                         // Status 423: project might be in maintenance mode (or bad state).
                         format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
                     }
@@ -70,13 +72,18 @@ pub mod errors {
         }
     }
 
-    // Helps eliminate graceless `.map_err` calls without introducing another ctor.
     impl From<reqwest::Error> for ApiError {
         fn from(e: reqwest::Error) -> Self {
             io_error(e).into()
         }
     }
 
+    impl From<reqwest_middleware::Error> for ApiError {
+        fn from(e: reqwest_middleware::Error) -> Self {
+            io_error(e).into()
+        }
+    }
+
     #[derive(Debug, Error)]
     pub enum GetAuthInfoError {
         // We shouldn't include the actual secret here.
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 4eca025d2d99..3644db17f7ea 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,7 +8,6 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use reqwest::StatusCode as HttpStatusCode;
 use tracing::{error, info, info_span, warn, Instrument};
 
 #[derive(Clone)]
@@ -52,7 +51,7 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(HttpStatusCode::NOT_FOUND) => return Ok(None),
+                    Some(http::StatusCode::NOT_FOUND) => return Ok(None),
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -154,7 +153,7 @@ impl super::Api for Api {
 
 /// Parse http response body, taking status code into account.
 async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    response: reqwest::Response,
+    response: http::Response,
 ) -> Result<T, ApiError> {
     let status = response.status();
     if status.is_success() {
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index e847edc8bd68..a54415780089 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -1,7 +1,24 @@
+//! HTTP client and server impls.
+//! Other modules should use stuff from this module instead of
+//! directly relying on deps like `reqwest` (think loose coupling).
+
 pub mod server;
 pub mod websocket;
 
+pub use reqwest::{Request, Response, StatusCode};
+pub use reqwest_middleware::{ClientWithMiddleware, Error};
+
 use crate::url::ApiUrl;
+use reqwest_middleware::RequestBuilder;
+
+/// This is the preferred way to create new http clients,
+/// because it takes care of observability (OpenTelemetry).
+/// We deliberately don't want to replace this with a public static.
+pub fn new_client() -> ClientWithMiddleware {
+    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
+        .with(reqwest_tracing::TracingMiddleware::default())
+        .build()
+}
 
 /// Thin convenience wrapper for an API provided by an http endpoint.
 #[derive(Debug, Clone)]
@@ -9,13 +26,17 @@ pub struct Endpoint {
     /// API's base URL.
     endpoint: ApiUrl,
     /// Connection manager with built-in pooling.
-    client: reqwest::Client,
+    client: ClientWithMiddleware,
 }
 
 impl Endpoint {
     /// Construct a new HTTP endpoint wrapper.
-    pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self {
-        Self { endpoint, client }
+    /// Http client is not constructed under the hood so that it can be shared.
+    pub fn new(endpoint: ApiUrl, client: impl Into<ClientWithMiddleware>) -> Self {
+        Self {
+            endpoint,
+            client: client.into(),
+        }
     }
 
     #[inline(always)]
@@ -23,19 +44,16 @@ impl Endpoint {
         &self.endpoint
     }
 
-    /// Return a [builder](reqwest::RequestBuilder) for a `GET` request,
+    /// Return a [builder](RequestBuilder) for a `GET` request,
     /// appending a single `path` segment to the base endpoint URL.
-    pub fn get(&self, path: &str) -> reqwest::RequestBuilder {
+    pub fn get(&self, path: &str) -> RequestBuilder {
         let mut url = self.endpoint.clone();
         url.path_segments_mut().push(path);
         self.client.get(url.into_inner())
     }
 
     /// Execute a [request](reqwest::Request).
-    pub async fn execute(
-        &self,
-        request: reqwest::Request,
-    ) -> Result<reqwest::Response, reqwest::Error> {
+    pub async fn execute(&self, request: Request) -> Result<Response, Error> {
         self.client.execute(request).await
     }
 }
@@ -43,11 +61,12 @@ impl Endpoint {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use reqwest::Client;
 
     #[test]
     fn optional_query_params() -> anyhow::Result<()> {
         let url = "http://example.com".parse()?;
-        let endpoint = Endpoint::new(url, reqwest::Client::new());
+        let endpoint = Endpoint::new(url, Client::new());
 
         // Validate that this pattern makes sense.
         let req = endpoint
@@ -66,7 +85,7 @@ mod tests {
     #[test]
     fn uuid_params() -> anyhow::Result<()> {
         let url = "http://example.com".parse()?;
-        let endpoint = Endpoint::new(url, reqwest::Client::new());
+        let endpoint = Endpoint::new(url, Client::new());
 
         let req = endpoint
             .get("frobnicate")
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index d4235c2c3848..1757652a9065 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -186,8 +186,8 @@ async fn ws_handler(
 }
 
 pub async fn task_main(
-    ws_listener: TcpListener,
     config: &'static ProxyConfig,
+    ws_listener: TcpListener,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
new file mode 100644
index 000000000000..0c8c2858b900
--- /dev/null
+++ b/proxy/src/logging.rs
@@ -0,0 +1,47 @@
+use tracing_opentelemetry::OpenTelemetryLayer;
+use tracing_subscriber::{
+    filter::{EnvFilter, LevelFilter},
+    prelude::*,
+};
+
+/// Initialize logging and OpenTelemetry tracing and exporter.
+///
+/// Logging can be configured using `RUST_LOG` environment variable.
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the
+/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
+/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
+pub async fn init() -> anyhow::Result<LoggingGuard> {
+    let env_filter = EnvFilter::builder()
+        .with_default_directive(LevelFilter::INFO.into())
+        .from_env_lossy();
+
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_ansi(atty::is(atty::Stream::Stderr))
+        .with_writer(std::io::stderr)
+        .with_target(false);
+
+    let otlp_layer = tracing_utils::init_tracing("proxy")
+        .await
+        .map(OpenTelemetryLayer::new);
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(otlp_layer)
+        .with(fmt_layer)
+        .try_init()?;
+
+    Ok(LoggingGuard)
+}
+
+pub struct LoggingGuard;
+
+impl Drop for LoggingGuard {
+    fn drop(&mut self) {
+        // Shutdown trace pipeline gracefully, so that it has a chance to send any
+        // pending traces before we exit.
+        tracing::info!("shutting down the tracing machinery");
+        tracing_utils::shutdown_tracing();
+    }
+}
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 8812f77b622f..85478da3bcc1 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -12,6 +12,7 @@ mod config;
 mod console;
 mod error;
 mod http;
+mod logging;
 mod metrics;
 mod parse;
 mod proxy;
@@ -27,7 +28,7 @@ use config::ProxyConfig;
 use futures::FutureExt;
 use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
-use tracing::{info, info_span, Instrument};
+use tracing::{info, warn};
 use utils::{project_git_version, sentry_init::init_sentry};
 
 project_git_version!(GIT_VERSION);
@@ -41,8 +42,8 @@ async fn flatten_err(
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    // First, initialize logging and troubleshooting subsystems.
-    init_tracing();
+    let _logging_guard = logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     info!("Version: {GIT_VERSION}");
@@ -60,16 +61,17 @@ async fn main() -> anyhow::Result<()> {
 
     let mgmt_address: SocketAddr = args.get_one::<String>("mgmt").unwrap().parse()?;
     info!("Starting mgmt on {mgmt_address}");
-    let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?;
+    let mgmt_listener = TcpListener::bind(mgmt_address).await?;
 
     let proxy_address: SocketAddr = args.get_one::<String>("proxy").unwrap().parse()?;
     info!("Starting proxy on {proxy_address}");
     let proxy_listener = TcpListener::bind(proxy_address).await?;
 
     let mut tasks = vec![
+        tokio::spawn(handle_signals()),
         tokio::spawn(http::server::task_main(http_listener)),
         tokio::spawn(proxy::task_main(config, proxy_listener)),
-        tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)),
+        tokio::spawn(console::mgmt::task_main(mgmt_listener)),
     ];
 
     if let Some(wss_address) = args.get_one::<String>("wss") {
@@ -78,48 +80,50 @@ async fn main() -> anyhow::Result<()> {
         let wss_listener = TcpListener::bind(wss_address).await?;
 
         tasks.push(tokio::spawn(http::websocket::task_main(
-            wss_listener,
             config,
+            wss_listener,
         )));
     }
 
-    // TODO: refactor.
-    if let Some(metric_collection) = &config.metric_collection {
-        let hostname = hostname::get()?
-            .into_string()
-            .map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
-
-        tasks.push(tokio::spawn(
-            metrics::collect_metrics(
-                &metric_collection.endpoint,
-                metric_collection.interval,
-                hostname,
-            )
-            .instrument(info_span!("collect_metrics")),
-        ));
+    if let Some(metrics_config) = &config.metric_collection {
+        tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
     }
 
-    // This will block until all tasks have completed.
-    // Furthermore, the first one to fail will cancel the rest.
+    // This combinator will block until either all tasks complete or
+    // one of them finishes with an error (others will be cancelled).
     let tasks = tasks.into_iter().map(flatten_err);
     let _: Vec<()> = futures::future::try_join_all(tasks).await?;
 
     Ok(())
 }
 
-/// Tracing is used for logging and telemetry.
-fn init_tracing() {
-    tracing_subscriber::fmt()
-        .with_env_filter({
-            // This filter will examine the `RUST_LOG` env variable.
-            use tracing_subscriber::filter::{EnvFilter, LevelFilter};
-            EnvFilter::builder()
-                .with_default_directive(LevelFilter::INFO.into())
-                .from_env_lossy()
-        })
-        .with_ansi(atty::is(atty::Stream::Stdout))
-        .with_target(false)
-        .init();
+/// Handle unix signals appropriately.
+async fn handle_signals() -> anyhow::Result<()> {
+    use tokio::signal::unix::{signal, SignalKind};
+
+    let mut hangup = signal(SignalKind::hangup())?;
+    let mut interrupt = signal(SignalKind::interrupt())?;
+    let mut terminate = signal(SignalKind::terminate())?;
+
+    loop {
+        tokio::select! {
+            // Hangup is commonly used for config reload.
+            _ = hangup.recv() => {
+                warn!("received SIGHUP; config reload is not supported");
+            }
+            // Shut down the whole application.
+            _ = interrupt.recv() => {
+                warn!("received SIGINT, exiting immediately");
+                bail!("interrupted");
+            }
+            // TODO: Don't accept new proxy connections.
+            // TODO: Shut down once all exisiting connections have been closed.
+            _ = terminate.recv() => {
+                warn!("received SIGTERM, exiting immediately");
+                bail!("terminated");
+            }
+        }
+    }
 }
 
 /// ProxyConfig is created at proxy startup, and lives forever.
@@ -161,7 +165,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
             }));
 
             let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
-            let endpoint = http::Endpoint::new(url, reqwest::Client::new());
+            let endpoint = http::Endpoint::new(url, http::new_client());
 
             let api = console::provider::neon::Api::new(endpoint, caches);
             auth::BackendType::Console(Cow::Owned(api), ())
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index d9aa4aec8c0f..3b2834687258 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,12 +1,11 @@
-//!
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-//!
+use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
-use std::{collections::HashMap, time::Duration};
-use tracing::{debug, error, log::info, trace};
+use std::collections::HashMap;
+use tracing::{debug, error, info, instrument, trace};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
@@ -19,48 +18,41 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
 ///
-#[derive(Eq, Hash, PartialEq, Serialize)]
+#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
 pub struct Ids {
     pub endpoint_id: String,
 }
 
-pub async fn collect_metrics(
-    metric_collection_endpoint: &reqwest::Url,
-    metric_collection_interval: Duration,
-    hostname: String,
-) -> anyhow::Result<()> {
+pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
+    info!("metrics collector config: {config:?}");
     scopeguard::defer! {
-        info!("collect_metrics has shut down");
+        info!("metrics collector has shut down");
     }
 
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
-    info!(
-        "starting collect_metrics. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
-    // define client here to reuse it for all requests
-    let client = reqwest::Client::new();
+    let http_client = http::new_client();
     let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
+    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
+    let mut ticker = tokio::time::interval(config.interval);
     loop {
-        tokio::select! {
-            _ = ticker.tick() => {
-
-                match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
-                {
-                    Err(e) => {
-                        error!("Failed to send consumption metrics: {} ", e);
-                    },
-                    Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
-                }
-            }
+        ticker.tick().await;
+
+        let res = collect_metrics_iteration(
+            &http_client,
+            &mut cached_metrics,
+            &config.endpoint,
+            &hostname,
+        )
+        .await;
+
+        match res {
+            Err(e) => error!("failed to send consumption metrics: {e} "),
+            Ok(_) => trace!("periodic metrics collection completed successfully"),
         }
     }
 }
 
-pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
+fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
     let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
     let metrics = prometheus::default_registry().gather();
 
@@ -99,11 +91,12 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
     current_metrics
 }
 
-pub async fn collect_metrics_iteration(
-    client: &reqwest::Client,
+#[instrument(skip_all)]
+async fn collect_metrics_iteration(
+    client: &http::ClientWithMiddleware,
     cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
     metric_collection_endpoint: &reqwest::Url,
-    hostname: String,
+    hostname: &str,
 ) -> anyhow::Result<()> {
     info!(
         "starting collect_metrics_iteration. metric_collection_endpoint: {}",
@@ -134,7 +127,7 @@ pub async fn collect_metrics_iteration(
                     stop_time: *curr_time,
                 },
                 metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname.clone()),
+                idempotency_key: idempotency_key(hostname.to_owned()),
                 value,
                 extra: Ids {
                     endpoint_id: curr_key.endpoint_id.clone(),
@@ -190,6 +183,12 @@ pub async fn collect_metrics_iteration(
             }
         } else {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
+            for metric in chunk.iter() {
+                // Report if the metric value is suspiciously large
+                if metric.value > (1u64 << 40) {
+                    error!("potentially abnormal metric value: {:?}", metric);
+                }
+            }
         }
     }
     Ok(())
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index a622a35e6d6e..964204781283 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -17,7 +17,7 @@ use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn};
 
 /// Number of times we should retry the `/proxy_wake_compute` http request.
 const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
@@ -91,13 +91,13 @@ pub async fn task_main(
             .unwrap_or_else(|e| {
                 // Acknowledge that the task has finished with an error.
                 error!("per-client task finished with an error: {e:#}");
-            })
-            .instrument(info_span!("client", session = format_args!("{session_id}"))),
+            }),
         );
     }
 }
 
 // TODO(tech debt): unite this with its twin below.
+#[tracing::instrument(fields(session_id), skip_all)]
 pub async fn handle_ws_client(
     config: &'static ProxyConfig,
     cancel_map: &CancelMap,
@@ -127,7 +127,7 @@ pub async fn handle_ws_client(
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name, true))
+            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -135,10 +135,11 @@ pub async fn handle_ws_client(
 
     let client = Client::new(stream, creds, &params, session_id);
     cancel_map
-        .with_session(|session| client.connect_to_db(session))
+        .with_session(|session| client.connect_to_db(session, true))
         .await
 }
 
+#[tracing::instrument(fields(session_id), skip_all)]
 async fn handle_client(
     config: &'static ProxyConfig,
     cancel_map: &CancelMap,
@@ -165,7 +166,7 @@ async fn handle_client(
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name, false))
+            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -173,7 +174,7 @@ async fn handle_client(
 
     let client = Client::new(stream, creds, &params, session_id);
     cancel_map
-        .with_session(|session| client.connect_to_db(session))
+        .with_session(|session| client.connect_to_db(session, false))
         .await
 }
 
@@ -401,7 +402,11 @@ impl<'a, S> Client<'a, S> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     /// Let the client authenticate and connect to the designated compute node.
-    async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> {
+    async fn connect_to_db(
+        self,
+        session: cancellation::Session<'_>,
+        allow_cleartext: bool,
+    ) -> anyhow::Result<()> {
         let Self {
             mut stream,
             mut creds,
@@ -416,10 +421,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
 
         let auth_result = async {
             // `&mut stream` doesn't let us merge those 2 lines.
-            let res = creds.authenticate(&extra, &mut stream).await;
+            let res = creds
+                .authenticate(&extra, &mut stream, allow_cleartext)
+                .await;
+
             async { res }.or_else(|e| stream.throw_error(e)).await
         }
-        .instrument(info_span!("auth"))
         .await?;
 
         let AuthSuccess {
diff --git a/pyproject.toml b/pyproject.toml
index d3d3948b9ad4..415f7f1ae703 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,12 +19,12 @@ types-requests = "^2.28.5"
 types-psycopg2 = "^2.9.18"
 boto3 = "^1.26.16"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
-moto = {version = "^3.0.0", extras = ["server"]}
+moto = {extras = ["server"], version = "^4.1.2"}
 backoff = "^1.11.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "2.1.2"
+Werkzeug = "^2.2.3"
 pytest-order = "^1.0.1"
 allure-pytest = "^2.10.0"
 pytest-asyncio = "^0.19.0"
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 1a068412c8cf..683050e9cd50 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -126,7 +126,12 @@ fn main() -> anyhow::Result<()> {
         return Ok(());
     }
 
+    // important to keep the order of:
+    // 1. init logging
+    // 2. tracing panic hook
+    // 3. sentry
     logging::init(LogFormat::from_config(&args.log_format)?)?;
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
     info!("version: {GIT_VERSION}");
 
     let args_workdir = &args.datadir;
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index c73206b7dcb1..1a0d261184a4 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -424,12 +424,16 @@ async fn http1_handler(
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
     let args = Args::parse();
 
+    // important to keep the order of:
+    // 1. init logging
+    // 2. tracing panic hook
+    // 3. sentry
     logging::init(LogFormat::from_config(&args.log_format)?)?;
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
     info!("version: {GIT_VERSION}");
     ::metrics::set_build_info_metric(GIT_VERSION);
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b35252243edd..63196609ccb7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1217,7 +1217,7 @@ def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[st
         """
         Returns the tenant size, together with the model inputs as the second tuple item.
         """
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size")
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
         self.verbose_error(res)
         res = res.json()
         assert isinstance(res, dict)
@@ -1228,6 +1228,16 @@ def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[st
         assert type(inputs) is dict
         return (size, inputs)
 
+    def tenant_size_debug(self, tenant_id: TenantId) -> str:
+        """
+        Returns the tenant size debug info, as an HTML string
+        """
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
+            headers={"Accept": "text/html"},
+        )
+        return res.text
+
     def timeline_list(
         self,
         tenant_id: TenantId,
@@ -1278,6 +1288,7 @@ def timeline_detail(
         timeline_id: TimelineId,
         include_non_incremental_logical_size: bool = False,
         include_timeline_dir_layer_file_size_sum: bool = False,
+        **kwargs,
     ) -> Dict[Any, Any]:
         params = {}
         if include_non_incremental_logical_size:
@@ -1288,6 +1299,7 @@ def timeline_detail(
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
             params=params,
+            **kwargs,
         )
         self.verbose_error(res)
         res_json = res.json()
@@ -2067,6 +2079,7 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
             ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
             ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
+            ".*task iteration took longer than the configured period.*",
         ]
 
     def start(
@@ -2517,15 +2530,17 @@ def __exit__(
         tb: Optional[TracebackType],
     ):
         if self._popen is not None:
-            # NOTE the process will die when we're done with tests anyway, because
-            # it's a child process. This is mostly to clean up in between different tests.
-            self._popen.kill()
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                log.warn("failed to gracefully terminate proxy; killing")
+                self._popen.kill()
 
     @staticmethod
     async def activate_link_auth(
         local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True
     ):
-
         pg_user = "proxy"
 
         if create_user:
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 3551f27cad56..5ee94de32dcb 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -1,19 +1,25 @@
 # It's possible to run any regular test with the local fs remote storage via
 # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
 
+import time
+from collections import defaultdict
 from pathlib import Path
+from typing import Any, DefaultDict, Dict
 
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    PageserverApiException,
     RemoteStorageKind,
     assert_tenant_status,
     available_remote_storages,
+    wait_for_last_flush_lsn,
     wait_for_last_record_lsn,
     wait_for_sk_commit_lsn_to_reach_remote_storage,
     wait_for_upload,
     wait_until,
+    wait_until_tenant_state,
 )
 from fixtures.types import Lsn
 from fixtures.utils import query_scalar
@@ -212,6 +218,9 @@ def get_resident_physical_size():
     log.info(filled_size)
     assert filled_current_physical == filled_size, "we don't yet do layer eviction"
 
+    # Wait until generated image layers are uploaded to S3
+    time.sleep(3)
+
     env.pageserver.stop()
 
     # remove all the layer files
@@ -449,3 +458,233 @@ def get_resident_physical_size():
     pg_old = env.postgres.create_start(branch_name="main")
     with pg_old.cursor() as cur:
         assert query_scalar(cur, "select count(*) from testtab") == table_len
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_without_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Create a few layers, then evict, then make sure compaction runs successfully.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand_without_image_creation",
+    )
+
+    env = neon_env_builder.init_start()
+
+    conf = {
+        # Disable background GC & compaction
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # unused, because manual will be called after each table
+        "checkpoint_distance": 100 * 1024**2,
+        # this will be updated later on to allow manual compaction outside of checkpoints
+        "compaction_threshold": 100,
+        # repartitioning parameter, not required here
+        "image_creation_threshold": 100,
+        # repartitioning parameter, not required here
+        "compaction_target_size": 128 * 1024**2,
+        # pitr_interval and gc_horizon are not interesting because we dont run gc
+    }
+
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    with env.postgres.create_start("main") as pg:
+        # no particular reason to create the layers like this, but we are sure
+        # not to hit the image_creation_threshold here.
+        with pg.cursor() as cur:
+            cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        with pg.cursor() as cur:
+            cur.execute("update a set id = -id")
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
+    assert len(layers.historic_layers) == 1 + 2, "should have inidb layer and 2 deltas"
+
+    for layer in layers.historic_layers:
+        log.info(f"pre-compact:  {layer}")
+        pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"})
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    for layer in layers.historic_layers:
+        log.info(f"post compact: {layer}")
+    assert len(layers.historic_layers) == 1, "should have compacted to single layer"
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_with_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted.
+
+    Due to current implementation, this will make image creation on-demand download layers, but we cannot really
+    directly test for it.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand",
+    )
+
+    env = neon_env_builder.init_start()
+
+    conf = {
+        # Disable background GC & compaction
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # repartitioning threshold is this / 10, but it doesn't really seem to matter
+        "checkpoint_distance": 50 * 1024**2,
+        "compaction_threshold": 3,
+        # important: keep this high for the data ingestion
+        "image_creation_threshold": 100,
+        # repartitioning parameter, unused
+        "compaction_target_size": 128 * 1024**2,
+        # pitr_interval and gc_horizon are not interesting because we dont run gc
+    }
+
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    pg = env.postgres.create_start("main")
+
+    # no particular reason to create the layers like this, but we are sure
+    # not to hit the image_creation_threshold here.
+    with pg.cursor() as cur:
+        cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
+        cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    for i in range(0, 2):
+        for j in range(0, 3):
+            # create a minimal amount of "delta difficulty" for this table
+            with pg.cursor() as cur:
+                cur.execute("update a set some_value = -some_value + %s", (j,))
+
+            with pg.cursor() as cur:
+                # vacuuming should aid to reuse keys, though it's not really important
+                # with image_creation_threshold=1 which we will use on the last compaction
+                cur.execute("vacuum")
+
+            wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+
+            if i == 1 and j == 2:
+                # last iteration; stop before checkpoint to avoid leaving an inmemory layer
+                pg.stop_and_destroy()
+
+            pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        # images should not yet be created, because threshold is too high,
+        # but these will be reshuffled to L1 layers
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
+
+    for _ in range(0, 20):
+        # loop in case flushing is still in progress
+        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+        if not layers.in_memory_layers:
+            break
+        time.sleep(0.2)
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
+
+    kinds_before: DefaultDict[str, int] = defaultdict(int)
+
+    for layer in layers.historic_layers:
+        kinds_before[layer.kind] += 1
+        pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    assert dict(kinds_before) == {"Delta": 4}
+
+    # now having evicted all layers, reconfigure to have lower image creation
+    # threshold to expose image creation to downloading all of the needed
+    # layers -- threshold of 2 would sound more reasonable, but keeping it as 1
+    # to be less flaky
+    env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"})
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    kinds_after: DefaultDict[str, int] = defaultdict(int)
+    for layer in layers.historic_layers:
+        kinds_after[layer.kind] += 1
+
+    assert dict(kinds_after) == {"Delta": 4, "Image": 1}
+
+
+def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
+    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ondemand_download_failure_to_replace(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
+
+    See: https://github.com/neondatabase/neon/issues/3533
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_failure_to_replace",
+    )
+
+    # disable gc and compaction via default tenant config because config is lost while detaching
+    # so that compaction will not be the one to download the layer but the http handler is
+    neon_env_builder.pageserver_config_override = (
+        """tenant_config={gc_period = "0s", compaction_period = "0s"}"""
+    )
+
+    env = neon_env_builder.init_start()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
+
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    # remove layers so that they will be redownloaded
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)
+
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
+    pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
+
+    # requesting details with non-incremental size should trigger a download of the only layer
+    # this will need to be adjusted if an index for logical sizes is ever implemented
+    with pytest.raises(PageserverApiException):
+        # error message is not useful
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
+
+    actual_message = (
+        ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
+    )
+    assert env.pageserver.log_contains(actual_message) is not None
+    env.pageserver.allowed_errors.append(actual_message)
+
+    env.pageserver.allowed_errors.append(
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
+    )
+    # this might get to run and attempt on-demand, but not always
+    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
+
+    # if the above returned, then we didn't have a livelock, and all is well
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index bb3bca878263..8c2996f4917b 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,13 +1,13 @@
-from typing import Any, List, Tuple
+from pathlib import Path
+from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.types import Lsn
 
 
-def test_empty_tenant_size(neon_simple_env: NeonEnv):
+def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     env = neon_simple_env
     (tenant_id, _) = env.neon_cli.create_tenant()
     http_client = env.pageserver.http_client()
@@ -18,6 +18,9 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
 
     main_branch_name = "main"
 
+    branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
+    assert branch_name == main_branch_name
+
     with env.postgres.create_start(
         main_branch_name,
         tenant_id=tenant_id,
@@ -39,12 +42,44 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
     size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
     assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
 
-    expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
-    actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"]))  # type: ignore
-    assert actual_commands == expected_commands
+    expected_inputs = {
+        "segments": [
+            {
+                "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
+                "timeline_id": f"{main_timeline_id}",
+                "kind": "BranchStart",
+            },
+            {
+                "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
+                "timeline_id": f"{main_timeline_id}",
+                "kind": "BranchEnd",
+            },
+        ],
+        "timeline_inputs": [
+            {
+                "timeline_id": f"{main_timeline_id}",
+                "ancestor_id": None,
+                "ancestor_lsn": "0/0",
+                "last_record": "0/1698CC0",
+                "latest_gc_cutoff": "0/1698C48",
+                "horizon_cutoff": "0/0",
+                "pitr_cutoff": "0/0",
+                "next_gc_cutoff": "0/0",
+                "retention_param_cutoff": None,
+            }
+        ],
+    }
+    expected_inputs = mask_model_inputs(expected_inputs)
+    actual_inputs = mask_model_inputs(inputs)
+
+    assert expected_inputs == actual_inputs
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
 
-def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
+
+def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     Issue found in production. Because the ancestor branch was under
     gc_horizon, the branchpoint was "dangling" and the computation could not be
@@ -75,8 +110,12 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
 
     assert size_after_branching > initial_size
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
-def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
+def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     More general version of test_branched_empty_timeline_size
 
@@ -128,9 +167,13 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
     size_after_writes = http_client.tenant_size(tenant_id)
     assert size_after_writes > initial_size
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
+def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 15
 
@@ -167,9 +210,13 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
 
     assert size_before_branching < size_after
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_parent_within_horizon(neon_simple_env: NeonEnv):
+def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 5
 
@@ -179,7 +226,7 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
     """
 
     env = neon_simple_env
-    gc_horizon = 200_000
+    gc_horizon = 5_000
     (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
     http_client = env.pageserver.http_client()
 
@@ -212,9 +259,13 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
 
     assert size_before_branching < size_after
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
+def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = small
 
@@ -253,8 +304,14 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
 
             latest_size = size_now
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
 
-def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
+
+def test_single_branch_get_tenant_size_grows(
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
     """
     Operate on single branch reading the tenants size after each transaction.
     """
@@ -279,7 +336,20 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
     collected_responses: List[Tuple[Lsn, int]] = []
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+
+    def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int):
+        if current_lsn - initdb_lsn > gc_horizon:
+            assert (
+                size >= prev
+            ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
+        else:
+            assert (
+                size > prev
+            ), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
+
     with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
         with pg.cursor() as cur:
             cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")
 
@@ -297,13 +367,19 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
             current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
 
-            size = http_client.tenant_size(tenant_id)
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+
+            size_debug = http_client.tenant_size_debug(tenant_id)
+            size_debug_file.write(size_debug)
 
             if len(collected_responses) > 0:
                 prev = collected_responses[-1][1]
                 if size == 0:
                     assert prev == 0
                 else:
+                    # branch start shouldn't be past gc_horizon yet
+                    # thus the size should grow as we insert more data
+                    assert current_lsn - initdb_lsn <= gc_horizon
                     assert size > prev
 
             collected_responses.append((current_lsn, size))
@@ -323,9 +399,15 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
             current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
 
-            size = http_client.tenant_size(tenant_id)
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+
+            size_debug = http_client.tenant_size_debug(tenant_id)
+            size_debug_file.write(size_debug)
+
             prev = collected_responses[-1][1]
-            assert size > prev, "tenant_size should grow with updates"
+
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
             collected_responses.append((current_lsn, size))
 
         while True:
@@ -340,9 +422,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
             size = http_client.tenant_size(tenant_id)
             prev = collected_responses[-1][1]
-            assert (
-                size > prev
-            ), "even though rows have been deleted, the tenant_size should increase"
+
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
             collected_responses.append((current_lsn, size))
 
         with pg.cursor() as cur:
@@ -352,7 +434,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
         size = http_client.tenant_size(tenant_id)
         prev = collected_responses[-1][1]
-        assert size > prev, "dropping table grows tenant_size"
+
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
         collected_responses.append((current_lsn, size))
 
     # this isn't too many lines to forget for a while. observed while
@@ -364,24 +448,17 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start()
 
+    size_debug_file.close()
+
     size_after = http_client.tenant_size(tenant_id)
     prev = collected_responses[-1][1]
 
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
-    ps_metrics = parse_metrics(http_client.get_metrics(), "pageserver")
-    tenant_metric_filter = {
-        "tenant_id": str(tenant_id),
-    }
 
-    tenant_size_metric = int(
-        ps_metrics.query_one("pageserver_tenant_synthetic_size", filter=tenant_metric_filter).value
-    )
-
-    assert tenant_size_metric == size_after, "API size value should be equal to metric size value"
-
-
-def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder):
+def test_get_tenant_size_with_multiple_branches(
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
     """
     Reported size goes up while branches or rows are being added, goes down after removing branches.
     """
@@ -481,6 +558,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
     size_after = http_client.tenant_size(tenant_id)
     assert size_after == size_after_thinning_branch
 
+    size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file_before.write(size_debug)
+
     # teardown, delete branches, and the size should be going down
     http_client.timeline_delete(tenant_id, first_branch_timeline_id)
 
@@ -493,3 +574,38 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
 
     assert size_after_deleting_second < size_after_continuing_on_main
     assert size_after_deleting_second > size_after_first_branch
+
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
+
+# Helper for tests that compare timeline_inputs
+# We don't want to compare the exact values, because they can be unstable
+# and cause flaky tests. So replace the values with useful invariants.
+def mask_model_inputs(x):
+    if isinstance(x, dict):
+        newx = {}
+        for k, v in x.items():
+            if k == "size":
+                if v is None or v == 0:
+                    # no change
+                    newx[k] = v
+                elif v < 0:
+                    newx[k] = "<0"
+                else:
+                    newx[k] = ">0"
+            elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record":
+                if v is None or v == 0 or v == "0/0":
+                    # no change
+                    newx[k] = v
+                else:
+                    newx[k] = "masked"
+            else:
+                newx[k] = mask_model_inputs(v)
+        return newx
+    elif isinstance(x, list):
+        newlist = [mask_model_inputs(v) for v in x]
+        return newlist
+    else:
+        return x
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 6da6a4d4463c..769bc10280b8 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -280,6 +280,7 @@ def test_tenant_upgrades_index_json_from_v0(
 
         timeline_file.seek(0)
         json.dump(v0_index_part, timeline_file)
+        timeline_file.truncate(timeline_file.tell())
 
     env.pageserver.start()
     pageserver_http = env.pageserver.http_client()
diff --git a/vm-cgconfig.conf b/vm-cgconfig.conf
new file mode 100644
index 000000000000..a2e201708e25
--- /dev/null
+++ b/vm-cgconfig.conf
@@ -0,0 +1,12 @@
+# Configuration for cgroups in VM compute nodes
+group neon-postgres {
+    perm {
+        admin {
+            uid = vm-informant;
+        }
+        task {
+            gid = users;
+        }
+    }
+    memory {}
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 30a6d3a92b1c..c0cf3c56116b 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,14 +38,14 @@ prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
 tonic = { version = "0.8", features = ["tls-roots"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }