diff --git a/.dockerignore b/.dockerignore index d256b21af1d5..a6e11805e92c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -21,3 +21,4 @@ !workspace_hack/ !neon_local/ !scripts/ninstall.sh +!vm-cgconfig.conf diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 71fced23c25f..13b44f40520f 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-prod-storage-ap-southeast-1 bucket_region: ap-southeast-1 - console_mgmt_base_url: http://console-release.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index 83d4f6f37db8..2236dcbc06c7 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-prod-storage-eu-central-1 bucket_region: eu-central-1 - console_mgmt_base_url: http://console-release.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 7f7601cd3928..56bece3e7752 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-prod-storage-us-east-2 bucket_region: us-east-2 - console_mgmt_base_url: http://console-release.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index 9cad79b986ab..f03e2d943586 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -2,11 +2,11 @@ storage: vars: bucket_name: neon-prod-storage-us-west-2 bucket_region: us-west-2 - console_mgmt_base_url: http://console-release.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.tech broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min remote_storage: bucket_name: "{{ bucket_name }}" diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml deleted file mode 100644 index ecb847bd6131..000000000000 --- a/.github/ansible/production.hosts.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -storage: - vars: - console_mgmt_base_url: http://console-release.local - bucket_name: zenith-storage-oregon - bucket_region: us-west-2 - broker_endpoint: http://storage-broker.prod.local:50051 - pageserver_config_stub: - pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events - metric_collection_interval: 10min - remote_storage: - bucket_name: "{{ bucket_name }}" - bucket_region: "{{ bucket_region }}" - prefix_in_bucket: "{{ inventory_hostname }}" - safekeeper_s3_prefix: prod-1/wal - hostname_suffix: ".local" - remote_user: admin - sentry_environment: production - - children: - pageservers: - hosts: - zenith-1-ps-2: - console_region_id: aws-us-west-2 - zenith-1-ps-3: - console_region_id: aws-us-west-2 - zenith-1-ps-4: - console_region_id: aws-us-west-2 - zenith-1-ps-5: - console_region_id: aws-us-west-2 - - safekeepers: - hosts: - zenith-1-sk-1: - console_region_id: aws-us-west-2 - zenith-1-sk-2: - console_region_id: aws-us-west-2 - zenith-1-sk-4: - console_region_id: aws-us-west-2 diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index f28dc8e07b54..b5377957049a 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -2,12 +2,17 @@ storage: vars: bucket_name: neon-dev-storage-eu-west-1 bucket_region: eu-west-1 - console_mgmt_base_url: http://console-staging.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.build broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "20m" + threshold: "20m" remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 9a1a0952822a..cd8f832af017 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -2,12 +2,17 @@ storage: vars: bucket_name: neon-staging-storage-us-east-2 bucket_region: us-east-2 - console_mgmt_base_url: http://console-staging.local + console_mgmt_base_url: http://neon-internal-api.aws.neon.build broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051 pageserver_config_stub: pg_distrib_dir: /usr/local - metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events + metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "20m" + threshold: "20m" remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index c49b8d20097a..ecf57554d971 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.eu-west-1.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 - metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml index 157ae66ed1af..91ddd07eae5d 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml @@ -10,7 +10,7 @@ settings: uri: "https://console.stage.neon.tech/psql_session/" domain: "pg.neon.build" sentryEnvironment: "staging" - metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" # -- Additional labels for neon-proxy-link pods diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index 99b67d75c1c8..6ec18ff388a9 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.cloud.stage.neon.tech" sentryEnvironment: "staging" wssPort: 8443 - metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index 764bb25b646a..9b250fce6eee 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-staging.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.us-east-2.aws.neon.build" sentryEnvironment: "staging" wssPort: 8443 - metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" metricCollectionInterval: "1min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index a640d468b34c..389da354638e 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.ap-southeast-1.aws.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index c9430877de84..7e16ac2d3d7e 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.eu-central-1.aws.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 677df6a5be1f..05e41e7a97c4 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-east-2.aws.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml index 3a5cde4b01a3..e67a3e44618d 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.cloud.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 919a0d503cf8..5dc23b282e06 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -6,11 +6,11 @@ image: settings: authBackend: "console" - authEndpoint: "http://console-release.local/management/api/v2" + authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-west-2.aws.neon.tech" sentryEnvironment: "production" wssPort: 8443 - metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events" + metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" metricCollectionInterval: "10min" # -- Additional labels for neon-proxy pods diff --git a/.github/helm-values/production.neon-storage-broker.yaml b/.github/helm-values/production.neon-storage-broker.yaml deleted file mode 100644 index aa64081da3f0..000000000000 --- a/.github/helm-values/production.neon-storage-broker.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Helm chart values for neon-storage-broker -podLabels: - neon_env: production - neon_service: storage-broker - -# Use L4 LB -service: - # service.annotations -- Annotations to add to the service - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: external # use newer AWS Load Balancer Controller - service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip - service.beta.kubernetes.io/aws-load-balancer-scheme: internal # deploy LB to private subnet - # assign service to this name at external-dns - external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local - # service.type -- Service type - type: LoadBalancer - # service.port -- broker listen port - port: 50051 - -ingress: - enabled: false - -metrics: - enabled: true - serviceMonitor: - enabled: true - selector: - release: kube-prometheus-stack - -extraManifests: - - apiVersion: operator.victoriametrics.com/v1beta1 - kind: VMServiceScrape - metadata: - name: "{{ include \"neon-storage-broker.fullname\" . }}" - labels: - helm.sh/chart: neon-storage-broker-{{ .Chart.Version }} - app.kubernetes.io/name: neon-storage-broker - app.kubernetes.io/instance: neon-storage-broker - app.kubernetes.io/version: "{{ .Chart.AppVersion }}" - app.kubernetes.io/managed-by: Helm - namespace: "{{ .Release.Namespace }}" - spec: - selector: - matchLabels: - app.kubernetes.io/name: "neon-storage-broker" - endpoints: - - port: broker - path: /metrics - interval: 10s - scrapeTimeout: 10s - namespaceSelector: - matchNames: - - "{{ .Release.Namespace }}" - -settings: - sentryEnvironment: "production" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 27b7f54856f8..d16d221cc4ca 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -611,34 +611,31 @@ jobs: run: shell: sh -eu {0} env: - VM_INFORMANT_VERSION: 0.1.1 + VM_BUILDER_VERSION: v0.4.6 steps: - - name: Downloading latest vm-builder + - name: Checkout + uses: actions/checkout@v1 + with: + fetch-depth: 0 + + - name: Downloading vm-builder run: | - curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder + curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder chmod +x vm-builder - name: Pulling compute-node image run: | docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - - name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }} - run: | - curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant - chmod +x vm-informant - - - name: Adding VM informant to compute-node image + - name: Building VM compute-node rootfs run: | - ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}) - docker cp vm-informant $ID:/bin/vm-informant - docker commit $ID temp-vm-compute-node - docker rm -f $ID + docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node . - name: Build vm image run: | # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images - ./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} + ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} - name: Pushing vm-compute-node image run: | diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml index f4ce7e9afa34..540d187274a1 100644 --- a/.github/workflows/deploy-prod.yml +++ b/.github/workflows/deploy-prod.yml @@ -165,80 +165,3 @@ jobs: - name: Deploy storage-broker run: helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - # Deploy to old account below - - deploy: - runs-on: prod - container: - image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest - options: --user root --privileged - if: inputs.deployStorage && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - environment: - name: prod-old - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Redeploy - run: | - export DOCKER_TAG=${{ inputs.dockerTag }} - cd "$(pwd)/.github/ansible" - - ./get_binaries.sh - - eval $(ssh-agent) - echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key - echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub - chmod 0600 ssh-key - ssh-add ssh-key - rm -f ssh-key ssh-key-cert.pub - ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater - ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }} - rm -f neon_install.tar.gz .neon_current_version - - # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied - - name: Cleanup ansible folder - run: rm -rf ~/.ansible - - deploy-storage-broker: - name: deploy storage broker on old staging and old prod - runs-on: [ self-hosted, gen3, small ] - container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned - if: inputs.deployStorageBroker && inputs.disclamerAcknowledged - defaults: - run: - shell: bash - environment: - name: prod-old - env: - KUBECONFIG: .kubeconfig - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: true - fetch-depth: 0 - ref: ${{ inputs.branch }} - - - name: Store kubeconfig file - run: | - echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG} - chmod 0600 ${KUBECONFIG} - - - name: Add neon helm chart - run: helm repo add neondatabase https://neondatabase.github.io/helm-charts - - - name: Deploy storage-broker - run: - helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s - - - name: Cleanup helm folder - run: rm -rf ~/.cache diff --git a/Cargo.lock b/Cargo.lock index 67e54d38337e..d154b4eaea2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2110,6 +2110,16 @@ version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +[[package]] +name = "mime_guess" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +dependencies = [ + "mime", + "unicase", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2884,6 +2894,7 @@ dependencies = [ "md5", "metrics", "once_cell", + "opentelemetry", "parking_lot", "pin-project-lite", "pq_proto", @@ -2892,6 +2903,8 @@ dependencies = [ "rcgen", "regex", "reqwest", + "reqwest-middleware", + "reqwest-tracing", "routerify", "rstest", "rustls", @@ -2909,7 +2922,9 @@ dependencies = [ "tokio-postgres-rustls", "tokio-rustls", "tracing", + "tracing-opentelemetry", "tracing-subscriber", + "tracing-utils", "url", "utils", "uuid", @@ -3079,6 +3094,7 @@ dependencies = [ "js-sys", "log", "mime", + "mime_guess", "once_cell", "percent-encoding", "pin-project-lite", @@ -3098,6 +3114,36 @@ dependencies = [ "winreg", ] +[[package]] +name = "reqwest-middleware" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894" +dependencies = [ + "anyhow", + "async-trait", + "http", + "reqwest", + "serde", + "task-local-extensions", + "thiserror", +] + +[[package]] +name = "reqwest-tracing" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce" +dependencies = [ + "async-trait", + "opentelemetry", + "reqwest", + "reqwest-middleware", + "task-local-extensions", + "tracing", + "tracing-opentelemetry", +] + [[package]] name = "ring" version = "0.16.20" @@ -3790,6 +3836,15 @@ dependencies = [ "xattr", ] +[[package]] +name = "task-local-extensions" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4" +dependencies = [ + "tokio", +] + [[package]] name = "tempfile" version = "3.3.0" @@ -3809,6 +3864,8 @@ name = "tenant_size_model" version = "0.1.0" dependencies = [ "anyhow", + "serde", + "serde_json", "workspace_hack", ] @@ -4358,6 +4415,15 @@ dependencies = [ "libc", ] +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-bidi" version = "0.3.10" diff --git a/Cargo.toml b/Cargo.toml index 4e4667f253af..99a3f560262a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -76,6 +76,8 @@ prost = "0.11" rand = "0.8" regex = "1.4" reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] } +reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] } +reqwest-middleware = "0.2.0" routerify = "3" rpds = "0.12.0" rustls = "0.20" diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 5c58f4baaa79..1d6c2f354f5c 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -11,7 +11,7 @@ FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ - libicu-dev + libicu-dev libxslt1-dev ######################################################################################### # @@ -23,7 +23,8 @@ FROM build-deps AS pg-build ARG PG_VERSION COPY vendor/postgres-${PG_VERSION} postgres RUN cd postgres && \ - ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \ + ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \ + --with-libxml --with-libxslt && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \ make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \ # Install headers @@ -34,7 +35,8 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control ######################################################################################### # @@ -50,15 +52,15 @@ RUN apt update && \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc -RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \ - tar zxvf SFCGAL-v1.3.10.tar.gz && \ - cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \ +# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2 +RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \ + mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \ + cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \ make clean && cp -R /sfcgal/* / -RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ - tar xvzf postgis-3.3.1.tar.gz && \ - cd postgis-3.3.1 && \ +RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \ + mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ ./autogen.sh && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ @@ -83,29 +85,14 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \ FROM build-deps AS plv8-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils + apt install -y ninja-build python3-dev libncurses5 binutils clang -# https://github.com/plv8/plv8/issues/475: -# v8 uses gold for linking and sets `--thread-count=4` which breaks -# gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607) -# Install newer gold version manually as debian-testing binutils version updates -# libc version, which in turn breaks other extension built against non-testing libc. -RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \ - tar xvzf binutils-2.38.tar.gz && \ - cd binutils-2.38 && \ - cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \ - cd ../bfd && ./configure && make bfdver.h && \ - cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \ - cp /usr/local/bin/ld.gold /usr/bin/gold - -# Sed is used to patch for https://github.com/plv8/plv8/issues/503 -RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \ - tar xvzf v3.1.4.tar.gz && \ - cd plv8-3.1.4 && \ +RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \ + mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ - sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ rm -rf /plv8-* && \ + find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -126,20 +113,17 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2 && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \ && rm /tmp/cmake-install.sh -RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \ - tar xvzf h3.tgz && \ - cd h3-4.0.1 && \ - mkdir build && \ - cd build && \ +RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \ + mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \ + mkdir build && cd build && \ cmake .. -DCMAKE_BUILD_TYPE=Release && \ make -j $(getconf _NPROCESSORS_ONLN) && \ DESTDIR=/h3 make install && \ cp -R /h3/usr / && \ rm -rf build -RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \ - tar xvzf h3-pg.tgz && \ - cd h3-pg-4.0.1 && \ +RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \ + mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -155,9 +139,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3 FROM build-deps AS unit-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \ - tar xvzf 7.7.tar.gz && \ - cd postgresql-unit-7.7 && \ +RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \ + mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ # unit extension's "create extension" script relies on absolute install path to fill some reference tables. @@ -176,12 +159,27 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz & FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \ - cd pgvector && \ +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \ + mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control +######################################################################################### +# +# Layer "pgjwt-pg-build" +# compile pgjwt extension +# +######################################################################################### +FROM build-deps AS pgjwt-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021 +RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \ + mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \ + make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -196,6 +194,7 @@ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=h3-pg-build /h3/usr / COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -258,6 +257,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # libicu67, locales for collations (including ICU) # libossp-uuid16 for extension ossp-uuid # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS +# libxml2, libxslt1.1 for xml2 RUN apt update && \ apt install --no-install-recommends -y \ locales \ @@ -269,6 +269,8 @@ RUN apt update && \ libproj19 \ libprotobuf-c1 \ libsfcgal1 \ + libxml2 \ + libxslt1.1 \ gdb && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node new file mode 100644 index 000000000000..af3bfb359005 --- /dev/null +++ b/Dockerfile.vm-compute-node @@ -0,0 +1,32 @@ +# Note: this file *mostly* just builds on Dockerfile.compute-node + +ARG SRC_IMAGE +ARG VM_INFORMANT_VERSION=v0.1.6 + +# Pull VM informant and set up inittab +FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant + +RUN set -e \ + && rm -f /etc/inittab \ + && touch /etc/inittab + +ADD vm-cgconfig.conf /etc/cgconfig.conf +RUN set -e \ + && echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \ + && echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab + +# Combine, starting from non-VM compute node image. +FROM $SRC_IMAGE as base + +# Temporarily set user back to root so we can run apt update and adduser +USER root +RUN apt update && \ + apt install --no-install-recommends -y \ + cgroup-tools +RUN adduser vm-informant --disabled-password --no-create-home +USER postgres + +COPY --from=informant /etc/inittab /etc/inittab +COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant + +ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"] diff --git a/Makefile b/Makefile index 92a453268482..e04a82c7c9d1 100644 --- a/Makefile +++ b/Makefile @@ -136,9 +136,15 @@ neon-pg-ext-%: postgres-% .PHONY: neon-pg-ext-clean-% neon-pg-ext-clean-%: - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean - $(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean + $(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \ + -C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \ + -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean .PHONY: neon-pg-ext neon-pg-ext: \ diff --git a/README.md b/README.md index 29389e7a5d34..819693f1f3ef 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ protobuf-devel ``` +* On Arch based systems, these packages are needed: +```bash +pacman -S base-devel readline zlib libseccomp openssl clang \ +postgresql-libs cmake postgresql protobuf +``` 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` @@ -83,9 +88,10 @@ cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. For a release build, -# use "BUILD_TYPE=release make -j`nproc`" +# use "BUILD_TYPE=release make -j`nproc` -s" +# Remove -s for the verbose build log -make -j`nproc` +make -j`nproc` -s ``` #### Building on OSX @@ -99,9 +105,10 @@ cd neon # The preferred and default is to make a debug build. This will create a # demonstrably slower build than a release build. For a release build, -# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" +# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s" +# Remove -s for the verbose build log -make -j`sysctl -n hw.logicalcpu` +make -j`sysctl -n hw.logicalcpu` -s ``` #### Dependency installation notes diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 2c426620205d..49cf1cd3473d 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -44,7 +44,6 @@ use tracing::{error, info}; use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; use compute_tools::http::api::launch_http_server; -use compute_tools::informant::spawn_vm_informant_if_present; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; @@ -141,8 +140,6 @@ fn main() -> Result<()> { // requests, while configuration is still in progress. let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); - // Also spawn the thread responsible for handling the VM informant -- if it's present - let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant"); // Start Postgres let mut delay_exit = false; diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs deleted file mode 100644 index 8a6e3ab43a3d..000000000000 --- a/compute_tools/src/informant.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::path::Path; -use std::process; -use std::thread; -use std::time::Duration; -use tracing::{info, warn}; - -use anyhow::{Context, Result}; - -const VM_INFORMANT_PATH: &str = "/bin/vm-informant"; -const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000; - -/// Launch a thread to start the VM informant if it's present (and restart, on failure) -pub fn spawn_vm_informant_if_present() -> Result>> { - let exists = Path::new(VM_INFORMANT_PATH) - .try_exists() - .context("could not check if path exists")?; - - if !exists { - return Ok(None); - } - - Ok(Some( - thread::Builder::new() - .name("run-vm-informant".into()) - .spawn(move || run_informant())?, - )) -} - -fn run_informant() -> ! { - let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS); - - info!("starting VM informant"); - - loop { - let mut cmd = process::Command::new(VM_INFORMANT_PATH); - // Block on subprocess: - let result = cmd.status(); - - match result { - Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"), - Ok(status) if !status.success() => { - warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying") - } - Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"), - } - - // Wait before retrying - thread::sleep(restart_wait); - } -} diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index a71b92f91a8e..aee6b53e6a19 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -8,7 +8,6 @@ pub mod http; #[macro_use] pub mod logger; pub mod compute; -pub mod informant; pub mod monitor; pub mod params; pub mod pg_helpers; diff --git a/docs/settings.md b/docs/settings.md index 58d32157a3db..817f97d8ba7a 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898' checkpoint_distance = '268435456' # in bytes checkpoint_timeout = '10m' -gc_period = '100 s' +gc_period = '1 hour' gc_horizon = '67108864' max_file_descriptors = '100' @@ -101,7 +101,7 @@ away. #### gc_period -Interval at which garbage collection is triggered. Default is 100 s. +Interval at which garbage collection is triggered. Default is 1 hour. #### image_creation_threshold @@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3. #### pitr_interval -WAL retention duration for PITR branching. Default is 30 days. +WAL retention duration for PITR branching. Default is 7 days. #### walreceiver_connect_timeout diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md index 8378efc84256..407d7b525ac7 100644 --- a/docs/synthetic-size.md +++ b/docs/synthetic-size.md @@ -37,7 +37,7 @@ The synthetic size is designed to: ## Terms & assumptions -- logical size is the size of a database *at a given point in +- logical size is the size of a branch *at a given point in time*. It's the total size of all tables in all databases, as you see with "\l+" in psql for example, plus the Postgres SLRUs and some small amount of metadata. NOTE that currently, Neon does not include diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index c5e4dbd1f0d9..b7995c840cb9 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -75,27 +75,36 @@ impl StartupMessageParams { /// taking into account all escape sequences but leaving them as-is. /// [`None`] means that there's no `options` in [`Self`]. pub fn options_raw(&self) -> Option> { + self.get("options").map(Self::parse_options_raw) + } + + /// Split command-line options according to PostgreSQL's logic, + /// applying all escape sequences (using owned strings as needed). + /// [`None`] means that there's no `options` in [`Self`]. + pub fn options_escaped(&self) -> Option>> { + self.get("options").map(Self::parse_options_escaped) + } + + /// Split command-line options according to PostgreSQL's logic, + /// taking into account all escape sequences but leaving them as-is. + pub fn parse_options_raw(input: &str) -> impl Iterator { // See `postgres: pg_split_opts`. let mut last_was_escape = false; - let iter = self - .get("options")? + input .split(move |c: char| { // We split by non-escaped whitespace symbols. let should_split = c.is_ascii_whitespace() && !last_was_escape; last_was_escape = c == '\\' && !last_was_escape; should_split }) - .filter(|s| !s.is_empty()); - - Some(iter) + .filter(|s| !s.is_empty()) } /// Split command-line options according to PostgreSQL's logic, /// applying all escape sequences (using owned strings as needed). - /// [`None`] means that there's no `options` in [`Self`]. - pub fn options_escaped(&self) -> Option>> { + pub fn parse_options_escaped(input: &str) -> impl Iterator> { // See `postgres: pg_split_opts`. - let iter = self.options_raw()?.map(|s| { + Self::parse_options_raw(input).map(|s| { let mut preserve_next_escape = false; let escape = |c| { // We should remove '\\' unless it's preceded by '\\'. @@ -108,9 +117,12 @@ impl StartupMessageParams { true => Cow::Owned(s.replace(escape, "")), false => Cow::Borrowed(s), } - }); + }) + } - Some(iter) + /// Iterate through key-value pairs in an arbitrary order. + pub fn iter(&self) -> impl Iterator { + self.params.iter().map(|(k, v)| (k.as_str(), v.as_str())) } // This function is mostly useful in tests. diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml index a5f0160f35f8..15e78932a83d 100644 --- a/libs/tenant_size_model/Cargo.toml +++ b/libs/tenant_size_model/Cargo.toml @@ -7,5 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true +serde.workspace = true +serde_json.workspace = true workspace_hack.workspace = true diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs new file mode 100644 index 000000000000..093b053675a7 --- /dev/null +++ b/libs/tenant_size_model/src/calculation.rs @@ -0,0 +1,219 @@ +use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; + +// +// *-g--*---D---> +// / +// / +// / *---b----*-B---> +// / / +// / / +// -----*--e---*-----f----* C +// E \ +// \ +// *--a---*---A--> +// +// If A and B need to be retained, is it cheaper to store +// snapshot at C+a+b, or snapshots at A and B ? +// +// If D also needs to be retained, which is cheaper: +// +// 1. E+g+e+f+a+b +// 2. D+C+a+b +// 3. D+A+B + +/// [`Segment`] which has had it's size calculated. +#[derive(Clone, Debug)] +struct SegmentSize { + method: SegmentMethod, + + // calculated size of this subtree, using this method + accum_size: u64, + + seg_id: usize, + children: Vec, +} + +struct SizeAlternatives { + // cheapest alternative if parent is available. + incremental: SegmentSize, + + // cheapest alternative if parent node is not available + non_incremental: Option, +} + +impl StorageModel { + pub fn calculate(&self) -> SizeResult { + // Build adjacency list. 'child_list' is indexed by segment id. Each entry + // contains a list of all child segments of the segment. + let mut roots: Vec = Vec::new(); + let mut child_list: Vec> = Vec::new(); + child_list.resize(self.segments.len(), Vec::new()); + + for (seg_id, seg) in self.segments.iter().enumerate() { + if let Some(parent_id) = seg.parent { + child_list[parent_id].push(seg_id); + } else { + roots.push(seg_id); + } + } + + let mut segment_results = Vec::new(); + segment_results.resize( + self.segments.len(), + SegmentSizeResult { + method: SegmentMethod::Skipped, + accum_size: 0, + }, + ); + + let mut total_size = 0; + for root in roots { + if let Some(selected) = self.size_here(root, &child_list).non_incremental { + StorageModel::fill_selected_sizes(&selected, &mut segment_results); + total_size += selected.accum_size; + } else { + // Couldn't find any way to get this root. Error? + } + } + + SizeResult { + total_size, + segments: segment_results, + } + } + + fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec) { + result[selected.seg_id] = SegmentSizeResult { + method: selected.method, + accum_size: selected.accum_size, + }; + // recurse to children + for child in selected.children.iter() { + StorageModel::fill_selected_sizes(child, result); + } + } + + // + // This is the core of the sizing calculation. + // + // This is a recursive function, that for each Segment calculates the best way + // to reach all the Segments that are marked as needed in this subtree, under two + // different conditions: + // a) when the parent of this segment is available (as a snaphot or through WAL), and + // b) when the parent of this segment is not available. + // + fn size_here(&self, seg_id: usize, child_list: &Vec>) -> SizeAlternatives { + let seg = &self.segments[seg_id]; + // First figure out the best way to get each child + let mut children = Vec::new(); + for child_id in &child_list[seg_id] { + children.push(self.size_here(*child_id, child_list)) + } + + // Method 1. If this node is not needed, we can skip it as long as we + // take snapshots later in each sub-tree + let snapshot_later = if !seg.needed { + let mut snapshot_later = SegmentSize { + seg_id, + method: SegmentMethod::Skipped, + accum_size: 0, + children: Vec::new(), + }; + + let mut possible = true; + for child in children.iter() { + if let Some(non_incremental) = &child.non_incremental { + snapshot_later.accum_size += non_incremental.accum_size; + snapshot_later.children.push(non_incremental.clone()) + } else { + possible = false; + break; + } + } + if possible { + Some(snapshot_later) + } else { + None + } + } else { + None + }; + + // Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of + // this Segment was given. + let snapshot_here = if !seg.needed || seg.parent.is_none() { + if let Some(snapshot_size) = seg.size { + let mut snapshot_here = SegmentSize { + seg_id, + method: SegmentMethod::SnapshotHere, + accum_size: snapshot_size, + children: Vec::new(), + }; + for child in children.iter() { + snapshot_here.accum_size += child.incremental.accum_size; + snapshot_here.children.push(child.incremental.clone()) + } + Some(snapshot_here) + } else { + None + } + } else { + None + }; + + // Method 3. Use WAL to get here from parent + let wal_here = { + let mut wal_here = SegmentSize { + seg_id, + method: SegmentMethod::Wal, + accum_size: if let Some(parent_id) = seg.parent { + seg.lsn - self.segments[parent_id].lsn + } else { + 0 + }, + children: Vec::new(), + }; + for child in children { + wal_here.accum_size += child.incremental.accum_size; + wal_here.children.push(child.incremental) + } + wal_here + }; + + // If the parent is not available, what's the cheapest method involving + // a snapshot here or later? + let mut cheapest_non_incremental: Option = None; + if let Some(snapshot_here) = snapshot_here { + cheapest_non_incremental = Some(snapshot_here); + } + if let Some(snapshot_later) = snapshot_later { + // Use <=, to prefer skipping if the size is equal + if let Some(parent) = &cheapest_non_incremental { + if snapshot_later.accum_size <= parent.accum_size { + cheapest_non_incremental = Some(snapshot_later); + } + } else { + cheapest_non_incremental = Some(snapshot_later); + } + } + + // And what's the cheapest method, if the parent is available? + let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental + { + // Is it cheaper to use a snapshot here or later, anyway? + // Use <, to prefer Wal over snapshot if the cost is the same + if wal_here.accum_size < cheapest_non_incremental.accum_size { + wal_here + } else { + cheapest_non_incremental.clone() + } + } else { + wal_here + }; + + SizeAlternatives { + incremental: cheapest_incremental, + non_incremental: cheapest_non_incremental, + } + } +} diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs index b156e1be9d52..c151e3b42cde 100644 --- a/libs/tenant_size_model/src/lib.rs +++ b/libs/tenant_size_model/src/lib.rs @@ -1,401 +1,70 @@ -use std::borrow::Cow; -use std::collections::HashMap; - -use anyhow::Context; - -/// Pricing model or history size builder. +//! Synthetic size calculation + +mod calculation; +pub mod svg; + +/// StorageModel is the input to the synthetic size calculation. It represents +/// a tree of timelines, with just the information that's needed for the +/// calculation. This doesn't track timeline names or where each timeline +/// begins and ends, for example. Instead, it consists of "points of interest" +/// on the timelines. A point of interest could be the timeline start or end point, +/// the oldest point on a timeline that needs to be retained because of PITR +/// cutoff, or snapshot points named by the user. For each such point, and the +/// edge connecting the points (implicit in Segment), we store information about +/// whether we need to be able to recover to the point, and if known, the logical +/// size at the point. /// -/// Maintains knowledge of the branches and their modifications. Generic over the branch name key -/// type. -pub struct Storage { - segments: Vec, - - /// Mapping from the branch name to the index of a segment describing it's latest state. - branches: HashMap, +/// The segments must form a well-formed tree, with no loops. +#[derive(serde::Serialize)] +pub struct StorageModel { + pub segments: Vec, } -/// Snapshot of a branch. -#[derive(Clone, Debug, Eq, PartialEq)] +/// Segment represents one point in the tree of branches, *and* the edge that leads +/// to it (if any). We don't need separate structs for points and edges, because each +/// point can have only one parent. +/// +/// When 'needed' is true, it means that we need to be able to reconstruct +/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only +/// a single point is needed, create two Segments with the same lsn, and mark only +/// the child as needed. +/// +#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub struct Segment { /// Previous segment index into ['Storage::segments`], if any. - parent: Option, - - /// Description of how did we get to this state. - /// - /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when - /// modifying a branch directly. - pub op: Cow<'static, str>, - - /// LSN before this state - start_lsn: u64, + pub parent: Option, - /// LSN at this state - pub end_lsn: u64, + /// LSN at this point + pub lsn: u64, - /// Logical size before this state - start_size: u64, + /// Logical size at this node, if known. + pub size: Option, - /// Logical size at this state. Can be None in the last Segment of a branch. - pub end_size: Option, - - /// Indices to [`Storage::segments`] - /// - /// FIXME: this could be an Option - children_after: Vec, - - /// Determined by `retention_period` given to [`Storage::calculate`] + /// If true, the segment from parent to this node is needed by `retention_period` pub needed: bool, } -// -// -// -// -// *-g--*---D---> -// / -// / -// / *---b----*-B---> -// / / -// / / -// -----*--e---*-----f----* C -// E \ -// \ -// *--a---*---A--> -// -// If A and B need to be retained, is it cheaper to store -// snapshot at C+a+b, or snapshots at A and B ? -// -// If D also needs to be retained, which is cheaper: -// -// 1. E+g+e+f+a+b -// 2. D+C+a+b -// 3. D+A+B - -/// [`Segment`] which has had it's size calculated. -pub struct SegmentSize { - pub seg_id: usize, - - pub method: SegmentMethod, - - this_size: u64, +/// Result of synthetic size calculation. Returned by StorageModel::calculate() +pub struct SizeResult { + pub total_size: u64, - pub children: Vec, + // This has same length as the StorageModel::segments vector in the input. + // Each entry in this array corresponds to the entry with same index in + // StorageModel::segments. + pub segments: Vec, } -impl SegmentSize { - fn total(&self) -> u64 { - self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) - } - - pub fn total_children(&self) -> u64 { - if self.method == SnapshotAfter { - self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total()) - } else { - self.children.iter().fold(0, |acc, x| acc + x.total()) - } - } +#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct SegmentSizeResult { + pub method: SegmentMethod, + // calculated size of this subtree, using this method + pub accum_size: u64, } /// Different methods to retain history from a particular state -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub enum SegmentMethod { - SnapshotAfter, - Wal, - WalNeeded, + SnapshotHere, // A logical snapshot is needed after this segment + Wal, // Keep WAL leading up to this node Skipped, } - -use SegmentMethod::*; - -impl Storage { - /// Creates a new storage with the given default branch name. - pub fn new(initial_branch: K) -> Storage { - let init_segment = Segment { - op: "".into(), - needed: false, - parent: None, - start_lsn: 0, - end_lsn: 0, - start_size: 0, - end_size: Some(0), - children_after: Vec::new(), - }; - - Storage { - segments: vec![init_segment], - branches: HashMap::from([(initial_branch, 0)]), - } - } - - /// Advances the branch with a new point, at given LSN. - pub fn insert_point( - &mut self, - branch: &Q, - op: Cow<'static, str>, - lsn: u64, - size: Option, - ) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") }; - let newseg_id = self.segments.len(); - let lastseg = &mut self.segments[lastseg_id]; - - assert!(lsn > lastseg.end_lsn); - - let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") }; - - let newseg = Segment { - op, - parent: Some(lastseg_id), - start_lsn: lastseg.end_lsn, - end_lsn: lsn, - start_size, - end_size: size, - children_after: Vec::new(), - needed: false, - }; - lastseg.children_after.push(newseg_id); - - self.segments.push(newseg); - *self.branches.get_mut(branch).expect("read already") = newseg_id; - - Ok(()) - } - - /// Advances the branch with the named operation, by the relative LSN and logical size bytes. - pub fn modify_branch( - &mut self, - branch: &Q, - op: Cow<'static, str>, - lsn_bytes: u64, - size_bytes: i64, - ) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") }; - let newseg_id = self.segments.len(); - let lastseg = &mut self.segments[lastseg_id]; - - let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") }; - - let newseg = Segment { - op, - parent: Some(lastseg_id), - start_lsn: lastseg.end_lsn, - end_lsn: lastseg.end_lsn + lsn_bytes, - start_size: last_end_size, - end_size: Some((last_end_size as i64 + size_bytes) as u64), - children_after: Vec::new(), - needed: false, - }; - lastseg.children_after.push(newseg_id); - - self.segments.push(newseg); - *self.branches.get_mut(branch).expect("read already") = newseg_id; - Ok(()) - } - - pub fn insert(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "insert".into(), bytes, bytes as i64) - } - - pub fn update(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "update".into(), bytes, 0i64) - } - - pub fn delete(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()> - where - K: std::borrow::Borrow, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64)) - } - - pub fn branch(&mut self, parent: &Q, name: K) -> anyhow::Result<()> - where - K: std::borrow::Borrow + std::fmt::Debug, - Q: std::hash::Hash + Eq + std::fmt::Debug, - { - // Find the right segment - let branchseg_id = *self.branches.get(parent).with_context(|| { - format!( - "should had found the parent {:?} by key. in branches {:?}", - parent, self.branches - ) - })?; - - let _branchseg = &mut self.segments[branchseg_id]; - - // Create branch name for it - self.branches.insert(name, branchseg_id); - Ok(()) - } - - pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result { - // Phase 1: Mark all the segments that need to be retained - for (_branch, &last_seg_id) in self.branches.iter() { - let last_seg = &self.segments[last_seg_id]; - let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period); - let mut seg_id = last_seg_id; - loop { - let seg = &mut self.segments[seg_id]; - if seg.end_lsn < cutoff_lsn { - break; - } - seg.needed = true; - if let Some(prev_seg_id) = seg.parent { - seg_id = prev_seg_id; - } else { - break; - } - } - } - - // Phase 2: For each oldest segment in a chain that needs to be retained, - // calculate if we should store snapshot or WAL - self.size_from_snapshot_later(0) - } - - fn size_from_wal(&self, seg_id: usize) -> anyhow::Result { - let seg = &self.segments[seg_id]; - - let this_size = seg.end_lsn - seg.start_lsn; - - let mut children = Vec::new(); - - // try both ways - for &child_id in seg.children_after.iter() { - // try each child both ways - let child = &self.segments[child_id]; - let p1 = self.size_from_wal(child_id)?; - - let p = if !child.needed { - let p2 = self.size_from_snapshot_later(child_id)?; - if p1.total() < p2.total() { - p1 - } else { - p2 - } - } else { - p1 - }; - children.push(p); - } - Ok(SegmentSize { - seg_id, - method: if seg.needed { WalNeeded } else { Wal }, - this_size, - children, - }) - } - - fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result { - // If this is needed, then it's time to do the snapshot and continue - // with wal method. - let seg = &self.segments[seg_id]; - //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed); - if seg.needed { - let mut children = Vec::new(); - - for &child_id in seg.children_after.iter() { - // try each child both ways - let child = &self.segments[child_id]; - let p1 = self.size_from_wal(child_id)?; - - let p = if !child.needed { - let p2 = self.size_from_snapshot_later(child_id)?; - if p1.total() < p2.total() { - p1 - } else { - p2 - } - } else { - p1 - }; - children.push(p); - } - Ok(SegmentSize { - seg_id, - method: WalNeeded, - this_size: seg.start_size, - children, - }) - } else { - // If any of the direct children are "needed", need to be able to reconstruct here - let mut children_needed = false; - for &child in seg.children_after.iter() { - let seg = &self.segments[child]; - if seg.needed { - children_needed = true; - break; - } - } - - let method1 = if !children_needed { - let mut children = Vec::new(); - for child in seg.children_after.iter() { - children.push(self.size_from_snapshot_later(*child)?); - } - Some(SegmentSize { - seg_id, - method: Skipped, - this_size: 0, - children, - }) - } else { - None - }; - - // If this a junction, consider snapshotting here - let method2 = if children_needed || seg.children_after.len() >= 2 { - let mut children = Vec::new(); - for child in seg.children_after.iter() { - children.push(self.size_from_wal(*child)?); - } - let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") }; - Some(SegmentSize { - seg_id, - method: SnapshotAfter, - this_size, - children, - }) - } else { - None - }; - - Ok(match (method1, method2) { - (None, None) => anyhow::bail!( - "neither method was applicable: children_after={}, children_needed={}", - seg.children_after.len(), - children_needed - ), - (Some(method), None) => method, - (None, Some(method)) => method, - (Some(method1), Some(method2)) => { - if method1.total() < method2.total() { - method1 - } else { - method2 - } - } - }) - } - } - - pub fn into_segments(self) -> Vec { - self.segments - } -} diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs deleted file mode 100644 index e32dd055f491..000000000000 --- a/libs/tenant_size_model/src/main.rs +++ /dev/null @@ -1,269 +0,0 @@ -//! Tenant size model testing ground. -//! -//! Has a number of scenarios and a `main` for invoking these by number, calculating the history -//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios -//! into pngs. - -use tenant_size_model::{Segment, SegmentSize, Storage}; - -// Main branch only. Some updates on it. -fn scenario_1() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Main branch only. Some updates on it. -fn scenario_2() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - storage.update("main", 1_000)?; - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Like 2, but more updates on main -fn scenario_3() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -// Diverged branches -fn scenario_4() -> anyhow::Result<(Vec, SegmentSize)> { - // Create main branch - let mut storage = Storage::new("main"); - - // Bulk load 5 GB of data to it - storage.insert("main", 5_000)?; - - // Stream of updates - for _ in 0..5 { - storage.update("main", 1_000)?; - } - - // Branch - storage.branch("main", "child")?; - storage.update("child", 1_000)?; - - // More updates on parent - for _ in 0..8 { - storage.update("main", 1_000)?; - } - - let size = storage.calculate(1000)?; - - Ok((storage.into_segments(), size)) -} - -fn scenario_5() -> anyhow::Result<(Vec, SegmentSize)> { - let mut storage = Storage::new("a"); - storage.insert("a", 5000)?; - storage.branch("a", "b")?; - storage.update("b", 4000)?; - storage.update("a", 2000)?; - storage.branch("a", "c")?; - storage.insert("c", 4000)?; - storage.insert("a", 2000)?; - - let size = storage.calculate(5000)?; - - Ok((storage.into_segments(), size)) -} - -fn scenario_6() -> anyhow::Result<(Vec, SegmentSize)> { - use std::borrow::Cow; - - const NO_OP: Cow<'static, str> = Cow::Borrowed(""); - - let branches = [ - Some(0x7ff1edab8182025f15ae33482edb590a_u128), - Some(0xb1719e044db05401a05a2ed588a3ad3f), - Some(0xb68d6691c895ad0a70809470020929ef), - ]; - - // compared to other scenarios, this one uses bytes instead of kB - - let mut storage = Storage::new(None); - - storage.branch(&None, branches[0])?; // at 0 - storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064 - storage.branch(&branches[0], branches[1])?; // at 108951064 - storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472 - storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424 - storage.branch(&branches[0], branches[2])?; // at 283415424 - storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616 - storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400 - - let size = storage.calculate(100_000)?; - - Ok((storage.into_segments(), size)) -} - -fn main() { - let args: Vec = std::env::args().collect(); - - let scenario = if args.len() < 2 { "1" } else { &args[1] }; - - let (segments, size) = match scenario { - "1" => scenario_1(), - "2" => scenario_2(), - "3" => scenario_3(), - "4" => scenario_4(), - "5" => scenario_5(), - "6" => scenario_6(), - other => { - eprintln!("invalid scenario {}", other); - std::process::exit(1); - } - } - .unwrap(); - - graphviz_tree(&segments, &size); -} - -fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) { - use tenant_size_model::SegmentMethod::*; - - let seg_id = node.seg_id; - let seg = segments.get(seg_id).unwrap(); - let lsn = seg.end_lsn; - let size = seg.end_size.unwrap_or(0); - let method = node.method; - - println!(" {{"); - println!(" node [width=0.1 height=0.1 shape=oval]"); - - let tenant_size = node.total_children(); - - let penwidth = if seg.needed { 6 } else { 3 }; - let x = match method { - SnapshotAfter => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"), - Wal => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), - WalNeeded => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"), - Skipped => - format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"), - }; - - println!(" \"seg{seg_id}\" [{x}]"); - println!(" }}"); - - // Recurse. Much of the data is actually on the edge - for child in node.children.iter() { - let child_id = child.seg_id; - graphviz_recurse(segments, child); - - let edge_color = match child.method { - SnapshotAfter => "gray", - Wal => "black", - WalNeeded => "black", - Skipped => "gray", - }; - - println!(" {{"); - println!(" edge [] "); - print!(" \"seg{seg_id}\" -> \"seg{child_id}\" ["); - print!("color={edge_color}"); - if child.method == WalNeeded { - print!(" penwidth=6"); - } - if child.method == Wal { - print!(" penwidth=3"); - } - - let next = segments.get(child_id).unwrap(); - - if next.op.is_empty() { - print!( - " label=\"{} / {}\"", - next.end_lsn - seg.end_lsn, - (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128) - ); - } else { - print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn); - } - println!("]"); - println!(" }}"); - } -} - -fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) { - println!("digraph G {{"); - println!(" fontname=\"Helvetica,Arial,sans-serif\""); - println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]"); - println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]"); - println!(" graph [center=1 rankdir=LR]"); - println!(" edge [dir=none]"); - - graphviz_recurse(segments, tree); - - println!("}}"); -} - -#[test] -fn scenarios_return_same_size() { - type ScenarioFn = fn() -> anyhow::Result<(Vec, SegmentSize)>; - let truths: &[(u32, ScenarioFn, _)] = &[ - (line!(), scenario_1, 8000), - (line!(), scenario_2, 9000), - (line!(), scenario_3, 13000), - (line!(), scenario_4, 16000), - (line!(), scenario_5, 17000), - (line!(), scenario_6, 333_792_000), - ]; - - for (line, scenario, expected) in truths { - let (_, size) = scenario().unwrap(); - assert_eq!(*expected, size.total_children(), "scenario on line {line}"); - } -} diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs new file mode 100644 index 000000000000..f26d3aa79d1a --- /dev/null +++ b/libs/tenant_size_model/src/svg.rs @@ -0,0 +1,193 @@ +use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel}; +use std::fmt::Write; + +const SVG_WIDTH: f32 = 500.0; + +struct SvgDraw<'a> { + storage: &'a StorageModel, + branches: &'a [String], + seg_to_branch: &'a [usize], + sizes: &'a [SegmentSizeResult], + + // layout + xscale: f32, + min_lsn: u64, + seg_coordinates: Vec<(f32, f32)>, +} + +fn draw_legend(result: &mut String) -> anyhow::Result<()> { + writeln!( + result, + "" + )?; + writeln!(result, "logical snapshot")?; + writeln!( + result, + "" + )?; + writeln!( + result, + "WAL within retention period" + )?; + writeln!( + result, + "" + )?; + writeln!( + result, + "WAL retained to avoid copy" + )?; + writeln!( + result, + "" + )?; + writeln!(result, "WAL not retained")?; + Ok(()) +} + +pub fn draw_svg( + storage: &StorageModel, + branches: &[String], + seg_to_branch: &[usize], + sizes: &SizeResult, +) -> anyhow::Result { + let mut draw = SvgDraw { + storage, + branches, + seg_to_branch, + sizes: &sizes.segments, + + xscale: 0.0, + min_lsn: 0, + seg_coordinates: Vec::new(), + }; + + let mut result = String::new(); + + writeln!(result, "")?; + + draw.calculate_svg_layout(); + + // Draw the tree + for (seg_id, _seg) in storage.segments.iter().enumerate() { + draw.draw_seg_phase1(seg_id, &mut result)?; + } + + // Draw snapshots + for (seg_id, _seg) in storage.segments.iter().enumerate() { + draw.draw_seg_phase2(seg_id, &mut result)?; + } + + draw_legend(&mut result)?; + + write!(result, "")?; + + Ok(result) +} + +impl<'a> SvgDraw<'a> { + fn calculate_svg_layout(&mut self) { + // Find x scale + let segments = &self.storage.segments; + let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min); + let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max); + + // Start with 1 pixel = 1 byte. Double the scale until it fits into the image + let mut xscale = 1.0; + while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH { + xscale *= 2.0; + } + + // Layout the timelines on Y dimension. + // TODO + let mut y = 100.0; + let mut branch_y_coordinates = Vec::new(); + for _branch in self.branches { + branch_y_coordinates.push(y); + y += 40.0; + } + + // Calculate coordinates for each point + let seg_coordinates = std::iter::zip(segments, self.seg_to_branch) + .map(|(seg, branch_id)| { + let x = (seg.lsn - min_lsn) as f32 / xscale; + let y = branch_y_coordinates[*branch_id]; + (x, y) + }) + .collect(); + + self.xscale = xscale; + self.min_lsn = min_lsn; + self.seg_coordinates = seg_coordinates; + } + + /// Draws lines between points + fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> { + let seg = &self.storage.segments[seg_id]; + + let wal_bytes = if let Some(parent_id) = seg.parent { + seg.lsn - self.storage.segments[parent_id].lsn + } else { + 0 + }; + + let style = match self.sizes[seg_id].method { + SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"", + SegmentMethod::Wal if seg.needed && wal_bytes > 0 => { + "stroke-width=\"6\" stroke=\"black\"" + } + SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"", + SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"", + }; + if let Some(parent_id) = seg.parent { + let (x1, y1) = self.seg_coordinates[parent_id]; + let (x2, y2) = self.seg_coordinates[seg_id]; + + writeln!( + result, + "", + )?; + writeln!( + result, + " {wal_bytes} bytes of WAL (seg {seg_id})" + )?; + writeln!(result, "")?; + } else { + // draw a little dash to mark the starting point of this branch + let (x, y) = self.seg_coordinates[seg_id]; + let (x1, y1) = (x, y - 5.0); + let (x2, y2) = (x, y + 5.0); + + writeln!( + result, + "", + )?; + writeln!(result, " (seg {seg_id})")?; + writeln!(result, "")?; + } + + Ok(()) + } + + /// Draw circles where snapshots are taken + fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> { + let seg = &self.storage.segments[seg_id]; + + // draw a snapshot point if it's needed + let (coord_x, coord_y) = self.seg_coordinates[seg_id]; + if self.sizes[seg_id].method == SegmentMethod::SnapshotHere { + writeln!( + result, + "", + )?; + writeln!( + result, + " logical size {}", + seg.size.unwrap() + )?; + write!(result, "")?; + } + + Ok(()) + } +} diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs new file mode 100644 index 000000000000..7660d41c56e7 --- /dev/null +++ b/libs/tenant_size_model/tests/tests.rs @@ -0,0 +1,313 @@ +//! Tenant size model tests. + +use tenant_size_model::{Segment, SizeResult, StorageModel}; + +use std::collections::HashMap; + +struct ScenarioBuilder { + segments: Vec, + + /// Mapping from the branch name to the index of a segment describing its latest state. + branches: HashMap, +} + +impl ScenarioBuilder { + /// Creates a new storage with the given default branch name. + pub fn new(initial_branch: &str) -> ScenarioBuilder { + let init_segment = Segment { + parent: None, + lsn: 0, + size: Some(0), + needed: false, // determined later + }; + + ScenarioBuilder { + segments: vec![init_segment], + branches: HashMap::from([(initial_branch.into(), 0)]), + } + } + + /// Advances the branch with the named operation, by the relative LSN and logical size bytes. + pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) { + let lastseg_id = *self.branches.get(branch).unwrap(); + let newseg_id = self.segments.len(); + let lastseg = &mut self.segments[lastseg_id]; + + let newseg = Segment { + parent: Some(lastseg_id), + lsn: lastseg.lsn + lsn_bytes, + size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64), + needed: false, + }; + + self.segments.push(newseg); + *self.branches.get_mut(branch).expect("read already") = newseg_id; + } + + pub fn insert(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, bytes as i64); + } + + pub fn update(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, 0i64); + } + + pub fn _delete(&mut self, branch: &str, bytes: u64) { + self.modify_branch(branch, bytes, -(bytes as i64)); + } + + /// Panics if the parent branch cannot be found. + pub fn branch(&mut self, parent: &str, name: &str) { + // Find the right segment + let branchseg_id = *self + .branches + .get(parent) + .expect("should had found the parent by key"); + let _branchseg = &mut self.segments[branchseg_id]; + + // Create branch name for it + self.branches.insert(name.to_string(), branchseg_id); + } + + pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) { + // Phase 1: Mark all the segments that need to be retained + for (_branch, &last_seg_id) in self.branches.iter() { + let last_seg = &self.segments[last_seg_id]; + let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period); + let mut seg_id = last_seg_id; + loop { + let seg = &mut self.segments[seg_id]; + if seg.lsn <= cutoff_lsn { + break; + } + seg.needed = true; + if let Some(prev_seg_id) = seg.parent { + seg_id = prev_seg_id; + } else { + break; + } + } + } + + // Perform the calculation + let storage_model = StorageModel { + segments: self.segments.clone(), + }; + let size_result = storage_model.calculate(); + (storage_model, size_result) + } +} + +// Main branch only. Some updates on it. +#[test] +fn scenario_1() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Calculate the synthetic size with retention horizon 1000 + let (_model, result) = scenario.calculate(1000); + + // The end of the branch is at LSN 10000. Need to retain + // a logical snapshot at LSN 9000, plus the WAL between 9000-10000. + // The logical snapshot has size 5000. + assert_eq!(result.total_size, 5000 + 1000); +} + +// Main branch only. Some updates on it. +#[test] +fn scenario_2() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + scenario.update("main", 1_000); + + // + // The history looks like this now: + // + // 10000 11000 + // *----*----*--------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, we need to retain logical snapshot + // at the branch point, size 5000, and the WAL from 10000-11000 on + // both branches. + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 1000); +} + +// Like 2, but more updates on main +#[test] +fn scenario_3() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // + // The history looks like this now: + // + // 10000 15000 + // *----*----*------------------------------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, it's still cheapest to retain + // - snapshot at branch point (size 5000) + // - WAL on child between 10000-11000 + // - WAL on main between 10000-15000 + // + // This is in total 5000 + 1000 + 5000 + // + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 5000); +} + +// Diverged branches +#[test] +fn scenario_4() { + // Create main branch + let mut scenario = ScenarioBuilder::new("main"); + + // Bulk load 5 GB of data to it + scenario.insert("main", 5_000); + + // Stream of updates + for _ in 0..5 { + scenario.update("main", 1_000); + } + + // Branch + scenario.branch("main", "child"); + scenario.update("child", 1_000); + + // More updates on parent + for _ in 0..8 { + scenario.update("main", 1_000); + } + + // + // The history looks like this now: + // + // 10000 18000 + // *----*----*------------------------------------* main + // | + // | 11000 + // +-------------- child + // + // + // With retention horizon 1000, it's now cheapest to retain + // separate snapshots on both branches: + // - snapshot on main branch at LSN 17000 (size 5000) + // - WAL on main between 17000-18000 + // - snapshot on child branch at LSN 10000 (size 5000) + // - WAL on child between 10000-11000 + // + // This is in total 5000 + 1000 + 5000 + 1000 = 12000 + // + // (If we used the the method from the previous scenario, and + // kept only snapshot at the branch point, we'd need to keep + // all the WAL between 10000-18000 on the main branch, so + // the total size would be 5000 + 1000 + 8000 = 14000. The + // calculation always picks the cheapest alternative) + + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000); +} + +#[test] +fn scenario_5() { + let mut scenario = ScenarioBuilder::new("a"); + scenario.insert("a", 5000); + scenario.branch("a", "b"); + scenario.update("b", 4000); + scenario.update("a", 2000); + scenario.branch("a", "c"); + scenario.insert("c", 4000); + scenario.insert("a", 2000); + + let (_model, result) = scenario.calculate(1000); + + assert_eq!(result.total_size, 17000); +} + +#[test] +fn scenario_6() { + let branches = [ + "7ff1edab8182025f15ae33482edb590a", + "b1719e044db05401a05a2ed588a3ad3f", + "0xb68d6691c895ad0a70809470020929ef", + ]; + + // compared to other scenarios, this one uses bytes instead of kB + + let mut scenario = ScenarioBuilder::new(""); + + scenario.branch("", branches[0]); // at 0 + scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064 + scenario.branch(branches[0], branches[1]); // at 108951064 + scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472 + scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424 + scenario.branch(branches[0], branches[2]); // at 283415424 + scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616 + scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400 + + let (model, result) = scenario.calculate(100_000); + + // FIXME: We previously calculated 333_792_000. But with this PR, we get + // a much lower number. At a quick look at the model output and the + // calculations here, the new result seems correct to me. + eprintln!( + " MODEL: {}", + serde_json::to_string(&model.segments).unwrap() + ); + eprintln!( + "RESULT: {}", + serde_json::to_string(&result.segments).unwrap() + ); + + assert_eq!(result.total_size, 136_236_928); +} diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 02684d3d16c1..f770622a601f 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -45,3 +45,115 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> { Ok(()) } + +/// Disable the default rust panic hook by using `set_hook`. +/// +/// For neon binaries, the assumption is that tracing is configured before with [`init`], after +/// that sentry is configured (if needed). sentry will install it's own on top of this, always +/// processing the panic before we log it. +/// +/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr). +/// If the assumptions about the initialization order are not held, use +/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be +/// lost. +#[must_use] +pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard { + std::panic::set_hook(Box::new(tracing_panic_hook)); + TracingPanicHookGuard::new() +} + +/// Drop guard which restores the std panic hook on drop. +/// +/// Tracing should not be used when it's not configured, but we cannot really latch on to any +/// imaginary lifetime of tracing. +pub struct TracingPanicHookGuard { + act: bool, +} + +impl TracingPanicHookGuard { + fn new() -> Self { + TracingPanicHookGuard { act: true } + } + + /// Make this hook guard not do anything when dropped. + pub fn forget(&mut self) { + self.act = false; + } +} + +impl Drop for TracingPanicHookGuard { + fn drop(&mut self) { + if self.act { + let _ = std::panic::take_hook(); + } + } +} + +/// Named symbol for our panic hook, which logs the panic. +fn tracing_panic_hook(info: &std::panic::PanicInfo) { + // following rust 1.66.1 std implementation: + // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 + let location = info.location(); + + let msg = match info.payload().downcast_ref::<&'static str>() { + Some(s) => *s, + None => match info.payload().downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + + let thread = std::thread::current(); + let thread = thread.name().unwrap_or(""); + let backtrace = std::backtrace::Backtrace::capture(); + + let _entered = if let Some(location) = location { + tracing::error_span!("panic", %thread, location = %PrettyLocation(location)) + } else { + // very unlikely to hit here, but the guarantees of std could change + tracing::error_span!("panic", %thread) + } + .entered(); + + if backtrace.status() == std::backtrace::BacktraceStatus::Captured { + // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really + // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to + // string, maybe even to a TLS one but tracing already does that. + tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}"); + } else { + tracing::error!("{msg}"); + } + + // ensure that we log something on the panic if this hook is left after tracing has been + // unconfigured. worst case when teardown is racing the panic is to log the panic twice. + tracing::dispatcher::get_default(|d| { + if let Some(_none) = d.downcast_ref::() { + let location = location.map(PrettyLocation); + log_panic_to_stderr(thread, msg, location, &backtrace); + } + }); +} + +#[cold] +fn log_panic_to_stderr( + thread: &str, + msg: &str, + location: Option>, + backtrace: &std::backtrace::Backtrace, +) { + eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}"); +} + +struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); + +impl std::fmt::Display for PrettyLocation<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column()) + } +} + +impl std::fmt::Debug for PrettyLocation<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + ::fmt(self, f) + } +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 50eefa8c7787..01a2c85d7496 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -88,6 +88,13 @@ fn main() -> anyhow::Result<()> { } }; + // Initialize logging, which must be initialized before the custom panic hook is installed. + logging::init(conf.log_format)?; + + // mind the order required here: 1. logging, 2. panic_hook, 3. sentry. + // disarming this hook on pageserver, because we never tear down tracing. + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry( Some(GIT_VERSION.into()), @@ -210,9 +217,6 @@ fn start_pageserver( launch_ts: &'static LaunchTimestamp, conf: &'static PageServerConf, ) -> anyhow::Result<()> { - // Initialize logging - logging::init(conf.log_format)?; - // Print version and launch timestamp to the log, // and expose them as prometheus metrics. // A changed version string indicates changed software. diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index a730d393395e..8916d4f1c978 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size"; const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size"; #[serde_as] -#[derive(Serialize)] +#[derive(Serialize, Debug)] struct Ids { #[serde_as(as = "DisplayFromStr")] tenant_id: TenantId, @@ -75,7 +75,7 @@ pub async fn collect_metrics( // define client here to reuse it for all requests let client = reqwest::Client::new(); let mut cached_metrics: HashMap = HashMap::new(); - let mut prev_iteration_time: Option = None; + let mut prev_iteration_time: std::time::Instant = std::time::Instant::now(); loop { tokio::select! { @@ -86,11 +86,11 @@ pub async fn collect_metrics( _ = ticker.tick() => { // send cached metrics every cached_metric_collection_interval - let send_cached = prev_iteration_time - .map(|x| x.elapsed() >= cached_metric_collection_interval) - .unwrap_or(false); + let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval; - prev_iteration_time = Some(std::time::Instant::now()); + if send_cached { + prev_iteration_time = std::time::Instant::now(); + } collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await; } @@ -287,6 +287,12 @@ pub async fn collect_metrics_iteration( } } else { error!("metrics endpoint refused the sent metrics: {:?}", res); + for metric in chunk_to_send.iter() { + // Report if the metric value is suspiciously large + if metric.value > (1u64 << 40) { + error!("potentially abnormal metric value: {:?}", metric); + } + } } } Err(err) => { diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index fc271fe83b7b..e68ceb2dc6cf 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -437,6 +437,13 @@ paths: type: boolean description: | When true, skip calculation and only provide the model inputs (for debugging). Defaults to false. + - name: retention_period + in: query + required: false + schema: + type: integer + description: | + Override the default retention period (in bytes) used for size calculation. get: description: | Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes). diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 6a9232e097aa..d2d9f24efbe8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -7,19 +7,21 @@ use hyper::{Body, Request, Response, Uri}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest; use remote_storage::GenericRemoteStorage; +use tenant_size_model::{SizeResult, StorageModel}; use tokio_util::sync::CancellationToken; use tracing::*; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use super::models::{ StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, - TimelineCreateRequest, TimelineInfo, + TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; use crate::tenant::mgr::TenantMapInsertError; +use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::{PageReconstructError, Timeline}; use crate::{config::PageServerConf, tenant::mgr}; @@ -38,7 +40,7 @@ use utils::{ // Imports only used for testing APIs #[cfg(feature = "testing")] -use super::models::{ConfigureFailpointsRequest, TimelineGcRequest}; +use super::models::ConfigureFailpointsRequest; struct State { conf: &'static PageServerConf, @@ -479,11 +481,19 @@ async fn tenant_status(request: Request) -> Result, ApiErro /// to debug any of the calculations. Requires `tenant_id` request parameter, supports /// `inputs_only=true|false` (default false) which supports debugging failure to calculate model /// values. +/// +/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size +/// (only if it is shorter than the real cutoff). +/// +/// Note: we don't update the cached size and prometheus metric here. +/// The retention period might be different, and it's nice to have a method to just calculate it +/// without modifying anything anyway. async fn tenant_size_handler(request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; - let inputs_only: Option = parse_query_param(&request, "inputs_only")?; + let retention_period: Option = parse_query_param(&request, "retention_period")?; + let headers = request.headers(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let tenant = mgr::get_tenant(tenant_id, true) @@ -492,24 +502,29 @@ async fn tenant_size_handler(request: Request) -> Result, A // this can be long operation let inputs = tenant - .gather_size_inputs(&ctx) + .gather_size_inputs(retention_period, &ctx) .await .map_err(ApiError::InternalServerError)?; - let size = if !inputs_only.unwrap_or(false) { - Some( - tenant - .calc_and_update_cached_synthetic_size(&inputs) - .map_err(ApiError::InternalServerError)?, - ) - } else { - None - }; + let mut sizes = None; + if !inputs_only.unwrap_or(false) { + let storage_model = inputs + .calculate_model() + .map_err(ApiError::InternalServerError)?; + let size = storage_model.calculate(); - /// Private response type with the additional "unstable" `inputs` field. - /// - /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is - /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`. + // If request header expects html, return html + if headers["Accept"] == "text/html" { + return synthetic_size_html_response(inputs, storage_model, size); + } + sizes = Some(size); + } else if headers["Accept"] == "text/html" { + return Err(ApiError::BadRequest(anyhow!( + "inputs_only parameter is incompatible with html output request" + ))); + } + + /// The type resides in the pageserver not to expose `ModelInputs`. #[serde_with::serde_as] #[derive(serde::Serialize)] struct TenantHistorySize { @@ -519,6 +534,9 @@ async fn tenant_size_handler(request: Request) -> Result, A /// /// Will be none if `?inputs_only=true` was given. size: Option, + /// Size of each segment used in the model. + /// Will be null if `?inputs_only=true` was given. + segment_sizes: Option>, inputs: crate::tenant::size::ModelInputs, } @@ -526,7 +544,8 @@ async fn tenant_size_handler(request: Request) -> Result, A StatusCode::OK, TenantHistorySize { id: tenant_id, - size, + size: sizes.as_ref().map(|x| x.total_size), + segment_sizes: sizes.map(|x| x.segments), inputs, }, ) @@ -591,6 +610,62 @@ async fn evict_timeline_layer_handler(request: Request) -> Result Result, ApiError> { + let mut timeline_ids: Vec = Vec::new(); + let mut timeline_map: HashMap = HashMap::new(); + for (index, ti) in inputs.timeline_inputs.iter().enumerate() { + timeline_map.insert(ti.timeline_id, index); + timeline_ids.push(ti.timeline_id.to_string()); + } + let seg_to_branch: Vec = inputs + .segments + .iter() + .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap()) + .collect(); + + let svg = + tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes) + .map_err(ApiError::InternalServerError)?; + + let mut response = String::new(); + + use std::fmt::Write; + write!(response, "\n\n").unwrap(); + write!(response, "
\n{svg}\n
").unwrap(); + writeln!(response, "Project size: {}", sizes.total_size).unwrap(); + writeln!(response, "
").unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&inputs).unwrap()
+    )
+    .unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&sizes.segments).unwrap()
+    )
+    .unwrap();
+    writeln!(response, "
").unwrap(); + write!(response, "\n\n").unwrap(); + + html_response(StatusCode::OK, response) +} + +pub fn html_response(status: StatusCode, data: String) -> Result, ApiError> { + let response = Response::builder() + .status(status) + .header(hyper::header::CONTENT_TYPE, "text/html") + .body(Body::from(data.as_bytes().to_vec())) + .map_err(|e| ApiError::InternalServerError(e.into()))?; + Ok(response) +} + // Helper function to standardize the error messages we produce on bad durations // // Intended to be used with anyhow's `with_context`, e.g.: @@ -850,7 +925,6 @@ async fn failpoints_handler(mut request: Request) -> Result } // Run GC immediately on given timeline. -#[cfg(feature = "testing")] async fn timeline_gc_handler(mut request: Request) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -954,6 +1028,17 @@ async fn active_timeline_of_active_tenant( .map_err(ApiError::NotFound) } +async fn always_panic_handler(req: Request) -> Result, ApiError> { + // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook(). + // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it. + // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic. + let query = req.uri().query(); + let _ = std::panic::catch_unwind(|| { + panic!("unconditional panic for testing panic hook integration; request query: {query:?}") + }); + json_response(StatusCode::NO_CONTENT, ()) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -1019,7 +1104,7 @@ pub fn make_router( .get("/v1/tenant", tenant_list_handler) .post("/v1/tenant", tenant_create_handler) .get("/v1/tenant/:tenant_id", tenant_status) - .get("/v1/tenant/:tenant_id/size", tenant_size_handler) + .get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler) .put("/v1/tenant/config", update_tenant_config_handler) .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler) .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) @@ -1038,7 +1123,7 @@ pub fn make_router( ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", - testing_api!("run timeline GC", timeline_gc_handler), + timeline_gc_handler, ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/compact", @@ -1072,5 +1157,6 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", evict_timeline_layer_handler, ) + .get("/v1/panic", always_panic_handler) .any(handler_404)) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 23210b98d5bd..9e9c98ad62af 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -2418,6 +2418,9 @@ impl Tenant { #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] pub async fn gather_size_inputs( &self, + // `max_retention_period` overrides the cutoff that is used to calculate the size + // (only if it is shorter than the real cutoff). + max_retention_period: Option, ctx: &RequestContext, ) -> anyhow::Result { let logical_sizes_at_once = self @@ -2425,32 +2428,41 @@ impl Tenant { .concurrent_tenant_size_logical_size_queries .inner(); - // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries - // are for testing/experimenting, we tolerate this. + // TODO: Having a single mutex block concurrent reads is not great for performance. + // + // But the only case where we need to run multiple of these at once is when we + // request a size for a tenant manually via API, while another background calculation + // is in progress (which is not a common case). // // See more for on the issue #2748 condenced out of the initial PR review. let mut shared_cache = self.cached_logical_sizes.lock().await; - size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await + size::gather_inputs( + self, + logical_sizes_at_once, + max_retention_period, + &mut shared_cache, + ctx, + ) + .await } - /// Calculate synthetic tenant size + /// Calculate synthetic tenant size and cache the result. /// This is periodically called by background worker. /// result is cached in tenant struct #[instrument(skip_all, fields(tenant_id=%self.tenant_id))] pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result { - let inputs = self.gather_size_inputs(ctx).await?; + let inputs = self.gather_size_inputs(None, ctx).await?; - self.calc_and_update_cached_synthetic_size(&inputs) - } - - /// Calculate synthetic size , cache it and set metric value - pub fn calc_and_update_cached_synthetic_size( - &self, - inputs: &size::ModelInputs, - ) -> anyhow::Result { let size = inputs.calculate()?; + self.set_cached_synthetic_size(size); + + Ok(size) + } + + /// Cache given synthetic size and update the metric value + pub fn set_cached_synthetic_size(&self, size: u64) { self.cached_synthetic_tenant_size .store(size, Ordering::Relaxed); @@ -2458,8 +2470,6 @@ impl Tenant { .get_metric_with_label_values(&[&self.tenant_id.to_string()]) .unwrap() .set(size); - - Ok(size) } pub fn get_cached_synthetic_size(&self) -> u64 { diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index e446e34f4e0f..8d7d9c6f8f05 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -154,6 +154,12 @@ where expected: &Arc, new: Arc, ) -> anyhow::Result>> { + fail::fail_point!("layermap-replace-notfound", |_| Ok( + // this is not what happens if an L0 layer was not found a anyhow error but perhaps + // that should be changed. this is good enough to show a replacement failure. + Replacement::NotFound + )); + self.layer_map.replace_historic_noflush(expected, new) } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a74dfdea04ae..a44cb02b4dd1 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -540,13 +540,11 @@ where } } -#[cfg(feature = "testing")] use { crate::repository::GcResult, pageserver_api::models::TimelineGcRequest, utils::http::error::ApiError, }; -#[cfg(feature = "testing")] pub async fn immediate_gc( tenant_id: TenantId, timeline_id: TimelineId, diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 985b480a764c..7049a0bd665a 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -571,14 +571,15 @@ impl RemoteTimelineClient { Ok(()) } - /// /// Launch a delete operation in the background. /// + /// The operation does not modify local state but assumes the local files have already been + /// deleted, and is used to mirror those changes to remote. + /// /// Note: This schedules an index file upload before the deletions. The /// deletion won't actually be performed, until any previously scheduled /// upload operations, and the index file upload, have completed /// succesfully. - /// pub fn schedule_layer_file_deletion( self: &Arc, names: &[LayerFileName], diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 2fed4f88b3cc..a41889f16d2c 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -1,8 +1,9 @@ use std::cmp; +use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -use anyhow::Context; +use anyhow::{bail, Context}; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; @@ -10,35 +11,80 @@ use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use super::Tenant; +use crate::tenant::Timeline; use utils::id::TimelineId; use utils::lsn::Lsn; use tracing::*; +use tenant_size_model::{Segment, StorageModel}; + /// Inputs to the actual tenant sizing model /// /// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to /// be a transferrable format between execution environments and developer. +/// +/// This tracks more information than the actual StorageModel that calculation +/// needs. We will convert this into a StorageModel when it's time to perform +/// the calculation. +/// #[serde_with::serde_as] #[derive(Debug, serde::Serialize, serde::Deserialize)] pub struct ModelInputs { - updates: Vec, - retention_period: u64, - - /// Relevant lsns per timeline. - /// - /// This field is not required for deserialization purposes, which is mostly used in tests. The - /// LSNs explain the outcome (updates) but are not needed in size calculation. - #[serde_as(as = "HashMap")] - #[serde(default)] - timeline_inputs: HashMap, + pub segments: Vec, + pub timeline_inputs: Vec, +} + +/// A [`Segment`], with some extra information for display purposes +#[serde_with::serde_as] +#[derive(Debug, serde::Serialize, serde::Deserialize)] +pub struct SegmentMeta { + pub segment: Segment, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub timeline_id: TimelineId, + pub kind: LsnKind, +} + +impl SegmentMeta { + fn size_needed(&self) -> bool { + match self.kind { + LsnKind::BranchStart => { + // If we don't have a later GcCutoff point on this branch, and + // no ancestor, calculate size for the branch start point. + self.segment.needed && self.segment.parent.is_none() + } + LsnKind::BranchPoint => true, + LsnKind::GcCutOff => true, + LsnKind::BranchEnd => false, + } + } +} + +#[derive( + Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize, +)] +pub enum LsnKind { + /// A timeline starting here + BranchStart, + /// A child timeline branches off from here + BranchPoint, + /// GC cutoff point + GcCutOff, + /// Last record LSN + BranchEnd, } /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as /// part of [`ModelInputs`] from the HTTP api, explaining the inputs. #[serde_with::serde_as] #[derive(Debug, serde::Serialize, serde::Deserialize)] -struct TimelineInputs { +pub struct TimelineInputs { + #[serde_as(as = "serde_with::DisplayFromStr")] + pub timeline_id: TimelineId, + + #[serde_as(as = "Option")] + pub ancestor_id: Option, + #[serde_as(as = "serde_with::DisplayFromStr")] ancestor_lsn: Lsn, #[serde_as(as = "serde_with::DisplayFromStr")] @@ -49,118 +95,14 @@ struct TimelineInputs { horizon_cutoff: Lsn, #[serde_as(as = "serde_with::DisplayFromStr")] pitr_cutoff: Lsn, + + /// Cutoff point based on GC settings #[serde_as(as = "serde_with::DisplayFromStr")] next_gc_cutoff: Lsn, -} - -// Adjust BranchFrom sorting so that we always process ancestor -// before descendants. This is needed to correctly calculate size of -// descendant timelines. -// -// Note that we may have multiple BranchFroms at the same LSN, so we -// need to sort them in the tree order. -// -// see updates_sort_with_branches_at_same_lsn test below -fn sort_updates_in_tree_order(updates: Vec) -> anyhow::Result> { - let mut sorted_updates = Vec::with_capacity(updates.len()); - let mut known_timelineids = HashSet::new(); - let mut i = 0; - while i < updates.len() { - let curr_upd = &updates[i]; - - if let Command::BranchFrom(parent_id) = curr_upd.command { - let parent_id = match parent_id { - Some(parent_id) if known_timelineids.contains(&parent_id) => { - // we have already processed ancestor - // process this BranchFrom Update normally - known_timelineids.insert(curr_upd.timeline_id); - sorted_updates.push(*curr_upd); - i += 1; - continue; - } - None => { - known_timelineids.insert(curr_upd.timeline_id); - sorted_updates.push(*curr_upd); - i += 1; - continue; - } - Some(parent_id) => parent_id, - }; - - let mut j = i; - - // we have not processed ancestor yet. - // there is a chance that it is at the same Lsn - if !known_timelineids.contains(&parent_id) { - let mut curr_lsn_branchfroms: HashMap> = - HashMap::new(); - - // inspect all branchpoints at the same lsn - while j < updates.len() && updates[j].lsn == curr_upd.lsn { - let lookahead_upd = &updates[j]; - j += 1; - - if let Command::BranchFrom(lookahead_parent_id) = lookahead_upd.command { - match lookahead_parent_id { - Some(lookahead_parent_id) - if !known_timelineids.contains(&lookahead_parent_id) => - { - // we have not processed ancestor yet - // store it for later - let es = - curr_lsn_branchfroms.entry(lookahead_parent_id).or_default(); - es.push((lookahead_upd.timeline_id, j)); - } - _ => { - // we have already processed ancestor - // process this BranchFrom Update normally - known_timelineids.insert(lookahead_upd.timeline_id); - sorted_updates.push(*lookahead_upd); - } - } - } - } - - // process BranchFroms in the tree order - // check that we don't have a cycle if somet entry is orphan - // (this should not happen, but better to be safe) - let mut processed_some_entry = true; - while processed_some_entry { - processed_some_entry = false; - - curr_lsn_branchfroms.retain(|parent_id, branchfroms| { - if known_timelineids.contains(parent_id) { - for (timeline_id, j) in branchfroms { - known_timelineids.insert(*timeline_id); - sorted_updates.push(updates[*j - 1]); - } - processed_some_entry = true; - false - } else { - true - } - }); - } - - if !curr_lsn_branchfroms.is_empty() { - // orphans are expected to be rare and transient between tenant reloads - // for example, an broken ancestor without the child branch being broken. - anyhow::bail!( - "orphan branch(es) detected in BranchFroms: {curr_lsn_branchfroms:?}" - ); - } - } - assert!(j > i); - i = j; - } else { - // not a BranchFrom, keep the same order - sorted_updates.push(*curr_upd); - i += 1; - } - } - - Ok(sorted_updates) + /// Cutoff point calculated from the user-supplied 'max_retention_period' + #[serde_as(as = "Option")] + retention_param_cutoff: Option, } /// Gathers the inputs for the tenant sizing model. @@ -181,257 +123,264 @@ fn sort_updates_in_tree_order(updates: Vec) -> anyhow::Result, + max_retention_period: Option, logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, ctx: &RequestContext, ) -> anyhow::Result { - // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to - // our advantage with `?` error handling. - let mut joinset = tokio::task::JoinSet::new(); - // refresh is needed to update gc related pitr_cutoff and horizon_cutoff tenant .refresh_gc_info(ctx) .await .context("Failed to refresh gc_info before gathering inputs")?; - let timelines = tenant.list_timelines(); + // Collect information about all the timelines + let mut timelines = tenant.list_timelines(); if timelines.is_empty() { // perhaps the tenant has just been created, and as such doesn't have any data yet return Ok(ModelInputs { - updates: vec![], - retention_period: 0, - timeline_inputs: HashMap::default(), + segments: vec![], + timeline_inputs: Vec::new(), }); } - // record the used/inserted cache keys here, to remove extras not to start leaking - // after initial run the cache should be quite stable, but live timelines will eventually - // require new lsns to be inspected. - let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new(); + // Filter out timelines that are not active + // + // There may be a race when a timeline is dropped, + // but it is unlikely to cause any issues. In the worst case, + // the calculation will error out. + timelines.retain(|t| t.is_active()); + + // Build a map of branch points. + let mut branchpoints: HashMap> = HashMap::new(); + for timeline in timelines.iter() { + if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { + branchpoints + .entry(ancestor_id) + .or_default() + .insert(timeline.get_ancestor_lsn()); + } + } - let mut updates = Vec::new(); + // These become the final result. + let mut timeline_inputs = Vec::with_capacity(timelines.len()); + let mut segments: Vec = Vec::new(); - // record the per timeline values useful to debug the model inputs, also used to track - // ancestor_lsn without keeping a hold of Timeline - let mut timeline_inputs = HashMap::with_capacity(timelines.len()); + // + // Build Segments representing each timeline. As we do that, also remember + // the branchpoints and branch startpoints in 'branchpoint_segments' and + // 'branchstart_segments' + // - // used to determine the `retention_period` for the size model - let mut max_cutoff_distance = None; + // BranchPoint segments of each timeline + // (timeline, branchpoint LSN) -> segment_id + let mut branchpoint_segments: HashMap<(TimelineId, Lsn), usize> = HashMap::new(); - // mapping from (TimelineId, Lsn) => if this branch point has been handled already via - // GcInfo::retain_lsns or if it needs to have its logical_size calculated. - let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new(); - - for timeline in timelines { - if !timeline.is_active() { - anyhow::bail!( - "timeline {} is not active, cannot calculate tenant_size now", - timeline.timeline_id - ); - } + // timeline, Branchpoint seg id, (ancestor, ancestor LSN) + type BranchStartSegment = (TimelineId, usize, Option<(TimelineId, Lsn)>); + let mut branchstart_segments: Vec = Vec::new(); + for timeline in timelines.iter() { + let timeline_id = timeline.timeline_id; let last_record_lsn = timeline.get_last_record_lsn(); + let ancestor_lsn = timeline.get_ancestor_lsn(); - let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = { - // there's a race between the update (holding tenant.gc_lock) and this read but it - // might not be an issue, because it's not for Timeline::gc - let gc_info = timeline.gc_info.read().unwrap(); - - // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a - // new gc run, which we have no control over. however differently from `Timeline::gc` - // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not - // actually removing files. - let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); - - // the minimum where we should find the next_gc_cutoff for our calculations. - // - // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we - // want to query any logical size before initdb_lsn. - let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn); - - let maybe_cutoff = if next_gc_cutoff > cutoff_minimum { - Some((next_gc_cutoff, LsnKind::GcCutOff)) - } else { - None - }; - - // this assumes there are no other lsns than the branchpoints - let lsns = gc_info - .retain_lsns - .iter() - .inspect(|&&lsn| { - trace!( - timeline_id=%timeline.timeline_id, - "retained lsn: {lsn:?}, is_before_ancestor_lsn={}", - lsn < timeline.get_ancestor_lsn() - ) - }) - .filter(|&&lsn| lsn > timeline.get_ancestor_lsn()) - .copied() - .map(|lsn| (lsn, LsnKind::BranchPoint)) - .chain(maybe_cutoff) - .collect::>(); - - ( - lsns, - gc_info.horizon_cutoff, - gc_info.pitr_cutoff, - next_gc_cutoff, - ) + // there's a race between the update (holding tenant.gc_lock) and this read but it + // might not be an issue, because it's not for Timeline::gc + let gc_info = timeline.gc_info.read().unwrap(); + + // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a + // new gc run, which we have no control over. however differently from `Timeline::gc` + // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not + // actually removing files. + let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff); + + // If the caller provided a shorter retention period, use that instead of the GC cutoff. + let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period { + let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period)); + if next_gc_cutoff < param_cutoff { + next_gc_cutoff = param_cutoff; + } + Some(param_cutoff) + } else { + None }; - // update this to have a retention_period later for the tenant_size_model - // tenant_size_model compares this to the last segments start_lsn - if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) { - match max_cutoff_distance.as_mut() { - Some(max) => { - *max = std::cmp::max(*max, cutoff_distance); - } - _ => { - max_cutoff_distance = Some(cutoff_distance); - } - } + // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we + // want to query any logical size before initdb_lsn. + let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); + + // Build "interesting LSNs" on this timeline + let mut lsns: Vec<(Lsn, LsnKind)> = gc_info + .retain_lsns + .iter() + .filter(|&&lsn| lsn > ancestor_lsn) + .copied() + // this assumes there are no other retain_lsns than the branchpoints + .map(|lsn| (lsn, LsnKind::BranchPoint)) + .collect::>(); + + // Add branch points we collected earlier, just in case there were any that were + // not present in retain_lsns. We will remove any duplicates below later. + if let Some(this_branchpoints) = branchpoints.get(&timeline_id) { + lsns.extend( + this_branchpoints + .iter() + .map(|lsn| (*lsn, LsnKind::BranchPoint)), + ) } - // all timelines branch from something, because it might be impossible to pinpoint - // which is the tenant_size_model's "default" branch. - - let ancestor_lsn = timeline.get_ancestor_lsn(); + // Add a point for the GC cutoff + let branch_start_needed = next_gc_cutoff <= branch_start_lsn; + if !branch_start_needed { + lsns.push((next_gc_cutoff, LsnKind::GcCutOff)); + } - updates.push(Update { - lsn: ancestor_lsn, - command: Command::BranchFrom(timeline.get_ancestor_timeline_id()), + lsns.sort_unstable(); + lsns.dedup(); + + // + // Create Segments for the interesting points. + // + + // Timeline start point + let ancestor = timeline + .get_ancestor_timeline_id() + .map(|ancestor_id| (ancestor_id, ancestor_lsn)); + branchstart_segments.push((timeline_id, segments.len(), ancestor)); + segments.push(SegmentMeta { + segment: Segment { + parent: None, // filled in later + lsn: branch_start_lsn.0, + size: None, // filled in later + needed: branch_start_needed, + }, timeline_id: timeline.timeline_id, + kind: LsnKind::BranchStart, }); - if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() { - // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches - // which are over gc_horizon. for example, a "main" branch which never received any - // updates apart from initdb not have branch points recorded. - referenced_branch_froms - .entry((parent_timeline_id, timeline.get_ancestor_lsn())) - .or_default(); - } - - for (lsn, _kind) in &interesting_lsns { - // mark this visited so don't need to re-process this parent - *referenced_branch_froms - .entry((timeline.timeline_id, *lsn)) - .or_default() = true; - - if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) { - updates.push(Update { - lsn: *lsn, - timeline_id: timeline.timeline_id, - command: Command::Update(*size), - }); - - needed_cache.insert((timeline.timeline_id, *lsn)); - } else { - let timeline = Arc::clone(&timeline); - let parallel_size_calcs = Arc::clone(limit); - let ctx = ctx.attached_child(); - joinset.spawn(calculate_logical_size( - parallel_size_calcs, - timeline, - *lsn, - ctx, - )); + // GC cutoff point, and any branch points, i.e. points where + // other timelines branch off from this timeline. + let mut parent = segments.len() - 1; + for (lsn, kind) in lsns { + if kind == LsnKind::BranchPoint { + branchpoint_segments.insert((timeline_id, lsn), segments.len()); } + segments.push(SegmentMeta { + segment: Segment { + parent: Some(parent), + lsn: lsn.0, + size: None, + needed: lsn > next_gc_cutoff, + }, + timeline_id: timeline.timeline_id, + kind, + }); + parent += 1; } - timeline_inputs.insert( - timeline.timeline_id, - TimelineInputs { - ancestor_lsn, - last_record: last_record_lsn, - // this is not used above, because it might not have updated recently enough - latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), - horizon_cutoff, - pitr_cutoff, - next_gc_cutoff, + // Current end of the timeline + segments.push(SegmentMeta { + segment: Segment { + parent: Some(parent), + lsn: last_record_lsn.0, + size: None, // Filled in later, if necessary + needed: true, }, - ); + timeline_id: timeline.timeline_id, + kind: LsnKind::BranchEnd, + }); + + timeline_inputs.push(TimelineInputs { + timeline_id: timeline.timeline_id, + ancestor_id: timeline.get_ancestor_timeline_id(), + ancestor_lsn, + last_record: last_record_lsn, + // this is not used above, because it might not have updated recently enough + latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(), + horizon_cutoff: gc_info.horizon_cutoff, + pitr_cutoff: gc_info.pitr_cutoff, + next_gc_cutoff, + retention_param_cutoff, + }); } - // iterate over discovered branch points and make sure we are getting logical sizes at those - // points. - for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() { - if *handled { - continue; + // We now have all segments from the timelines in 'segments'. The timelines + // haven't been linked to each other yet, though. Do that. + for (_timeline_id, seg_id, ancestor) in branchstart_segments { + // Look up the branch point + if let Some(ancestor) = ancestor { + let parent_id = *branchpoint_segments.get(&ancestor).unwrap(); + segments[seg_id].segment.parent = Some(parent_id); } + } - let timeline_id = *timeline_id; - let lsn = *lsn; + // We left the 'size' field empty in all of the Segments so far. + // Now find logical sizes for all of the points that might need or benefit from them. + fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?; - match timeline_inputs.get(&timeline_id) { - Some(inputs) if inputs.ancestor_lsn == lsn => { - // we don't need an update at this branch point which is also point where - // timeline_id branch was branched from. - continue; - } - Some(_) => {} - None => { - // we should have this because we have iterated through all of the timelines - anyhow::bail!("missing timeline_input for {timeline_id}") - } - } + Ok(ModelInputs { + segments, + timeline_inputs, + }) +} - if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) { - updates.push(Update { - lsn, - timeline_id, - command: Command::Update(*size), - }); +/// Augment 'segments' with logical sizes +/// +/// this will probably conflict with on-demand downloaded layers, or at least force them all +/// to be downloaded +/// +async fn fill_logical_sizes( + timelines: &[Arc], + segments: &mut [SegmentMeta], + limit: &Arc, + logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>, + ctx: &RequestContext, +) -> anyhow::Result<()> { + let timeline_hash: HashMap> = HashMap::from_iter( + timelines + .iter() + .map(|timeline| (timeline.timeline_id, Arc::clone(timeline))), + ); - needed_cache.insert((timeline_id, lsn)); - } else { - let timeline = tenant - .get_timeline(timeline_id, false) - .context("find referenced ancestor timeline")?; - let parallel_size_calcs = Arc::clone(limit); - joinset.spawn(calculate_logical_size( - parallel_size_calcs, - timeline.clone(), - lsn, - ctx.attached_child(), - )); - - if let Some(parent_id) = timeline.get_ancestor_timeline_id() { - // we should not find new ones because we iterated tenants all timelines - anyhow::ensure!( - timeline_inputs.contains_key(&parent_id), - "discovered new timeline {parent_id} (parent of {timeline_id})" - ); - } - }; - } + // record the used/inserted cache keys here, to remove extras not to start leaking + // after initial run the cache should be quite stable, but live timelines will eventually + // require new lsns to be inspected. + let mut sizes_needed = HashMap::<(TimelineId, Lsn), Option>::new(); - // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch - // point. this is needed by the model. - for (timeline_id, inputs) in timeline_inputs.iter() { - let lsn = inputs.last_record; + // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to + // our advantage with `?` error handling. + let mut joinset = tokio::task::JoinSet::new(); - if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) { - // this means that the (timeline_id, last_record_lsn) represents a branch point - // we do not want to add EndOfBranch updates for these points because it doesn't fit - // into the current tenant_size_model. + // For each point that would benefit from having a logical size available, + // spawn a Task to fetch it, unless we have it cached already. + for seg in segments.iter() { + if !seg.size_needed() { continue; } - if lsn > inputs.ancestor_lsn { - // all timelines also have an end point if they have made any progress - updates.push(Update { - lsn, - command: Command::EndOfBranch, - timeline_id: *timeline_id, - }); + let timeline_id = seg.timeline_id; + let lsn = Lsn(seg.segment.lsn); + + if let Entry::Vacant(e) = sizes_needed.entry((timeline_id, lsn)) { + let cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned(); + if cached_size.is_none() { + let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap()); + let parallel_size_calcs = Arc::clone(limit); + let ctx = ctx.attached_child(); + joinset.spawn(calculate_logical_size( + parallel_size_calcs, + timeline, + lsn, + ctx, + )); + } + e.insert(cached_size); } } + // Perform the size lookups let mut have_any_error = false; - while let Some(res) = joinset.join_next().await { // each of these come with Result, JoinError> // because of spawn + spawn_blocking @@ -460,19 +409,13 @@ pub(super) async fn gather_inputs( debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated"); logical_size_cache.insert((timeline.timeline_id, lsn), size); - needed_cache.insert((timeline.timeline_id, lsn)); - - updates.push(Update { - lsn, - timeline_id: timeline.timeline_id, - command: Command::Update(size), - }); + sizes_needed.insert((timeline.timeline_id, lsn), Some(size)); } } } // prune any keys not needed anymore; we record every used key and added key. - logical_size_cache.retain(|key, _| needed_cache.contains(key)); + logical_size_cache.retain(|key, _| sizes_needed.contains_key(key)); if have_any_error { // we cannot complete this round, because we are missing data. @@ -480,105 +423,47 @@ pub(super) async fn gather_inputs( anyhow::bail!("failed to calculate some logical_sizes"); } - // the data gathered to updates is per lsn, regardless of the branch, so we can use it to - // our advantage, not requiring a sorted container or graph walk. - // - // for branch points, which come as multiple updates at the same LSN, the Command::Update - // is needed before a branch is made out of that branch Command::BranchFrom. this is - // handled by the variant order in `Command`. - // - updates.sort_unstable(); + // Insert the looked up sizes to the Segments + for seg in segments.iter_mut() { + if !seg.size_needed() { + continue; + } - // And another sort to handle Command::BranchFrom ordering - // in case when there are multiple branches at the same LSN. - let sorted_updates = sort_updates_in_tree_order(updates)?; + let timeline_id = seg.timeline_id; + let lsn = Lsn(seg.segment.lsn); - let retention_period = match max_cutoff_distance { - Some(max) => max.0, - None => { - anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0") + if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) { + seg.segment.size = Some(*size); + } else { + bail!("could not find size at {} in timeline {}", lsn, timeline_id); } - }; - - Ok(ModelInputs { - updates: sorted_updates, - retention_period, - timeline_inputs, - }) + } + Ok(()) } impl ModelInputs { - pub fn calculate(&self) -> anyhow::Result { - // Option is used for "naming" the branches because it is assumed to be - // impossible to always determine the a one main branch. - let mut storage = tenant_size_model::Storage::>::new(None); - - for update in &self.updates { - let Update { - lsn, - command: op, - timeline_id, - } = update; - - let Lsn(now) = *lsn; - match op { - Command::Update(sz) => { - storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?; - } - Command::EndOfBranch => { - storage.insert_point(&Some(*timeline_id), "".into(), now, None)?; - } - Command::BranchFrom(parent) => { - // This branch command may fail if it cannot find a parent to branch from. - storage.branch(parent, Some(*timeline_id))?; - } - } - } + pub fn calculate_model(&self) -> anyhow::Result { + // Convert SegmentMetas into plain Segments + let storage = StorageModel { + segments: self + .segments + .iter() + .map(|seg| seg.segment.clone()) + .collect(), + }; - Ok(storage.calculate(self.retention_period)?.total_children()) + Ok(storage) } -} -/// A point of interest in the tree of branches -#[serde_with::serde_as] -#[derive( - Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize, -)] -struct Update { - #[serde_as(as = "serde_with::DisplayFromStr")] - lsn: utils::lsn::Lsn, - command: Command, - #[serde_as(as = "serde_with::DisplayFromStr")] - timeline_id: TimelineId, -} - -#[serde_with::serde_as] -#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)] -#[serde(rename_all = "snake_case")] -enum Command { - Update(u64), - BranchFrom(#[serde_as(as = "Option")] Option), - EndOfBranch, -} + // calculate total project size + pub fn calculate(&self) -> anyhow::Result { + let storage = self.calculate_model()?; + let sizes = storage.calculate(); -impl std::fmt::Debug for Command { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3 - // linebreaks - match self { - Self::Update(arg0) => write!(f, "Update({arg0})"), - Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"), - Self::EndOfBranch => write!(f, "EndOfBranch"), - } + Ok(sizes.total_size) } } -#[derive(Debug, Clone, Copy)] -enum LsnKind { - BranchPoint, - GcCutOff, -} - /// Newtype around the tuple that carries the timeline at lsn logical size calculation. struct TimelineAtLsnSizeResult( Arc, @@ -604,227 +489,230 @@ async fn calculate_logical_size( Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res)) } -#[test] -fn updates_sort() { - use std::str::FromStr; - use utils::id::TimelineId; - use utils::lsn::Lsn; - - let ids = [ - TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(), - TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(), - TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(), - ]; - - // try through all permutations - let ids = [ - [&ids[0], &ids[1], &ids[2]], - [&ids[0], &ids[2], &ids[1]], - [&ids[1], &ids[0], &ids[2]], - [&ids[1], &ids[2], &ids[0]], - [&ids[2], &ids[0], &ids[1]], - [&ids[2], &ids[1], &ids[0]], - ]; - - for ids in ids { - // apply a fixture which uses a permutation of ids - let commands = [ - Update { - lsn: Lsn(0), - command: Command::BranchFrom(None), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/67E7618").unwrap(), - command: Command::Update(43696128), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/67E7618").unwrap(), - command: Command::BranchFrom(Some(*ids[0])), - timeline_id: *ids[1], - }, - Update { - lsn: Lsn::from_str("0/76BE4F0").unwrap(), - command: Command::Update(41844736), - timeline_id: *ids[1], - }, - Update { - lsn: Lsn::from_str("0/10E49380").unwrap(), - command: Command::Update(42164224), - timeline_id: *ids[0], - }, - Update { - lsn: Lsn::from_str("0/10E49380").unwrap(), - command: Command::BranchFrom(Some(*ids[0])), - timeline_id: *ids[2], - }, - Update { - lsn: Lsn::from_str("0/11D74910").unwrap(), - command: Command::Update(42172416), - timeline_id: *ids[2], - }, - Update { - lsn: Lsn::from_str("0/12051E98").unwrap(), - command: Command::Update(42196992), - timeline_id: *ids[0], - }, - ]; - - let mut sorted = commands; - - // these must sort in the same order, regardless of how the ids sort - // which is why the timeline_id is the last field - sorted.sort_unstable(); - - assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted); - } -} - #[test] fn verify_size_for_multiple_branches() { // this is generated from integration test test_tenant_size_with_multiple_branches, but this way // it has the stable lsn's // - // timelineinputs have been left out, because those explain the inputs, but don't participate - // in further size calculations. - let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#; - + // The timeline_inputs don't participate in the size calculation, and are here just to explain + // the inputs. + let doc = r#" +{ + "segments": [ + { + "segment": { + "parent": 9, + "lsn": 26033560, + "size": null, + "needed": false + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 0, + "lsn": 35720400, + "size": 25206784, + "needed": false + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 1, + "lsn": 35851472, + "size": null, + "needed": true + }, + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "kind": "BranchEnd" + }, + { + "segment": { + "parent": 7, + "lsn": 24566168, + "size": null, + "needed": false + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 3, + "lsn": 25261936, + "size": 26050560, + "needed": false + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 4, + "lsn": 25393008, + "size": null, + "needed": true + }, + "timeline_id": "454626700469f0a9914949b9d018e876", + "kind": "BranchEnd" + }, + { + "segment": { + "parent": null, + "lsn": 23694408, + "size": null, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 6, + "lsn": 24566168, + "size": 25739264, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchPoint" + }, + { + "segment": { + "parent": 7, + "lsn": 25902488, + "size": 26402816, + "needed": false + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 8, + "lsn": 26033560, + "size": 26468352, + "needed": true + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchPoint" + }, + { + "segment": { + "parent": 9, + "lsn": 26033560, + "size": null, + "needed": true + }, + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "kind": "BranchEnd" + } + ], + "timeline_inputs": [ + { + "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce", + "ancestor_lsn": "0/18D3D98", + "last_record": "0/2230CD0", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/2210CD0", + "pitr_cutoff": "0/2210CD0", + "next_gc_cutoff": "0/2210CD0", + "retention_param_cutoff": null + }, + { + "timeline_id": "454626700469f0a9914949b9d018e876", + "ancestor_lsn": "0/176D998", + "last_record": "0/1837770", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/1817770", + "pitr_cutoff": "0/1817770", + "next_gc_cutoff": "0/1817770", + "retention_param_cutoff": null + }, + { + "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", + "ancestor_lsn": "0/0", + "last_record": "0/18D3D98", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/18B3D98", + "pitr_cutoff": "0/18B3D98", + "next_gc_cutoff": "0/18B3D98", + "retention_param_cutoff": null + } + ] +} +"#; let inputs: ModelInputs = serde_json::from_str(doc).unwrap(); - assert_eq!(inputs.calculate().unwrap(), 36_409_872); + assert_eq!(inputs.calculate().unwrap(), 37_851_408); } #[test] -fn updates_sort_with_branches_at_same_lsn() { - use std::str::FromStr; - use Command::{BranchFrom, EndOfBranch}; - - macro_rules! lsn { - ($e:expr) => { - Lsn::from_str($e).unwrap() - }; +fn verify_size_for_one_branch() { + let doc = r#" +{ + "segments": [ + { + "segment": { + "parent": null, + "lsn": 0, + "size": null, + "needed": false + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "BranchStart" + }, + { + "segment": { + "parent": 0, + "lsn": 305547335776, + "size": 220054675456, + "needed": false + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "GcCutOff" + }, + { + "segment": { + "parent": 1, + "lsn": 305614444640, + "size": null, + "needed": true + }, + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "kind": "BranchEnd" } + ], + "timeline_inputs": [ + { + "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd", + "ancestor_lsn": "0/0", + "last_record": "47/280A5860", + "latest_gc_cutoff": "47/240A5860", + "horizon_cutoff": "47/240A5860", + "pitr_cutoff": "47/240A5860", + "next_gc_cutoff": "47/240A5860", + "retention_param_cutoff": "0/0" + } + ] +}"#; + + let model: ModelInputs = serde_json::from_str(doc).unwrap(); + + let res = model.calculate_model().unwrap().calculate(); - let ids = [ - TimelineId::from_str("00000000000000000000000000000000").unwrap(), - TimelineId::from_str("11111111111111111111111111111111").unwrap(), - TimelineId::from_str("22222222222222222222222222222222").unwrap(), - TimelineId::from_str("33333333333333333333333333333333").unwrap(), - TimelineId::from_str("44444444444444444444444444444444").unwrap(), - ]; - - // issue https://github.com/neondatabase/neon/issues/3179 - let commands = vec![ - Update { - lsn: lsn!("0/0"), - command: BranchFrom(None), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: Command::Update(25387008), - timeline_id: ids[0], - }, - // next three are wrongly sorted, because - // ids[1] is branched from before ids[1] exists - // and ids[2] is branched from before ids[2] exists - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[0])), - timeline_id: ids[2], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[2])), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CA85B8"), - command: Command::Update(28925952), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: Command::Update(29024256), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[4], - }, - Update { - lsn: lsn!("0/22DCE70"), - command: Command::Update(32546816), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/230CE70"), - command: EndOfBranch, - timeline_id: ids[3], - }, - ]; - - let expected = vec![ - Update { - lsn: lsn!("0/0"), - command: BranchFrom(None), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: Command::Update(25387008), - timeline_id: ids[0], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[0])), - timeline_id: ids[2], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[2])), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/169AD58"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/1CA85B8"), - command: Command::Update(28925952), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: Command::Update(29024256), - timeline_id: ids[1], - }, - Update { - lsn: lsn!("0/1CD85B8"), - command: BranchFrom(Some(ids[1])), - timeline_id: ids[4], - }, - Update { - lsn: lsn!("0/22DCE70"), - command: Command::Update(32546816), - timeline_id: ids[3], - }, - Update { - lsn: lsn!("0/230CE70"), - command: EndOfBranch, - timeline_id: ids[3], - }, - ]; - - let sorted_commands = sort_updates_in_tree_order(commands).unwrap(); - - assert_eq!(sorted_commands, expected); + println!("calculated synthetic size: {}", res.total_size); + println!("result: {:?}", serde_json::to_string(&res.segments)); + + use utils::lsn::Lsn; + let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap(); + let last_lsn: Lsn = "47/280A5860".parse().unwrap(); + println!( + "latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}", + u64::from(latest_gc_cutoff_lsn), + u64::from(last_lsn) + ); + assert_eq!(res.total_size, 220121784320); } diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index 51bb4dcc2afe..8465a9933918 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -49,6 +49,17 @@ pub struct RemoteLayer { access_stats: LayerAccessStats, pub(crate) ongoing_download: Arc, + + /// Has `LayerMap::replace` failed for this (true) or not (false). + /// + /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`. + /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load) + /// unprocessable, because a LayerMap::replace failed. + /// + /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids + /// a possible fast loop between `Timeline::get_reconstruct_data` and + /// `Timeline::download_remote_layer`, which also logs. + pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool, } impl std::fmt::Debug for RemoteLayer { @@ -207,6 +218,7 @@ impl RemoteLayer { file_name: fname.to_owned().into(), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + download_replacement_failure: std::sync::atomic::AtomicBool::default(), access_stats, } } @@ -228,6 +240,7 @@ impl RemoteLayer { file_name: fname.to_owned().into(), layer_metadata: layer_metadata.clone(), ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)), + download_replacement_failure: std::sync::atomic::AtomicBool::default(), access_stats, } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index b126545ee433..e9ce52d1ab37 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -3,7 +3,7 @@ use std::ops::ControlFlow; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; @@ -11,6 +11,7 @@ use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::mgr; use crate::tenant::{Tenant, TenantState}; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::TenantId; @@ -53,37 +54,55 @@ async fn compaction_loop(tenant_id: TenantId) { info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + let cancel = task_mgr::shutdown_token(); let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); + let mut first = true; loop { trace!("waking up"); let tenant = tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request"); - return; + return; }, tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result { ControlFlow::Break(()) => return, ControlFlow::Continue(tenant) => tenant, }, - }; + }; + + let period = tenant.get_compaction_period(); - let mut sleep_duration = tenant.get_compaction_period(); - if sleep_duration == Duration::ZERO { + // TODO: we shouldn't need to await to find tenant and this could be moved outside of + // loop + if first { + first = false; + if random_init_delay(period, &cancel).await.is_err() { + break; + } + } + + let started_at = Instant::now(); + + let sleep_duration = if period == Duration::ZERO { info!("automatic compaction is disabled"); // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); + Duration::from_secs(10) } else { // Run compaction if let Err(e) = tenant.compaction_iteration(&ctx).await { - sleep_duration = wait_duration; - error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration); + error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration); + wait_duration + } else { + period } - } + }; + + warn_when_period_overrun(started_at.elapsed(), period, "compaction"); // Sleep tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request during idling"); break; }, @@ -105,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) { info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { + let cancel = task_mgr::shutdown_token(); // GC might require downloading, to find the cutoff LSN that corresponds to the // cutoff specified as time. let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); + let mut first = true; loop { trace!("waking up"); let tenant = tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request"); return; }, @@ -122,27 +143,38 @@ async fn gc_loop(tenant_id: TenantId) { }, }; - let gc_period = tenant.get_gc_period(); + let period = tenant.get_gc_period(); + + if first { + first = false; + if random_init_delay(period, &cancel).await.is_err() { + break; + } + } + + let started_at = Instant::now(); + let gc_horizon = tenant.get_gc_horizon(); - let mut sleep_duration = gc_period; - if sleep_duration == Duration::ZERO { + let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { info!("automatic GC is disabled"); // check again in 10 seconds, in case it's been enabled again. - sleep_duration = Duration::from_secs(10); + Duration::from_secs(10) } else { // Run gc - if gc_horizon > 0 { - if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await - { - sleep_duration = wait_duration; - error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration); - } + let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await; + if let Err(e) = res { + error!("Gc failed, retrying in {:?}: {e:?}", wait_duration); + wait_duration + } else { + period } - } + }; + + warn_when_period_overrun(started_at.elapsed(), period, "gc"); // Sleep tokio::select! { - _ = task_mgr::shutdown_watcher() => { + _ = cancel.cancelled() => { info!("received cancellation request during idling"); break; }, @@ -197,3 +229,49 @@ async fn wait_for_active_tenant( } } } + +#[derive(thiserror::Error, Debug)] +#[error("cancelled")] +pub(crate) struct Cancelled; + +/// Provide a random delay for background task initialization. +/// +/// This delay prevents a thundering herd of background tasks and will likely keep them running on +/// different periods for more stable load. +pub(crate) async fn random_init_delay( + period: Duration, + cancel: &CancellationToken, +) -> Result<(), Cancelled> { + use rand::Rng; + + let d = { + let mut rng = rand::thread_rng(); + + // gen_range asserts that the range cannot be empty, which it could be because period can + // be set to zero to disable gc or compaction, so lets set it to be at least 10s. + let period = std::cmp::max(period, Duration::from_secs(10)); + + // semi-ok default as the source of jitter + rng.gen_range(Duration::ZERO..=period) + }; + + tokio::select! { + _ = cancel.cancelled() => Err(Cancelled), + _ = tokio::time::sleep(d) => Ok(()), + } +} + +pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) { + // Duration::ZERO will happen because it's the "disable [bgtask]" value. + if elapsed >= period && period != Duration::ZERO { + // humantime does no significant digits clamping whereas Duration's debug is a bit more + // intelligent. however it makes sense to keep the "configuration format" for period, even + // though there's no way to output the actual config value. + warn!( + ?elapsed, + period = %humantime::format_duration(period), + task, + "task iteration took longer than the configured period" + ); + } +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5a829e42e54c..f2b0a985097e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -613,7 +613,10 @@ impl Timeline { self.flush_frozen_layers_and_wait().await } + /// Outermost timeline compaction operation; downloads needed layers. pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> { + const ROUNDS: usize = 2; + let last_record_lsn = self.get_last_record_lsn(); // Last record Lsn could be zero in case the timeline was just created @@ -622,6 +625,86 @@ impl Timeline { return Ok(()); } + // retry two times to allow first round to find layers which need to be downloaded, then + // download them, then retry compaction + for round in 0..ROUNDS { + // should we error out with the most specific error? + let last_round = round == ROUNDS - 1; + + let res = self.compact_inner(ctx).await; + + // If `create_image_layers' or `compact_level0` scheduled any + // uploads or deletions, but didn't update the index file yet, + // do it now. + // + // This isn't necessary for correctness, the remote state is + // consistent without the uploads and deletions, and we would + // update the index file on next flush iteration too. But it + // could take a while until that happens. + // + // Additionally, only do this on the terminal round before sleeping. + if last_round { + if let Some(remote_client) = &self.remote_client { + remote_client.schedule_index_upload_for_file_changes()?; + } + } + + let rls = match res { + Ok(()) => return Ok(()), + Err(CompactionError::DownloadRequired(rls)) if !last_round => { + // this can be done at most one time before exiting, waiting + rls + } + Err(CompactionError::DownloadRequired(rls)) => { + anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len()) + } + Err(CompactionError::Other(e)) => { + return Err(e); + } + }; + + // this path can be visited in the second round of retrying, if first one found that we + // must first download some remote layers + let total = rls.len(); + + let mut downloads = rls + .into_iter() + .map(|rl| self.download_remote_layer(rl)) + .collect::>(); + + let mut failed = 0; + + let cancelled = task_mgr::shutdown_watcher(); + tokio::pin!(cancelled); + + loop { + tokio::select! { + _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"), + res = downloads.next() => { + match res { + Some(Ok(())) => {}, + Some(Err(e)) => { + warn!("Downloading remote layer for compaction failed: {e:#}"); + failed += 1; + } + None => break, + } + } + } + } + + if failed != 0 { + anyhow::bail!("{failed} out of {total} layers failed to download, retrying later"); + } + + // if everything downloaded fine, lets try again + } + + unreachable!("retry loop exits") + } + + /// Compaction which might need to be retried after downloading remote layers. + async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> { // // High level strategy for compaction / image creation: // @@ -660,7 +743,7 @@ impl Timeline { // Is the timeline being deleted? let state = *self.state.borrow(); if state == TimelineState::Stopping { - anyhow::bail!("timeline is Stopping"); + return Err(anyhow::anyhow!("timeline is Stopping").into()); } let target_file_size = self.get_checkpoint_distance(); @@ -680,7 +763,8 @@ impl Timeline { // "enough". let layer_paths_to_upload = self .create_image_layers(&partitioning, lsn, false, ctx) - .await?; + .await + .map_err(anyhow::Error::from)?; if let Some(remote_client) = &self.remote_client { for (path, layer_metadata) in layer_paths_to_upload { remote_client.schedule_layer_file_upload(&path, &layer_metadata)?; @@ -692,18 +776,6 @@ impl Timeline { self.compact_level0(&layer_removal_cs, target_file_size, ctx) .await?; timer.stop_and_record(); - - // If `create_image_layers' or `compact_level0` scheduled any - // uploads or deletions, but didn't update the index file yet, - // do it now. - // - // This isn't necessary for correctness, the remote state is - // consistent without the uploads and deletions, and we would - // update the index file on next flush iteration too. But it - // could take a while until that happens. - if let Some(remote_client) = &self.remote_client { - remote_client.schedule_index_upload_for_file_changes()?; - } } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -2536,10 +2608,13 @@ impl Timeline { ) -> anyhow::Result<(KeyPartitioning, Lsn)> { { let partitioning_guard = self.partitioning.lock().unwrap(); - if partitioning_guard.1 != Lsn(0) - && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold - { - // no repartitioning needed + let distance = lsn.0 - partitioning_guard.1 .0; + if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold { + debug!( + distance, + threshold = self.repartition_threshold, + "no repartitioning needed" + ); return Ok((partitioning_guard.0.clone(), partitioning_guard.1)); } } @@ -2557,8 +2632,12 @@ impl Timeline { // Is it time to create a new image layer for the given partition? fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result { + let threshold = self.get_image_creation_threshold(); + let layers = self.layers.read().unwrap(); + let mut max_deltas = 0; + for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; for (img_range, last_img) in image_coverage { @@ -2580,21 +2659,25 @@ impl Timeline { // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed // after we read last_record_lsn, which is passed here in the 'lsn' argument. if img_lsn < lsn { - let threshold = self.get_image_creation_threshold(); let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?; - debug!( - "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", - img_range.start, img_range.end, num_deltas, img_lsn, lsn - ); + max_deltas = max_deltas.max(num_deltas); if num_deltas >= threshold { + debug!( + "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}", + img_range.start, img_range.end, num_deltas, img_lsn, lsn + ); return Ok(true); } } } } + debug!( + max_deltas, + "none of the partitioned ranges had >= {threshold} deltas" + ); Ok(false) } @@ -2707,25 +2790,55 @@ impl Timeline { Ok(layer_paths_to_upload) } } + #[derive(Default)] struct CompactLevel0Phase1Result { new_layers: Vec, deltas_to_compact: Vec>, } +/// Top-level failure to compact. +#[derive(Debug)] +enum CompactionError { + /// L0 compaction requires layers to be downloaded. + /// + /// This should not happen repeatedly, but will be retried once by top-level + /// `Timeline::compact`. + DownloadRequired(Vec>), + /// Compaction cannot be done right now; page reconstruction and so on. + Other(anyhow::Error), +} + +impl From for CompactionError { + fn from(value: anyhow::Error) -> Self { + CompactionError::Other(value) + } +} + impl Timeline { + /// Level0 files first phase of compaction, explained in the [`compact_inner`] comment. + /// + /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are + /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the + /// start of level0 files compaction, the on-demand download should be revisited as well. async fn compact_level0_phase1( &self, + _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, target_file_size: u64, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let layers = self.layers.read().unwrap(); let mut level0_deltas = layers.get_level0_deltas()?; drop(layers); // Only compact if enough layers have accumulated. - if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() { - return Ok(Default::default()); + let threshold = self.get_compaction_threshold(); + if level0_deltas.is_empty() || level0_deltas.len() < threshold { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); } // Gather the files to compact in this iteration. @@ -2761,6 +2874,24 @@ impl Timeline { end: deltas_to_compact.last().unwrap().get_lsn_range().end, }; + let remotes = deltas_to_compact + .iter() + .filter(|l| l.is_remote_layer()) + .inspect(|l| info!("compact requires download of {}", l.filename().file_name())) + .map(|l| { + l.clone() + .downcast_remote_layer() + .expect("just checked it is remote layer") + }) + .collect::>(); + + if !remotes.is_empty() { + // caller is holding the lock to layer_removal_cs, and we don't want to download while + // holding that; in future download_remote_layer might take it as well. this is + // regardless of earlier image creation downloading on-demand, while holding the lock. + return Err(CompactionError::DownloadRequired(remotes)); + } + info!( "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)", lsn_range.start, @@ -2768,9 +2899,11 @@ impl Timeline { deltas_to_compact.len(), level0_deltas.len() ); + for l in deltas_to_compact.iter() { info!("compact includes {}", l.filename().file_name()); } + // We don't need the original list of layers anymore. Drop it so that // we don't accidentally use it later in the function. drop(level0_deltas); @@ -2940,7 +3073,9 @@ impl Timeline { } fail_point!("delta-layer-writer-fail-before-finish", |_| { - anyhow::bail!("failpoint delta-layer-writer-fail-before-finish"); + return Err( + anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(), + ); }); writer.as_mut().unwrap().put_value(key, lsn, value)?; @@ -2959,7 +3094,7 @@ impl Timeline { // Fsync all the layer files and directory using multiple threads to // minimize latency. - par_fsync::par_fsync(&layer_paths)?; + par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?; layer_paths.pop().unwrap(); } @@ -2981,11 +3116,13 @@ impl Timeline { layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>, target_file_size: u64, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), CompactionError> { let CompactLevel0Phase1Result { new_layers, deltas_to_compact, - } = self.compact_level0_phase1(target_file_size, ctx).await?; + } = self + .compact_level0_phase1(layer_removal_cs, target_file_size, ctx) + .await?; if new_layers.is_empty() && deltas_to_compact.is_empty() { // nothing to do @@ -3009,7 +3146,12 @@ impl Timeline { for l in new_layers { let new_delta_path = l.path(); - let metadata = new_delta_path.metadata()?; + let metadata = new_delta_path.metadata().with_context(|| { + format!( + "read file metadata for new created layer {}", + new_delta_path.display() + ) + })?; if let Some(remote_client) = &self.remote_client { remote_client.schedule_layer_file_upload( @@ -3243,7 +3385,7 @@ impl Timeline { let mut layers_to_remove = Vec::new(); - // Scan all on-disk layers in the timeline. + // Scan all layers in the timeline (remote or on-disk). // // Garbage collect the layer if all conditions are satisfied: // 1. it is older than cutoff LSN; @@ -3482,14 +3624,26 @@ impl Timeline { &self, remote_layer: Arc, ) -> anyhow::Result<()> { + use std::sync::atomic::Ordering::Relaxed; + let permit = match Arc::clone(&remote_layer.ongoing_download) .acquire_owned() .await { Ok(permit) => permit, Err(_closed) => { - info!("download of layer has already finished"); - return Ok(()); + if remote_layer.download_replacement_failure.load(Relaxed) { + // this path will be hit often, in case there are upper retries. however + // hitting this error will prevent a busy loop between get_reconstruct_data and + // download, so an error is prefered. + // + // TODO: we really should poison the timeline, but panicking is not yet + // supported. Related: https://github.com/neondatabase/neon/issues/3621 + anyhow::bail!("an earlier download succeeded but LayerMap::replace failed") + } else { + info!("download of layer has already finished"); + return Ok(()); + } } }; @@ -3525,8 +3679,8 @@ impl Timeline { { use crate::tenant::layer_map::Replacement; let l: Arc = remote_layer.clone(); - match updates.replace_historic(&l, new_layer) { - Ok(Replacement::Replaced { .. }) => { /* expected */ } + let failure = match updates.replace_historic(&l, new_layer) { + Ok(Replacement::Replaced { .. }) => false, Ok(Replacement::NotFound) => { // TODO: the downloaded file should probably be removed, otherwise // it will be added to the layermap on next load? we should @@ -3534,6 +3688,7 @@ impl Timeline { // // See: https://github.com/neondatabase/neon/issues/3533 error!("replacing downloaded layer into layermap failed because layer was not found"); + true } Ok(Replacement::RemovalBuffered) => { unreachable!("current implementation does not remove anything") @@ -3552,12 +3707,32 @@ impl Timeline { ?other, "replacing downloaded layer into layermap failed because another layer was found instead of expected" ); + true } Err(e) => { // this is a precondition failure, the layer filename derived // attributes didn't match up, which doesn't seem likely. - error!("replacing downloaded layer into layermap failed: {e:#?}") + error!("replacing downloaded layer into layermap failed: {e:#?}"); + true } + }; + + if failure { + // mark the remote layer permanently failed; the timeline is most + // likely unusable after this. sadly we cannot just poison the layermap + // lock with panic, because that would create an issue with shutdown. + // + // this does not change the retry semantics on failed downloads. + // + // use of Relaxed is valid because closing of the semaphore gives + // happens-before and wakes up any waiters; we write this value before + // and any waiters (or would be waiters) will load it after closing + // semaphore. + // + // See: https://github.com/neondatabase/neon/issues/3533 + remote_layer + .download_replacement_failure + .store(true, Relaxed); } } updates.flush(); @@ -3583,7 +3758,7 @@ impl Timeline { drop(permit); Ok(()) - }, + }.in_current_span(), ); receiver.await.context("download task cancelled")? diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index e3e7ce4c9df9..2aad0ef0f344 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -41,9 +41,23 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, cancel: CancellationToken) { + use crate::tenant::tasks::random_init_delay; + { + let policy = self.get_eviction_policy(); + let period = match policy { + EvictionPolicy::LayerAccessThreshold(lat) => lat.period, + EvictionPolicy::NoEviction => Duration::from_secs(10), + }; + if random_init_delay(period, &cancel).await.is_err() { + info!("shutting down"); + return; + } + } + loop { let policy = self.get_eviction_policy(); let cf = self.eviction_iteration(&policy, cancel.clone()).await; + match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { @@ -78,13 +92,7 @@ impl Timeline { ControlFlow::Continue(()) => (), } let elapsed = start.elapsed(); - if elapsed > p.period { - warn!( - configured_period = %humantime::format_duration(p.period), - last_period = %humantime::format_duration(elapsed), - "this eviction period took longer than the configured period" - ); - } + crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction"); ControlFlow::Continue(start + p.period) } } @@ -100,7 +108,6 @@ impl Timeline { #[allow(dead_code)] #[derive(Debug, Default)] struct EvictionStats { - not_considered_due_to_clock_skew: usize, candidates: usize, evicted: usize, errors: usize, @@ -129,9 +136,21 @@ impl Timeline { let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, Err(_e) => { - // NB: don't log the error. If there are many layers and the system clock - // is skewed, we'd be flooding the log. - stats.not_considered_due_to_clock_skew += 1; + // We reach here if `now` < `last_activity_ts`, which can legitimately + // happen if there is an access between us getting `now`, and us getting + // the access stats from the layer. + // + // The other reason why it can happen is system clock skew because + // SystemTime::now() is not monotonic, so, even if there is no access + // to the layer after we get `now` at the beginning of this function, + // it could be that `now` < `last_activity_ts`. + // + // To distinguish the cases, we would need to record `Instant`s in the + // access stats (i.e., monotonic timestamps), but then, the timestamps + // values in the access stats would need to be `Instant`'s, and hence + // they would be meaningless outside of the pageserver process. + // At the time of writing, the trade-off is that access stats are more + // valuable than detecting clock skew. continue; } }; @@ -188,8 +207,9 @@ impl Timeline { } } } - if stats.not_considered_due_to_clock_skew > 0 || stats.errors > 0 || stats.not_evictable > 0 - { + if stats.candidates == stats.not_evictable { + debug!(stats=?stats, "eviction iteration complete"); + } else if stats.errors > 0 || stats.not_evictable > 0 { warn!(stats=?stats, "eviction iteration complete"); } else { info!(stats=?stats, "eviction iteration complete"); diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index c943bf0a276f..21c6ede27e93 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -27,8 +27,7 @@ use std::fs::OpenOptions; use std::io::prelude::*; use std::io::{Error, ErrorKind}; use std::ops::{Deref, DerefMut}; -use std::os::fd::RawFd; -use std::os::unix::io::AsRawFd; +use std::os::unix::io::{AsRawFd, RawFd}; use std::os::unix::prelude::CommandExt; use std::path::PathBuf; use std::process::Stdio; diff --git a/poetry.lock b/poetry.lock index 7e80b1e10ada..bc2c56d74c61 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1238,7 +1238,6 @@ category = "main" optional = false python-versions = "*" files = [ - {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"}, {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"}, ] @@ -1309,66 +1308,63 @@ files = [ [[package]] name = "moto" -version = "3.1.18" -description = "A library that allows your python tests to easily mock out the boto library" +version = "4.1.2" +description = "" category = "main" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"}, - {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"}, + {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"}, + {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"}, ] [package.dependencies] aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""} boto3 = ">=1.9.201" botocore = ">=1.12.201" -cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""} +cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""} cryptography = ">=3.3.1" docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""} ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""} -flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""} +flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""} flask-cors = {version = "*", optional = true, markers = "extra == \"server\""} graphql-core = {version = "*", optional = true, markers = "extra == \"server\""} -idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""} Jinja2 = ">=2.10.1" jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""} -MarkupSafe = "!=2.0.0a1" openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""} pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""} python-dateutil = ">=2.1,<3.0.0" python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""} -pytz = "*" PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""} requests = ">=2.5" -responses = ">=0.9.0" +responses = ">=0.13.0" setuptools = {version = "*", optional = true, markers = "extra == \"server\""} sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""} -werkzeug = ">=0.5,<2.2.0" +werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1" xmltodict = "*" [package.extras] -all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] apigatewayv2 = ["PyYAML (>=5.1)"] appsync = ["graphql-core"] awslambda = ["docker (>=2.5.1)"] batch = ["docker (>=2.5.1)"] -cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"] ds = ["sshpubkeys (>=3.1.0)"] dynamodb = ["docker (>=2.5.1)"] -dynamodb2 = ["docker (>=2.5.1)"] dynamodbstreams = ["docker (>=2.5.1)"] ebs = ["sshpubkeys (>=3.1.0)"] ec2 = ["sshpubkeys (>=3.1.0)"] efs = ["sshpubkeys (>=3.1.0)"] +eks = ["sshpubkeys (>=3.1.0)"] glue = ["pyparsing (>=3.0.7)"] iotdata = ["jsondiff (>=1.1.2)"] route53resolver = ["sshpubkeys (>=3.1.0)"] s3 = ["PyYAML (>=5.1)"] -server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] -ssm = ["PyYAML (>=5.1)", "dataclasses"] +server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"] +ssm = ["PyYAML (>=5.1)"] xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"] [[package]] @@ -1716,7 +1712,6 @@ python-versions = ">=3.6" files = [ {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"}, - {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"}, {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"}, @@ -1750,7 +1745,6 @@ files = [ {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"}, {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"}, - {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"}, @@ -1762,7 +1756,6 @@ files = [ {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"}, {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"}, - {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"}, {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"}, @@ -1795,7 +1788,18 @@ category = "main" optional = false python-versions = "*" files = [ + {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"}, + {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"}, + {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"}, + {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"}, {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"}, + {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"}, + {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"}, + {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"}, + {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"}, + {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"}, + {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"}, + {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"}, {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"}, ] @@ -2004,8 +2008,8 @@ files = [ [package.dependencies] pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, + {version = ">=5.0", markers = "python_version < \"3.10\""}, ] [[package]] @@ -2082,18 +2086,6 @@ cryptography = ["cryptography (>=3.4.0)"] pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"] pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"] -[[package]] -name = "pytz" -version = "2022.1" -description = "World timezone definitions, modern and historical" -category = "main" -optional = false -python-versions = "*" -files = [ - {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, - {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, -] - [[package]] name = "pywin32" version = "301" @@ -2129,13 +2121,6 @@ files = [ {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, - {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, - {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, @@ -2449,16 +2434,19 @@ test = ["websockets"] [[package]] name = "werkzeug" -version = "2.1.2" +version = "2.2.3" description = "The comprehensive WSGI web application library." category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"}, - {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"}, + {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"}, + {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"}, ] +[package.dependencies] +MarkupSafe = ">=2.1.1" + [package.extras] watchdog = ["watchdog"] @@ -2655,4 +2643,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3" +content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 152c83e4a0b0..030a5f1d6e47 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -28,6 +28,7 @@ itertools.workspace = true md5.workspace = true metrics.workspace = true once_cell.workspace = true +opentelemetry.workspace = true parking_lot.workspace = true pin-project-lite.workspace = true pq_proto.workspace = true @@ -35,6 +36,8 @@ prometheus.workspace = true rand.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["json"] } +reqwest-middleware.workspace = true +reqwest-tracing.workspace = true routerify.workspace = true rustls-pemfile.workspace = true rustls.workspace = true @@ -48,8 +51,10 @@ thiserror.workspace = true tls-listener.workspace = true tokio-postgres.workspace = true tokio-rustls.workspace = true -tokio.workspace = true +tokio = { workspace = true, features = ["signal"] } +tracing-opentelemetry.workspace = true tracing-subscriber.workspace = true +tracing-utils.workspace = true tracing.workspace = true url.workspace = true utils.workspace = true diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 50afbd2a27e2..b8599adaeb6b 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -1,11 +1,11 @@ mod classic; - +mod hacks; mod link; -use futures::TryFutureExt; + pub use link::LinkAuthError; use crate::{ - auth::{self, AuthFlow, ClientCredentials}, + auth::{self, ClientCredentials}, console::{ self, provider::{CachedNodeInfo, ConsoleReqExtra}, @@ -13,9 +13,10 @@ use crate::{ }, stream, url, }; +use futures::TryFutureExt; use std::borrow::Cow; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use tracing::info; /// A product of successful authentication. pub struct AuthSuccess { @@ -105,97 +106,49 @@ impl<'a, T, E> BackendType<'a, Result> { } } -// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc). -// Read more: https://github.com/rust-lang/rust/issues/99190 -// Alleged fix: https://github.com/rust-lang/rust/pull/89056 -impl<'l> BackendType<'l, ClientCredentials<'_>> { - /// Do something special if user didn't provide the `project` parameter. - async fn try_password_hack<'a>( - &'a mut self, - extra: &'a ConsoleReqExtra<'a>, - client: &'a mut stream::PqStream, - ) -> auth::Result>> { - use BackendType::*; - - // If there's no project so far, that entails that client doesn't - // support SNI or other means of passing the project name. - // We now expect to see a very specific payload in the place of password. - let fetch_magic_payload = |client| async { - warn!("project name not specified, resorting to the password hack auth flow"); - let payload = AuthFlow::new(client) - .begin(auth::PasswordHack) - .await? - .authenticate() - .await?; - - info!(project = &payload.project, "received missing parameter"); - auth::Result::Ok(payload) - }; - - // If we want to use cleartext password flow, we can read the password - // from the client and pretend that it's a magic payload (PasswordHack hack). - let fetch_plaintext_password = |client| async { - info!("using cleartext password flow"); - let payload = AuthFlow::new(client) - .begin(auth::CleartextPassword) - .await? - .authenticate() - .await?; - - auth::Result::Ok(auth::password_hack::PasswordHackPayload { - project: String::new(), - password: payload, - }) - }; - - // TODO: find a proper way to merge those very similar blocks. - let (mut node, password) = match self { - Console(api, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload(client).await?; - creds.project = Some(payload.project.into()); - let node = api.wake_compute(extra, creds).await?; - - (node, payload.password) - } - // This is a hack to allow cleartext password in secure connections (wss). - Console(api, creds) if creds.use_cleartext_password_flow => { - let payload = fetch_plaintext_password(client).await?; - let node = api.wake_compute(extra, creds).await?; - - (node, payload.password) - } - Postgres(api, creds) if creds.project.is_none() => { - let payload = fetch_magic_payload(client).await?; - creds.project = Some(payload.project.into()); - let node = api.wake_compute(extra, creds).await?; +/// True to its name, this function encapsulates our current auth trade-offs. +/// Here, we choose the appropriate auth flow based on circumstances. +async fn auth_quirks( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, + allow_cleartext: bool, +) -> auth::Result> { + // If there's no project so far, that entails that client doesn't + // support SNI or other means of passing the endpoint (project) name. + // We now expect to see a very specific payload in the place of password. + if creds.project.is_none() { + // Password will be checked by the compute node later. + return hacks::password_hack(api, extra, creds, client).await; + } - (node, payload.password) - } - _ => return Ok(None), - }; + // Password hack should set the project name. + // TODO: make `creds.project` more type-safe. + assert!(creds.project.is_some()); - node.config.password(password); - Ok(Some(AuthSuccess { - reported_auth_ok: false, - value: node, - })) + // Perform cleartext auth if we're allowed to do that. + // Currently, we use it for websocket connections (latency). + if allow_cleartext { + // Password will be checked by the compute node later. + return hacks::cleartext_hack(api, extra, creds, client).await; } + // Finally, proceed with the main auth flow (SCRAM-based). + classic::authenticate(api, extra, creds, client).await +} + +impl BackendType<'_, ClientCredentials<'_>> { /// Authenticate the client via the requested backend, possibly using credentials. - pub async fn authenticate<'a>( + #[tracing::instrument(fields(allow_cleartext), skip_all)] + pub async fn authenticate( &mut self, - extra: &'a ConsoleReqExtra<'a>, - client: &'a mut stream::PqStream, + extra: &ConsoleReqExtra<'_>, + client: &mut stream::PqStream, + allow_cleartext: bool, ) -> auth::Result> { use BackendType::*; - // Handle cases when `project` is missing in `creds`. - // TODO: type safety: return `creds` with irrefutable `project`. - if let Some(res) = self.try_password_hack(extra, client).await? { - info!("user successfully authenticated (using the password hack)"); - return Ok(res); - } - let res = match self { Console(api, creds) => { info!( @@ -204,20 +157,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { "performing authentication using the console" ); - assert!(creds.project.is_some()); - classic::handle_user(api.as_ref(), extra, creds, client).await? + let api = api.as_ref(); + auth_quirks(api, extra, creds, client, allow_cleartext).await? } Postgres(api, creds) => { - info!("performing mock authentication using a local postgres instance"); + info!( + user = creds.user, + project = creds.project(), + "performing authentication using a local postgres instance" + ); - assert!(creds.project.is_some()); - classic::handle_user(api.as_ref(), extra, creds, client).await? + let api = api.as_ref(); + auth_quirks(api, extra, creds, client, allow_cleartext).await? } // NOTE: this auth backend doesn't use client credentials. Link(url) => { info!("performing link authentication"); - link::handle_user(url, client) + link::authenticate(url, client) .await? .map(CachedNodeInfo::new_uncached) } @@ -229,9 +186,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> { /// When applicable, wake the compute node, gaining its connection info in the process. /// The link auth flow doesn't support this, so we return [`None`] in that case. - pub async fn wake_compute<'a>( + pub async fn wake_compute( &self, - extra: &'a ConsoleReqExtra<'a>, + extra: &ConsoleReqExtra<'_>, ) -> Result, console::errors::WakeComputeError> { use BackendType::*; diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index eefef6e9b498..6753e7ed7fb1 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -9,7 +9,7 @@ use crate::{ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; -pub(super) async fn handle_user( +pub(super) async fn authenticate( api: &impl console::Api, extra: &ConsoleReqExtra<'_>, creds: &ClientCredentials<'_>, diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs new file mode 100644 index 000000000000..f710581cb2d4 --- /dev/null +++ b/proxy/src/auth/backend/hacks.rs @@ -0,0 +1,66 @@ +use super::AuthSuccess; +use crate::{ + auth::{self, AuthFlow, ClientCredentials}, + console::{ + self, + provider::{CachedNodeInfo, ConsoleReqExtra}, + }, + stream, +}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tracing::{info, warn}; + +/// Compared to [SCRAM](crate::scram), cleartext password auth saves +/// one round trip and *expensive* computations (>= 4096 HMAC iterations). +/// These properties are benefical for serverless JS workers, so we +/// use this mechanism for websocket connections. +pub async fn cleartext_hack( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, +) -> auth::Result> { + warn!("cleartext auth flow override is enabled, proceeding"); + let password = AuthFlow::new(client) + .begin(auth::CleartextPassword) + .await? + .authenticate() + .await?; + + let mut node = api.wake_compute(extra, creds).await?; + node.config.password(password); + + // Report tentative success; compute node will check the password anyway. + Ok(AuthSuccess { + reported_auth_ok: false, + value: node, + }) +} + +/// Workaround for clients which don't provide an endpoint (project) name. +/// Very similar to [`cleartext_hack`], but there's a specific password format. +pub async fn password_hack( + api: &impl console::Api, + extra: &ConsoleReqExtra<'_>, + creds: &mut ClientCredentials<'_>, + client: &mut stream::PqStream, +) -> auth::Result> { + warn!("project not specified, resorting to the password hack auth flow"); + let payload = AuthFlow::new(client) + .begin(auth::PasswordHack) + .await? + .authenticate() + .await?; + + info!(project = &payload.project, "received missing parameter"); + creds.project = Some(payload.project.into()); + + let mut node = api.wake_compute(extra, creds).await?; + node.config.password(payload.password); + + // Report tentative success; compute node will check the password anyway. + Ok(AuthSuccess { + reported_auth_ok: false, + value: node, + }) +} diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 5d0049c95712..7175a23dc108 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -53,7 +53,7 @@ pub fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } -pub(super) async fn handle_user( +pub(super) async fn authenticate( link_uri: &reqwest::Url, client: &mut PqStream, ) -> auth::Result> { diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 968104f05876..c556c33197f4 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -11,12 +11,16 @@ pub enum ClientCredsParseError { #[error("Parameter '{0}' is missing in startup packet.")] MissingKey(&'static str), - #[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)] + #[error( + "Inconsistent project name inferred from \ + SNI ('{}') and project option ('{}').", + .domain, .option, + )] InconsistentProjectNames { domain: String, option: String }, #[error( "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \ - SNI should be formatted as '.{}'.", + SNI should be formatted as '.{}'.", .sni, .cn, .cn, )] InconsistentSni { sni: String, cn: String }, @@ -34,9 +38,6 @@ pub struct ClientCredentials<'a> { pub user: &'a str, // TODO: this is a severe misnomer! We should think of a new name ASAP. pub project: Option>, - /// If `True`, we'll use the old cleartext password flow. This is used for - /// websocket connections, which want to minimize the number of round trips. - pub use_cleartext_password_flow: bool, } impl ClientCredentials<'_> { @@ -51,7 +52,6 @@ impl<'a> ClientCredentials<'a> { params: &'a StartupMessageParams, sni: Option<&str>, common_name: Option<&str>, - use_cleartext_password_flow: bool, ) -> Result { use ClientCredsParseError::*; @@ -96,18 +96,9 @@ impl<'a> ClientCredentials<'a> { } .transpose()?; - info!( - user = user, - project = project.as_deref(), - use_cleartext_password_flow = use_cleartext_password_flow, - "credentials" - ); - - Ok(Self { - user, - project, - use_cleartext_password_flow, - }) + info!(user, project = project.as_deref(), "credentials"); + + Ok(Self { user, project }) } } @@ -131,7 +122,7 @@ mod tests { // According to postgresql, only `user` should be required. let options = StartupMessageParams::new([("user", "john_doe")]); - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project, None); @@ -146,7 +137,7 @@ mod tests { ("foo", "bar"), // should be ignored ]); - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project, None); @@ -160,7 +151,7 @@ mod tests { let sni = Some("foo.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(&options, sni, common_name, false)?; + let creds = ClientCredentials::parse(&options, sni, common_name)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -174,7 +165,7 @@ mod tests { ("options", "-ckey=1 project=bar -c geqo=off"), ]); - let creds = ClientCredentials::parse(&options, None, None, false)?; + let creds = ClientCredentials::parse(&options, None, None)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("bar")); @@ -188,7 +179,7 @@ mod tests { let sni = Some("baz.localhost"); let common_name = Some("localhost"); - let creds = ClientCredentials::parse(&options, sni, common_name, false)?; + let creds = ClientCredentials::parse(&options, sni, common_name)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("baz")); @@ -203,8 +194,7 @@ mod tests { let sni = Some("second.localhost"); let common_name = Some("localhost"); - let err = - ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -221,8 +211,7 @@ mod tests { let sni = Some("project.localhost"); let common_name = Some("example.com"); - let err = - ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); match err { InconsistentSni { sni, cn } => { assert_eq!(sni, "project.localhost"); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 3f5eb3caff80..2e12d9ee26fd 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -77,14 +77,9 @@ impl ConnCfg { self.dbname(dbname); } - if let Some(options) = params.options_raw() { - // We must drop all proxy-specific parameters. - #[allow(unstable_name_collisions)] - let options: String = options - .filter(|opt| !opt.starts_with("project=")) - .intersperse(" ") // TODO: use impl from std once it's stabilized - .collect(); - + // Don't add `options` if they were only used for specifying a project. + // Connection pools don't support `options`, because they affect backend startup. + if let Some(options) = filtered_options(params) { self.options(&options); } @@ -225,3 +220,46 @@ impl ConnCfg { Ok(connection) } } + +/// Retrieve `options` from a startup message, dropping all proxy-secific flags. +fn filtered_options(params: &StartupMessageParams) -> Option { + #[allow(unstable_name_collisions)] + let options: String = params + .options_raw()? + .filter(|opt| !opt.starts_with("project=")) + .intersperse(" ") // TODO: use impl from std once it's stabilized + .collect(); + + // Don't even bother with empty options. + if options.is_empty() { + return None; + } + + Some(options) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_filtered_options() { + // Empty options is unlikely to be useful anyway. + let params = StartupMessageParams::new([("options", "")]); + assert_eq!(filtered_options(¶ms), None); + + // It's likely that clients will only use options to specify endpoint/project. + let params = StartupMessageParams::new([("options", "project=foo")]); + assert_eq!(filtered_options(¶ms), None); + + // Same, because unescaped whitespaces are no-op. + let params = StartupMessageParams::new([("options", " project=foo ")]); + assert_eq!(filtered_options(¶ms).as_deref(), None); + + let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]); + assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ ")); + + let params = StartupMessageParams::new([("options", "project = foo")]); + assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); + } +} diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 5e285f3625d9..600db7f8eca2 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -8,6 +8,7 @@ pub struct ProxyConfig { pub metric_collection: Option, } +#[derive(Debug)] pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index 51a117d3b782..c00c06fbb7c1 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -5,10 +5,7 @@ use crate::{ use anyhow::Context; use once_cell::sync::Lazy; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::{ - net::{TcpListener, TcpStream}, - thread, -}; +use std::{net::TcpStream, thread}; use tracing::{error, info, info_span}; use utils::{ postgres_backend::{self, AuthType, PostgresBackend}, @@ -34,23 +31,24 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N CPLANE_WAITERS.notify(psql_session_id, msg) } -/// Console management API listener thread. +/// Console management API listener task. /// It spawns console response handlers needed for the link auth. -pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> { +pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> { scopeguard::defer! { info!("mgmt has shut down"); } - listener - .set_nonblocking(false) - .context("failed to set listener to blocking")?; - loop { - let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?; + let (socket, peer_addr) = listener.accept().await?; info!("accepted connection from {peer_addr}"); + + let socket = socket.into_std()?; socket .set_nodelay(true) .context("failed to set client socket option")?; + socket + .set_nonblocking(false) + .context("failed to set client socket option")?; // TODO: replace with async tasks. thread::spawn(move || { diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 7621aba19b24..80cd94d48366 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -11,8 +11,10 @@ use async_trait::async_trait; use std::sync::Arc; pub mod errors { - use crate::error::{io_error, UserFacingError}; - use reqwest::StatusCode as HttpStatusCode; + use crate::{ + error::{io_error, UserFacingError}, + http, + }; use thiserror::Error; /// A go-to error message which doesn't leak any detail. @@ -24,7 +26,7 @@ pub mod errors { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {}: {}", .status, .text)] Console { - status: HttpStatusCode, + status: http::StatusCode, text: Box, }, @@ -35,7 +37,7 @@ pub mod errors { impl ApiError { /// Returns HTTP status code if it's the reason for failure. - pub fn http_status_code(&self) -> Option { + pub fn http_status_code(&self) -> Option { use ApiError::*; match self { Console { status, .. } => Some(*status), @@ -51,15 +53,15 @@ pub mod errors { // To minimize risks, only select errors are forwarded to users. // Ask @neondatabase/control-plane for review before adding more. Console { status, .. } => match *status { - HttpStatusCode::NOT_FOUND => { + http::StatusCode::NOT_FOUND => { // Status 404: failed to get a project-related resource. format!("{REQUEST_FAILED}: endpoint cannot be found") } - HttpStatusCode::NOT_ACCEPTABLE => { + http::StatusCode::NOT_ACCEPTABLE => { // Status 406: endpoint is disabled (we don't allow connections). format!("{REQUEST_FAILED}: endpoint is disabled") } - HttpStatusCode::LOCKED => { + http::StatusCode::LOCKED => { // Status 423: project might be in maintenance mode (or bad state). format!("{REQUEST_FAILED}: endpoint is temporary unavailable") } @@ -70,13 +72,18 @@ pub mod errors { } } - // Helps eliminate graceless `.map_err` calls without introducing another ctor. impl From for ApiError { fn from(e: reqwest::Error) -> Self { io_error(e).into() } } + impl From for ApiError { + fn from(e: reqwest_middleware::Error) -> Self { + io_error(e).into() + } + } + #[derive(Debug, Error)] pub enum GetAuthInfoError { // We shouldn't include the actual secret here. diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 4eca025d2d99..3644db17f7ea 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -8,7 +8,6 @@ use super::{ use crate::{auth::ClientCredentials, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; -use reqwest::StatusCode as HttpStatusCode; use tracing::{error, info, info_span, warn, Instrument}; #[derive(Clone)] @@ -52,7 +51,7 @@ impl Api { Ok(body) => body, // Error 404 is special: it's ok not to have a secret. Err(e) => match e.http_status_code() { - Some(HttpStatusCode::NOT_FOUND) => return Ok(None), + Some(http::StatusCode::NOT_FOUND) => return Ok(None), _otherwise => return Err(e.into()), }, }; @@ -154,7 +153,7 @@ impl super::Api for Api { /// Parse http response body, taking status code into account. async fn parse_body serde::Deserialize<'a>>( - response: reqwest::Response, + response: http::Response, ) -> Result { let status = response.status(); if status.is_success() { diff --git a/proxy/src/http.rs b/proxy/src/http.rs index e847edc8bd68..a54415780089 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -1,7 +1,24 @@ +//! HTTP client and server impls. +//! Other modules should use stuff from this module instead of +//! directly relying on deps like `reqwest` (think loose coupling). + pub mod server; pub mod websocket; +pub use reqwest::{Request, Response, StatusCode}; +pub use reqwest_middleware::{ClientWithMiddleware, Error}; + use crate::url::ApiUrl; +use reqwest_middleware::RequestBuilder; + +/// This is the preferred way to create new http clients, +/// because it takes care of observability (OpenTelemetry). +/// We deliberately don't want to replace this with a public static. +pub fn new_client() -> ClientWithMiddleware { + reqwest_middleware::ClientBuilder::new(reqwest::Client::new()) + .with(reqwest_tracing::TracingMiddleware::default()) + .build() +} /// Thin convenience wrapper for an API provided by an http endpoint. #[derive(Debug, Clone)] @@ -9,13 +26,17 @@ pub struct Endpoint { /// API's base URL. endpoint: ApiUrl, /// Connection manager with built-in pooling. - client: reqwest::Client, + client: ClientWithMiddleware, } impl Endpoint { /// Construct a new HTTP endpoint wrapper. - pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self { - Self { endpoint, client } + /// Http client is not constructed under the hood so that it can be shared. + pub fn new(endpoint: ApiUrl, client: impl Into) -> Self { + Self { + endpoint, + client: client.into(), + } } #[inline(always)] @@ -23,19 +44,16 @@ impl Endpoint { &self.endpoint } - /// Return a [builder](reqwest::RequestBuilder) for a `GET` request, + /// Return a [builder](RequestBuilder) for a `GET` request, /// appending a single `path` segment to the base endpoint URL. - pub fn get(&self, path: &str) -> reqwest::RequestBuilder { + pub fn get(&self, path: &str) -> RequestBuilder { let mut url = self.endpoint.clone(); url.path_segments_mut().push(path); self.client.get(url.into_inner()) } /// Execute a [request](reqwest::Request). - pub async fn execute( - &self, - request: reqwest::Request, - ) -> Result { + pub async fn execute(&self, request: Request) -> Result { self.client.execute(request).await } } @@ -43,11 +61,12 @@ impl Endpoint { #[cfg(test)] mod tests { use super::*; + use reqwest::Client; #[test] fn optional_query_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; - let endpoint = Endpoint::new(url, reqwest::Client::new()); + let endpoint = Endpoint::new(url, Client::new()); // Validate that this pattern makes sense. let req = endpoint @@ -66,7 +85,7 @@ mod tests { #[test] fn uuid_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; - let endpoint = Endpoint::new(url, reqwest::Client::new()); + let endpoint = Endpoint::new(url, Client::new()); let req = endpoint .get("frobnicate") diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index d4235c2c3848..1757652a9065 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -186,8 +186,8 @@ async fn ws_handler( } pub async fn task_main( - ws_listener: TcpListener, config: &'static ProxyConfig, + ws_listener: TcpListener, ) -> anyhow::Result<()> { scopeguard::defer! { info!("websocket server has shut down"); diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs new file mode 100644 index 000000000000..0c8c2858b900 --- /dev/null +++ b/proxy/src/logging.rs @@ -0,0 +1,47 @@ +use tracing_opentelemetry::OpenTelemetryLayer; +use tracing_subscriber::{ + filter::{EnvFilter, LevelFilter}, + prelude::*, +}; + +/// Initialize logging and OpenTelemetry tracing and exporter. +/// +/// Logging can be configured using `RUST_LOG` environment variable. +/// +/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up +/// configuration from environment variables. For example, to change the +/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. +/// See +pub async fn init() -> anyhow::Result { + let env_filter = EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .from_env_lossy(); + + let fmt_layer = tracing_subscriber::fmt::layer() + .with_ansi(atty::is(atty::Stream::Stderr)) + .with_writer(std::io::stderr) + .with_target(false); + + let otlp_layer = tracing_utils::init_tracing("proxy") + .await + .map(OpenTelemetryLayer::new); + + tracing_subscriber::registry() + .with(env_filter) + .with(otlp_layer) + .with(fmt_layer) + .try_init()?; + + Ok(LoggingGuard) +} + +pub struct LoggingGuard; + +impl Drop for LoggingGuard { + fn drop(&mut self) { + // Shutdown trace pipeline gracefully, so that it has a chance to send any + // pending traces before we exit. + tracing::info!("shutting down the tracing machinery"); + tracing_utils::shutdown_tracing(); + } +} diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 8812f77b622f..85478da3bcc1 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -12,6 +12,7 @@ mod config; mod console; mod error; mod http; +mod logging; mod metrics; mod parse; mod proxy; @@ -27,7 +28,7 @@ use config::ProxyConfig; use futures::FutureExt; use std::{borrow::Cow, future::Future, net::SocketAddr}; use tokio::{net::TcpListener, task::JoinError}; -use tracing::{info, info_span, Instrument}; +use tracing::{info, warn}; use utils::{project_git_version, sentry_init::init_sentry}; project_git_version!(GIT_VERSION); @@ -41,8 +42,8 @@ async fn flatten_err( #[tokio::main] async fn main() -> anyhow::Result<()> { - // First, initialize logging and troubleshooting subsystems. - init_tracing(); + let _logging_guard = logging::init().await?; + let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); info!("Version: {GIT_VERSION}"); @@ -60,16 +61,17 @@ async fn main() -> anyhow::Result<()> { let mgmt_address: SocketAddr = args.get_one::("mgmt").unwrap().parse()?; info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?; + let mgmt_listener = TcpListener::bind(mgmt_address).await?; let proxy_address: SocketAddr = args.get_one::("proxy").unwrap().parse()?; info!("Starting proxy on {proxy_address}"); let proxy_listener = TcpListener::bind(proxy_address).await?; let mut tasks = vec![ + tokio::spawn(handle_signals()), tokio::spawn(http::server::task_main(http_listener)), tokio::spawn(proxy::task_main(config, proxy_listener)), - tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)), + tokio::spawn(console::mgmt::task_main(mgmt_listener)), ]; if let Some(wss_address) = args.get_one::("wss") { @@ -78,48 +80,50 @@ async fn main() -> anyhow::Result<()> { let wss_listener = TcpListener::bind(wss_address).await?; tasks.push(tokio::spawn(http::websocket::task_main( - wss_listener, config, + wss_listener, ))); } - // TODO: refactor. - if let Some(metric_collection) = &config.metric_collection { - let hostname = hostname::get()? - .into_string() - .map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?; - - tasks.push(tokio::spawn( - metrics::collect_metrics( - &metric_collection.endpoint, - metric_collection.interval, - hostname, - ) - .instrument(info_span!("collect_metrics")), - )); + if let Some(metrics_config) = &config.metric_collection { + tasks.push(tokio::spawn(metrics::task_main(metrics_config))); } - // This will block until all tasks have completed. - // Furthermore, the first one to fail will cancel the rest. + // This combinator will block until either all tasks complete or + // one of them finishes with an error (others will be cancelled). let tasks = tasks.into_iter().map(flatten_err); let _: Vec<()> = futures::future::try_join_all(tasks).await?; Ok(()) } -/// Tracing is used for logging and telemetry. -fn init_tracing() { - tracing_subscriber::fmt() - .with_env_filter({ - // This filter will examine the `RUST_LOG` env variable. - use tracing_subscriber::filter::{EnvFilter, LevelFilter}; - EnvFilter::builder() - .with_default_directive(LevelFilter::INFO.into()) - .from_env_lossy() - }) - .with_ansi(atty::is(atty::Stream::Stdout)) - .with_target(false) - .init(); +/// Handle unix signals appropriately. +async fn handle_signals() -> anyhow::Result<()> { + use tokio::signal::unix::{signal, SignalKind}; + + let mut hangup = signal(SignalKind::hangup())?; + let mut interrupt = signal(SignalKind::interrupt())?; + let mut terminate = signal(SignalKind::terminate())?; + + loop { + tokio::select! { + // Hangup is commonly used for config reload. + _ = hangup.recv() => { + warn!("received SIGHUP; config reload is not supported"); + } + // Shut down the whole application. + _ = interrupt.recv() => { + warn!("received SIGINT, exiting immediately"); + bail!("interrupted"); + } + // TODO: Don't accept new proxy connections. + // TODO: Shut down once all exisiting connections have been closed. + _ = terminate.recv() => { + warn!("received SIGTERM, exiting immediately"); + bail!("terminated"); + } + } + } } /// ProxyConfig is created at proxy startup, and lives forever. @@ -161,7 +165,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> })); let url = args.get_one::("auth-endpoint").unwrap().parse()?; - let endpoint = http::Endpoint::new(url, reqwest::Client::new()); + let endpoint = http::Endpoint::new(url, http::new_client()); let api = console::provider::neon::Api::new(endpoint, caches); auth::BackendType::Console(Cow::Owned(api), ()) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index d9aa4aec8c0f..3b2834687258 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,12 +1,11 @@ -//! //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -//! +use crate::{config::MetricCollectionConfig, http}; use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use serde::Serialize; -use std::{collections::HashMap, time::Duration}; -use tracing::{debug, error, log::info, trace}; +use std::collections::HashMap; +use tracing::{debug, error, info, instrument, trace}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -19,48 +18,41 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; /// so while the project-id is unique across regions the whole pipeline will work correctly /// because we enrich the event with project_id in the control-plane endpoint. /// -#[derive(Eq, Hash, PartialEq, Serialize)] +#[derive(Eq, Hash, PartialEq, Serialize, Debug)] pub struct Ids { pub endpoint_id: String, } -pub async fn collect_metrics( - metric_collection_endpoint: &reqwest::Url, - metric_collection_interval: Duration, - hostname: String, -) -> anyhow::Result<()> { +pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> { + info!("metrics collector config: {config:?}"); scopeguard::defer! { - info!("collect_metrics has shut down"); + info!("metrics collector has shut down"); } - let mut ticker = tokio::time::interval(metric_collection_interval); - - info!( - "starting collect_metrics. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - - // define client here to reuse it for all requests - let client = reqwest::Client::new(); + let http_client = http::new_client(); let mut cached_metrics: HashMap)> = HashMap::new(); + let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + let mut ticker = tokio::time::interval(config.interval); loop { - tokio::select! { - _ = ticker.tick() => { - - match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await - { - Err(e) => { - error!("Failed to send consumption metrics: {} ", e); - }, - Ok(_) => { trace!("collect_metrics_iteration completed successfully") }, - } - } + ticker.tick().await; + + let res = collect_metrics_iteration( + &http_client, + &mut cached_metrics, + &config.endpoint, + &hostname, + ) + .await; + + match res { + Err(e) => error!("failed to send consumption metrics: {e} "), + Ok(_) => trace!("periodic metrics collection completed successfully"), } } } -pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { +fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { let mut current_metrics: Vec<(Ids, (u64, DateTime))> = Vec::new(); let metrics = prometheus::default_registry().gather(); @@ -99,11 +91,12 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { current_metrics } -pub async fn collect_metrics_iteration( - client: &reqwest::Client, +#[instrument(skip_all)] +async fn collect_metrics_iteration( + client: &http::ClientWithMiddleware, cached_metrics: &mut HashMap)>, metric_collection_endpoint: &reqwest::Url, - hostname: String, + hostname: &str, ) -> anyhow::Result<()> { info!( "starting collect_metrics_iteration. metric_collection_endpoint: {}", @@ -134,7 +127,7 @@ pub async fn collect_metrics_iteration( stop_time: *curr_time, }, metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname.clone()), + idempotency_key: idempotency_key(hostname.to_owned()), value, extra: Ids { endpoint_id: curr_key.endpoint_id.clone(), @@ -190,6 +183,12 @@ pub async fn collect_metrics_iteration( } } else { error!("metrics endpoint refused the sent metrics: {:?}", res); + for metric in chunk.iter() { + // Report if the metric value is suspiciously large + if metric.value > (1u64 << 40) { + error!("potentially abnormal metric value: {:?}", metric); + } + } } } Ok(()) diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index a622a35e6d6e..964204781283 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -17,7 +17,7 @@ use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{error, info, warn}; /// Number of times we should retry the `/proxy_wake_compute` http request. const NUM_RETRIES_WAKE_COMPUTE: usize = 1; @@ -91,13 +91,13 @@ pub async fn task_main( .unwrap_or_else(|e| { // Acknowledge that the task has finished with an error. error!("per-client task finished with an error: {e:#}"); - }) - .instrument(info_span!("client", session = format_args!("{session_id}"))), + }), ); } } // TODO(tech debt): unite this with its twin below. +#[tracing::instrument(fields(session_id), skip_all)] pub async fn handle_ws_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -127,7 +127,7 @@ pub async fn handle_ws_client( let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name, true)) + .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? @@ -135,10 +135,11 @@ pub async fn handle_ws_client( let client = Client::new(stream, creds, ¶ms, session_id); cancel_map - .with_session(|session| client.connect_to_db(session)) + .with_session(|session| client.connect_to_db(session, true)) .await } +#[tracing::instrument(fields(session_id), skip_all)] async fn handle_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -165,7 +166,7 @@ async fn handle_client( let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name, false)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? @@ -173,7 +174,7 @@ async fn handle_client( let client = Client::new(stream, creds, ¶ms, session_id); cancel_map - .with_session(|session| client.connect_to_db(session)) + .with_session(|session| client.connect_to_db(session, false)) .await } @@ -401,7 +402,11 @@ impl<'a, S> Client<'a, S> { impl Client<'_, S> { /// Let the client authenticate and connect to the designated compute node. - async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> { + async fn connect_to_db( + self, + session: cancellation::Session<'_>, + allow_cleartext: bool, + ) -> anyhow::Result<()> { let Self { mut stream, mut creds, @@ -416,10 +421,12 @@ impl Client<'_, S> { let auth_result = async { // `&mut stream` doesn't let us merge those 2 lines. - let res = creds.authenticate(&extra, &mut stream).await; + let res = creds + .authenticate(&extra, &mut stream, allow_cleartext) + .await; + async { res }.or_else(|e| stream.throw_error(e)).await } - .instrument(info_span!("auth")) .await?; let AuthSuccess { diff --git a/pyproject.toml b/pyproject.toml index d3d3948b9ad4..415f7f1ae703 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,12 +19,12 @@ types-requests = "^2.28.5" types-psycopg2 = "^2.9.18" boto3 = "^1.26.16" boto3-stubs = {extras = ["s3"], version = "^1.26.16"} -moto = {version = "^3.0.0", extras = ["server"]} +moto = {extras = ["server"], version = "^4.1.2"} backoff = "^1.11.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" -Werkzeug = "2.1.2" +Werkzeug = "^2.2.3" pytest-order = "^1.0.1" allure-pytest = "^2.10.0" pytest-asyncio = "^0.19.0" diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 1a068412c8cf..683050e9cd50 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -126,7 +126,12 @@ fn main() -> anyhow::Result<()> { return Ok(()); } + // important to keep the order of: + // 1. init logging + // 2. tracing panic hook + // 3. sentry logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::replace_panic_hook_with_tracing_panic_hook().forget(); info!("version: {GIT_VERSION}"); let args_workdir = &args.datadir; diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index c73206b7dcb1..1a0d261184a4 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -424,12 +424,16 @@ async fn http1_handler( #[tokio::main] async fn main() -> Result<(), Box> { - // initialize sentry if SENTRY_DSN is provided - let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); - let args = Args::parse(); + // important to keep the order of: + // 1. init logging + // 2. tracing panic hook + // 3. sentry logging::init(LogFormat::from_config(&args.log_format)?)?; + logging::replace_panic_hook_with_tracing_panic_hook().forget(); + // initialize sentry if SENTRY_DSN is provided + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); info!("version: {GIT_VERSION}"); ::metrics::set_build_info_metric(GIT_VERSION); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index b35252243edd..63196609ccb7 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1217,7 +1217,7 @@ def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[st """ Returns the tenant size, together with the model inputs as the second tuple item. """ - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size") + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size") self.verbose_error(res) res = res.json() assert isinstance(res, dict) @@ -1228,6 +1228,16 @@ def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[st assert type(inputs) is dict return (size, inputs) + def tenant_size_debug(self, tenant_id: TenantId) -> str: + """ + Returns the tenant size debug info, as an HTML string + """ + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size", + headers={"Accept": "text/html"}, + ) + return res.text + def timeline_list( self, tenant_id: TenantId, @@ -1278,6 +1288,7 @@ def timeline_detail( timeline_id: TimelineId, include_non_incremental_logical_size: bool = False, include_timeline_dir_layer_file_size_sum: bool = False, + **kwargs, ) -> Dict[Any, Any]: params = {} if include_non_incremental_logical_size: @@ -1288,6 +1299,7 @@ def timeline_detail( res = self.get( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", params=params, + **kwargs, ) self.verbose_error(res) res_json = res.json() @@ -2067,6 +2079,7 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs ".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock() ".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress + ".*task iteration took longer than the configured period.*", ] def start( @@ -2517,15 +2530,17 @@ def __exit__( tb: Optional[TracebackType], ): if self._popen is not None: - # NOTE the process will die when we're done with tests anyway, because - # it's a child process. This is mostly to clean up in between different tests. - self._popen.kill() + self._popen.terminate() + try: + self._popen.wait(timeout=5) + except subprocess.TimeoutExpired: + log.warn("failed to gracefully terminate proxy; killing") + self._popen.kill() @staticmethod async def activate_link_auth( local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True ): - pg_user = "proxy" if create_user: diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index 3551f27cad56..5ee94de32dcb 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -1,19 +1,25 @@ # It's possible to run any regular test with the local fs remote storage via # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ...... +import time +from collections import defaultdict from pathlib import Path +from typing import Any, DefaultDict, Dict import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, + PageserverApiException, RemoteStorageKind, assert_tenant_status, available_remote_storages, + wait_for_last_flush_lsn, wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, wait_for_upload, wait_until, + wait_until_tenant_state, ) from fixtures.types import Lsn from fixtures.utils import query_scalar @@ -212,6 +218,9 @@ def get_resident_physical_size(): log.info(filled_size) assert filled_current_physical == filled_size, "we don't yet do layer eviction" + # Wait until generated image layers are uploaded to S3 + time.sleep(3) + env.pageserver.stop() # remove all the layer files @@ -449,3 +458,233 @@ def get_resident_physical_size(): pg_old = env.postgres.create_start(branch_name="main") with pg_old.cursor() as cur: assert query_scalar(cur, "select count(*) from testtab") == table_len + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) +def test_compaction_downloads_on_demand_without_image_creation( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + """ + Create a few layers, then evict, then make sure compaction runs successfully. + """ + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_compaction_downloads_on_demand_without_image_creation", + ) + + env = neon_env_builder.init_start() + + conf = { + # Disable background GC & compaction + "gc_period": "0s", + "compaction_period": "0s", + # unused, because manual will be called after each table + "checkpoint_distance": 100 * 1024**2, + # this will be updated later on to allow manual compaction outside of checkpoints + "compaction_threshold": 100, + # repartitioning parameter, not required here + "image_creation_threshold": 100, + # repartitioning parameter, not required here + "compaction_target_size": 128 * 1024**2, + # pitr_interval and gc_horizon are not interesting because we dont run gc + } + + # Override defaults, to create more layers + tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf)) + env.initial_tenant = tenant_id + pageserver_http = env.pageserver.http_client() + + with env.postgres.create_start("main") as pg: + # no particular reason to create the layers like this, but we are sure + # not to hit the image_creation_threshold here. + with pg.cursor() as cur: + cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + with pg.cursor() as cur: + cur.execute("update a set id = -id") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint" + assert len(layers.historic_layers) == 1 + 2, "should have inidb layer and 2 deltas" + + for layer in layers.historic_layers: + log.info(f"pre-compact: {layer}") + pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + + env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"}) + + pageserver_http.timeline_compact(tenant_id, timeline_id) + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + for layer in layers.historic_layers: + log.info(f"post compact: {layer}") + assert len(layers.historic_layers) == 1, "should have compacted to single layer" + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) +def test_compaction_downloads_on_demand_with_image_creation( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + """ + Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted. + + Due to current implementation, this will make image creation on-demand download layers, but we cannot really + directly test for it. + """ + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_compaction_downloads_on_demand", + ) + + env = neon_env_builder.init_start() + + conf = { + # Disable background GC & compaction + "gc_period": "0s", + "compaction_period": "0s", + # repartitioning threshold is this / 10, but it doesn't really seem to matter + "checkpoint_distance": 50 * 1024**2, + "compaction_threshold": 3, + # important: keep this high for the data ingestion + "image_creation_threshold": 100, + # repartitioning parameter, unused + "compaction_target_size": 128 * 1024**2, + # pitr_interval and gc_horizon are not interesting because we dont run gc + } + + # Override defaults, to create more layers + tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf)) + env.initial_tenant = tenant_id + pageserver_http = env.pageserver.http_client() + + pg = env.postgres.create_start("main") + + # no particular reason to create the layers like this, but we are sure + # not to hit the image_creation_threshold here. + with pg.cursor() as cur: + cur.execute("create table a (id bigserial primary key, some_value bigint not null)") + cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)") + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + for i in range(0, 2): + for j in range(0, 3): + # create a minimal amount of "delta difficulty" for this table + with pg.cursor() as cur: + cur.execute("update a set some_value = -some_value + %s", (j,)) + + with pg.cursor() as cur: + # vacuuming should aid to reuse keys, though it's not really important + # with image_creation_threshold=1 which we will use on the last compaction + cur.execute("vacuum") + + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + if i == 1 and j == 2: + # last iteration; stop before checkpoint to avoid leaving an inmemory layer + pg.stop_and_destroy() + + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + + # images should not yet be created, because threshold is too high, + # but these will be reshuffled to L1 layers + pageserver_http.timeline_compact(tenant_id, timeline_id) + + for _ in range(0, 20): + # loop in case flushing is still in progress + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + if not layers.in_memory_layers: + break + time.sleep(0.2) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint" + + kinds_before: DefaultDict[str, int] = defaultdict(int) + + for layer in layers.historic_layers: + kinds_before[layer.kind] += 1 + pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + + assert dict(kinds_before) == {"Delta": 4} + + # now having evicted all layers, reconfigure to have lower image creation + # threshold to expose image creation to downloading all of the needed + # layers -- threshold of 2 would sound more reasonable, but keeping it as 1 + # to be less flaky + env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"}) + + pageserver_http.timeline_compact(tenant_id, timeline_id) + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + kinds_after: DefaultDict[str, int] = defaultdict(int) + for layer in layers.historic_layers: + kinds_after[layer.kind] += 1 + + assert dict(kinds_after) == {"Delta": 4, "Image": 1} + + +def stringify(conf: Dict[str, Any]) -> Dict[str, str]: + return dict(map(lambda x: (x[0], str(x[1])), conf.items())) + + +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS]) +def test_ondemand_download_failure_to_replace( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + """ + Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking. + + See: https://github.com/neondatabase/neon/issues/3533 + """ + + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_ondemand_download_failure_to_replace", + ) + + # disable gc and compaction via default tenant config because config is lost while detaching + # so that compaction will not be the one to download the layer but the http handler is + neon_env_builder.pageserver_config_override = ( + """tenant_config={gc_period = "0s", compaction_period = "0s"}""" + ) + + env = neon_env_builder.init_start() + + tenant_id, timeline_id = env.neon_cli.create_tenant() + + env.initial_tenant = tenant_id + pageserver_http = env.pageserver.http_client() + + lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"]) + + wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) + + # remove layers so that they will be redownloaded + pageserver_http.tenant_detach(tenant_id) + pageserver_http.tenant_attach(tenant_id) + + wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5) + pageserver_http.configure_failpoints(("layermap-replace-notfound", "return")) + + # requesting details with non-incremental size should trigger a download of the only layer + # this will need to be adjusted if an index for logical sizes is ever implemented + with pytest.raises(PageserverApiException): + # error message is not useful + pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2) + + actual_message = ( + ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found" + ) + assert env.pageserver.log_contains(actual_message) is not None + env.pageserver.allowed_errors.append(actual_message) + + env.pageserver.allowed_errors.append( + ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info" + ) + # this might get to run and attempt on-demand, but not always + env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'") + + # if the above returned, then we didn't have a livelock, and all is well diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index bb3bca878263..8c2996f4917b 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -1,13 +1,13 @@ -from typing import Any, List, Tuple +from pathlib import Path +from typing import List, Tuple import pytest from fixtures.log_helper import log -from fixtures.metrics import parse_metrics from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn from fixtures.types import Lsn -def test_empty_tenant_size(neon_simple_env: NeonEnv): +def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): env = neon_simple_env (tenant_id, _) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() @@ -18,6 +18,9 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv): main_branch_name = "main" + branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] + assert branch_name == main_branch_name + with env.postgres.create_start( main_branch_name, tenant_id=tenant_id, @@ -39,12 +42,44 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv): size, inputs = http_client.tenant_size_and_modelinputs(tenant_id) assert size == initial_size, "tenant_size should not be affected by shutdown of compute" - expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"] - actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"])) # type: ignore - assert actual_commands == expected_commands + expected_inputs = { + "segments": [ + { + "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True}, + "timeline_id": f"{main_timeline_id}", + "kind": "BranchStart", + }, + { + "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True}, + "timeline_id": f"{main_timeline_id}", + "kind": "BranchEnd", + }, + ], + "timeline_inputs": [ + { + "timeline_id": f"{main_timeline_id}", + "ancestor_id": None, + "ancestor_lsn": "0/0", + "last_record": "0/1698CC0", + "latest_gc_cutoff": "0/1698C48", + "horizon_cutoff": "0/0", + "pitr_cutoff": "0/0", + "next_gc_cutoff": "0/0", + "retention_param_cutoff": None, + } + ], + } + expected_inputs = mask_model_inputs(expected_inputs) + actual_inputs = mask_model_inputs(inputs) + + assert expected_inputs == actual_inputs + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) -def test_branched_empty_timeline_size(neon_simple_env: NeonEnv): + +def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path): """ Issue found in production. Because the ancestor branch was under gc_horizon, the branchpoint was "dangling" and the computation could not be @@ -75,8 +110,12 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv): assert size_after_branching > initial_size + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + -def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv): +def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_output_dir: Path): """ More general version of test_branched_empty_timeline_size @@ -128,9 +167,13 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv): size_after_writes = http_client.tenant_size(tenant_id) assert size_after_writes > initial_size + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + @pytest.mark.skip("This should work, but is left out because assumed covered by other tests") -def test_branch_point_within_horizon(neon_simple_env: NeonEnv): +def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 15 @@ -167,9 +210,13 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv): assert size_before_branching < size_after + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + @pytest.mark.skip("This should work, but is left out because assumed covered by other tests") -def test_parent_within_horizon(neon_simple_env: NeonEnv): +def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 5 @@ -179,7 +226,7 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv): """ env = neon_simple_env - gc_horizon = 200_000 + gc_horizon = 5_000 (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)}) http_client = env.pageserver.http_client() @@ -212,9 +259,13 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv): assert size_before_branching < size_after + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + @pytest.mark.skip("This should work, but is left out because assumed covered by other tests") -def test_only_heads_within_horizon(neon_simple_env: NeonEnv): +def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = small @@ -253,8 +304,14 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv): latest_size = size_now + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) -def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): + +def test_single_branch_get_tenant_size_grows( + neon_env_builder: NeonEnvBuilder, test_output_dir: Path +): """ Operate on single branch reading the tenants size after each transaction. """ @@ -279,7 +336,20 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): collected_responses: List[Tuple[Lsn, int]] = [] + size_debug_file = open(test_output_dir / "size_debug.html", "w") + + def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int): + if current_lsn - initdb_lsn > gc_horizon: + assert ( + size >= prev + ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size" + else: + assert ( + size > prev + ), "tenant_size should grow, because we continue to add WAL to initial snapshot size" + with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg: + initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) with pg.cursor() as cur: cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)") @@ -297,13 +367,19 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) - size = http_client.tenant_size(tenant_id) + size, sizes = http_client.tenant_size_and_modelinputs(tenant_id) + + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) if len(collected_responses) > 0: prev = collected_responses[-1][1] if size == 0: assert prev == 0 else: + # branch start shouldn't be past gc_horizon yet + # thus the size should grow as we insert more data + assert current_lsn - initdb_lsn <= gc_horizon assert size > prev collected_responses.append((current_lsn, size)) @@ -323,9 +399,15 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) - size = http_client.tenant_size(tenant_id) + size, sizes = http_client.tenant_size_and_modelinputs(tenant_id) + + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + prev = collected_responses[-1][1] - assert size > prev, "tenant_size should grow with updates" + + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) + collected_responses.append((current_lsn, size)) while True: @@ -340,9 +422,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): size = http_client.tenant_size(tenant_id) prev = collected_responses[-1][1] - assert ( - size > prev - ), "even though rows have been deleted, the tenant_size should increase" + + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) + collected_responses.append((current_lsn, size)) with pg.cursor() as cur: @@ -352,7 +434,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): size = http_client.tenant_size(tenant_id) prev = collected_responses[-1][1] - assert size > prev, "dropping table grows tenant_size" + + check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev) + collected_responses.append((current_lsn, size)) # this isn't too many lines to forget for a while. observed while @@ -364,24 +448,17 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() env.pageserver.start() + size_debug_file.close() + size_after = http_client.tenant_size(tenant_id) prev = collected_responses[-1][1] assert size_after == prev, "size after restarting pageserver should not have changed" - ps_metrics = parse_metrics(http_client.get_metrics(), "pageserver") - tenant_metric_filter = { - "tenant_id": str(tenant_id), - } - tenant_size_metric = int( - ps_metrics.query_one("pageserver_tenant_synthetic_size", filter=tenant_metric_filter).value - ) - - assert tenant_size_metric == size_after, "API size value should be equal to metric size value" - - -def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder): +def test_get_tenant_size_with_multiple_branches( + neon_env_builder: NeonEnvBuilder, test_output_dir: Path +): """ Reported size goes up while branches or rows are being added, goes down after removing branches. """ @@ -481,6 +558,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder size_after = http_client.tenant_size(tenant_id) assert size_after == size_after_thinning_branch + size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file_before.write(size_debug) + # teardown, delete branches, and the size should be going down http_client.timeline_delete(tenant_id, first_branch_timeline_id) @@ -493,3 +574,38 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder assert size_after_deleting_second < size_after_continuing_on_main assert size_after_deleting_second > size_after_first_branch + + size_debug_file = open(test_output_dir / "size_debug.html", "w") + size_debug = http_client.tenant_size_debug(tenant_id) + size_debug_file.write(size_debug) + + +# Helper for tests that compare timeline_inputs +# We don't want to compare the exact values, because they can be unstable +# and cause flaky tests. So replace the values with useful invariants. +def mask_model_inputs(x): + if isinstance(x, dict): + newx = {} + for k, v in x.items(): + if k == "size": + if v is None or v == 0: + # no change + newx[k] = v + elif v < 0: + newx[k] = "<0" + else: + newx[k] = ">0" + elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record": + if v is None or v == 0 or v == "0/0": + # no change + newx[k] = v + else: + newx[k] = "masked" + else: + newx[k] = mask_model_inputs(v) + return newx + elif isinstance(x, list): + newlist = [mask_model_inputs(v) for v in x] + return newlist + else: + return x diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 6da6a4d4463c..769bc10280b8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -280,6 +280,7 @@ def test_tenant_upgrades_index_json_from_v0( timeline_file.seek(0) json.dump(v0_index_part, timeline_file) + timeline_file.truncate(timeline_file.tell()) env.pageserver.start() pageserver_http = env.pageserver.http_client() diff --git a/vm-cgconfig.conf b/vm-cgconfig.conf new file mode 100644 index 000000000000..a2e201708e25 --- /dev/null +++ b/vm-cgconfig.conf @@ -0,0 +1,12 @@ +# Configuration for cgroups in VM compute nodes +group neon-postgres { + perm { + admin { + uid = vm-informant; + } + task { + gid = users; + } + } + memory {} +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 30a6d3a92b1c..c0cf3c56116b 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -38,14 +38,14 @@ prost = { version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-syntax = { version = "0.6" } -reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } +reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] } ring = { version = "0.16", features = ["std"] } rustls = { version = "0.20", features = ["dangerous_configuration"] } scopeguard = { version = "1" } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["raw_value"] } socket2 = { version = "0.4", default-features = false, features = ["all"] } -tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] } +tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] } tokio-util = { version = "0.7", features = ["codec", "io"] } tonic = { version = "0.8", features = ["tls-roots"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }