deployments/helm/nvidia-device-plugin/values.yaml

# Plugin configuration
# Only one of "name" or "map" should ever be set for a given deployment.
# Use "name" to point to an external ConfigMap with a list of configurations, or to make the Chart
# create a ConfigMap for you if "create" is True.
# Use "map" to build an integrated ConfigMap from a set of configurations as
# part of this helm chart. An example of setting "map" might be:
# config:
#   map:
#     default: |-
#       version: v1
#       flags:
#         migStrategy: none
#     mig-single: |-
#       version: v1
#       flags:
#         migStrategy: single
#     mig-mixed: |-
#       version: v1
#       flags:
#         migStrategy: mixed
config:
  # ConfigMap name if pulling from an external ConfigMap
  name: "nos-device-plugin-configs"
  # If true, the ConfigMap containing the plugin configuration files will be created by the Chart, initialized
  # with an empty default configuration.
  # Otherwise, the Chart will use the existing ConfigMap with name .Values.config.name to exist.
  create: true
  # Set of named configs to build an integrated ConfigMap from
  map: {}
  # Default config name within the ConfigMap
  default: ""
  # List of fallback strategies to attempt if no config is selected and no default is provided
  fallbackStrategies: ["named" , "single"]

legacyDaemonsetAPI: null
compatWithCPUManager: null
migStrategy: null
failOnInitError: null
deviceListStrategy: null
deviceIDStrategy: null
nvidiaDriverRoot: null
gdsEnabled: null
mofedEnabled: null

fullnameOverride: ""
namespaceOverride: ""
selectorLabelsOverride: {}

allowDefaultNamespace: false

imagePullSecrets: []
image:
  repository: ghcr.io/nebuly-ai/k8s-device-plugin
  pullPolicy: IfNotPresent
  # Overrides the image tag whose default is the chart appVersion.
  tag: ""

mps:
  enabled: true
  # The ID of the user used to run the MPS server.
  # All the containers requesting GPU resources must run as this user.
  userID: 1000
  image:
    repository: ghcr.io/nebuly-ai/nvidia-mps-server
    pullPolicy: IfNotPresent
    tag: "0.0.1"


updateStrategy:
  type: RollingUpdate

podAnnotations: {}
podSecurityContext: {}
securityContext: {}

resources: {}
nodeSelector:
  nos.nebuly.com/gpu-partitioning: "mps"
affinity: {}
tolerations:
  # This toleration is deprecated. Kept here for backward compatibility
  # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
  - key: CriticalAddonsOnly
    operator: Exists
  - key: nvidia.com/gpu
    operator: Exists
    effect: NoSchedule
  - key: "kubernetes.azure.com/scalesetpriority"
    operator: "Equal"
    value: "spot"
    effect: "NoSchedule"

# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"

runtimeClassName: null

# Subcharts
nfd:
  nameOverride: node-feature-discovery
  enableNodeFeatureApi: false
  master:
    serviceAccount:
      name: node-feature-discovery
      create: true
    config: 
      extraLabelNs: ["nvidia.com"]

  worker:
    tolerations:
    - key: "node-role.kubernetes.io/master"
      operator: "Equal"
      value: ""
      effect: "NoSchedule"
    - key: "nvidia.com/gpu"
      operator: "Equal"
      value: "present"
      effect: "NoSchedule"
    config:
      sources:
        pci:
          deviceClassWhitelist:
          - "02"
          - "0200"
          - "0207"
          - "0300"
          - "0302"
          deviceLabelFields:
          - vendor
gfd:
  enabled: false
  nameOverride: gpu-feature-discovery
  namespaceOverride: ""