From ca7386aaefe6c0c0376298c78e12fc75770fdb2d Mon Sep 17 00:00:00 2001 From: Chad Metcalf Date: Thu, 3 Mar 2022 10:49:20 -0800 Subject: [PATCH 1/2] Adding a gpu node group. --- eks-cluster.yaml | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/eks-cluster.yaml b/eks-cluster.yaml index b6ba331..8e52e8a 100644 --- a/eks-cluster.yaml +++ b/eks-cluster.yaml @@ -122,6 +122,77 @@ managedNodeGroups: # or use a custom list instanceTypes: ["m6i.xlarge", "m6i.2xlarge"] + - name: gpu + desiredCapacity: 1 + minSize: 1 + maxSize: 10 + # because of AWS addons + disableIMDSv1: false + # Please configure the size of the volume and additional features + # https://eksctl.io/usage/schema/#nodeGroups-volumeType + # https://aws.amazon.com/es/ebs/pricing/ + volumeSize: 300 + volumeType: gp3 + volumeIOPS: 6000 + volumeThroughput: 500 + ebsOptimized: true + # Use private subnets for nodes + # https://eksctl.io/usage/vpc-networking/#use-private-subnets-for-initial-nodegroup + privateNetworking: true + amiFamily: AmazonLinux2 + + tags: + # EC2 tags required for cluster-autoscaler auto-discovery + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/gitpod: "owned" + + # GPU autoscale tags + k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true + k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true' + + labels: + nvidia.com/gpu: 'true' + k8s.amazonaws.com/accelerator: nvidia-tesla + + iam: + attachPolicyARNs: &attachPolicyARNs + - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly + - arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy + - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy + - arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess + - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore + withAddonPolicies: &withAddonPolicies + albIngress: true + autoScaler: true + cloudWatch: true + certManager: true + ebs: true + # Using custom AMI images require the definition of overrideBootstrapCommand + # to ensure that nodes are able to join the cluster https://eksctl.io/usage/custom-ami-support/ + #overrideBootstrapCommand: | + # #!/bin/bash +# + # export CLUSTERNAME=gitpod + # export NODEGROUP=gpu +# + # declare -a LABELS=( + # eks.amazonaws.com/nodegroup="${NODEGROUP}" + # gitpod.io/workload_workspace_services=true + # gitpod.io/workload_workspace_regular=true + # gitpod.io/workload_workspace_headless=true + # ) +# + # export KUBELET_EXTRA_ARGS="$(printf -- "--max-pods=110 --node-labels=%s" $(IFS=$','; echo "${LABELS[*]}"))" + # /etc/eks/bootstrap.sh ${CLUSTERNAME} + + spot: false + + # gpu: 1 + # vCPUs: 4 + # memory: 61Gib + instanceTypes: ["p2.xlarge"] + + - name: services desiredCapacity: 1 minSize: 1 From 15ca22d809b8f0b42339104f6dc4ed9de05c0370 Mon Sep 17 00:00:00 2001 From: "Cornelius A. Ludmann" Date: Wed, 13 Apr 2022 08:42:07 +0000 Subject: [PATCH 2/2] [wip] GPU support --- .env.example | 9 +-- Makefile | 2 + eks-cluster.yaml | 148 +++++++++++++++++++++------------------ nvidia-device-plugin.yml | 38 ++++++++++ setup.sh | 16 +++-- 5 files changed, 137 insertions(+), 76 deletions(-) create mode 100644 nvidia-device-plugin.yml diff --git a/.env.example b/.env.example index 56caa6b..25aaddd 100644 --- a/.env.example +++ b/.env.example @@ -1,19 +1,20 @@ # Base domain -DOMAIN=mygitpod.example.com +DOMAIN=clu-aws.gitpod-self-hosted.com # AWS Certificate Manager certificate # Setting this value implies TLS termination in the load balancer -CERTIFICATE_ARN=arn:aws:acm:::certificate/ +# CERTIFICATE_ARN=arn:aws:acm:eu-central-1:691173103445:certificate/674853cf-ae61-41d9-8d64-6e28adfe1b28 +CERTIFICATE_ARN=arn:aws:acm:us-west-2:691173103445:certificate/77786715-c224-4d7f-9bae-4d7c8e302807 # The AWS credentials profile name (optional) # Leave empty or remove if you only set up the default one -AWS_PROFILE=ekspod +AWS_PROFILE= # The Route53 Zone ID (optional) # If the DNS domain is managed by and you want to enable external-dns, please set the route53 zone ID # This enables the update of the DNS records required to get gitpod running using the Ingress rule # definition as the source of truth. -ROUTE53_ZONEID=XXXXXXXXX +ROUTE53_ZONEID=Z09446753JOFN4T9C9LWJ # The name of the S3 bucket where the container images that gitpod creates are stored # If there is no value we create a new bucket with the name "container-registry--" diff --git a/Makefile b/Makefile index 4591012..e45c67c 100644 --- a/Makefile +++ b/Makefile @@ -31,6 +31,8 @@ DOCKER_RUN_CMD = docker run -it \ --volume ${PWD}/gitpod-config.yaml:/gitpod/gitpod-config.yaml \ --volume ${PWD}/cdk-outputs.json:/gitpod/cdk-outputs.json \ --volume ${HOME}/.aws:/root/.aws \ + --volume ${PWD}/setup.sh:/gitpod/setup.sh \ + --volume ${PWD}/nvidia-device-plugin.yml:/gitpod/nvidia-device-plugin.yml \ ${IMG} $(1) install: ## Install Gitpod diff --git a/eks-cluster.yaml b/eks-cluster.yaml index 8e52e8a..0ffb1f1 100644 --- a/eks-cluster.yaml +++ b/eks-cluster.yaml @@ -7,7 +7,7 @@ metadata: # and k8s.io/cluster-autoscaler/: "owned" # cluster-autoscaler will not be require additional labels in a future release. # https://github.com/kubernetes/autoscaler/pull/3968 - name: gitpod + name: gitpod-clu # Template, please change region: us-west-2 version: "1.21" @@ -77,12 +77,22 @@ managedNodeGroups: # Use private subnets for nodes # https://eksctl.io/usage/vpc-networking/#use-private-subnets-for-initial-nodegroup privateNetworking: true - ami: ami-009935ddbb32a7f3c + # ami: ami-009935ddbb32a7f3c + ami: ami-06045aa686f46dd58 tags: # EC2 tags required for cluster-autoscaler auto-discovery k8s.io/cluster-autoscaler/enabled: "true" k8s.io/cluster-autoscaler/gitpod: "owned" + + # GPU autoscale tags + k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true + k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true' + + labels: + nvidia.com/gpu: 'true' + k8s.amazonaws.com/accelerator: nvidia-tesla + iam: attachPolicyARNs: &attachPolicyARNs - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly @@ -101,7 +111,7 @@ managedNodeGroups: overrideBootstrapCommand: | #!/bin/bash - export CLUSTERNAME=gitpod + export CLUSTERNAME=gitpod-clu export NODEGROUP=workspaces declare -a LABELS=( @@ -109,6 +119,8 @@ managedNodeGroups: gitpod.io/workload_workspace_services=true gitpod.io/workload_workspace_regular=true gitpod.io/workload_workspace_headless=true + nvidia.com/gpu=true + k8s.amazonaws.com/accelerator=nvidia-tesla ) export KUBELET_EXTRA_ARGS="$(printf -- "--max-pods=110 --node-labels=%s" $(IFS=$','; echo "${LABELS[*]}"))" @@ -120,77 +132,77 @@ managedNodeGroups: # vCPUs: 8 # memory: 64Gib # or use a custom list - instanceTypes: ["m6i.xlarge", "m6i.2xlarge"] + instanceTypes: ["p2.xlarge"] - - name: gpu - desiredCapacity: 1 - minSize: 1 - maxSize: 10 - # because of AWS addons - disableIMDSv1: false - # Please configure the size of the volume and additional features - # https://eksctl.io/usage/schema/#nodeGroups-volumeType - # https://aws.amazon.com/es/ebs/pricing/ - volumeSize: 300 - volumeType: gp3 - volumeIOPS: 6000 - volumeThroughput: 500 - ebsOptimized: true - # Use private subnets for nodes - # https://eksctl.io/usage/vpc-networking/#use-private-subnets-for-initial-nodegroup - privateNetworking: true - amiFamily: AmazonLinux2 +# - name: gpu +# desiredCapacity: 1 +# minSize: 1 +# maxSize: 10 +# # because of AWS addons +# disableIMDSv1: false +# # Please configure the size of the volume and additional features +# # https://eksctl.io/usage/schema/#nodeGroups-volumeType +# # https://aws.amazon.com/es/ebs/pricing/ +# volumeSize: 300 +# volumeType: gp3 +# volumeIOPS: 6000 +# volumeThroughput: 500 +# ebsOptimized: true +# # Use private subnets for nodes +# # https://eksctl.io/usage/vpc-networking/#use-private-subnets-for-initial-nodegroup +# privateNetworking: true +# amiFamily: AmazonLinux2 - tags: - # EC2 tags required for cluster-autoscaler auto-discovery - k8s.io/cluster-autoscaler/enabled: "true" - k8s.io/cluster-autoscaler/gitpod: "owned" +# tags: +# # EC2 tags required for cluster-autoscaler auto-discovery +# k8s.io/cluster-autoscaler/enabled: "true" +# k8s.io/cluster-autoscaler/gitpod: "owned" - # GPU autoscale tags - k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true - k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true' +# # GPU autoscale tags +# k8s.io/cluster-autoscaler/node-template/taint/dedicated: nvidia.com/gpu=true +# k8s.io/cluster-autoscaler/node-template/label/nvidia.com/gpu: 'true' - labels: - nvidia.com/gpu: 'true' - k8s.amazonaws.com/accelerator: nvidia-tesla +# labels: +# nvidia.com/gpu: 'true' +# k8s.amazonaws.com/accelerator: nvidia-tesla - iam: - attachPolicyARNs: &attachPolicyARNs - - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly - - arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy - - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy - - arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess - - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore - withAddonPolicies: &withAddonPolicies - albIngress: true - autoScaler: true - cloudWatch: true - certManager: true - ebs: true - # Using custom AMI images require the definition of overrideBootstrapCommand - # to ensure that nodes are able to join the cluster https://eksctl.io/usage/custom-ami-support/ - #overrideBootstrapCommand: | - # #!/bin/bash -# - # export CLUSTERNAME=gitpod - # export NODEGROUP=gpu -# - # declare -a LABELS=( - # eks.amazonaws.com/nodegroup="${NODEGROUP}" - # gitpod.io/workload_workspace_services=true - # gitpod.io/workload_workspace_regular=true - # gitpod.io/workload_workspace_headless=true - # ) -# - # export KUBELET_EXTRA_ARGS="$(printf -- "--max-pods=110 --node-labels=%s" $(IFS=$','; echo "${LABELS[*]}"))" - # /etc/eks/bootstrap.sh ${CLUSTERNAME} +# iam: +# attachPolicyARNs: &attachPolicyARNs +# - arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly +# - arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy +# - arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy +# - arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess +# - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore +# withAddonPolicies: &withAddonPolicies +# albIngress: true +# autoScaler: true +# cloudWatch: true +# certManager: true +# ebs: true +# # Using custom AMI images require the definition of overrideBootstrapCommand +# # to ensure that nodes are able to join the cluster https://eksctl.io/usage/custom-ami-support/ +# #overrideBootstrapCommand: | +# # #!/bin/bash +# # +# # export CLUSTERNAME=gitpod +# # export NODEGROUP=gpu +# # +# # declare -a LABELS=( +# # eks.amazonaws.com/nodegroup="${NODEGROUP}" +# # gitpod.io/workload_workspace_services=true +# # gitpod.io/workload_workspace_regular=true +# # gitpod.io/workload_workspace_headless=true +# # ) +# # +# # export KUBELET_EXTRA_ARGS="$(printf -- "--max-pods=110 --node-labels=%s" $(IFS=$','; echo "${LABELS[*]}"))" +# # /etc/eks/bootstrap.sh ${CLUSTERNAME} - spot: false +# spot: false - # gpu: 1 - # vCPUs: 4 - # memory: 61Gib - instanceTypes: ["p2.xlarge"] +# # gpu: 1 +# # vCPUs: 4 +# # memory: 61Gib +# instanceTypes: ["p2.xlarge"] - name: services @@ -224,7 +236,7 @@ managedNodeGroups: overrideBootstrapCommand: | #!/bin/bash - export CLUSTERNAME=gitpod + export CLUSTERNAME=gitpod-clu export NODEGROUP=services declare -a LABELS=( diff --git a/nvidia-device-plugin.yml b/nvidia-device-plugin.yml new file mode 100644 index 0000000..92ffc3f --- /dev/null +++ b/nvidia-device-plugin.yml @@ -0,0 +1,38 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + template: + metadata: + # Mark this pod as a critical add-on; when enabled, the critical add-on scheduler + # reserves resources for critical add-on pods so that they can be rescheduled after + # a failure. This annotation works in tandem with the toleration below. + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + # Allow this pod to be rescheduled while the node is in "critical add-ons only" mode. + # This, along with the annotation above marks this pod as a critical add-on. + - key: CriticalAddonsOnly + operator: Exists + containers: + - image: nvidia/k8s-device-plugin:1.10 + name: nvidia-device-plugin-ctr + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/setup.sh b/setup.sh index df8f20f..547ea74 100755 --- a/setup.sh +++ b/setup.sh @@ -102,6 +102,11 @@ function install() { # Install Calico. kubectl apply -f https://docs.projectcalico.org/manifests/calico-vxlan.yaml + # Apply NVIDIA Kubernetes device plugin as daemon set + # https://aws.amazon.com/blogs/compute/running-gpu-accelerated-kubernetes-workloads-on-p3-and-p2-ec2-instances-with-amazon-eks/ + #kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.10/nvidia-device-plugin.yml + kubectl apply -f /gitpod/nvidia-device-plugin.yml + # Create secret with container registry credentials if [ -n "${IMAGE_PULL_SECRET_FILE}" ] && [ -f "${IMAGE_PULL_SECRET_FILE}" ]; then kubectl create secret generic gitpod-image-pull-secret \ @@ -204,7 +209,7 @@ EOF kubectl replace --force -f - local CONFIG_FILE="${DIR}/gitpod-config.yaml" - gitpod-installer init > "${CONFIG_FILE}" + # gitpod-installer init > "${CONFIG_FILE}" yq e -i ".certificate.name = \"https-certificates\"" "${CONFIG_FILE}" yq e -i ".domain = \"${DOMAIN}\"" "${CONFIG_FILE}" @@ -213,11 +218,14 @@ EOF yq e -i ".database.external.certificate.kind = \"secret\"" "${CONFIG_FILE}" yq e -i ".database.external.certificate.name = \"${MYSQL_GITPOD_SECRET}\"" "${CONFIG_FILE}" yq e -i '.workspace.runtime.containerdRuntimeDir = "/var/lib/containerd/io.containerd.runtime.v2.task/k8s.io"' "${CONFIG_FILE}" - yq e -i ".containerRegistry.s3storage.bucket = \"${CONTAINER_REGISTRY_BUCKET}\"" "${CONFIG_FILE}" - yq e -i ".containerRegistry.s3storage.certificate.kind = \"secret\"" "${CONFIG_FILE}" - yq e -i ".containerRegistry.s3storage.certificate.name = \"${SECRET_STORAGE}\"" "${CONFIG_FILE}" + # yq e -i ".containerRegistry.s3storage.bucket = \"${CONTAINER_REGISTRY_BUCKET}\"" "${CONFIG_FILE}" + # yq e -i ".containerRegistry.s3storage.certificate.kind = \"secret\"" "${CONFIG_FILE}" + # yq e -i ".containerRegistry.s3storage.certificate.name = \"${SECRET_STORAGE}\"" "${CONFIG_FILE}" yq e -i ".workspace.runtime.fsShiftMethod = \"shiftfs\"" "${CONFIG_FILE}" + # GPU + yq e -i ".workspace.resources.limits.\"nvidia.com/gpu\" = \"1\"" "${CONFIG_FILE}" + gitpod-installer \ render \ --config="${CONFIG_FILE}" > gitpod.yaml