From 20323fa4c14b3388a5caa92a5172fd69333cfd5e Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Thu, 18 Apr 2024 16:12:36 -0600 Subject: [PATCH 1/5] Add retries to auto.ipxe kernel/initrd downloads: This will help overcome transient network issues. Signed-off-by: Jacob Weinstock --- internal/ipxe/script/auto_test.go | 48 +++++++++++++++++++++++++++---- internal/ipxe/script/hook.go | 24 ++++++++++++++-- internal/ipxe/script/ipxe_test.go | 24 ++++++++++++++-- 3 files changed, 84 insertions(+), 12 deletions(-) diff --git a/internal/ipxe/script/auto_test.go b/internal/ipxe/script/auto_test.go index 82a87a58..18486816 100644 --- a/internal/ipxe/script/auto_test.go +++ b/internal/ipxe/script/auto_test.go @@ -33,13 +33,31 @@ echo Loading the Tinkerbell Hook iPXE script... set arch x86_64 set download-url http://location:8080/to/kernel/and/initrd +set idx:int32 0 +:retry_kernel kernel ${download-url}/vmlinuz-${arch} tink_worker_image=quay.io/tinkerbell/tink-worker:v0.8.0 tinkerbell=packet \ facility=onprem syslog_host=1.2.3.4 grpc_authority=1.2.3.4:42113 tinkerbell_tls=false worker_id=3c:ec:ef:4c:4f:54 hw_addr=3c:ec:ef:4c:4f:54 \ -modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 +modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} 10 && goto kernel-error || inc idx && goto retry_kernel -initrd ${download-url}/initramfs-${arch} +set idx:int32 0 +:retry_initrd +initrd ${download-url}/initramfs-${arch} || iseq ${idx} 10 && goto initrd-error || inc idx && goto retry_initrd -boot +set idx:int32 0 +:retry_boot +boot || iseq ${idx} 10 && goto boot-error || inc idx && goto retry_boot + +:kernel-error +echo Failed to load kernel +exit + +:initrd-error +echo Failed to load initrd +exit + +:boot-error +echo Failed to boot +exit `, }, "with vlan": { @@ -63,13 +81,31 @@ echo Loading the Tinkerbell Hook iPXE script... set arch x86_64 set download-url http://location:8080/to/kernel/and/initrd +set idx:int32 0 +:retry_kernel kernel ${download-url}/vmlinuz-${arch} vlan_id=16 tink_worker_image=quay.io/tinkerbell/tink-worker:v0.8.0 tinkerbell=packet \ facility=onprem syslog_host=1.2.3.4 grpc_authority=1.2.3.4:42113 tinkerbell_tls=false worker_id=3c:ec:ef:4c:4f:54 hw_addr=3c:ec:ef:4c:4f:54 \ -modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 +modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} 10 && goto kernel-error || inc idx && goto retry_kernel + +set idx:int32 0 +:retry_initrd +initrd ${download-url}/initramfs-${arch} || iseq ${idx} 10 && goto initrd-error || inc idx && goto retry_initrd + +set idx:int32 0 +:retry_boot +boot || iseq ${idx} 10 && goto boot-error || inc idx && goto retry_boot + +:kernel-error +echo Failed to load kernel +exit -initrd ${download-url}/initramfs-${arch} +:initrd-error +echo Failed to load initrd +exit -boot +:boot-error +echo Failed to boot +exit `, }, "parse error": { diff --git a/internal/ipxe/script/hook.go b/internal/ipxe/script/hook.go index a9ba4ed1..52a9b3da 100644 --- a/internal/ipxe/script/hook.go +++ b/internal/ipxe/script/hook.go @@ -11,13 +11,31 @@ echo Debug TraceID: {{ .TraceID }} set arch {{ .Arch }} set download-url {{ .DownloadURL }} +set idx:int32 0 +:retry_kernel kernel ${download-url}/vmlinuz-${arch} {{- if ne .VLANID "" }} vlan_id={{ .VLANID }} {{- end }} {{- range .ExtraKernelParams}} {{.}} {{- end}} \ facility={{ .Facility }} syslog_host={{ .SyslogHost }} grpc_authority={{ .TinkGRPCAuthority }} tinkerbell_tls={{ .TinkerbellTLS }} worker_id={{ .WorkerID }} hw_addr={{ .HWAddr }} \ -modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 +modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} 10 && goto kernel-error || inc idx && goto retry_kernel -initrd ${download-url}/initramfs-${arch} +set idx:int32 0 +:retry_initrd +initrd ${download-url}/initramfs-${arch} || iseq ${idx} 10 && goto initrd-error || inc idx && goto retry_initrd -boot +set idx:int32 0 +:retry_boot +boot || iseq ${idx} 10 && goto boot-error || inc idx && goto retry_boot + +:kernel-error +echo Failed to load kernel +exit + +:initrd-error +echo Failed to load initrd +exit + +:boot-error +echo Failed to boot +exit ` // Hook holds the values used to generate the iPXE script that loads the Hook OS. diff --git a/internal/ipxe/script/ipxe_test.go b/internal/ipxe/script/ipxe_test.go index a3ceb169..76139def 100644 --- a/internal/ipxe/script/ipxe_test.go +++ b/internal/ipxe/script/ipxe_test.go @@ -51,13 +51,31 @@ echo Loading the Tinkerbell Hook iPXE script... set arch x86_64 set download-url http://127.1.1.1 +set idx:int32 0 +:retry_kernel kernel ${download-url}/vmlinuz-${arch} vlan_id=1234 \ facility=onprem syslog_host= grpc_authority= tinkerbell_tls=false worker_id=00:01:02:03:04:05 hw_addr=00:01:02:03:04:05 \ -modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 +modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} 10 && goto kernel-error || inc idx && goto retry_kernel -initrd ${download-url}/initramfs-${arch} +set idx:int32 0 +:retry_initrd +initrd ${download-url}/initramfs-${arch} || iseq ${idx} 10 && goto initrd-error || inc idx && goto retry_initrd -boot +set idx:int32 0 +:retry_boot +boot || iseq ${idx} 10 && goto boot-error || inc idx && goto retry_boot + +:kernel-error +echo Failed to load kernel +exit + +:initrd-error +echo Failed to load initrd +exit + +:boot-error +echo Failed to boot +exit ` tests := map[string]struct { want string From cf4aad5d1e4765b832504f8f15dcce53b3008b40 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Mon, 13 May 2024 11:02:12 -0600 Subject: [PATCH 2/5] Make iPXE script kernel, initrd retries configurable: This allows for transient network issues to not cause the iPXE script to fail downloading kernel and initramfs. Signed-off-by: Jacob Weinstock --- cmd/smee/flag.go | 1 + cmd/smee/flag_test.go | 1 + cmd/smee/main.go | 2 ++ internal/ipxe/script/hook.go | 1 + internal/ipxe/script/ipxe.go | 2 ++ 5 files changed, 7 insertions(+) diff --git a/cmd/smee/flag.go b/cmd/smee/flag.go index aad0adce..5632ece8 100644 --- a/cmd/smee/flag.go +++ b/cmd/smee/flag.go @@ -106,6 +106,7 @@ func ipxeHTTPScriptFlags(c *config, fs *flag.FlagSet) { fs.StringVar(&c.ipxeHTTPScript.hookURL, "osie-url", "", "[http] URL where OSIE (HookOS) images are located") fs.StringVar(&c.ipxeHTTPScript.tinkServer, "tink-server", "", "[http] IP:Port for the Tink server") fs.BoolVar(&c.ipxeHTTPScript.tinkServerUseTLS, "tink-server-tls", false, "[http] use TLS for Tink server") + fs.IntVar(&c.ipxeHTTPScript.retries, "ipxe-script-retries", 0, "[http] number of retries to attempt when fetching kernel and initrd files in the iPXE script") } func dhcpFlags(c *config, fs *flag.FlagSet) { diff --git a/cmd/smee/flag_test.go b/cmd/smee/flag_test.go index 52c85226..a7394e2e 100644 --- a/cmd/smee/flag_test.go +++ b/cmd/smee/flag_test.go @@ -111,6 +111,7 @@ FLAGS -http-addr [http] local IP:Port to listen on for iPXE HTTP script requests (default "%[1]v:80") -http-ipxe-binary-enabled [http] enable iPXE HTTP binary server (default "true") -http-ipxe-script-enabled [http] enable iPXE HTTP script server (default "true") + -ipxe-script-retries [http] number of retries to attempt when fetching kernel and initrd files in the iPXE script (default "0") -osie-url [http] URL where OSIE (HookOS) images are located -tink-server [http] IP:Port for the Tink server -tink-server-tls [http] use TLS for Tink server (default "false") diff --git a/cmd/smee/main.go b/cmd/smee/main.go index a96e192e..d50c5ceb 100644 --- a/cmd/smee/main.go +++ b/cmd/smee/main.go @@ -83,6 +83,7 @@ type ipxeHTTPScript struct { tinkServerUseTLS bool trustedProxies string disableDiscoverTrustedProxies bool + retries int } type dhcpConfig struct { @@ -219,6 +220,7 @@ func main() { PublicSyslogFQDN: cfg.dhcp.syslogIP, TinkServerTLS: cfg.ipxeHTTPScript.tinkServerUseTLS, TinkServerGRPCAddr: cfg.ipxeHTTPScript.tinkServer, + IPXEScriptRetries: cfg.ipxeHTTPScript.retries, } // serve ipxe script from the "/" URI. handlers["/"] = jh.HandlerFunc() diff --git a/internal/ipxe/script/hook.go b/internal/ipxe/script/hook.go index 52a9b3da..3c966c1c 100644 --- a/internal/ipxe/script/hook.go +++ b/internal/ipxe/script/hook.go @@ -52,4 +52,5 @@ type Hook struct { TraceID string VLANID string // string number between 1-4095 WorkerID string // example 3c:ec:ef:4c:4f:54 or worker1 + Retries int // number of retries to attempt when fetching kernel and initrd files } diff --git a/internal/ipxe/script/ipxe.go b/internal/ipxe/script/ipxe.go index 2e989c9a..dfd22c6c 100644 --- a/internal/ipxe/script/ipxe.go +++ b/internal/ipxe/script/ipxe.go @@ -26,6 +26,7 @@ type Handler struct { PublicSyslogFQDN string TinkServerTLS bool TinkServerGRPCAddr string + IPXEScriptRetries int } type data struct { @@ -227,6 +228,7 @@ func (h *Handler) defaultScript(span trace.Span, hw data) (string, error) { TinkGRPCAuthority: h.TinkServerGRPCAddr, VLANID: hw.VLANID, WorkerID: wID, + Retries: h.IPXEScriptRetries, } if sc := span.SpanContext(); sc.IsSampled() { auto.TraceID = sc.TraceID().String() From 1efd22dec2126587cc54d80a960ec87c6667c16d Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Mon, 13 May 2024 11:09:07 -0600 Subject: [PATCH 3/5] Fix linting issue Signed-off-by: Jacob Weinstock --- internal/ipxe/script/hook.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/ipxe/script/hook.go b/internal/ipxe/script/hook.go index 3c966c1c..75eaa895 100644 --- a/internal/ipxe/script/hook.go +++ b/internal/ipxe/script/hook.go @@ -52,5 +52,5 @@ type Hook struct { TraceID string VLANID string // string number between 1-4095 WorkerID string // example 3c:ec:ef:4c:4f:54 or worker1 - Retries int // number of retries to attempt when fetching kernel and initrd files + Retries int // number of retries to attempt when fetching kernel and initrd files } From 05671ff1c478ea9171fdd6f6796c94e63b14c638 Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Mon, 13 May 2024 11:16:35 -0600 Subject: [PATCH 4/5] Fix tests, add int32 type to retries var: This makes the retries count work in iPXE. Signed-off-by: Jacob Weinstock --- internal/ipxe/script/auto_test.go | 16 ++++++++++------ internal/ipxe/script/hook.go | 7 ++++--- internal/ipxe/script/ipxe_test.go | 8 +++++--- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/internal/ipxe/script/auto_test.go b/internal/ipxe/script/auto_test.go index 18486816..e0363f03 100644 --- a/internal/ipxe/script/auto_test.go +++ b/internal/ipxe/script/auto_test.go @@ -24,6 +24,7 @@ func TestGenerateTemplate(t *testing.T) { Facility: "onprem", ExtraKernelParams: []string{"tink_worker_image=quay.io/tinkerbell/tink-worker:v0.8.0", "tinkerbell=packet"}, HWAddr: "3c:ec:ef:4c:4f:54", + Retries: 10, }, script: HookScript, want: `#!ipxe @@ -32,20 +33,21 @@ echo Loading the Tinkerbell Hook iPXE script... set arch x86_64 set download-url http://location:8080/to/kernel/and/initrd +set retries:int32 10 set idx:int32 0 :retry_kernel kernel ${download-url}/vmlinuz-${arch} tink_worker_image=quay.io/tinkerbell/tink-worker:v0.8.0 tinkerbell=packet \ facility=onprem syslog_host=1.2.3.4 grpc_authority=1.2.3.4:42113 tinkerbell_tls=false worker_id=3c:ec:ef:4c:4f:54 hw_addr=3c:ec:ef:4c:4f:54 \ -modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} 10 && goto kernel-error || inc idx && goto retry_kernel +modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} ${retries} && goto kernel-error || inc idx && goto retry_kernel set idx:int32 0 :retry_initrd -initrd ${download-url}/initramfs-${arch} || iseq ${idx} 10 && goto initrd-error || inc idx && goto retry_initrd +initrd ${download-url}/initramfs-${arch} || iseq ${idx} ${retries} && goto initrd-error || inc idx && goto retry_initrd set idx:int32 0 :retry_boot -boot || iseq ${idx} 10 && goto boot-error || inc idx && goto retry_boot +boot || iseq ${idx} ${retries} && goto boot-error || inc idx && goto retry_boot :kernel-error echo Failed to load kernel @@ -72,6 +74,7 @@ exit ExtraKernelParams: []string{"tink_worker_image=quay.io/tinkerbell/tink-worker:v0.8.0", "tinkerbell=packet"}, HWAddr: "3c:ec:ef:4c:4f:54", VLANID: "16", + Retries: 10, }, script: HookScript, want: `#!ipxe @@ -80,20 +83,21 @@ echo Loading the Tinkerbell Hook iPXE script... set arch x86_64 set download-url http://location:8080/to/kernel/and/initrd +set retries:int32 10 set idx:int32 0 :retry_kernel kernel ${download-url}/vmlinuz-${arch} vlan_id=16 tink_worker_image=quay.io/tinkerbell/tink-worker:v0.8.0 tinkerbell=packet \ facility=onprem syslog_host=1.2.3.4 grpc_authority=1.2.3.4:42113 tinkerbell_tls=false worker_id=3c:ec:ef:4c:4f:54 hw_addr=3c:ec:ef:4c:4f:54 \ -modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} 10 && goto kernel-error || inc idx && goto retry_kernel +modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} ${retries} && goto kernel-error || inc idx && goto retry_kernel set idx:int32 0 :retry_initrd -initrd ${download-url}/initramfs-${arch} || iseq ${idx} 10 && goto initrd-error || inc idx && goto retry_initrd +initrd ${download-url}/initramfs-${arch} || iseq ${idx} ${retries} && goto initrd-error || inc idx && goto retry_initrd set idx:int32 0 :retry_boot -boot || iseq ${idx} 10 && goto boot-error || inc idx && goto retry_boot +boot || iseq ${idx} ${retries} && goto boot-error || inc idx && goto retry_boot :kernel-error echo Failed to load kernel diff --git a/internal/ipxe/script/hook.go b/internal/ipxe/script/hook.go index 75eaa895..71d3b7be 100644 --- a/internal/ipxe/script/hook.go +++ b/internal/ipxe/script/hook.go @@ -10,20 +10,21 @@ echo Debug TraceID: {{ .TraceID }} set arch {{ .Arch }} set download-url {{ .DownloadURL }} +set retries:int32 {{ .Retries }} set idx:int32 0 :retry_kernel kernel ${download-url}/vmlinuz-${arch} {{- if ne .VLANID "" }} vlan_id={{ .VLANID }} {{- end }} {{- range .ExtraKernelParams}} {{.}} {{- end}} \ facility={{ .Facility }} syslog_host={{ .SyslogHost }} grpc_authority={{ .TinkGRPCAuthority }} tinkerbell_tls={{ .TinkerbellTLS }} worker_id={{ .WorkerID }} hw_addr={{ .HWAddr }} \ -modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} 10 && goto kernel-error || inc idx && goto retry_kernel +modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} ${retries} && goto kernel-error || inc idx && goto retry_kernel set idx:int32 0 :retry_initrd -initrd ${download-url}/initramfs-${arch} || iseq ${idx} 10 && goto initrd-error || inc idx && goto retry_initrd +initrd ${download-url}/initramfs-${arch} || iseq ${idx} ${retries} && goto initrd-error || inc idx && goto retry_initrd set idx:int32 0 :retry_boot -boot || iseq ${idx} 10 && goto boot-error || inc idx && goto retry_boot +boot || iseq ${idx} ${retries} && goto boot-error || inc idx && goto retry_boot :kernel-error echo Failed to load kernel diff --git a/internal/ipxe/script/ipxe_test.go b/internal/ipxe/script/ipxe_test.go index 76139def..f4a21a9d 100644 --- a/internal/ipxe/script/ipxe_test.go +++ b/internal/ipxe/script/ipxe_test.go @@ -50,20 +50,21 @@ echo Loading the Tinkerbell Hook iPXE script... set arch x86_64 set download-url http://127.1.1.1 +set retries:int32 10 set idx:int32 0 :retry_kernel kernel ${download-url}/vmlinuz-${arch} vlan_id=1234 \ facility=onprem syslog_host= grpc_authority= tinkerbell_tls=false worker_id=00:01:02:03:04:05 hw_addr=00:01:02:03:04:05 \ -modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} 10 && goto kernel-error || inc idx && goto retry_kernel +modules=loop,squashfs,sd-mod,usb-storage intel_iommu=on iommu=pt initrd=initramfs-${arch} console=tty0 console=ttyS1,115200 || iseq ${idx} ${retries} && goto kernel-error || inc idx && goto retry_kernel set idx:int32 0 :retry_initrd -initrd ${download-url}/initramfs-${arch} || iseq ${idx} 10 && goto initrd-error || inc idx && goto retry_initrd +initrd ${download-url}/initramfs-${arch} || iseq ${idx} ${retries} && goto initrd-error || inc idx && goto retry_initrd set idx:int32 0 :retry_boot -boot || iseq ${idx} 10 && goto boot-error || inc idx && goto retry_boot +boot || iseq ${idx} ${retries} && goto boot-error || inc idx && goto retry_boot :kernel-error echo Failed to load kernel @@ -86,6 +87,7 @@ exit t.Run(name, func(t *testing.T) { h := &Handler{ OSIEURL: "http://127.1.1.1", + IPXEScriptRetries: 10, } d := data{MACAddress: net.HardwareAddr{0x00, 0x01, 0x02, 0x03, 0x04, 0x05}, VLANID: "1234", Facility: "onprem", Arch: "x86_64"} sp := trace.SpanFromContext(context.Background()) From b2a9e18d3d674c6d387eff79698f45abcd93e35f Mon Sep 17 00:00:00 2001 From: Jacob Weinstock Date: Mon, 13 May 2024 11:22:40 -0600 Subject: [PATCH 5/5] Fix linting issue Signed-off-by: Jacob Weinstock --- internal/ipxe/script/ipxe_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/ipxe/script/ipxe_test.go b/internal/ipxe/script/ipxe_test.go index f4a21a9d..28d12868 100644 --- a/internal/ipxe/script/ipxe_test.go +++ b/internal/ipxe/script/ipxe_test.go @@ -86,7 +86,7 @@ exit for name, tt := range tests { t.Run(name, func(t *testing.T) { h := &Handler{ - OSIEURL: "http://127.1.1.1", + OSIEURL: "http://127.1.1.1", IPXEScriptRetries: 10, } d := data{MACAddress: net.HardwareAddr{0x00, 0x01, 0x02, 0x03, 0x04, 0x05}, VLANID: "1234", Facility: "onprem", Arch: "x86_64"}