diff --git a/Microsoft.GA4GH.TES.sln b/Microsoft.GA4GH.TES.sln index fe33fbb7f..fac9b0574 100644 --- a/Microsoft.GA4GH.TES.sln +++ b/Microsoft.GA4GH.TES.sln @@ -45,6 +45,56 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tes.ApiClients", "src\Tes.A EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tes.ApiClients.Tests", "src\Tes.ApiClients.Tests\Tes.ApiClients.Tests.csproj", "{FDD8FB67-8C8D-4D84-8D4C-FEEA644F4745}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "VM Monitoring", "VM Monitoring", "{1166E860-5DBB-478F-9A1E-848A74D6D1AD}" + ProjectSection(SolutionItems) = preProject + src\build_tes_remote.sh = src\build_tes_remote.sh + src\create_tes_debug_pool.sh = src\create_tes_debug_pool.sh + src\docker_deploy.sh = src\docker_deploy.sh + src\tes_pool_config.json = src\tes_pool_config.json + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "vm_monitor", "vm_monitor", "{DDF9389C-F5D0-4998-A53C-9505608B1FF6}" + ProjectSection(SolutionItems) = preProject + src\vm_monitor\build_tes_deployment_archive.sh = src\vm_monitor\build_tes_deployment_archive.sh + src\vm_monitor\README.md = src\vm_monitor\README.md + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "telegraf", "telegraf", "{28A0F8E6-A63B-4378-90B0-773F61CF7C15}" + ProjectSection(SolutionItems) = preProject + src\vm_monitor\telegraf\build_telegraf_remote.sh = src\vm_monitor\telegraf\build_telegraf_remote.sh + src\vm_monitor\telegraf\Dockerfile = src\vm_monitor\telegraf\Dockerfile + src\vm_monitor\telegraf\telegraf = src\vm_monitor\telegraf\telegraf + src\vm_monitor\telegraf\telegraf.dummy.conf = src\vm_monitor\telegraf\telegraf.dummy.conf + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "azure_append_blob", "azure_append_blob", "{34E5A67E-9E8A-413C-B437-2E60A0658E93}" + ProjectSection(SolutionItems) = preProject + src\vm_monitor\telegraf\azure_append_blob\azure_append_blob.go = src\vm_monitor\telegraf\azure_append_blob\azure_append_blob.go + src\vm_monitor\telegraf\azure_append_blob\README.md = src\vm_monitor\telegraf\azure_append_blob\README.md + src\vm_monitor\telegraf\azure_append_blob\sample.conf = src\vm_monitor\telegraf\azure_append_blob\sample.conf + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "all", "all", "{63D64305-E8F5-48CA-974A-1B59CF8E68FB}" + ProjectSection(SolutionItems) = preProject + src\vm_monitor\telegraf\all\azure_append_blob.go = src\vm_monitor\telegraf\all\azure_append_blob.go + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "vm_monitor_scripts", "vm_monitor_scripts", "{63ACE1DA-967C-4416-BD83-2540B84E8FE4}" + ProjectSection(SolutionItems) = preProject + src\vm_monitor\vm_monitor_scripts\clean_log.py = src\vm_monitor\vm_monitor_scripts\clean_log.py + src\vm_monitor\vm_monitor_scripts\collect_azure_vm_perf.sh = src\vm_monitor\vm_monitor_scripts\collect_azure_vm_perf.sh + src\vm_monitor\vm_monitor_scripts\get_batch_agent_values.sh = src\vm_monitor\vm_monitor_scripts\get_batch_agent_values.sh + src\vm_monitor\vm_monitor_scripts\get_imds_and_nvme_metatada.sh = src\vm_monitor\vm_monitor_scripts\get_imds_and_nvme_metatada.sh + src\vm_monitor\vm_monitor_scripts\get_linux_boot_iso_timestamp.sh = src\vm_monitor\vm_monitor_scripts\get_linux_boot_iso_timestamp.sh + src\vm_monitor\vm_monitor_scripts\parse_extended_cpu_info.py = src\vm_monitor\vm_monitor_scripts\parse_extended_cpu_info.py + src\vm_monitor\vm_monitor_scripts\parse_imds_and_nvme_metadata.py = src\vm_monitor\vm_monitor_scripts\parse_imds_and_nvme_metadata.py + src\vm_monitor\vm_monitor_scripts\run_telegraf.sh = src\vm_monitor\vm_monitor_scripts\run_telegraf.sh + src\vm_monitor\vm_monitor_scripts\start_vm_node_monitoring.sh = src\vm_monitor\vm_monitor_scripts\start_vm_node_monitoring.sh + src\vm_monitor\vm_monitor_scripts\telegraf_helper.py = src\vm_monitor\vm_monitor_scripts\telegraf_helper.py + src\vm_monitor\vm_monitor_scripts\tes_vm_monitor.continuous.conf = src\vm_monitor\vm_monitor_scripts\tes_vm_monitor.continuous.conf + src\vm_monitor\vm_monitor_scripts\tes_vm_monitor.once.conf = src\vm_monitor\vm_monitor_scripts\tes_vm_monitor.once.conf + EndProjectSection +EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tes.SDK", "src\Tes.SDK\Tes.SDK.csproj", "{9625CB30-4159-4257-9032-B693EDCB5E9B}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tes.SDK.Tests", "src\Tes.SDK.Tests\Tes.SDK.Tests.csproj", "{AE7ADB92-BEC6-4030-B62F-BDBB6AC53CB4}" @@ -119,6 +169,14 @@ Global GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {1166E860-5DBB-478F-9A1E-848A74D6D1AD} = {EF518D6D-FA52-4B12-AF47-0C46C0BC434D} + {DDF9389C-F5D0-4998-A53C-9505608B1FF6} = {1166E860-5DBB-478F-9A1E-848A74D6D1AD} + {28A0F8E6-A63B-4378-90B0-773F61CF7C15} = {DDF9389C-F5D0-4998-A53C-9505608B1FF6} + {34E5A67E-9E8A-413C-B437-2E60A0658E93} = {28A0F8E6-A63B-4378-90B0-773F61CF7C15} + {63D64305-E8F5-48CA-974A-1B59CF8E68FB} = {28A0F8E6-A63B-4378-90B0-773F61CF7C15} + {63ACE1DA-967C-4416-BD83-2540B84E8FE4} = {DDF9389C-F5D0-4998-A53C-9505608B1FF6} + EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {67AD4797-6B55-4CF7-A9BB-A83921479EA7} EndGlobalSection diff --git a/src/Dockerfile-Tes b/src/Dockerfile-Tes index 2f431356d..48610304f 100644 --- a/src/Dockerfile-Tes +++ b/src/Dockerfile-Tes @@ -19,4 +19,5 @@ WORKDIR /app COPY --from=publish /app/publish . COPY --from=publish /app/publish-output/tes-runner /app/scripts/tes-runner COPY --from=publish /app/publish-output/tes-runner.md5 /app/scripts/tes-runner.md5 -ENTRYPOINT ["dotnet", "tesapi.dll"] +COPY ./vm_monitor/tes_vm_monitor.tar.gz /app/scripts/tes_vm_monitor.tar.gz +ENTRYPOINT ["dotnet", "tesapi.dll"] \ No newline at end of file diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 9bb26b226..c26c8fcd9 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -634,7 +634,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual("TES-hostname-edicated1-obkfufnroslrzwlitqbrmjeowu7iuhfm-", tesTask.PoolId[0..^8]); + Assert.AreEqual("TES-hostname-edicated1-hzpeysbtzr46ynk5kabk55lcxmuerypk-", tesTask.PoolId[0..^8]); Assert.AreEqual("VmSizeDedicated1", pool.VmSize); Assert.IsTrue(((BatchScheduler)batchScheduler).TryGetPool(tesTask.PoolId, out _)); }); diff --git a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs index 5810e4cd5..2b13ec3e6 100644 --- a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs +++ b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs @@ -7,6 +7,7 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; using Microsoft.VisualStudio.TestTools.UnitTesting; using Moq; using Newtonsoft.Json; @@ -55,7 +56,7 @@ public void SetUp() .ReturnsAsync(nodeTask); - taskExecutionScriptingManager = new TaskExecutionScriptingManager(storageAccessProviderMock.Object, taskToNodeTaskConverterMock.Object, new NullLogger()); + taskExecutionScriptingManager = new TaskExecutionScriptingManager(storageAccessProviderMock.Object, taskToNodeTaskConverterMock.Object, Options.Create(new()), new NullLogger()); } [TestMethod] diff --git a/src/TesApi.Web/BatchScheduler.BatchPools.cs b/src/TesApi.Web/BatchScheduler.BatchPools.cs index 6772234ea..fd3a6b7c0 100644 --- a/src/TesApi.Web/BatchScheduler.BatchPools.cs +++ b/src/TesApi.Web/BatchScheduler.BatchPools.cs @@ -40,7 +40,7 @@ public partial class BatchScheduler // Generate hash of everything that differentiates this group of pools var displayName = $"{label}:{vmSize}:{isPreemptable}:{identityResourceIds}"; - var hash = SHA1.HashData(Encoding.UTF8.GetBytes(displayName + ":" + this.runnerMD5)).ConvertToBase32().TrimEnd('=').ToLowerInvariant(); // This becomes 32 chars + var hash = SHA1.HashData(Encoding.UTF8.GetBytes($"{displayName}:{runnerMD5}:{disableBatchNodesPublicIpAddress}:{advancedVmPerformanceMonitoring}")).ConvertToBase32().TrimEnd('=').ToLowerInvariant(); // This becomes 32 chars // Build a PoolName that is of legal length, while exposing the most important metadata without requiring user to find DisplayName // Note that the hash covers all necessary parts to make name unique, so limiting the size of the other parts is not expected to appreciably change the risk of collisions. Those other parts are for convenience diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 901e03020..dcb5af553 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -56,6 +56,7 @@ public partial class BatchScheduler : IBatchScheduler public const string BatchNodeTaskWorkingDirEnvVar = "$AZ_BATCH_TASK_WORKING_DIR"; internal const string NodeTaskRunnerFilename = "tes-runner"; + internal const string VMPerformanceArchiverFilename = "tes_vm_monitor.tar.gz"; private const string AzureSupportUrl = "https://portal.azure.com/#blade/Microsoft_Azure_Support/HelpAndSupportBlade/newsupportrequest"; private const int PoolKeyLength = 55; // 64 max pool name length - 9 chars generating unique pool names @@ -76,6 +77,7 @@ public partial class BatchScheduler : IBatchScheduler private readonly string batchNodesSubnetId; private readonly bool batchNodesSetContentMd5OnUpload; private readonly bool disableBatchNodesPublicIpAddress; + private readonly bool advancedVmPerformanceMonitoring; private readonly TimeSpan poolLifetime; private readonly TimeSpan taskMaxWallClockTime; private readonly BatchNodeInfo gen2BatchNodeInfo; @@ -150,6 +152,7 @@ public BatchScheduler( this.batchNodesSubnetId = batchNodesOptions.Value.SubnetId; this.batchNodesSetContentMd5OnUpload = batchNodesOptions.Value.ContentMD5; this.disableBatchNodesPublicIpAddress = batchNodesOptions.Value.DisablePublicIpAddress; + this.advancedVmPerformanceMonitoring = batchNodesOptions.Value.AdvancedVmPerformanceMonitoringEnabled; this.poolLifetime = TimeSpan.FromDays(batchSchedulingOptions.Value.PoolRotationForcedDays == 0 ? Options.BatchSchedulingOptions.DefaultPoolRotationForcedDays : batchSchedulingOptions.Value.PoolRotationForcedDays); this.taskMaxWallClockTime = TimeSpan.FromDays(batchSchedulingOptions.Value.TaskMaxWallClockTimeDays == 0 ? Options.BatchSchedulingOptions.DefaultPoolRotationForcedDays : batchSchedulingOptions.Value.TaskMaxWallClockTimeDays); this.defaultStorageAccountName = storageOptions.Value.DefaultAccountName; @@ -415,12 +418,22 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) { var blobUri = await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, cancellationToken); var blobProperties = await azureProxy.GetBlobPropertiesAsync(blobUri, cancellationToken); - if (!(await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, $"scripts/{NodeTaskRunnerMD5HashFilename}"), cancellationToken)).Trim().Equals(Convert.ToBase64String(blobProperties?.ContentHash ?? []), StringComparison.OrdinalIgnoreCase)) + if (!runnerMD5.Equals(Convert.ToBase64String(blobProperties?.ContentHash ?? []), StringComparison.Ordinal)) { await azureProxy.UploadBlobFromFileAsync(blobUri, $"scripts/{NodeTaskRunnerFilename}", cancellationToken); } } + /// + public async Task UploadMonitoringScriptIfNeeded(CancellationToken cancellationToken) + { + if (advancedVmPerformanceMonitoring) + { + var blobUri = await storageAccessProvider.GetInternalTesBlobUrlAsync(VMPerformanceArchiverFilename, cancellationToken); + await azureProxy.UploadBlobFromFileAsync(blobUri, $"scripts/{VMPerformanceArchiverFilename}", cancellationToken); + } + } + /// /// Iteratively manages execution of a on Azure Batch until completion or failure /// @@ -1133,6 +1146,13 @@ private enum NodeOS StringBuilder cmd = new("#!/bin/sh\n"); cmd.Append($"mkdir -p {BatchNodeSharedEnvVar} && {CreateWgetDownloadCommand(await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, cancellationToken), $"{BatchNodeSharedEnvVar}/{NodeTaskRunnerFilename}", setExecutable: true)}"); + if (advancedVmPerformanceMonitoring) + { + cmd.Append($" && mkdir -p {BatchNodeSharedEnvVar}/vm_monitor && {CreateWgetDownloadCommand(await storageAccessProvider.GetInternalTesBlobUrlAsync(VMPerformanceArchiverFilename, cancellationToken), $"{BatchNodeSharedEnvVar}/vm_monitor/{VMPerformanceArchiverFilename}")}"); + var script = "vm-monitor.sh"; + cmd.Append($" && {CreateWgetDownloadCommand(await UploadScriptAsync(script, new((await ReadScript(script)).Replace("{VMPerformanceArchiverFilename}", VMPerformanceArchiverFilename))), script, setExecutable: true)} && ./{script}"); + } + if (!dockerConfigured) { var packageInstallScript = nodeOs switch diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 8ccb8d254..6bb3f23fc 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -35,6 +35,14 @@ public interface IBatchScheduler /// This should be called only once after the is created before any other methods are called. Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken); + /// + /// Stores the compute node performance monitor in the default storage account + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + /// This should be called only once after the is created before any other methods are called. + Task UploadMonitoringScriptIfNeeded(CancellationToken cancellationToken); + /// /// Iteratively schedule a on a batch system until completion or failure /// diff --git a/src/TesApi.Web/Options/BatchNodesOptions.cs b/src/TesApi.Web/Options/BatchNodesOptions.cs index 03a4d6283..4c746488c 100644 --- a/src/TesApi.Web/Options/BatchNodesOptions.cs +++ b/src/TesApi.Web/Options/BatchNodesOptions.cs @@ -30,6 +30,10 @@ public class BatchNodesOptions /// public string GlobalStartTask { get; set; } = string.Empty; /// + /// True to enable advanced VM performance monitoring, False otherwise + /// + public bool AdvancedVmPerformanceMonitoringEnabled { get; set; } = false; + /// /// True to have the runner calculate and provide the blob content MD5 to the storage account, False otherwise. /// public bool ContentMD5 { get; set; } = false; diff --git a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs index b04170a7d..bd3fdeead 100644 --- a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs +++ b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs @@ -7,6 +7,7 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; using Newtonsoft.Json; using Tes.Models; using Tes.Runner.Models; @@ -20,6 +21,7 @@ namespace TesApi.Web.Runner public class TaskExecutionScriptingManager { private const string NodeTaskFilename = "runner-task.json"; + private const string BatchScriptFileName = "batch_script"; private static readonly JsonSerializerSettings IndentedSerializerSettings = new() { @@ -37,14 +39,16 @@ public class TaskExecutionScriptingManager private readonly IStorageAccessProvider storageAccessProvider; private readonly TaskToNodeTaskConverter taskToNodeConverter; private readonly ILogger logger; + private readonly bool advancedVmPerformanceMonitoring; /// /// Constructor of TaskExecutionScriptingManager /// /// /// + /// /// - public TaskExecutionScriptingManager(IStorageAccessProvider storageAccessProvider, TaskToNodeTaskConverter taskToNodeConverter, ILogger logger) + public TaskExecutionScriptingManager(IStorageAccessProvider storageAccessProvider, TaskToNodeTaskConverter taskToNodeConverter, IOptions batchNodesOptions, ILogger logger) { ArgumentNullException.ThrowIfNull(storageAccessProvider); ArgumentNullException.ThrowIfNull(taskToNodeConverter); @@ -52,6 +56,7 @@ public TaskExecutionScriptingManager(IStorageAccessProvider storageAccessProvide this.storageAccessProvider = storageAccessProvider; this.taskToNodeConverter = taskToNodeConverter; + this.advancedVmPerformanceMonitoring = batchNodesOptions.Value.AdvancedVmPerformanceMonitoringEnabled; this.logger = logger; } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 93576ebc9..99460920a 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -57,6 +57,7 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) { // Delay "starting" Scheduler until this completes to finish initializing BatchScheduler. await batchScheduler.UploadTaskRunnerIfNeeded(stoppingToken); + await batchScheduler.UploadMonitoringScriptIfNeeded(stoppingToken); } catch (Exception exc) { diff --git a/src/TesApi.Web/TesApi.Web.csproj b/src/TesApi.Web/TesApi.Web.csproj index 9f7c00890..fad8cf370 100644 --- a/src/TesApi.Web/TesApi.Web.csproj +++ b/src/TesApi.Web/TesApi.Web.csproj @@ -29,6 +29,12 @@ Never + + PreserveNewest + + + PreserveNewest + PreserveNewest diff --git a/src/TesApi.Web/scripts/config-docker.sh b/src/TesApi.Web/scripts/config-docker.sh index a36b95100..d761a000c 100644 --- a/src/TesApi.Web/scripts/config-docker.sh +++ b/src/TesApi.Web/scripts/config-docker.sh @@ -2,6 +2,7 @@ trap "echo Error trapped; exit 0" ERR # set -e will cause any error to exit the script set -e + sudo touch tmp2.json sudo cp /etc/docker/daemon.json tmp1.json || sudo echo {} > tmp1.json sudo chmod a+w tmp?.json; diff --git a/src/TesApi.Web/scripts/config-nvme.sh b/src/TesApi.Web/scripts/config-nvme.sh index fce71994b..8997a9717 100644 --- a/src/TesApi.Web/scripts/config-nvme.sh +++ b/src/TesApi.Web/scripts/config-nvme.sh @@ -5,6 +5,7 @@ trap "echo Error trapped; exit 0" ERR # set -e will cause any error to exit the script set -e + # Get nvme device paths without jq being installed nvme_devices=$(nvme list -o json | grep -oP "\"DevicePath\" : \"\K[^\"]+" || true) nvme_device_count=$(echo $nvme_devices | wc -w) diff --git a/src/TesApi.Web/scripts/tes_vm_monitor.tar.gz b/src/TesApi.Web/scripts/tes_vm_monitor.tar.gz new file mode 100644 index 000000000..f0bc00b5a Binary files /dev/null and b/src/TesApi.Web/scripts/tes_vm_monitor.tar.gz differ diff --git a/src/TesApi.Web/scripts/vm-monitor.sh b/src/TesApi.Web/scripts/vm-monitor.sh new file mode 100644 index 000000000..be750dbe1 --- /dev/null +++ b/src/TesApi.Web/scripts/vm-monitor.sh @@ -0,0 +1,10 @@ +#!/usr/bin/bash +trap "echo Error trapped; exit 0" ERR +# set -e will cause any error to exit the script +set -e + +if [ -f "${AZ_BATCH_NODE_SHARED_DIR}/{VMPerformanceArchiverFilename}" ]; then + tar zxvf "${AZ_BATCH_NODE_SHARED_DIR}/vm_monitor/{VMPerformanceArchiverFilename}" -C "${AZ_BATCH_NODE_SHARED_DIR}/vm_monitor" start_vm_node_monitoring.sh + chmod +x "${AZ_BATCH_NODE_SHARED_DIR}/vm_monitor/start_vm_node_monitoring.sh" + /usr/bin/bash -c "${AZ_BATCH_NODE_SHARED_DIR}/vm_monitor/start_vm_node_monitoring.sh &" || true +fi; diff --git a/src/build_tes_remote.sh b/src/build_tes_remote.sh new file mode 100644 index 000000000..1a01bcbf5 --- /dev/null +++ b/src/build_tes_remote.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +connect_string=${connect_string:-"ssh batch-explorer-user@20.236.185.167 -p 50000"} +acr_name=${acr_name:-"wdltest"} +resource_group=${resource_group:-"test-coa4-southcentral-rg"} + +dest_dir="/mnt/tes/" + +# Parse the Azure Batch connect string: +username=$(echo "$connect_string" | cut -d'@' -f1 | cut -d' ' -f2) +ip=$(echo "$connect_string" | cut -d'@' -f2 | cut -d' ' -f1) +port=$(echo "$connect_string" | cut -d' ' -f4) + +echo -e "Username: \t$username" +echo -e "IP: \t\t$ip" +echo -e "Port: \t\t$port" + +# rclone current directory to remote server +# shellcheck disable=SC2086 +ssh -p $port $username@$ip "sudo mkdir -p $dest_dir && sudo chmod a+rw $dest_dir" +# shellcheck disable=SC2086 +rclone copy --progress --sftp-host=$ip --sftp-user=$username --sftp-port=$port --sftp-key-file=~/.ssh/id_rsa ../ :sftp:${dest_dir} --progress --multi-thread-streams=30 --transfers=30 --checkers=45 + +# Execute this script on the remote server as root: +# shellcheck disable=SC2087 +# shellcheck disable=SC2086 +ssh -p $port $username@$ip << EOF sudo -s + # if azure cli is not installed, install it + if [ ! -x "\$(command -v az)" ]; then + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + fi + set -e + az login --identity + cd ${dest_dir}/src && chmod a+x ./deploy-tes-docker-image.sh && ./deploy-tes-docker-image.sh $resource_group $acr_name +EOF diff --git a/src/create_tes_debug_pool.sh b/src/create_tes_debug_pool.sh new file mode 100644 index 000000000..1e68fdda7 --- /dev/null +++ b/src/create_tes_debug_pool.sh @@ -0,0 +1,249 @@ +#!/bin/bash +## This script is for debugging of batch_scripts in TES. It will create a DEBUG pool and job in an Azure Batch +## account and then allow you to rapidly execute a task on a node that is already allocated and waiting. +## Turn around time from running this script to the job running on the node is about 30 seconds. +## +## You must have the Azure CLI installed and logged in to use this script. +## +## Already setup pool JSON usage, pass in 3 args: SUBSCRIPTION_ID RESOURCE_GROUP WGET_URL +## Make sure you change the managed identity in the tes_pool_config.json +## ['identity']['userAssignedIdentities'][0]['resourceId'] to your own +## +## Create a pool config, pass in 5 args: SUBSCRIPTION_ID RESOURCE_GROUP WGET_URL TEMPLATE_POOL_ID POOL_CONFIG_JSON +## Example: a0e0e744-06b2-4fd3-9230-ebf8ef1ac4c8 test-coa4-southcentral-rg +## https://cromwellsc95a88970e25.blob.core.windows.net/cromwell-executions/test/f7fd31e3-61e7-48b3-b895-8b291bbecbdb/call-hello/tes_task/batch_script +## TES-OY5BKMMX-A1_v2-2m4tvpnjrgv74kjiyxtffht2mqzd2nqn-yhlj3wwu (pool to copy from) +## tes_pool_config.json (dest pool config file) +## +## You can also export and define these in your shell environment, and the script will use them as defaults. +## The values here won't work and are just for illutstrative purposes. +## The WGET_URL will have a SAS token added on so you do not need to add one +## +## NOTE: You must be familiar with how to remove Azure Batch pools before using this script. The script never sizes down +## the pool, so you must do this manually. Otherwise you will be charged for the VMs in the pool (which will run forever). +## +## Dependencies: Azure CLI, Python3, jq, wget + pyhton depdencies azure-batch, azure-mgmt-batch, azure-identity + +# If these variables are already defined in the environment, use them, otherwise use the defaults in this script +export SUBSCRIPTION_ID=${SUBSCRIPTION_ID:-a0e0e744-06b2-4fd3-9230-ebf8ef1ac4c8} +export RESOURCE_GROUP=${RESOURCE_GROUP:-test-coa4-southcentral-rg} +export WGET_URL=${WGET_URL:-https://cromwellsc95a88970e25.blob.core.windows.net/cromwell-executions/test/f7fd31e3-61e7-48b3-b895-8b291bbecbdb/call-hello/tes_task/batch_script} + +# If we were passed in 3 arguments use them: +export POOL_CONFIG_JSON="tes_pool_config.json" +if [ $# -eq 3 ]; then + export SUBSCRIPTION_ID=$1 + export RESOURCE_GROUP=$2 + export WGET_URL=$3 +elif [ $# -eq 5 ]; then + export SUBSCRIPTION_ID=$1 + export RESOURCE_GROUP=$2 + export WGET_URL=$3 + export TEMPLATE_POOL_ID=$4 + export POOL_CONFIG_JSON=$5 +fi + +export JOB_ID="DEBUG_TES_JOB" +export POOL_ID="DEBUG_TES_POOL" +export DEBUG_TASK_NAME="debug_task" +export VM_SIZE="Standard_D2s_v3" +export LOW_PRI_TARGET_NODES=1 + +echo "REMINDER: You must manually delete the pool after you are done debugging. The script never sizes down the pool." +echo "REMINDER: This means you will be charged for 1 low-pri node until you delete the pool!" +echo -e "\n\n" + +echo "Adding a job to run the batch_script in: ${WGET_URL}" +echo -e "Subscription ID: \t$SUBSCRIPTION_ID" +echo -e "Resource Group: \t$RESOURCE_GROUP" +echo -e "Job ID: \t\t$JOB_ID" +echo -e "Pool ID: \t\t$POOL_ID" +echo -e "Task Name: \t\t$DEBUG_TASK_NAME" +echo -e "VM Size: \t\t$VM_SIZE" +echo -e "Pool Config JSON: \t$POOL_CONFIG_JSON" + +# Get the Azure Batch account in the resource group, error if there are more than 1 +function get_az_batch_account { + local RESOURCE_GROUP=$1 + local BATCH_ACCOUNTS=$(az batch account list --resource-group "$RESOURCE_GROUP" --query "[].name" --output tsv) + local NUM_BATCH_ACCOUNTS=$(echo "$BATCH_ACCOUNTS" | wc -l) + if [ "$NUM_BATCH_ACCOUNTS" -ne 1 ]; then + echo "Error: There must be exactly 1 Azure Batch account in the resource group. Found ${NUM_BATCH_ACCOUNTS}." + exit 1 + fi + echo "$BATCH_ACCOUNTS" +} +export BATCH_ACCOUNT_NAME=$(get_az_batch_account "$RESOURCE_GROUP") +echo -e "Azure Batch account: \t$BATCH_ACCOUNT_NAME" +export BATCH_ACCOUNT_URL=$(az batch account show --name "$BATCH_ACCOUNT_NAME" --resource-group "$RESOURCE_GROUP" --query "accountEndpoint" --output tsv) +echo -e "Azure Batch URL: \t$BATCH_ACCOUNT_URL" +echo -e "\n\n" + + +# Generate a user delegation SAS token for the batch_script +function add_sas_token_to_wget_url() { + local WGET_URL=$1 + STORAGE_ACCOUNT_NAME=$(echo "$WGET_URL" | cut -d'.' -f1 | cut -d'/' -f3) + CONTAINER_NAME=$(echo "$WGET_URL" | cut -d'/' -f4) + BLOB_NAME=$(echo "$WGET_URL" | cut -d'/' -f5-) + EXPIRY=$(date -u -d "+1 day" '+%Y-%m-%dT%H:%M:%SZ') + SAS_TOKEN=$(az storage blob generate-sas --account-name "$STORAGE_ACCOUNT_NAME" --container-name "$CONTAINER_NAME" --name "$BLOB_NAME" --permissions r --expiry "$EXPIRY" --https-only --output tsv --auth-mode login --as-user) + echo "${WGET_URL}?${SAS_TOKEN}" +} + +# If there's no SAS token in the WGET_URL, add one +if [[ $WGET_URL != *"?"* ]]; then + echo "Generating SAS token for batch_script URL" + export WGET_URL=$(add_sas_token_to_wget_url "$WGET_URL") +fi + +# Authenticate with Azure Batch account +az batch account login --name "$BATCH_ACCOUNT_NAME" --resource-group "$RESOURCE_GROUP" +if [ -n "$TEMPLATE_POOL_ID" ]; then + echo "Downloading pool config to template file: $POOL_CONFIG_JSON" + az batch pool show --account-name "$BATCH_ACCOUNT_NAME" --pool-id "$TEMPLATE_POOL_ID" > "$POOL_CONFIG_JSON" +fi + +# If the pool doesn't exist, create it +# NOTE: Do not assign a principalID or ClientID to the managed identity otherwise it will not be added to the pool +# If there's already a pool +POOL_RESULT="$(az batch pool show --pool-id "$POOL_ID" --query "id" --output tsv 2>/dev/null)" +if [ "$POOL_RESULT" == "$POOL_ID" ]; then + echo "The pool $POOL_ID already exists." +else + echo "The pool $POOL_ID does not exist or is not in a steady state. Creating the pool..." + # The Azure cli doesn't offer enough support for the type of pool we want so we must use the python SDK + python3 </dev/null 2>&1 || az batch job create --id $JOB_ID --pool-id $POOL_ID + +# Get the task state +TASK_STATE=$(az batch task show --job-id $JOB_ID --task-id $DEBUG_TASK_NAME --query state -o tsv 2>/dev/null) + +# If the task exists and is not running, delete it +if [ "$TASK_STATE" ]; then + if [ "$TASK_STATE" != "running" ]; then + az batch task delete --job-id $JOB_ID --task-id $DEBUG_TASK_NAME --yes + fi +fi + +# Add a task to the job (we must use python3 so we can have the task run as admin) +echo "Adding task $DEBUG_TASK_NAME to job $JOB_ID" +export PRIMARY_KEY=$(az batch account keys list --name "$BATCH_ACCOUNT_NAME" --resource-group "$RESOURCE_GROUP" --query "primary" -o tsv) + +python3 < [IsUsGovernment]" exit 1 @@ -15,59 +20,86 @@ fi IMAGE_NAME=tes DOCKERFILE=Dockerfile-Tes -TAG=$(date +"%Y%m%d%H%M") - +TAG=$(date +%Y-%m-%d-%H-%M-%S) if [[ "$IS_US_GOVERNMENT" == "true" ]]; then ACR_LOGIN_SERVER="${ACR_NAME}.azurecr.us" # Adjusted for US Government cloud else ACR_LOGIN_SERVER="${ACR_NAME}.azurecr.io" # Default for public cloud fi - NEW_IMAGE="${ACR_LOGIN_SERVER}/${IMAGE_NAME}:${TAG}" -docker build -t $NEW_IMAGE -f $DOCKERFILE . --no-cache + +# Do the docker build step: +docker build -t "$NEW_IMAGE" -f "$DOCKERFILE" . +if [ $? -ne 0 ]; then + echo "Docker build failed" + exit 1 +fi echo "Built image: ${NEW_IMAGE}" -az login -az acr login --name $ACR_NAME + echo "Pushing image... ${NEW_IMAGE}" -docker push $NEW_IMAGE +# Check if we're already logged in +az account show > /dev/null 2>&1 +if [ $? -ne 0 ]; then + # We're not logged in, so run the az login command + az login +fi +az acr login --name "$ACR_NAME" +docker push "$NEW_IMAGE" + +echo -e "\n\nYou can manually run: kubectl set image deployment/tes tes=\"$NEW_IMAGE\" -n coa\n\n" + +echo "Attempting to update the AKS cluster with the new image..." +# Get the subscription ID of the resource group: +SUBSCRIPTION_ID=$(az group show --name "$RESOURCE_GROUP_NAME" --query "id" -o tsv | cut -d'/' -f3) +if [ -z "$SUBSCRIPTION_ID" ]; then + echo "Failed to get the subscription ID of the resource group $RESOURCE_GROUP_NAME." + exit 1 +fi + +echo "Setting active subscription to $SUBSCRIPTION_ID" +az account set --subscription "$SUBSCRIPTION_ID" # Get the first AKS cluster name in the specified resource group -AKS_CLUSTER_NAME=$(az aks list --resource-group $RESOURCE_GROUP_NAME --query '[0].name' -o tsv) +AKS_CLUSTER_NAME=$(az aks list --resource-group "$RESOURCE_GROUP_NAME" --subscription "$SUBSCRIPTION_ID" --query '[0].name' -o tsv) if [ -z "$AKS_CLUSTER_NAME" ]; then echo "No AKS cluster found in resource group $RESOURCE_GROUP_NAME." + echo "This identity does not have access to any AKS clusters in the specified resource group. Please make sure the identity has Kubernetes access." + echo "az credential is: $(az account show -o yaml)" exit 1 fi - echo "Found AKS Cluster: $AKS_CLUSTER_NAME" # Get the managed identity client ID used by the AKS cluster -AKS_IDENTITY_CLIENT_ID=$(az aks show \ - --resource-group $RESOURCE_GROUP_NAME \ - --name $AKS_CLUSTER_NAME \ - --query identityProfile.kubeletidentity.clientId \ - -o tsv) +AKS_IDENTITY_CLIENT_ID=$(az aks show --resource-group "$RESOURCE_GROUP_NAME" --subscription "$SUBSCRIPTION_ID" --name "$AKS_CLUSTER_NAME" --query identityProfile.kubeletidentity.clientId -o tsv) +# If there's an error, the above command will return an empty string +if [ -z "$AKS_IDENTITY_CLIENT_ID" ]; then + echo "Failed to get the managed identity client ID used by the AKS cluster $AKS_CLUSTER_NAME." + exit 1 +fi # Get the ACR resource ID -ACR_RESOURCE_ID=$(az acr show \ - --name $ACR_NAME \ - --query id \ - -o tsv) +ACR_RESOURCE_ID=$(az acr show --subscription "$SUBSCRIPTION_ID" --name "$ACR_NAME" --query id -o tsv) +echo "ACR_RESOURCE_ID: $ACR_RESOURCE_ID" # Check if the AcrPull role assignment already exists EXISTING_ASSIGNMENT=$(az role assignment list \ - --assignee $AKS_IDENTITY_CLIENT_ID \ + --assignee "$AKS_IDENTITY_CLIENT_ID" \ --role acrpull \ - --scope $ACR_RESOURCE_ID \ + --scope "$ACR_RESOURCE_ID" \ --query [].id \ -o tsv) +if [ -z "$EXISTING_ASSIGNMENT" ]; then + echo "Failed to get the managed identity client ID used by the AKS cluster $AKS_CLUSTER_NAME." + exit 1 +fi if [ -z "$EXISTING_ASSIGNMENT" ]; then # Assign AcrPull role to the AKS cluster's managed identity for the ACR echo "Assigning AcrPull role to AKS..." az role assignment create \ - --assignee $AKS_IDENTITY_CLIENT_ID \ + --assignee "$AKS_IDENTITY_CLIENT_ID" \ --role acrpull \ - --scope $ACR_RESOURCE_ID + --scope "$ACR_RESOURCE_ID" echo "AcrPull role assigned to the AKS cluster successfully." else echo "AcrPull role assignment already exists. No action required." @@ -75,8 +107,8 @@ fi # Update the AKS cluster with the new TES image echo "Updating AKS with the new image..." -az aks get-credentials --resource-group $RESOURCE_GROUP_NAME --name $AKS_CLUSTER_NAME --overwrite-existing -kubectl set image deployment/tes tes=$NEW_IMAGE -n tes +az aks get-credentials --resource-group "$RESOURCE_GROUP_NAME" --subscription "$SUBSCRIPTION_ID" --name "$AKS_CLUSTER_NAME" --overwrite-existing +kubectl set image deployment/tes tes="$NEW_IMAGE" -n coa echo "Deployment complete for: $NEW_IMAGE" # Get logs of the new TES pod diff --git a/src/deploy-tes-on-azure/Configuration.cs b/src/deploy-tes-on-azure/Configuration.cs index 41af680fe..141bbb1d4 100644 --- a/src/deploy-tes-on-azure/Configuration.cs +++ b/src/deploy-tes-on-azure/Configuration.cs @@ -84,6 +84,7 @@ public abstract class UserAccessibleConfiguration public string TesUsername { get; set; } = "tes"; public string TesPassword { get; set; } public string AadGroupIds { get; set; } + public bool? AdvancedVmPerformanceMonitoringEnabled { get; set; } public string DeploymentOrganizationName { get; set; } public string DeploymentOrganizationUrl { get; set; } public string DeploymentContactUri { get; set; } diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 198ef0031..299bf7c41 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -260,6 +260,14 @@ await Execute("Connecting to Azure Services...", async () => throw new ValidationException($"Could not retrieve account names from stored configuration in {storageAccountData.Name}.", displayExample: false); } + // validate update-once settings + { + if (configuration.BatchNodesSubnetId is not null && !string.IsNullOrEmpty(aksValues.TryGetValue("BatchNodesSubnetId", out var subnetId) ? subnetId : null)) + { + throw new ValidationException("'BatchNodesSubnetId' is already set.", displayExample: false); + } + } + if (aksValues.TryGetValue("EnableIngress", out var enableIngress) && aksValues.TryGetValue("TesHostname", out var tesHostname)) { kubernetesManager.TesHostname = tesHostname; @@ -1231,6 +1239,7 @@ private Dictionary ConfigureSettings(string managedIdentityClien // Additional non-personalized settings UpdateSetting(settings, defaults, "BatchNodesSubnetId", configuration.BatchNodesSubnetId); + UpdateSetting(settings, defaults, "AdvancedVmPerformanceMonitoringEnabled", configuration.AdvancedVmPerformanceMonitoringEnabled); UpdateSetting(settings, defaults, "DisableBatchNodesPublicIpAddress", configuration.DisableBatchNodesPublicIpAddress, b => b.GetValueOrDefault().ToString(), configuration.DisableBatchNodesPublicIpAddress.GetValueOrDefault().ToString()); UpdateSetting(settings, defaults, "DeploymentOrganizationName", configuration.DeploymentOrganizationName); UpdateSetting(settings, defaults, "DeploymentOrganizationUrl", configuration.DeploymentOrganizationUrl); @@ -2420,6 +2429,7 @@ void ValidateKubectlInstall(string kubectlPath, string featureName) ThrowIfProvidedForUpdate(configuration.VnetName, nameof(configuration.VnetName)); ThrowIfProvidedForUpdate(configuration.VnetResourceGroupName, nameof(configuration.VnetResourceGroupName)); ThrowIfProvidedForUpdate(configuration.SubnetName, nameof(configuration.SubnetName)); + ThrowIfProvidedForUpdate(configuration.BatchSubnetName, nameof(configuration.BatchSubnetName)); ThrowIfProvidedForUpdate(configuration.Tags, nameof(configuration.Tags)); ThrowIfTagsFormatIsUnacceptable(configuration.Tags, nameof(configuration.Tags)); diff --git a/src/deploy-tes-on-azure/KubernetesManager.cs b/src/deploy-tes-on-azure/KubernetesManager.cs index 288b0069c..f241cefea 100644 --- a/src/deploy-tes-on-azure/KubernetesManager.cs +++ b/src/deploy-tes-on-azure/KubernetesManager.cs @@ -416,6 +416,7 @@ private static void UpdateValuesFromSettings(HelmValues values, Dictionary ValuesToSettings(HelmValues values) ["BatchNodesSubnetId"] = GetValueOrDefault(batchNodes, "subnetId"), ["AksCoANamespace"] = GetValueOrDefault(values.Config, "coaNamespace") as string, ["DisableBatchNodesPublicIpAddress"] = GetValueOrDefault(batchNodes, "disablePublicIpAddress"), + ["AdvancedVmPerformanceMonitoringEnabled"] = GetValueOrDefault(batchNodes, "advancedVmPerformanceMonitoringEnabled"), ["UsePreemptibleVmsOnly"] = GetValueOrDefault(batchScheduling, "usePreemptibleVmsOnly"), ["Gen2BatchImageOffer"] = GetValueOrDefault(batchImageGen2, "offer"), ["Gen2BatchImagePublisher"] = GetValueOrDefault(batchImageGen2, "publisher"), diff --git a/src/deploy-tes-on-azure/scripts/env-04-settings.txt b/src/deploy-tes-on-azure/scripts/env-04-settings.txt index 8d9c7f971..8fafecc33 100644 --- a/src/deploy-tes-on-azure/scripts/env-04-settings.txt +++ b/src/deploy-tes-on-azure/scripts/env-04-settings.txt @@ -14,3 +14,4 @@ BatchPrefix={DefaultName} DrsHubUrl= GlobalStartTaskPath=/configuration/start-task.sh GlobalManagedIdentity= +AdvancedVmPerformanceMonitoringEnabled=false diff --git a/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml b/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml index f8320051e..3e951cdfd 100644 --- a/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml +++ b/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml @@ -58,6 +58,8 @@ spec: value: {{ .Values.config.batchAccount.accountName }} - name: BatchNodes__SubnetId value: {{ .Values.config.batchNodes.subnetId }} + - name: BatchNodes__AdvancedVmPerformanceMonitoringEnabled + value: {{ .Values.config.batchNodes.advancedVmPerformanceMonitoringEnabled | quote }} - name: BatchNodes__DisablePublicIpAddress value: {{ .Values.config.batchNodes.disablePublicIpAddress | quote }} - name: BatchNodes__GlobalStartTask diff --git a/src/deploy-tes-on-azure/scripts/helm/values-template.yaml b/src/deploy-tes-on-azure/scripts/helm/values-template.yaml index 3f6547a9a..998568384 100644 --- a/src/deploy-tes-on-azure/scripts/helm/values-template.yaml +++ b/src/deploy-tes-on-azure/scripts/helm/values-template.yaml @@ -13,6 +13,7 @@ config: accountName: RUNTIME_PARAMETER batchNodes: subnetId: RUNTIME_PARAMETER + advancedVmPerformanceMonitoringEnabled: RUNTIME_PARAMETER disablePublicIpAddress: RUNTIME_PARAMETER globalStartTask: /configuration/start-task.sh contentMD5: "False" diff --git a/src/docker_deploy.sh b/src/docker_deploy.sh new file mode 100644 index 000000000..383a72328 --- /dev/null +++ b/src/docker_deploy.sh @@ -0,0 +1,116 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +# This script builds a new TES image, pushes it the ACR, gives AKS AcrPull on the ACR, and updates the TES deployment + +# if we are passed three arguments, use them as the resource group and ACR name +if [ $# -eq 3 ]; then + RESOURCE_GROUP_NAME=$1 + ACR_NAME=$2 + IS_US_GOVERNMENT=$3 +fi +if [ -z "$RESOURCE_GROUP_NAME" ] || [ -z "$ACR_NAME" ]; then + echo "Usage: $0 [IsUsGovernment]" + exit 1 +fi + +IMAGE_NAME=tes +DOCKERFILE=Dockerfile-Tes +TAG=$(date +%Y-%m-%d-%H-%M-%S) +if [[ "$IS_US_GOVERNMENT" == "true" ]]; then + ACR_LOGIN_SERVER="${ACR_NAME}.azurecr.us" # Adjusted for US Government cloud +else + ACR_LOGIN_SERVER="${ACR_NAME}.azurecr.io" # Default for public cloud +fi +NEW_IMAGE="${ACR_LOGIN_SERVER}/${IMAGE_NAME}:${TAG}" + +# Do the docker build step: +docker build -t "$NEW_IMAGE" -f "$DOCKERFILE" . +echo "Built image: ${NEW_IMAGE}" + +echo "Pushing image... ${NEW_IMAGE}" +# Check if we're already logged in +az account show > /dev/null 2>&1 +if [ $? -ne 0 ]; then + # We're not logged in, so run the az login command + az login +fi +az acr login --name "$ACR_NAME" +docker push "$NEW_IMAGE" + +echo -e "\n\nYou can manually run: kubectl set image deployment/tes tes=\"$NEW_IMAGE\" -n coa\n\n" + +echo "Attempting to update the AKS cluster with the new image..." +# Get the subscription ID of the resource group: +SUBSCRIPTION_ID=$(az group show --name "$RESOURCE_GROUP_NAME" --query "id" -o tsv | cut -d'/' -f3) +if [ -z "$SUBSCRIPTION_ID" ]; then + echo "Failed to get the subscription ID of the resource group $RESOURCE_GROUP_NAME." + exit 1 +fi + +echo "Setting active subscription to $SUBSCRIPTION_ID" +az account set --subscription "$SUBSCRIPTION_ID" + +# Get the first AKS cluster name in the specified resource group +AKS_CLUSTER_NAME=$(az aks list --resource-group "$RESOURCE_GROUP_NAME" --subscription "$SUBSCRIPTION_ID" --query '[0].name' -o tsv) +if [ -z "$AKS_CLUSTER_NAME" ]; then + echo "No AKS cluster found in resource group $RESOURCE_GROUP_NAME." + echo "This identity does not have access to any AKS clusters in the specified resource group. Please make sure the identity has Kubernetes access." + echo "az credential is: $(az account show -o yaml)" + exit 1 +fi +echo "Found AKS Cluster: $AKS_CLUSTER_NAME" + +# Get the managed identity client ID used by the AKS cluster +AKS_IDENTITY_CLIENT_ID=$(az aks show --resource-group "$RESOURCE_GROUP_NAME" --subscription "$SUBSCRIPTION_ID" --name "$AKS_CLUSTER_NAME" --query identityProfile.kubeletidentity.clientId -o tsv) +# If there's an error, the above command will return an empty string +if [ -z "$AKS_IDENTITY_CLIENT_ID" ]; then + echo "Failed to get the managed identity client ID used by the AKS cluster $AKS_CLUSTER_NAME." + exit 1 +fi + +# Get the ACR resource ID +ACR_RESOURCE_ID=$(az acr show --subscription "$SUBSCRIPTION_ID" --name "$ACR_NAME" --query id -o tsv) +echo "ACR_RESOURCE_ID: $ACR_RESOURCE_ID" + +# Check if the AcrPull role assignment already exists +EXISTING_ASSIGNMENT=$(az role assignment list \ + --assignee "$AKS_IDENTITY_CLIENT_ID" \ + --role acrpull \ + --scope "$ACR_RESOURCE_ID" \ + --query [].id \ + -o tsv) +if [ -z "$EXISTING_ASSIGNMENT" ]; then + echo "Failed to get the managed identity client ID used by the AKS cluster $AKS_CLUSTER_NAME." + exit 1 +fi + +if [ -z "$EXISTING_ASSIGNMENT" ]; then + # Assign AcrPull role to the AKS cluster's managed identity for the ACR + echo "Assigning AcrPull role to AKS..." + az role assignment create \ + --assignee "$AKS_IDENTITY_CLIENT_ID" \ + --role acrpull \ + --scope "$ACR_RESOURCE_ID" + echo "AcrPull role assigned to the AKS cluster successfully." +else + echo "AcrPull role assignment already exists. No action required." +fi + +# Update the AKS cluster with the new TES image +echo "Updating AKS with the new image..." +az aks get-credentials --resource-group "$RESOURCE_GROUP_NAME" --subscription "$SUBSCRIPTION_ID" --name "$AKS_CLUSTER_NAME" --overwrite-existing +kubectl set image deployment/tes tes="$NEW_IMAGE" -n coa +echo "Deployment complete for: $NEW_IMAGE" + +# Get logs of the new TES pod +# kubectl get pods -n tes | awk '{print $1}' | xargs -I {} kubectl logs -n tes {} + +# Run a test task and get it's status (Get these from TesCredentials.json after running deploy-tes-on-azure) +# TesHostname="REMOVED" +# TesPassword="REMOVED" + +# response=$(curl -u "tes:$TesPassword" -H "Content-Type: application/json" -X POST -d '{"resources": {"cpu_cores": 1, "ram_gb": 1},"executors":[{"image":"ubuntu","command":["/bin/sh","-c","cat /proc/sys/kernel/random/uuid"]}]}' "https://$TesHostname/v1/tasks") +# taskId=$(echo $response | jq -r '.id') +# curl -u "tes:$TesPassword" -H "Content-Type: application/json" -X GET "https://$TesHostname/v1/tasks/$taskId?view=full" diff --git a/src/tes_pool_config.json b/src/tes_pool_config.json new file mode 100644 index 000000000..04927c5bd --- /dev/null +++ b/src/tes_pool_config.json @@ -0,0 +1,92 @@ +{ + "allocationState": "steady", + "allocationStateTransitionTime": "2024-03-23T06:46:12.450896+00:00", + "applicationLicenses": null, + "applicationPackageReferences": null, + "autoScaleEvaluationInterval": "0:05:00", + "autoScaleFormula": "$NodeDeallocationOption = taskcompletion;\nlifespan = time() - time(\"Fri, 22 Mar 2024 22:14:45 GMT\");\nspan = TimeInterval_Second * 90;\nstartup = TimeInterval_Minute * 2;\nratio = 10;\n$TargetLowPriorityNodes = (lifespan > startup ? min($PendingTasks.GetSample(span, ratio)) : 1);", + "autoScaleRun": { + "error": null, + "results": "$TargetDedicatedNodes=0;$TargetLowPriorityNodes=0;$NodeDeallocationOption=taskcompletion;lifespan=4d22h14m53.7821283s;ratio=10;span=1m30s;startup=2m", + "timestamp": "2024-03-27T20:29:38.783253+00:00" + }, + "certificateReferences": null, + "cloudServiceConfiguration": null, + "creationTime": "2024-03-22T22:14:46.378367+00:00", + "currentDedicatedNodes": 0, + "currentLowPriorityNodes": 0, + "currentNodeCommunicationMode": null, + "displayName": "OY5BKMMX:Standard_A1_v2:True:/subscriptions/a0e0e744-06b2-4fd3-9230-ebf8ef1ac4c8/resourcegroups/test-coa4-southcentral-rg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/test-coa4-southcentral-rg-identity", + "eTag": "0x8DC4B04BB1CADBE", + "enableAutoScale": true, + "enableInterNodeCommunication": false, + "id": "TES-OY5BKMMX-A1_v2-2m4tvpnjrgv74kjiyxtffht2mqzd2nqn-yhlj3wwu", + "identity": { + "type": "UserAssigned", + "userAssignedIdentities": [ + { + "clientId": "261634f9-6e34-4716-997b-21e5311ea45a", + "principalId": "05ed1543-d6bd-4eda-b414-f4856c30b4f6", + "resourceId": "/subscriptions/a0e0e744-06b2-4fd3-9230-ebf8ef1ac4c8/resourcegroups/test-coa4-southcentral-rg/providers/Microsoft.ManagedIdentity/userAssignedIdentities/test-coa4-southcentral-rg-identity" + } + ] + }, + "lastModified": "2024-03-23T06:44:47.093907+00:00", + "metadata": [ + { + "name": "CoA-TES-HostName", + "value": "OY5BKMMX" + }, + { + "name": "CoA-TES-IsDedicated", + "value": "False" + } + ], + "mountConfiguration": null, + "networkConfiguration": { + "dynamicVnetAssignmentScope": "none", + "enableAcceleratedNetworking": false, + "endpointConfiguration": null, + "publicIpAddressConfiguration": { + "ipAddressIds": null, + "provision": "batchmanaged" + }, + "subnetId": "/subscriptions/a0e0e744-06b2-4fd3-9230-ebf8ef1ac4c8/resourceGroups/test-coa4-southcentral-rg/providers/Microsoft.Network/virtualNetworks/cromwellsc-67645/subnets/default" + }, + "odata.metadata": "https://cromwellsc19387.southcentralus.batch.azure.com/$metadata#pools/@Element", + "resizeErrors": null, + "resizeTimeout": "0:15:00", + "startTask": null, + "state": "active", + "stateTransitionTime": "2024-03-22T22:14:46.378367+00:00", + "stats": null, + "targetDedicatedNodes": 0, + "targetLowPriorityNodes": 0, + "targetNodeCommunicationMode": "simplified", + "taskSchedulingPolicy": { + "nodeFillType": "spread" + }, + "taskSlotsPerNode": 1, + "url": "https://cromwellsc19387.southcentralus.batch.azure.com/pools/TES-OY5BKMMX-A1_v2-2m4tvpnjrgv74kjiyxtffht2mqzd2nqn-yhlj3wwu", + "userAccounts": null, + "virtualMachineConfiguration": { + "containerConfiguration": null, + "dataDisks": null, + "diskEncryptionConfiguration": null, + "extensions": null, + "imageReference": { + "exactVersion": null, + "offer": "ubuntu-server-container", + "publisher": "microsoft-azure-batch", + "sku": "20-04-lts", + "version": "latest", + "virtualMachineImageId": null + }, + "licenseType": null, + "nodeAgentSkuId": "batch.node.ubuntu 20.04", + "nodePlacementConfiguration": null, + "osDisk": null, + "windowsConfiguration": null + }, + "vmSize": "standard_a1_v2" +} diff --git a/src/vm_monitor/README.md b/src/vm_monitor/README.md new file mode 100644 index 000000000..e45ae116e --- /dev/null +++ b/src/vm_monitor/README.md @@ -0,0 +1,67 @@ +# TES Performance Metrics Data Collection + +Performance stats are collected on a per Azure Virtual Machine basis using the Telegraf collection agent. Data are written to the `///` directory as an Azure Append Blob. On initialization a snapshot of the system is captured and flushed to the Append Blob, there after performance metrics are aggregated and flushed every 2 minutes (set by `Agent.flush_interval`). + +Telegraf provides a set of input, output, and aggregator plugins. To limit the binary size for Telegraf we custom compile and strip a Telegraf binary going from ~220MB to ~21MB but having only the plugins we explicitly use available. + +Azure Append Blob output plugins are provided by a custom Go Telegraf plugin `azure_append_blob` with some TES specific logic (i.e., this is not intended to be a generic Append Blob plugin) + +Additional one-time stats are collected using Telegraf's shell `exec` plugin and a set of bash and Python3 scripts to extract information from: + +* `/proc/cpuinfo` +* `lscpu` +* system boot time (with sub second accuracy) +* unsupported (hacked) metrics from the Azure Batch Agent logs +* Azure Instance Metadata Service (IMDS) +* `nvme` block devices + +## Telegraf custom binary building + +Use the `build_tes_deployment_archive.sh` script to build a locally setup telegraf repo. Or call `telegraf/build_telegraf_remote.sh` to build on a remote Azure Batch VM. Otherwise you can follow the steps below: + +1. Git clone Telegraf repo +1. Apply patches to add azure_append_blob +1. `make build_tools` to build the Telegraf custom_builder +1. Have Telegraf read our config file to build only the needed plugins. We export a LDFLAG to strip and remove debug information from the resulting static Go binary. This saves ~50% of the executable size. `export LDFLAGS="-s -w"; ./tools/custom_builder/custom_builder --config ./azure_append_test.config` + +## Configuring Telegraf dynamically + +Some plugins like the AMD ROCm GPU or Nvidia-SMI GPU input plugin have a 'startup_error_behavior' configuration option. When set to `ignore` the plugin won't capture data if the VM doesn't support that input (e.g. it doesn't have a GPU). + +The `start_vm_node_monitoring_.sh` script can be modified to make changes to the Telegraf config file. For example, `infiniband` plugins should only be enabled on InfiniBand equipped machines. + +Note that the `file` input is used to read a comment striped version of the config file into the output log once. + +## Upload .tar.gz data to tes-internal + +Use `build_tes_deployment_archive.sh` to collect the scripts and telegraf binary into a single gziped binary. Then upload this binary to the `tes-internal` container. A modification to the `batch_script` used to run each task will need to be made to download and start data collection. + +## Collection process + +On download the `start_vm_node_monitoring.sh` script bootstraps the Telegraf logging process. The intent is that a node which gets multiple tasks will gracefully move from one monitoring session to another. That is, if you have a monitor running for the current task it will keep going after the current task ends. When a new task is picked up it, the new task will start a new logging session and ask (with increasing insistence) the previous logger to end. + +If the node terminates, telegraf will attempt to flush data but will most likely fail. This means node runtime roughly is within 2 minutes (`Agent.flush_interval` + append blob upload time) of the actual node runtime. + +Azure VMs typically are shutdown with ACPI power off style signal, so there is no time to respond or flush the final set of outputs. IMDS Scheduled Events do not help with deallocation signaling. Azure Batch machines power off immediately before the guest has time to receive a Scheduled Event. + +## What gets run on a Batch Node? + +On the first task run the `/cromwell-executions/.../tes_task/batch_script` will contain code to download the `tes_vm_monitor.tar.gz` archive built by `build_tes_deploymnet_archive.sh`. Next it will extract, chmod, and run in the background the `start_vm_node_monitoring.sh` script. + +### `start_vm_node_monitoring.sh` + +This script performs two main tasks. It is reentrant provided you have a new `TES_TASK_NAME`. Otherwise you can have multiple screens running with the same `TES_TASK_NAME`. The function `keep_latest_telegraf_screen` will attempt to ask other sessions to end and then kill them. + +1. Extracts the `tes_vm_monitor.tar.gz` archive into `${AZ_BATCH_TASK_DIR}/tes_vm_monitor/` (typically `/mnt/cromwell/tes_vm_monitor`). It makes sure the telegraf binary is executable, along with all the .sh scripts. It will also take the .conf files and create cleaned versions of them (with all comments removed) to use for logging. These ``.clean.conf` are written to the Telegraf output. + +2. Prepares a running environment for the current Telegraf monitoring instance (i.e. serializing variables for the launcher `tes_vm_monitor/run_telegraf.sh` to load). Launches the current monitor. And then looks to see if any other monitors are running that should be terminated. + + 1. For other sessions that have the same `TES_TASK_NAME` it will send them a `ctrl-c` command to gracefully flush and exit. Wait 10s for the flush to happen, and then kill the other processes. + + 1. For sessions that start with `TELEGRAF__` that aren't for the same `TES_TASK_NAME` it sends `ctrl-c` to ask the program to gracefully flush and exit. Waits ~30s. Then asks the screen to `quit` and also gracefully exit. Waits another 5s. Then it terminates using `SIGKILL` the other screens processes. + +### `run_telegraf.sh` + +This script launches two instances of Telegraf one after another. First it launches the `${ONE_TIME_CONFIG_CLEAN}` version of the config which should include some one-time system monitoring (i.e., running `collect_tes_perf.sh` which should run the other IMDS, nvme, lscpu, /proc/cpu, sub-second boot time, etc. scripts). + +Then it launches a version of `${CONTINUOUS_CONFIG_CLEAN}` which should be the long running performance monitoring session. `run_telegraf.sh` will attempt to restart telegraf if it dies. There is a `check_iteration_rate` function that does a simple 5 minute sleep if telegraf keeps on dying. diff --git a/src/vm_monitor/build_tes_deployment_archive.sh b/src/vm_monitor/build_tes_deployment_archive.sh new file mode 100644 index 000000000..390dba8bd --- /dev/null +++ b/src/vm_monitor/build_tes_deployment_archive.sh @@ -0,0 +1,113 @@ +#!/bin/bash + +# If an argument for a storage account URL is provided we'll upload the archive to the storage account: +if [ -n "$1" ]; then + # Remove trailing '/' if present: + export STORAGE_ACCOUNT_URL="${1%/}" + echo "Will upload archive to $STORAGE_ACCOUNT_URL" +fi + +# Create the archive for TES Performace monitoring: +if [ -z ${TELEGRAD_BUILD_DIR+x} ]; then + export TELEGRAD_BUILD_DIR="$HOME/telegraf" +fi +if [ -z ${AGENT_PERF_SCRIPT_DIR+x} ]; then + export AGENT_PERF_SCRIPT_DIR="$PWD/vm_monitor_scripts" +fi +ARCHIVE_NAME="tes_vm_monitor" +export TELEGRAF_BUILD_FILENAME="" + +function build_telegraf() { + local return_pwd + return_pwd=$(pwd) + + # Build telegraf from source: + cd "$TELEGRAD_BUILD_DIR" || exit 1 + rm -f ./telegraf + make build_tools + export LDFLAGS="-s -w" + if ! ./tools/custom_builder/custom_builder -tags -config "${AGENT_PERF_SCRIPT_DIR}/tes_vm_monitor.once.conf" -config "${AGENT_PERF_SCRIPT_DIR}/tes_vm_monitor.continuous.conf" -config "${AGENT_PERF_SCRIPT_DIR}/tes_vm_monitor.dummy.conf"; then + echo "Error: Failed to run custom builder" + exit 1 + fi + # Make sure a telegraf binary was built: + if [ ! -f ./telegraf ]; then + echo "Error: telegraf binary not found at $TELEGRAD_BUILD_DIR/telegraf" + exit 1 + fi + # Print stats on telegraf binary size: + ls -lh ./telegraf + # Print stats on compressed size vs built size: + GZIP_SZ=$(gzip --best -c ./telegraf | wc -c) + SIZE=$(stat -c %s ./telegraf) + GZIP_SZ_MB=$(echo "scale=2; $GZIP_SZ / 1024 / 1024" | bc) + SIZE_MB=$(echo "scale=2; $SIZE / 1024 / 1024" | bc) + REDUCTION=$(echo "scale=2; (1 - $GZIP_SZ / $SIZE) * 100" | bc) + echo "telegraf binary size: $SIZE_MB MB, compressed: $GZIP_SZ_MB MB, size reduction: $REDUCTION%" + TELEGRAF_BUILD_FILENAME=$(find "${TELEGRAD_BUILD_DIR}" -maxdepth 1 -type f -name "telegraf" -print0) + cd "$return_pwd" || exit 1 +} + +function build_telegraf_remote(){ + local return_pwd + return_pwd=$(pwd) + + cd ./telegraf || exit 1 + ./build_telegraf_remote.sh + TELEGRAF_BUILD_FILENAME="$PWD/telegraf" + cd "$return_pwd" || exit 1 +} + +# Create a tar archive of both the telegraf binary and the tes scripts: +echo "$TELEGRAD_BUILD_DIR" +build_telegraf_remote + +# Use find to gather the files, and then tar them into an archive. Transform +# the paths to remove the leading directories. +rm -f "${ARCHIVE_NAME}.tar" +rm -f "${ARCHIVE_NAME}.tar.gz" +# If the telegraf binary was not built, error out: +if [ ! -f "$TELEGRAF_BUILD_FILENAME" ]; then + echo "Error: telegraf binary not found" + exit 1 +fi +( printf "%s\0" "$TELEGRAF_BUILD_FILENAME" ; find "${AGENT_PERF_SCRIPT_DIR}" -type f -print0 ) | tar --null -cvf "${ARCHIVE_NAME}.tar" --transform 's,^.*/,,S' -T - +# Compress using gzip for size and decompression speed: +gzip --best "${ARCHIVE_NAME}.tar" + +# Print stats on archive size: +SIZE=$(stat -c %s "${ARCHIVE_NAME}.tar.gz") +SIZE_MB=$(echo "scale=2; $SIZE / 1024 / 1024" | bc) +echo "Total archive size: $SIZE_MB MB" + +# Print the contents of the archive: +echo -e "\nArchive contents:" +tar -tvf "${ARCHIVE_NAME}.tar.gz" +echo -e "\n${ARCHIVE_NAME}.tar.gz created" + +# If a storage account URL was provided, upload the archive to the storage account: +if [ -n "$STORAGE_ACCOUNT_URL" ]; then + # Remove trailing '/' if present: + STORAGE_ACCOUNT_URL="${STORAGE_ACCOUNT_URL%/}" + echo "Uploading archive to $STORAGE_ACCOUNT_URL/${ARCHIVE_NAME}.tar.gz" + + # # Use azcopy to upload the archive to the storage account: + # azcopy cp "${ARCHIVE_NAME}.tar.gz" "$STORAGE_ACCOUNT_URL/${ARCHIVE_NAME}.tar.gz" + + # Parse the storage account name and container name from the URL, upload with az cli + STORAGE_ACCOUNT_NAME=$(echo "$STORAGE_ACCOUNT_URL" | awk -F/ '{print $3}' | awk -F. '{print $1}') + CONTAINER_NAME=$(echo "$STORAGE_ACCOUNT_URL" | awk -F/ '{print $4}') + az storage blob upload --overwrite --account-name "$STORAGE_ACCOUNT_NAME" --container-name "$CONTAINER_NAME" --name "${ARCHIVE_NAME}.tar.gz" --type block --file "${ARCHIVE_NAME}.tar.gz" --auth-mode login +fi + +# bzip2 saves ~0.5MB, but takes 4s to decompress +# > hyperfine --prepare 'cp ./telegraf.tar.bz2 test.tar.bz2; rm -f test.tar' 'bzip2 -d test.tar.bz2' +# Benchmark 1: bzip2 -d test.tar.bz2 +# Time (mean ± σ): 4.107 s ± 0.162 s [User: 1.481 s, System: 0.165 s] +# Range (min … max): 3.820 s … 4.334 s 10 runs +# +# gzip costs a bit in archive size, but decompresses in 0.6s +# > hyperfine --prepare 'cp ./telegraf.tar.gz test.tar.gz; rm -f test.tar' 'gzip -d test.tar.gz' +# Benchmark 1: gzip -d test.tar.gz +# Time (mean ± σ): 643.0 ms ± 56.6 ms [User: 228.1 ms, System: 36.1 ms] +# Range (min … max): 596.0 ms … 741.5 ms 10 runs diff --git a/src/vm_monitor/telegraf/Dockerfile b/src/vm_monitor/telegraf/Dockerfile new file mode 100644 index 000000000..04ef079ab --- /dev/null +++ b/src/vm_monitor/telegraf/Dockerfile @@ -0,0 +1,43 @@ +# Get the latest telegraf release tag: +FROM ubuntu:latest AS fetcher +RUN apt-get update && apt-get install -y curl jq patch +WORKDIR /app +RUN curl --silent "https://api.github.com/repos/influxdata/telegraf/releases/latest" | jq -r .tag_name > /app/latest_release +RUN echo "Latest telegraf release: $(cat /app/latest_release)" + +# Build a custom telegraf plugin with the Azure Append Blob output plugin enabled +# Also enable changes to the filestat plugin for time filtering +FROM golang:latest AS builder +WORKDIR /app +COPY --from=fetcher /app/latest_release ./latest_release +RUN git clone https://github.com/influxdata/telegraf.git /app/telegraf/ +WORKDIR /app/telegraf +RUN echo "Fetching the latest release: $(cat /app/latest_release)" +RUN git checkout tags/$(cat /app/latest_release) + +# Add the Azure Append Blob patches: +COPY ./azure_append_blob/ /app/telegraf/plugins/outputs/azure_append_blob/ +COPY ./all/azure_append_blob.go /app/telegraf/plugins/outputs/all +# Copy and apply the filestat patch (skipping in favor of dedup for now): +# WORKDIR /app/telegraf +# RUN apt-get update && apt-get install -y patch +# COPY ./filestat.diff /app/telegraf/filestat.diff +# RUN patch -p1 < ./filestat.diff + +# NOTE: We explicitly are not building the entire telegraf code base as we only +# need custom_builder + the plugins it finds we need from telegraf.dummy.conf +# RUN make + +# Fix diskio +RUN sed -i 's|sysBlockPath = "/sys/block/" + devName|sysBlockPath = "/sys/class/block/" + devName|g' /app/telegraf/plugins/inputs/diskio/diskio_linux.go +RUN make build_tools + +# Build the custom telegraf binary (copy of configs is here for better docker caching) +COPY ./telegraf.dummy.conf /app/telegraf/telegraf.dummy.conf +COPY ./tes_vm_monitor.once.conf /app/telegraf/tes_vm_monitor.once.conf +COPY ./tes_vm_monitor.continuous.conf /app/telegraf/tes_vm_monitor.continuous.conf +RUN export LDFLAGS="-s -w"; ./tools/custom_builder/custom_builder --config ./tes_vm_monitor.once.conf --config ./tes_vm_monitor.continuous.conf --config ./telegraf.dummy.conf + +# Further shrink the size of telegraf: +WORKDIR /app/telegraf +RUN strip -s -x ./telegraf diff --git a/src/vm_monitor/telegraf/all/azure_append_blob.go b/src/vm_monitor/telegraf/all/azure_append_blob.go new file mode 100644 index 000000000..0767135bd --- /dev/null +++ b/src/vm_monitor/telegraf/all/azure_append_blob.go @@ -0,0 +1,5 @@ +//go:build !custom || outputs || outputs.azure_append_blob + +package all + +import _ "github.com/influxdata/telegraf/plugins/outputs/azure_append_blob" // register plugin diff --git a/src/vm_monitor/telegraf/azure_append_blob/README.md b/src/vm_monitor/telegraf/azure_append_blob/README.md new file mode 100644 index 000000000..b37a30109 --- /dev/null +++ b/src/vm_monitor/telegraf/azure_append_blob/README.md @@ -0,0 +1,60 @@ +# Azure Append Blob Output Plugin + +This plugin writes telegraf metrics to an append blob on an Azure Storage Account. +Plugin was developed using the File Output plugin as a base with additions from +the Azure Monitor output. Note that test does not work and the plugin is very +specific to the GA4GH TES monitoring task it is being used for. This plugin +would require significant time investment to make it an official plugin. + +Note that output + +## Global configuration options + +In addition to the plugin-specific configuration settings, plugins support +additional global and plugin configuration settings. These settings are used to +modify metrics, tags, and field or create aliases and configure ordering, etc. +See the [CONFIGURATION.md][CONFIGURATION.md] for more details. + +[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins + +## Configuration + +```toml @sample.conf +# Send metrics to an Azure Storage Account using an append blob +[[outputs.azure_append_blob]] + ## Azure Storage Account destination is specified in 4 parts, the storage account, + ## the azure_endpoint (optional), the container name, and the path to where the blobs + ## will be written. By default this plugin assumes it will be writing files called + ## "vm_metrics.%d.json". So output_path should be a directory. + storage_account_name = "myStorageAccountName" + container_name = "data" + output_path = "/workflow/task_name/iteration/" + + ## Use batch serialization format instead of line based delimiting. The + ## batch format allows for the production of non line based output formats and + ## may more efficiently encode and write metrics. + # use_batch_format = false + + ## Data format to output. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md + data_format = "influx" + + ## Compress output data with the specified algorithm. + ## If empty, compression will be disabled and files will be plain text. + ## Supported algorithms are "zstd", "gzip" and "zlib". + # compression_algorithm = "" + + ## Compression level for the algorithm above. + ## Please note that different algorithms support different levels: + ## zstd -- supports levels 1, 3, 7 and 11. + ## gzip -- supports levels 0, 1 and 9. + ## zlib -- supports levels 0, 1, and 9. + ## By default the default compression level for each algorithm is used. + # compression_level = -1 + + ## Optionally, if in Azure US Government, China, or other sovereign + ## cloud environment, set the appropriate endpoint + # azure_endpoint = "blob.core.usgovcloudapi.net" +``` diff --git a/src/vm_monitor/telegraf/azure_append_blob/azure_append_blob.go b/src/vm_monitor/telegraf/azure_append_blob/azure_append_blob.go new file mode 100644 index 000000000..77afe98d5 --- /dev/null +++ b/src/vm_monitor/telegraf/azure_append_blob/azure_append_blob.go @@ -0,0 +1,430 @@ +//go:generate ../../../tools/readme_config_includer/generator +package azure_append_blob + +// TODO: Clean-up start-up logic and filename creation. Current process is fairly slow and inefficient. + +import ( + "bytes" + "context" + _ "embed" + "fmt" + "net/url" + "regexp" + "strconv" + "strings" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/internal" + "github.com/influxdata/telegraf/plugins/outputs" + "github.com/influxdata/telegraf/plugins/serializers" + + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/appendblob" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/bloberror" + "github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/container" +) + +//go:embed sample.conf +var sampleConfig string + +type AzureAppendBlob struct { + StorageAccountName string `toml:"storage_account_name"` + ContainerName string `toml:"container_name"` + OutputPath string `toml:"output_path"` + UseBatchFormat bool `toml:"use_batch_format"` + CompressionAlgorithm string `toml:"compression_algorithm"` + CompressionLevel int `toml:"compression_level"` + Log telegraf.Logger `toml:"-"` + AzureEndpoint string `toml:"azure_endpoint"` + + encoder internal.ContentEncoder + serializer serializers.Serializer + cred *azidentity.DefaultAzureCredential + client *appendblob.Client + appendBlobOutputNum int + appendBlobRetryCount int + appendBlobNameTemplate string + containerURL string + appendBlobURL string + baseAppendBlobURL string +} + +const ( + defaultAzureEndpoint = "blob.core.windows.net" + appendBlobBaseName = "vm_perf_metrics" + appendBlobNameBaseTemplate = ".%d.json" + appendBlobWriteRetryLimit = 3 + appendBlobMaxBlocks = 50000 + + urlTemplate = "https://%s.%s/%s" +) + +func (*AzureAppendBlob) SampleConfig() string { + return sampleConfig +} + +func (f *AzureAppendBlob) SetSerializer(serializer serializers.Serializer) { + f.serializer = serializer +} + +func (f *AzureAppendBlob) Init() error { + var err error + f.appendBlobOutputNum = -1 + f.appendBlobRetryCount = 0 + if f.AzureEndpoint == "" { + f.AzureEndpoint = defaultAzureEndpoint + } + if f.StorageAccountName == "" { + return fmt.Errorf("storage_account_name is required") + } + if f.ContainerName == "" { + return fmt.Errorf("container_name is required") + } + if f.OutputPath == "" { + return fmt.Errorf("OutputPath is required") + } + + var options []internal.EncodingOption + if f.CompressionAlgorithm == "" { + f.CompressionAlgorithm = "identity" + } + + // Set appendBlob naming convention based on compression algorithm + switch f.CompressionAlgorithm { + case "zstd": + f.appendBlobNameTemplate = strings.Replace(appendBlobNameBaseTemplate, ".json", ".zst", 1) + case "gzip": + f.appendBlobNameTemplate = strings.Replace(appendBlobNameBaseTemplate, ".json", ".gz", 1) + case "zlib": + f.appendBlobNameTemplate = strings.Replace(appendBlobNameBaseTemplate, ".json", ".zlib", 1) + default: + f.appendBlobNameTemplate = appendBlobNameBaseTemplate + } + + if f.CompressionLevel >= 0 { + options = append(options, internal.WithCompressionLevel(f.CompressionLevel)) + } + f.encoder, err = internal.NewContentEncoder(f.CompressionAlgorithm, options...) + + return err +} + +func (f *AzureAppendBlob) Connect() error { + err := f.createAppendBlobLogFile() + return err +} + +func (f *AzureAppendBlob) Close() error { + var err error + return err +} + +func (f *AzureAppendBlob) Write(metrics []telegraf.Metric) error { + var writeErr error + + if f.UseBatchFormat { + octets, err := f.serializer.SerializeBatch(metrics) + if err != nil { + f.Log.Debugf("Could not serialize metric: %v", err) + } + + octets, err = f.encoder.Encode(octets) + if err != nil { + f.Log.Debugf("Could not compress metrics: %v", err) + } + + err = f.WriteToAppendBlob(NewByteReadSeekCloser(octets)) + if err != nil { + f.Log.Debugf("Error writing to file: %v", err) + } + } else { + for _, metric := range metrics { + b, err := f.serializer.Serialize(metric) + if err != nil { + f.Log.Debugf("Could not serialize metric: %v", err) + } + + b, err = f.encoder.Encode(b) + if err != nil { + f.Log.Debugf("Could not compress metrics: %v", err) + } + + err = f.WriteToAppendBlob(NewByteReadSeekCloser(b)) + if err != nil { + f.Log.Debugf("Error writing to write message: %w", err) + } + } + } + + return writeErr +} + +func init() { + outputs.Add("azure_append_blob", func() telegraf.Output { + return &AzureAppendBlob{ + CompressionLevel: -1, + } + }) +} + +// Sanitize the file input from the config, it should start with a slash +func (f *AzureAppendBlob) sanitizeAppendBlobFilename() string { + appendBlobFilename := f.OutputPath + if f.OutputPath[0] != '/' { + appendBlobFilename = "/" + f.OutputPath + } + return appendBlobFilename +} + +func isValidURL(toTest string) error { + _, err := url.ParseRequestURI(toTest) + return err +} + +func (f *AzureAppendBlob) buildBlobURL(outputNumber ...int) error { + appendBlobFilename := f.sanitizeAppendBlobFilename() + + var blobNumber int + if len(outputNumber) > 0 { + blobNumber = outputNumber[0] + } else { + blobNumber = f.appendBlobOutputNum + } + + // Build our append blob URL, the file path is provided as a config input path + potential blob prefix + // We use the currentAppendBlobNum to track the current append blob number + f.containerURL = fmt.Sprintf(urlTemplate, f.StorageAccountName, f.AzureEndpoint, f.ContainerName) + // https://.blob.core.windows.net//"vm_perf_metrics" + f.baseAppendBlobURL = fmt.Sprintf("%s%s%s", f.containerURL, appendBlobFilename, appendBlobBaseName) + // https://.blob.core.windows.net//"vm_perf_metrics.%d.json" + f.appendBlobURL = fmt.Sprintf("%s%s", f.baseAppendBlobURL, fmt.Sprintf(f.appendBlobNameTemplate, blobNumber)) + + // Validate the URL even though we were provided the components to build a blobURL + if err := isValidURL(f.appendBlobURL); err != nil { + return fmt.Errorf("Invalid URL: %s, error: %v", f.appendBlobURL, err) + } + return nil +} + +func (f *AzureAppendBlob) appendBlobAvailable(blobURL string) (bool, bool, error) { + blobClient, err := appendblob.NewClient(blobURL, f.cred, nil) + if err != nil { + return false, false, err + } + get, err := blobClient.GetProperties(context.Background(), nil) + if err != nil { + f.Log.Debugf("Blob does not exist at: \"%s\"", blobURL) + return false, true, nil // Blob doesn't exist, we can write to it + } else { + f.Log.Debugf("Blob exists at: \"%s\"", blobURL) + if *get.BlobType != "AppendBlob" { + f.Log.Debugf("Blob is not an append blob, it is a \"%s\"", *get.BlobType) + return true, false, nil // Blob exists but is not an append blob + } else if get.BlobCommittedBlockCount != nil && *get.BlobCommittedBlockCount >= appendBlobMaxBlocks { + f.Log.Debugf("Blob is fully committed") + return true, false, nil // Blob exists but is fully committed + } else { + return true, true, nil // Blob exists, is an append blob, and is not fully committed + } + } +} + +func (f *AzureAppendBlob) createAppendBlobClient(createAppendBlobFile bool) error { + if createAppendBlobFile { + // Check to see if we can write to this append blob + blobExists, blobUsable, err := f.appendBlobAvailable(f.appendBlobURL) + if err != nil { + return err + } + if blobUsable { + f.appendBlobOutputNum = 0 + if blobExists { + f.Log.Debugf("Appending to existing append blob: \"%s\"", f.appendBlobURL) + return f.createAppendBlobClientHelper(false) + } else { + f.Log.Debugf("Creating new append blob: \"%s\"", f.appendBlobURL) + return f.createAppendBlobClientHelper(true) + } + } else { + return fmt.Errorf("unable to create append blob: \"%s\"", f.appendBlobURL) + } + } + return f.createAppendBlobClientHelper(false) +} + +func (f *AzureAppendBlob) createAppendBlobClientHelper(createFile bool) error { + var err error + f.client, err = appendblob.NewClient(f.appendBlobURL, f.cred, nil) + if err != nil { + return err + } + if createFile { + _, err = f.client.Create(context.Background(), nil) + if err != nil { + return err + } + } + return nil +} + +// Called on initialization, figure out if there are already append blobs in the output path, and if so find the latest one +// That is, if we're writing JSON to "vm_perf_metrics.0.json" there are already "vm_perf_metrics.0.json" and "vm_perf_metrics.1.json" +// check to see if we can write to "vm_perf_metrics.1.json" and if not, create "vm_perf_metrics.2.json" +func (f *AzureAppendBlob) findLatestAppendBlob() error { + var err error + + // Start with the current 0th append blob: + err = f.buildBlobURL(0) + if err != nil { + return err + } + + // If the 0th append blob already exists make sure we can write to it: + f.Log.Debugf("Checking for existing 0th append blob: \"%s\"", f.appendBlobURL) + blobExists, blobUsable, err := f.appendBlobAvailable(f.appendBlobURL) + if err == nil && blobUsable { + f.appendBlobOutputNum = 0 + if blobExists { + f.Log.Debugf("0th Block exists, checking for other blocks: \"%s\"", f.appendBlobURL) + } else { + f.Log.Debugf("0th Block does not exist, creating: \"%s\"", f.appendBlobURL) + return f.createAppendBlobClient(true) + } + } + + // More complex case if the 0th blob exists but is not write-able. Rather than incrementing our appendBlobOutputNum and trying again, + // we will list the blobs that are available and find the highest number, then write to that blob (or a new blob) + // Start by listing the files in the destination directory: + containerClient, err := container.NewClient(f.containerURL, f.cred, nil) + if err != nil { + return err + } + searchPrefix := strings.Replace(f.baseAppendBlobURL, f.containerURL+"/", "", 1) + f.Log.Debugf("Searching for blobs with prefix: \"%s*\"", searchPrefix) + pager := containerClient.NewListBlobsFlatPager(&container.ListBlobsFlatOptions{ + Prefix: &searchPrefix}) + + // Find the blob with the largest number in its name. + // If there's vm_perf_metrics.0.json and vm_perf_metrics.10.json we'll write to vm_perf_metrics.10.json + maxNum := -1 + for pager.More() { + resp, err := pager.NextPage(context.TODO()) + if err != nil { + return err + } + for _, blob := range resp.Segment.BlobItems { + name := *blob.Name + // f.Log.Debug("Found blob: ", name) + extension := regexp.MustCompile(`\.\w+$`).FindString(f.appendBlobNameTemplate) // Extract the extension from the template + re := regexp.MustCompile(fmt.Sprintf(`^%s\.(\d+)%s.*`, regexp.QuoteMeta(searchPrefix), regexp.QuoteMeta(extension))) + matches := re.FindStringSubmatch(name) + if len(matches) > 1 { + numStr := matches[1] // get the number part + num, _ := strconv.Atoi(numStr) + f.Log.Debugf("Found %d with filename: \"%s\"", num, name) + if num > maxNum { + maxNum = num + } + } + } + } + f.Log.Debugf("Maximum number found: %d", maxNum) + + // If the maxNum is still -1, it means no blob was found, so create one + if maxNum == -1 { + f.appendBlobOutputNum = 0 + f.buildBlobURL() + f.Log.Debugf("No matching blob found, creating a new one: \"%s\"", f.appendBlobURL) + return f.createAppendBlobClient(true) + } else { + // If a blob was found, start writing to the blob with the largest number in its name + // Note that if the last blob is not write-able we'll catch that during our first write and move to the next blob + f.appendBlobOutputNum = maxNum + f.buildBlobURL() + f.Log.Debugf("Found existing blob, appending to [%d]: \"%s\"", f.appendBlobOutputNum, f.appendBlobURL) + return f.createAppendBlobClient(false) + } +} +func (f *AzureAppendBlob) createAppendBlobLogFile() error { + var err error + // Authenticate using an Azure VM provided managed identity, this will fail if the VM is not assigned a managed identity + if f.cred == nil { + f.cred, err = azidentity.NewDefaultAzureCredential(nil) + // if err != nil { + // f.Log.Debugf("Unable to get a default identity credential: %v", err) + // f.cred, err = azidentity.NewManagedIdentityCredential(nil) + // if err != nil { + // f.Log.Debugf("Unable to get a managed identity credential: %v", err) + // return err + // } + // } + if err != nil { + return err + } + } + + if f.appendBlobOutputNum == -1 { + // On the first write do an extensive search for the latest append blob: + return f.findLatestAppendBlob() + } else { + f.buildBlobURL() + f.Log.Warnf("createAppendBlobLogFile called with initialized appendBlobOutputNum: %d, attempting to use current output number", f.appendBlobOutputNum) + return f.createAppendBlobClient(false) + } +} + +func (f *AzureAppendBlob) WriteToAppendBlob(b ByteReadSeekCloser) error { + var err error + // Write to the append blob, if the target is sealed or at the maximum block counts increment to the next blob name: + ctx := context.Background() + _, err = f.client.AppendBlock(ctx, &b, nil) + if err != nil { + if bloberror.HasCode(err, bloberror.BlockCountExceedsLimit, bloberror.BlobNotFound, bloberror.InvalidOperation) { + // BlockCountExceedsLimit - We seem to have reached the maximum number of blocks try the next blob name + // BlobNotFound - The blob has been deleted, try next blob name + // InvalidOperation - The blob is likely sealed, try next blob name + f.appendBlobOutputNum++ + f.appendBlobRetryCount = 0 + f.Log.Warnf("Unable to append to current output, moving to output %d. Error was %v", f.appendBlobOutputNum, err) + f.buildBlobURL() + f.createAppendBlobClient(true) + return f.WriteToAppendBlob(b) + } else { + f.appendBlobOutputNum++ + f.appendBlobRetryCount++ + if f.appendBlobRetryCount < appendBlobWriteRetryLimit { + // Retry the write + f.Log.Warnf("Failed to write to append blob, attempt %d. Error was %v", f.appendBlobRetryCount, err) + return f.WriteToAppendBlob(b) + } else { + // Fatal error: + f.Log.Errorf("Failed to write to append blob, attempted %d times: %v", f.appendBlobRetryCount, err) + return err + } + } + } else { + f.appendBlobRetryCount = 0 + } + return err +} + +// Convenience wrapper around bytes.Reader to implement io.ReadSeekCloser for AppendBlock writes: +type ByteReadSeekCloser struct { + *bytes.Reader +} + +func NewByteReadSeekCloser(b []byte) ByteReadSeekCloser { + return ByteReadSeekCloser{bytes.NewReader(b)} +} + +func (b *ByteReadSeekCloser) Read(p []byte) (n int, err error) { + return b.Reader.Read(p) +} + +func (b *ByteReadSeekCloser) Seek(offset int64, whence int) (int64, error) { + return b.Reader.Seek(offset, whence) +} + +func (b *ByteReadSeekCloser) Close() error { + return nil // No op as there's nothing to close +} diff --git a/src/vm_monitor/telegraf/azure_append_blob/sample.conf b/src/vm_monitor/telegraf/azure_append_blob/sample.conf new file mode 100644 index 000000000..934ea81c4 --- /dev/null +++ b/src/vm_monitor/telegraf/azure_append_blob/sample.conf @@ -0,0 +1,37 @@ +# Send metrics to an Azure Storage Account using an append blob +[[outputs.azure_append_blob]] + ## Azure Storage Account destination is specified in 4 parts, the storage account, + ## the azure_endpoint (optional), the container name, and the path to where the blobs + ## will be written. By default this plugin assumes it will be writing files called + ## "vm_metrics.%d.json". So output_path should be a directory. + storage_account_name = "myStorageAccountName" + container_name = "data" + output_path = "/workflow/task_name/iteration/" + + ## Use batch serialization format instead of line based delimiting. The + ## batch format allows for the production of non line based output formats and + ## may more efficiently encode and write metrics. + # use_batch_format = false + + ## Data format to output. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md + data_format = "influx" + + ## Compress output data with the specified algorithm. + ## If empty, compression will be disabled and files will be plain text. + ## Supported algorithms are "zstd", "gzip" and "zlib". + # compression_algorithm = "" + + ## Compression level for the algorithm above. + ## Please note that different algorithms support different levels: + ## zstd -- supports levels 1, 3, 7 and 11. + ## gzip -- supports levels 0, 1 and 9. + ## zlib -- supports levels 0, 1, and 9. + ## By default the default compression level for each algorithm is used. + # compression_level = -1 + + ## Optionally, if in Azure US Government, China, or other sovereign + ## cloud environment, set the appropriate endpoint + # azure_endpoint = "blob.core.usgovcloudapi.net" diff --git a/src/vm_monitor/telegraf/build_telegraf_remote.sh b/src/vm_monitor/telegraf/build_telegraf_remote.sh new file mode 100644 index 000000000..2d3f84a61 --- /dev/null +++ b/src/vm_monitor/telegraf/build_telegraf_remote.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +connect_string=${connect_string:-"ssh batch-explorer-user@20.236.185.167 -p 50000"} +dest_dir="/mnt/telegraf/" + +# Parse the Azure Batch connect string: +username=$(echo "$connect_string" | cut -d'@' -f1 | cut -d' ' -f2) +ip=$(echo "$connect_string" | cut -d'@' -f2 | cut -d' ' -f1) +port=$(echo "$connect_string" | cut -d' ' -f4) + +echo -e "Username: \t$username" +echo -e "IP: \t\t$ip" +echo -e "Port: \t\t$port" + +# rclone current directory to remote server +# shellcheck disable=SC2086 +ssh -p $port $username@$ip "sudo mkdir -p $dest_dir && sudo chmod a+rw $dest_dir" +rm -f ./telegraf +# shellcheck disable=SC2086 +rclone copy --progress --sftp-host=$ip --sftp-user=$username --sftp-port=$port --sftp-key-file=~/.ssh/id_rsa ./ :sftp:${dest_dir} --progress --multi-thread-streams=30 --transfers=30 --checkers=45 +# Copy tes_performance/vm_monitor_scripts/tes_vm_monitor.once.conf tes_performance/vm_monitor_scripts/tes_vm_monitor.continuous.conf to remote +# shellcheck disable=SC2086 +scp -P $port ../vm_monitor_scripts/tes_vm_monitor.once.conf $username@$ip:${dest_dir}tes_vm_monitor.once.conf +# shellcheck disable=SC2086 +scp -P $port ../vm_monitor_scripts/tes_vm_monitor.continuous.conf $username@$ip:${dest_dir}tes_vm_monitor.continuous.conf + +echo -e "\n\nCopying complete" + +# Execute this script on the remote server: +# shellcheck disable=SC2087 +# shellcheck disable=SC2086 +ssh -p $port $username@$ip << EOF + # if azure cli is not installed, install it + if [ ! -x "\$(command -v az)" ]; then + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + fi + set -e + rm -f ${dest_dir}telegraf + cd ${dest_dir} && sudo docker build -t telegraf . + sudo docker rm temp_telegraf || true + sudo docker run -d --name temp_telegraf telegraf + until [ "\$(sudo docker inspect -f \{\{.State.Running\}\} temp_telegraf)"=="true" ]; do + echo "Waiting for telegraf docker image to start" + sleep 1 + done + sudo docker cp temp_telegraf:/app/telegraf/telegraf ${dest_dir}telegraf + sudo docker rm temp_telegraf || true +EOF + +# Copy the telegraf binary from the remote server to the local machine +# shellcheck disable=SC2086 +scp -P $port $username@$ip:${dest_dir}telegraf ./telegraf +# shellcheck disable=SC2181 +if [ $? -eq 0 ]; then + msg="Telegraf binary copied successfully, image size is $(du -h telegraf | cut -f1)" + if [ ! -x "$(command -v gum)" ]; then + echo "${msg}" + else + gum style --foreground 212 --border-foreground 212 --border double --align center --width 50 --margin "1 2" --padding "2 4" "${msg}" + fi +else + msg="Telegraf binary copy failed" + # if gum is not installed just echo the message + if [ ! -x "$(command -v gum)" ]; then + echo "${msg}" + else + gum style --foreground 196 --border-foreground 196 --border double --align center --width 50 --margin "1 2" --padding "2 4" "${msg}" + fi + exit 1 +fi diff --git a/src/vm_monitor/telegraf/filestat.diff b/src/vm_monitor/telegraf/filestat.diff new file mode 100644 index 000000000..4752ede54 --- /dev/null +++ b/src/vm_monitor/telegraf/filestat.diff @@ -0,0 +1,119 @@ +diff --git a/plugins/inputs/filestat/README.md b/plugins/inputs/filestat/README.md +--- a/plugins/inputs/filestat/README.md ++++ b/plugins/inputs/filestat/README.md +@@ -23,6 +23,15 @@ See the [CONFIGURATION.md][CONFIGURATION.md] for more details. + + ## If true, read the entire file and calculate an md5 checksum. + md5 = false ++ ++ ## Only count files that have not been touched for at least this ++ ## duration. If mtime is negative, only count files that have been ++ ## touched in this duration. Defaults to "0s". ++ mtime = "0s" ++ ++ ## If true, ignore mtime setting for the first data collection ++ ## this allows for capturing the initial state of the files ++ capture_all_on_first_run = true + ``` + + ## Metrics +diff --git a/plugins/inputs/filestat/filestat.go b/plugins/inputs/filestat/filestat.go +--- a/plugins/inputs/filestat/filestat.go ++++ b/plugins/inputs/filestat/filestat.go +@@ -7,8 +7,10 @@ import ( + "encoding/hex" + "io" + "os" ++ "time" + + "github.com/influxdata/telegraf" ++ "github.com/influxdata/telegraf/config" + "github.com/influxdata/telegraf/internal/globpath" + "github.com/influxdata/telegraf/plugins/inputs" + ) +@@ -17,11 +19,14 @@ import ( + var sampleConfig string + + type FileStat struct { +- Md5 bool +- Files []string ++ Md5 bool ++ Files []string ++ MTime config.Duration `toml:"mtime"` ++ CaptureAllOnFirstRun bool `toml:"capture_all_on_first_run"` + + Log telegraf.Logger + ++ firstRun bool + // maps full file paths to globmatch obj + globs map[string]*globpath.GlobPath + +@@ -33,6 +38,7 @@ type FileStat struct { + + func NewFileStat() *FileStat { + return &FileStat{ ++ firstRun: true, + globs: make(map[string]*globpath.GlobPath), + missingFiles: make(map[string]bool), + filesWithErrors: make(map[string]bool), +@@ -43,8 +49,34 @@ func (*FileStat) SampleConfig() string { + return sampleConfig + } + ++func absDuration(x time.Duration) time.Duration { ++ if x < 0 { ++ return -x ++ } ++ return x ++} ++ ++func (f *FileStat) mtimeFilter(now time.Time, fileInfo os.FileInfo) bool { ++ age := absDuration(time.Duration(f.MTime)) ++ mtime := now.Add(-age) ++ if time.Duration(f.MTime) < 0 { ++ return fileInfo.ModTime().After(mtime) ++ } ++ return fileInfo.ModTime().Before(mtime) ++} ++ + func (f *FileStat) Gather(acc telegraf.Accumulator) error { + var err error ++ var now time.Time ++ MTimeFilterActive := false ++ if time.Duration(f.MTime) != 0 { ++ MTimeFilterActive = true ++ now = time.Now() ++ } ++ if f.firstRun && f.CaptureAllOnFirstRun { ++ MTimeFilterActive = false ++ } ++ f.firstRun = false + + for _, filepath := range f.Files { + // Get the compiled glob object for this filepath +@@ -95,6 +127,9 @@ func (f *FileStat) Gather(acc telegraf.Accumulator) error { + fileName, err) + } + } else { ++ if MTimeFilterActive && !f.mtimeFilter(now, fileInfo) { ++ continue ++ } + f.filesWithErrors[fileName] = false + fields["size_bytes"] = fileInfo.Size() + fields["modification_time"] = fileInfo.ModTime().UnixNano() +diff --git a/plugins/inputs/filestat/sample.conf b/plugins/inputs/filestat/sample.conf +--- a/plugins/inputs/filestat/sample.conf ++++ b/plugins/inputs/filestat/sample.conf +@@ -7,3 +7,12 @@ + + ## If true, read the entire file and calculate an md5 checksum. + md5 = false ++ ++ ## Only count files that have not been touched for at least this ++ ## duration. If mtime is negative, only count files that have been ++ ## touched in this duration. Defaults to "0s". ++ mtime = "0s" ++ ++ ## If true, ignore mtime setting for the first data collection ++ ## this allows for capturing the initial state of the files ++ capture_all_on_first_run = true diff --git a/src/vm_monitor/telegraf/telegraf b/src/vm_monitor/telegraf/telegraf new file mode 100644 index 000000000..b5d578231 Binary files /dev/null and b/src/vm_monitor/telegraf/telegraf differ diff --git a/src/vm_monitor/telegraf/telegraf.dummy.conf b/src/vm_monitor/telegraf/telegraf.dummy.conf new file mode 100644 index 000000000..d316d7877 --- /dev/null +++ b/src/vm_monitor/telegraf/telegraf.dummy.conf @@ -0,0 +1,38 @@ +## This is a dummy config to force the custom_builder tools to include a bunch of useful plugins for debugging and other tasks +##! Not a real telegraf config +## Adding in these plugins adds very little size to the telegraf binary +[agent] +[[aggregators.basicstats]] +[[aggregators.histogram]] +[[aggregators.minmax]] +[[aggregators.quantile]] +[[aggregators.valuecounter]] +[[outputs.azure_append_blob]] +# Force outputs.file plugin for dbugging and add CSV, JSON + compression +[[outputs.file]] + files = ["stdout"] + data_format = "csv" + compression_algorithm = "zstd" +[[outputs.file]] + files = ["stdout"] + data_format = "json" + compression_algorithm = "gzip" +[[inputs.file]] +[[inputs.filestat]] +[[inputs.filecount]] +[[inputs.amd_rocm_smi]] +[[inputs.nvidia_smi]] +[[inputs.filecount]] +[[inputs.infiniband]] +[[inputs.exec]] +[[inputs.nstat]] +[[inputs.cpu]] +[[inputs.disk]] +[[inputs.diskio]] +[[inputs.mem]] +[[inputs.net]] +[[inputs.processes]] +[[inputs.swap]] +[[inputs.system]] +[[inputs.docker]] +[[inputs.docker_log]] diff --git a/src/vm_monitor/tes_vm_monitor.tar.gz b/src/vm_monitor/tes_vm_monitor.tar.gz new file mode 100644 index 000000000..f0bc00b5a Binary files /dev/null and b/src/vm_monitor/tes_vm_monitor.tar.gz differ diff --git a/src/vm_monitor/vm_monitor_scripts/clean_log.py b/src/vm_monitor/vm_monitor_scripts/clean_log.py new file mode 100644 index 000000000..bcb53457e --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/clean_log.py @@ -0,0 +1,13 @@ +import sys +import json +import re +import string + +# This script uses json.dumps to serialize strings with complex quoting/escaping issues into +# a single line string that can be placed inside double quotes +input_log_line = sys.stdin.read() +# Remove non-printable characters (such as the block character in agent logs) +clean_line = re.sub(f'[^{re.escape(string.printable)}]', '_', input_log_line) +clean_line = json.dumps(clean_line) +clean_line = clean_line.replace("\\n",'') +sys.stdout.write(clean_line.strip()) \ No newline at end of file diff --git a/src/vm_monitor/vm_monitor_scripts/collect_azure_vm_perf.sh b/src/vm_monitor/vm_monitor_scripts/collect_azure_vm_perf.sh new file mode 100644 index 000000000..7f94bd0de --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/collect_azure_vm_perf.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +cd "$(dirname "$0")" || exit 1 + +# Expected to be called from Telegraf's exec plugin +# Run and collect all telegraf stats, not all of this information will be useful but it is collected for +# completeness and debugging purposes. +./get_imds_and_nvme_metatada.sh || true +./get_linux_boot_iso_timestamp.sh || true +python3 parse_extended_cpu_info.py || true +# Called last in case batch agent has changed +./get_batch_agent_values.sh || true + diff --git a/src/vm_monitor/vm_monitor_scripts/get_batch_agent_values.sh b/src/vm_monitor/vm_monitor_scripts/get_batch_agent_values.sh new file mode 100644 index 000000000..921352ae3 --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/get_batch_agent_values.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +## Collect optional metrics from the Azure Batch agent, note this is not supported by the Azure Batch service +## values are not guaranteed to be accurate and may change in future versions of the Azure Batch agent +## +## See similar early C# version at: +## https://github.com/gabe-microsoft/CromwellOnAzure/blob/e1537f4f1bd8c0e1789c90e7031e97103eb1d5ca/src/TesApi.Web/BatchScheduler.cs + +batchAgentDebugLogPath="/mnt/batch/sys/logs/agent-debug.log" + +unix_timestamp_to_iso_time_string() { + # Convert the numeric unix timestamp to an ISO 8601 timestamp string + iso_timestamp=$(date -u -d @"$1" +"%Y-%m-%dT%H:%M:%S.%NZ") + echo "$iso_timestamp" +} + +win32_timestamp_to_unix_timestamp() { + local win32="$1" + # Subtract the number of seconds between 1601-01-01 and 1970-01-01 + unix=$(echo "scale=9; ($win32 - 116444736000000000) / 10000000" | bc) + echo "$unix" +} + +get_timestamp_or_nan() { + local timestamp="$1" + local unix_timestamp + if [[ -z "$timestamp" ]]; then + echo "nan" + else + unix_timestamp=$(win32_timestamp_to_unix_timestamp "$timestamp") + unix_timestamp=$(unix_timestamp_to_iso_time_string "$unix_timestamp") + echo "$unix_timestamp" + fi +} + + +get_batch_agent_debug_log_values() { + # If the batch agent debug log does not exist, return early + if [[ ! -f "$batchAgentDebugLogPath" ]]; then + echo "batch_agent_data batch_log_found=\"false\",batch_allocation_time=\"nan\",batch_vm_name=\"\",batch_pool_name=\"\",batch_boot_time=\"nan\"" + return + fi + + # Extract the logline containing the TVMAllocationTime, PoolName, and TVMName + # logline=$(sudo grep get_entity_with_sas_async $batchAgentDebugLogPath | grep get_entity_with_sas) + logline=$(sudo grep TVMAllocationTime $batchAgentDebugLogPath | grep TVMName | grep PoolName | head -1) + batch_entity_log_line_clean=$(echo "$logline" | python3 clean_log.py) + + # Extract the TVMAllocationTime (this is a win32 timestamp indicating when the VM was allocated by Azure Batch) + TVMAllocationTime=$(echo "$logline" | grep -oP '"TVMAllocationTime":"\K\d+' | head -1) + TVMAllocationTime=$(get_timestamp_or_nan "$TVMAllocationTime") + + # Extract the TVMName (the name of the node given by Azure Batch) + TVMName=$(echo "$logline" | grep -oP '"TVMName":"\K[^"]+' | head -1) + if [[ -z "$TVMName" ]]; then + TVMName="" + fi + + # Extract the PoolName (name of the Batch pool this VM is part of, also available from IMDS) + PoolName=$(echo "$logline" | grep -oP '"PoolName":"\K[^"]+' | head -1) + if [[ -z "$PoolName" ]]; then + PoolName="" + fi + + # Extract the logline containing the TVMBootTime (note single quotes + space unlike the other params) + logline=$(sudo grep TVMBootTime $batchAgentDebugLogPath | head -1) + batch_vmtable_log_line_clean=$(echo "$logline" | python3 clean_log.py) + + TVMBootTime=$(echo "$logline" | grep -oP "'TVMBootTime': '\K\d+") + TVMBootTime=$(get_timestamp_or_nan "$TVMBootTime") + + echo "batch_agent_data batch_log_found=\"true\",batch_allocation_time=\"$TVMAllocationTime\",batch_vm_name=\"$TVMName\",batch_pool_name=\"$PoolName\",batch_boot_time=\"$TVMBootTime\"" + # Output a copy of the log lines we're pulling from the Batch Agent, these are useful for debugging if this breaks in the future + echo "debug_batch_agent_entity_log_line data=$batch_entity_log_line_clean" + echo "debug_batch_agent_vmtable_log_line data=$batch_vmtable_log_line_clean" +} + +get_vm_info() { + # Capture if the system has /sys/firmware/efi, this is a good indicator of UEFI boot (gen2 VMs) + if [[ -d /sys/firmware/efi ]]; then + UEFIBoot="true" + else + UEFIBoot="false" + fi + DMESGOutput=$(dmesg --notime | grep -i "efi: EFI v") + + echo "batch_vm_info vm_has_system_efi=\"$UEFIBoot\",dmesg_efi_version=\"$DMESGOutput\"" +} + +get_batch_agent_debug_log_values +get_vm_info \ No newline at end of file diff --git a/src/vm_monitor/vm_monitor_scripts/get_imds_and_nvme_metatada.sh b/src/vm_monitor/vm_monitor_scripts/get_imds_and_nvme_metatada.sh new file mode 100644 index 000000000..38936dff0 --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/get_imds_and_nvme_metatada.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +## Collect data from the Azure Instance Metadata Service (IMDS) +CURL_TIMEOUT=5 + +get_imds_data(){ + imds_versions=$(curl --max-time $CURL_TIMEOUT -s -H Metadata:true --noproxy "*" "http://169.254.169.254/metadata/versions") + + # Make sure version 2023-07-01 is available: + # TODO: As versions are updated this may need to be updated as versions stop being supported + # TODO: update redaction code as versions are updated + if [[ $imds_versions == *"2023-07-01"* ]]; then + version_string="2023-07-01" + else + version_string="2021-12-13" + fi + + # Get instance metadata: + instance_metadata=$(curl --max-time $CURL_TIMEOUT -s -H Metadata:true --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=$version_string") + export instance_metadata +} + +get_nvme_data(){ + if command -v nvme >/dev/null 2>&1; then + nvme_metadata=$(sudo nvme list --output-format=json) + export nvme_metadata + fi +} + +get_imds_data +get_nvme_data + +python3 parse_imds_and_nvme_metadata.py diff --git a/src/vm_monitor/vm_monitor_scripts/get_linux_boot_iso_timestamp.sh b/src/vm_monitor/vm_monitor_scripts/get_linux_boot_iso_timestamp.sh new file mode 100644 index 000000000..c67676372 --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/get_linux_boot_iso_timestamp.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +## Use /proc/uptime to get the system uptime in seconds (with 2 decimal places) +## This is almost as accurate as using the C function clock_gettime(CLOCK_BOOTTIME, &ts_boot); + +unix_timestamp_to_iso_time_string() { + # Convert the numeric unix timestamp to an ISO 8601 timestamp string + iso_timestamp=$(date -u -d @"$1" +"%Y-%m-%dT%H:%M:%S.%NZ") + echo "$iso_timestamp" +} + +get_linux_boot_iso_timestamp() { + # Get the uptime in seconds (first value from /proc/uptime) + uptime=$(awk '{print $1}' /proc/uptime) + # Get the (numeric) current time in seconds since the Unix epoch + current_time=$(date +%s.%N) + # Subtract the uptime from the current time to get a timestamp + boot_time=$(echo "$current_time - $uptime" | bc) + + # Convert the boot time to an ISO 8601 timestamp + boot_iso_time=$(unix_timestamp_to_iso_time_string "$boot_time") + collection_iso_time=$(unix_timestamp_to_iso_time_string "$current_time") + + # echo "linux_boot iso_boot_timestamp=\"$boot_iso_time\",collection_iso_timestamp=\"$collection_iso_time\" $current_time" + echo "linux_boot iso_boot_timestamp=\"$boot_iso_time\",collection_iso_timestamp=\"$collection_iso_time\"" +} + +# Call the function +get_linux_boot_iso_timestamp diff --git a/src/vm_monitor/vm_monitor_scripts/parse_extended_cpu_info.py b/src/vm_monitor/vm_monitor_scripts/parse_extended_cpu_info.py new file mode 100644 index 000000000..d249c34ba --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/parse_extended_cpu_info.py @@ -0,0 +1,119 @@ +import subprocess +import json +# from pprint import pprint +# from rich.pretty import pprint +from telegraf_helper import telegraf_output, flatten_json, convert_to_number + +# Convert cpuinfo to a dictionary of processors with the processor number as the key +# Remove the 'processor' key from the dictionary and convert values to numbers where possible +def parse_processor_info(processor_info): + processor = {} + cpu_id = -1 + for line in processor_info.split('\n'): + if ':' in line: + key, value = line.split(':', 1) + value = convert_to_number(value.strip()) + if key.strip().lower() != "processor": + processor[key.strip()] = value + else: + cpu_id = value + return cpu_id, processor + + +# Group like metadata together into groups of processors +# That is, if you have 80 processors all with the same keys print "cpu: 0, 1, 2 .. 80" and their shared values +# instead of 80 rows of the same data +def group_processors_by_keys(processor_dict, keys): + processor_group = {} + for processor_id, processor_info in processor_dict.items(): + # Create a new dictionary with just the keys we want to group by + filtered_info = {key: value for key, value in processor_info.items() if key in keys} + # Convert the filtered info to a tuple for grouping + info_tuple = tuple(filtered_info.items()) + # Add the processor to the appropriate group + processor_group.setdefault(info_tuple, []).append(processor_id) + # At the moment we have a dict of tuples with processor_ids as values + # convert to a dict of string processor ids with the processor info as the value + processor_group = {'_'.join([str(i) for i in v]):dict(k) for k,v in processor_group.items()} + # Cleanup the field names inside each processor group: + for cpu_group in processor_group.keys(): + processor_group[cpu_group] = {k.replace(' ', '_').replace('(','').replace(')','').replace(':',''):v for k,v in processor_group[cpu_group].items()} + return processor_group + + +# Group all keys except the ones which vary in every processor +def group_processors_by_key_values(processor_dict): + # Get the keys from the first processor info dictionary + first_processor_info = next(iter(processor_dict.values())) + keys = set(first_processor_info.keys()) - {'processor', 'bogomips', 'initial apicid', 'apicid', 'core id'} + return group_processors_by_keys(processor_dict, keys) + + +def group_processors_by_core_architecture_key_values(processor_dict): + keys = {'cpu cores', 'physical id', 'siblings', 'core id'} + return group_processors_by_keys(processor_dict, keys) + + +def print_telegraf_processor_info(grouped_processor_info, meter_name, prefix=""): + for cpu, values in grouped_processor_info.items(): + results = telegraf_output(meter_name, prefix) + results.output_tag("cpu", cpu) + results.print_structs(flatten_json(values)) + results.print_telegraf_output() + + +def parse_cpuinfo(): + # Process the /proc/cpuinfo data: + command = "cat /proc/cpuinfo" + cpu_info = subprocess.check_output(command, shell=True).decode().strip() + processor_list = cpu_info.split('\n\n') + # Create a dictionary (key as processor number and value as the processor info) + processors = {k:v for k,v in (parse_processor_info(x) for x in processor_list)} + try: + # Create a few different views into the processor data: + processor_info = group_processors_by_key_values(processors) + processor_architecture_info = group_processors_by_core_architecture_key_values(processors) + cpu_perf_info = { + processor_id: { + 'bogomips':processor_info['bogomips'], + 'cpu_MHz':processor_info['cpu MHz'] if 'cpu MHz' in processor_info else None, + } for processor_id, processor_info in processors.items()} + + # Output to telegraf + print_telegraf_processor_info(processor_info, "vm_cpuinfo_metadata") + print_telegraf_processor_info(processor_architecture_info, "vm_cpuinfo_arch_metadata") + print_telegraf_processor_info(cpu_perf_info, "vm_cpuinfo_perf_metadata") + except: + result = telegraf_output("vm_cpuinfo_metadata", "") + result.output("parser_result", "error_processing") + result.print_telegraf_output() + + +def parse_lscpu_info(): + # Process the lscpu output (lscpu is likely easier to use than /proc/cpuinfo): + command = "lscpu -J" + lscpu_info = subprocess.check_output(command, shell=True).decode().strip() + # Convert the lscpu output to a dictionary + lscpu = json.loads(lscpu_info) + meter_name = "vm_lscpu_metadata" + meter_prefix = "" + results = telegraf_output(meter_name, meter_prefix) + try: + lscpu_dict = {x['field']:x['data'] for x in lscpu['lscpu']} + # Cleanup the field names: + lscpu_dict = {k.replace(' ', '_').replace('(','').replace(')','').replace(':',''):v for k,v in lscpu_dict.items()} + lscpu_dict = {k:convert_to_number(v) for k,v in lscpu_dict.items()} + results.print_structs(lscpu_dict) + results.output("parser_result", "true") + except: + results.output("parser_result", "error_processing") + results.print_telegraf_output() + + +def main(): + parse_cpuinfo() + parse_lscpu_info() + + +if __name__ == '__main__': + main() diff --git a/src/vm_monitor/vm_monitor_scripts/parse_imds_and_nvme_metadata.py b/src/vm_monitor/vm_monitor_scripts/parse_imds_and_nvme_metadata.py new file mode 100644 index 000000000..0bee1a103 --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/parse_imds_and_nvme_metadata.py @@ -0,0 +1,137 @@ +import os +import sys +import json +from datetime import datetime +# For better debugging (rich must be installed) +# from rich.pretty import pprint + +from telegraf_helper import telegraf_output, flatten_json + +def prune_imds_data(imds_data): + redacted_string = "" + # Recursively walk every string, redact any secrets + # (this is a weak-redaction and deletes things like 'disablePasswordAuthentication: false', but it's better than nothing) + # - string values starting with 'ssh-rsa' + # - string values with 'sv=' and 'sig=' (Azure SAS tokens) + # - keys that contain 'secret', 'password', or 'key' + def redact_secrets(data): + if type(data) is str: + if data.startswith('ssh-rsa'): + data = redacted_string + if 'sv=' in data and 'sig=' in data: + data = redacted_string + elif type(data) is dict: + for k, v in data.items(): + if any(word in k.lower() for word in ['secret', 'password', 'key']): + data[k] = redacted_string + else: + data[k] = redact_secrets(v) + elif type(data) is list: + for i, v in enumerate(data): + data[i] = redact_secrets(v) + return data + def key_exists(data, key): + for k in key: + if k not in data: + return False + data = data[k] + return True + # Explicitly drop keys that shouldn't be recorded: + if key_exists(imds_data, ['compute', 'publicKeys']): + imds_data['compute']['publicKeys'] = redacted_string + return redact_secrets(imds_data) + + +def parse_imds_metadata(): + results = telegraf_output('imds_metadata','imds') + try: + instance_metadata = os.getenv('instance_metadata') + if not instance_metadata: + results.output("parser_result", "no_data") + results.print_telegraf_output() + return + imds_data = json.loads(instance_metadata) + # Drop keys that shouldn't be recorded: + imds_data = prune_imds_data(imds_data) + compute = imds_data['compute'] + results.print_output("vm_tags", compute, 'tags') + results.print_output("vm_id", compute, 'vmId') + results.print_output("vm_size", compute, 'vmSize') + results.print_output("zone", compute, 'zone') + results.print_output("vm_location", compute, 'location') + results.print_output("vm_priority", compute, 'priority') + results.print_output("vm_resource_id", compute, 'resourceId') + results.print_output("vm_encryption_at_host", compute, ['securityProfile', 'encryptionAtHost']) + # Print individual important tags: + if 'tagsList' in compute: + tagsList = compute['tagsList'] + # convert "name"/"value" pairs to a dictionary + tagsList = {x['name']: x['value'] for x in tagsList} + results.print_output("batch_account_name", tagsList, 'BatchAccountName') + results.print_output("batch_subscription_id", tagsList, 'BatchAccountSubscription') + results.print_output("batch_pool_name_fq", tagsList, 'FQPoolName') + results.print_output("batch_pool_name", tagsList, 'PoolName') + results.print_output("vm_low_priority_type", tagsList, 'LowPriorityType') + # Print storage data: + if 'storageProfile' in compute: + storage = compute['storageProfile'] + results.print_output("vm_resource_disk_size", storage, ['resourceDisk', 'size']) + results.print_output("vm_image_offer", storage, ['imageReference', 'offer']) + results.print_output("vm_image_sku", storage, ['imageReference', 'sku']) + if 'osDisk' in storage: + osdisk = storage['osDisk'] + results.print_output("vm_os_disk_size", osdisk, 'diskSizeGB') + results.print_output("vm_os_disk_caching", osdisk, 'caching') + results.print_output("vm_managed_disk_type", osdisk, ['managedDisk', 'storageAccountType']) + results.print_output("vm_os_disk_write_accelerator_enabled", osdisk, 'writeAcceleratorEnabled') + # Print network interfaces data: + if 'network' in imds_data: + results.print_structs(flatten_json(imds_data['network'])) + results.output("parser_result", "true") + # Print the imds JSON struct for debugging purposes + print(f'debug_instance_metadata_json data={json.dumps(json.dumps(imds_data))}') + except: + results.output("parser_result", "error_processing") + results.print_telegraf_output() + + +def parse_nvme_metadata(): + def nvme_error_result(msg): + results = telegraf_output('nvme_metadata','vm_nvme') + results.output("parser_result", msg) + results.print_telegraf_output() + try: + nvme_metadata = os.getenv('nvme_metadata') + if not nvme_metadata: + nvme_error_result("no_data") + return + nvmedata = json.loads(nvme_metadata) + # For every nvme disk print by index their stats: + if 'Devices' not in nvmedata: + nvme_error_result("error_processing") + return + for dev in nvmedata['Devices']: + results = telegraf_output('nvme_metadata','vm_nvme') + results.output("parser_result", "true") + results.output_tag("nvme_disk_index", dev['Index']) + results.output("name", dev['DevicePath']) + results.output("used_bytes", dev['UsedBytes']) + results.output("max_lba", dev['MaximumLBA']) + results.output("physical_size", dev['PhysicalSize']) + results.output("sector_size", dev['SectorSize']) + results.print_telegraf_output() + # Print the nvme_metadata JSON struct for debugging purposes + print(f'debug_nvme_metadata_json data={json.dumps(json.dumps(nvmedata))}') + except: + nvme_error_result("error_processing") + + +def main(): + # Init an object to hold stats output so we can have a single line output for all stats + # This meter is called 'imds_metadata' + parse_imds_metadata() + parse_nvme_metadata() + + +if __name__ == '__main__': + main() diff --git a/src/vm_monitor/vm_monitor_scripts/run_telegraf.sh b/src/vm_monitor/vm_monitor_scripts/run_telegraf.sh new file mode 100644 index 000000000..053dcefd3 --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/run_telegraf.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +## This script is run in a screen session. If the telegraf process dies, the process will be restarted. +## However if the screen session quits, or ctrl-c is sent to the sesssion, the script will end. + +function run_telegraf_once() { + ./telegraf --config "${ONE_TIME_CONFIG_CLEAN}" --once --debug +} + +function run_telegraf_continuous() { + ./telegraf --config "${CONTINUOUS_CONFIG_CLEAN}" --debug +} + +# Source the environment variables from the file specified in the first argument +if [ -z "$1" ]; then + echo "Warning: run_telegraf.sh should be called with an argument specifying the environment file to source." +fi +ENV_FILE="$1" +# shellcheck disable=SC1090 +source "$ENV_FILE" + +echo "Starting telegraf..." +echo "Environment information:" +echo " PERF_SCRIPT_DIR: ${PERF_SCRIPT_DIR}" +echo " ONE_TIME_CONFIG_CLEAN: ${ONE_TIME_CONFIG_CLEAN}" +echo " CONTINUOUS_CONFIG_CLEAN: ${CONTINUOUS_CONFIG_CLEAN}" +echo " PERF_APPEND_BLOB_PATH: ${PERF_APPEND_BLOB_PATH}" +echo " PERF_STORAGE_ACCOUNT_NAME: ${PERF_STORAGE_ACCOUNT_NAME}" +echo " PERF_CONTAINER_NAME: ${PERF_CONTAINER_NAME}" +echo " TASK_WORKING_DIR: ${TASK_WORKING_DIR}" +echo -e "\n\nRunning telegraf once..." + +cd "${PERF_SCRIPT_DIR}" || exit 1 + +# Remove the context boostrap file +if [[ -f "$ENV_FILE" && "$(basename "$ENV_FILE")" == tmp_* ]]; then + rm -f "$ENV_FILE" +fi + +# One time initial run with an immediately flush to append blob: +run_telegraf_once + +echo -e "\n\n\n\n\nRunning telegraf continuously..." + +# Monitor SIGINT (Ctrl-C) to turn off the loop and pass ctrl-c to the currently running telegraf instance +export TELEGRAF_PID="" +trap 'kill -SIGINT $TELEGRAF_PID; LOOP=0' SIGINT +# Monitor SIGTERM to turn off the loop, these are terminal signals (SIGKILL and SIGSTOP cannot be caught) +trap 'kill -SIGTERM $TELEGRAF_PID; LOOP=0' SIGTERM +trap 'kill -SIGHUP $TELEGRAF_PID; LOOP=0' SIGHUP + +# Function to limit telegraf restart rate (to keep resource usage down) +# If telegraf dies twice in 30s, we will sleep for 5 minutes before attempting to restart it +export ITERATION_COUNT=0 +PREV_TIMESTAMP=$(date +%s) +export PREV_TIMESTAMP +export ITERATION_LIMIT=2 +function check_iteration_rate() { + local CURRENT_TIMESTAMP + CURRENT_TIMESTAMP=$(date +%s) + if [ $((CURRENT_TIMESTAMP - PREV_TIMESTAMP)) -le 30 ]; then + ITERATION_COUNT=$((ITERATION_COUNT + 1)) + if [ $ITERATION_COUNT -ge $ITERATION_LIMIT ]; then + echo "Loop iterated ${ITERATION_LIMIT} in less than 30 seconds. Sleeping for 5 minutes..." + sleep 300 + ITERATION_COUNT=0 + fi + else + ITERATION_COUNT=0 + fi + export PREV_TIMESTAMP=$CURRENT_TIMESTAMP +} + +# Run telegraf continuously with ocassional flushes: +LOOP=1 +while [ $LOOP -eq 1 ]; do + run_telegraf_continuous & + TELEGRAF_PID=$! + wait $TELEGRAF_PID + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + echo "Telegraf exited with non-zero status. Restarting..." + check_iteration_rate + sleep 1 + fi +done diff --git a/src/vm_monitor/vm_monitor_scripts/start_vm_node_monitoring.sh b/src/vm_monitor/vm_monitor_scripts/start_vm_node_monitoring.sh new file mode 100644 index 000000000..9b36b1b32 --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/start_vm_node_monitoring.sh @@ -0,0 +1,289 @@ +#!/bin/bash +## This is the script which sets up the Azure Batch TES pefromance monitoring +## environment. This task can be called several times during the lifetime +## of a VM node, for example when the current task ends and new job is picked up +## by the node. It is intended to keep running (in the background) after a TES +## task ends. It will be prempted by the next TES task or the VM being deallocated +## +## Usage: You should run this task in the background as it may need to perform +## some maintenance/waits before it ends. Outside the Azure Batch env you can +## run this script with the following arguments: +## $0 +## +## This script is rentrant, and will stop the previous performance montiors and +## start a new one for the current task. Some error conditions on extremely short +## tasks may cause unexpected behavior. +## +## Monitoring via telegraf is run twice, once for one-time metrics and once for +## continuous monitoring. +## +## TODO: add global task lockfile if needed +## TODO: Optional logging of start_vm_node_monitoring to a file for telegraf to log +## When run as normal task: +## AZ_BATCH_TASK_DIR=/mnt/batch/tasks/workitems/TES_DEBUG_JOB/job-1/debug_task +## AZ_BATCH_TASK_WORKING_DIR=/mnt/batch/tasks/workitems/TES_DEBUG_JOB/job-1/debug_task/wd +## AZ_BATCH_NODE_SHARED_DIR=/mnt/batch/tasks/shared +## When run as start task: +## AZ_BATCH_TASK_DIR=/mnt/batch/tasks/startup +## AZ_BATCH_TASK_WORKING_DIR=/mnt/batch/tasks/startup/wd +## AZ_BATCH_NODE_SHARED_DIR=/mnt/batch/tasks/shared + + +get_envs_from_runner_task_json() { + python3 < " + echo -e "\n\nExiting start_vm_node_monitoring.sh" + exit 1 + fi + echo "Running in Azure Batch task mode" + WORKING_DIR=${AZ_BATCH_NODE_SHARED_DIR}/vm_monitor + TASK_WORKING_DIR=${AZ_BATCH_TASK_WORKING_DIR} + if [[ -f "${AZ_BATCH_TASK_DIR}/runner-task.json" ]]; then + eval "$(get_envs_from_runner_task_json)" + else + echo " runner-task.json not found, performing extraction/setup only" + SETUP_ONLY=1 + fi + + # Add a trailing slash to PERF_APPEND_BLOB_PATH if it doesn't have one: + if [[ ! ${PERF_APPEND_BLOB_PATH} =~ /$ ]]; then + PERF_APPEND_BLOB_PATH="${PERF_APPEND_BLOB_PATH}/" + fi +fi +# WORKING_DIR should not end in a slash: +WORKING_DIR=${WORKING_DIR%/} + +# If start_vm_monitoring.sh is not in ${WORKING_DIR} then note it and update the WORKING_DIR +BASH_SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +if [ "$BASH_SCRIPT_PATH" != "${WORKING_DIR}" ]; then + echo "start_vm_monitoring.sh is not in the expected directory, updating WORKING_DIR to: $BASH_SCRIPT_PATH" + WORKING_DIR=$BASH_SCRIPT_PATH +fi + +PERF_ARCHIVE_FILENAME="${WORKING_DIR}/tes_vm_monitor.tar.gz" +PERF_SCRIPT_DIR="${WORKING_DIR}/scripts" +ONE_TIME_CONFIG="${PERF_SCRIPT_DIR}/tes_vm_monitor.once.conf" +CONTINUOUS_CONFIG="${PERF_SCRIPT_DIR}/tes_vm_monitor.continuous.conf" +ONE_TIME_CONFIG_CLEAN="${ONE_TIME_CONFIG%.conf}.clean.conf" +CONTINUOUS_CONFIG_CLEAN="${CONTINUOUS_CONFIG%.conf}.clean.conf" + +function extract_resources() { + # If the data is already extracted, do nothing: + if [ -d "${PERF_SCRIPT_DIR}" ]; then + echo "Skipping extraction ${PERF_SCRIPT_DIR} already exists." + return + fi + # If the archive is not present, do nothing: + if [ ! -f "${PERF_ARCHIVE_FILENAME}" ]; then + echo "Skipping extraction ${PERF_ARCHIVE_FILENAME} does not exist." + return + fi + + # Extract telegraf and all the scripts: + mkdir -p "${PERF_SCRIPT_DIR}" + tar zxvf "${PERF_ARCHIVE_FILENAME}" -C "${PERF_SCRIPT_DIR}/" + chmod a+x "${PERF_SCRIPT_DIR}/telegraf" + chmod a+x "${PERF_SCRIPT_DIR}/run_telegraf.sh" + chmod a+x "${PERF_SCRIPT_DIR}"/*.sh + + # Create the telegraf compact/cleaned config files: + create_telegraf_configs +} + +# Create 'clean' versions of the telegraf input configs for running telegraf and logging +# This will remove all lines starting with whitespace and a '#' and all blank lines +# Inline comments are left alone +function create_telegraf_configs() { + grep -vE '^\s*#' "${ONE_TIME_CONFIG}" | grep -vE '^\s*$' > "${ONE_TIME_CONFIG_CLEAN}" + grep -vE '^\s*#' "${CONTINUOUS_CONFIG}" | grep -vE '^\s*$' > "${CONTINUOUS_CONFIG_CLEAN}" +} + +# Screen returns a list of all screen sessions, we filter for sessions of the form: +# {screen_pid}.TELEGRAF__{VM_TASK_NAME} +# We ignore the current screen sessions and send a ctrl-C or passed in signal to the rest +function stop_other_telegraf_screens() { + local CURRENT_SCREEN_NAME=${1} + local SIGNAL=${2} + # End existing 'TELEGRAF__*' screen sessions: + for session in $(screen -ls | grep -o '[0-9]*\.TELEGRAF__\S*' | grep -v "${CURRENT_SCREEN_NAME}"); do + # If no signal is given send a Ctrl-C to the session: + if [ -z "${SIGNAL}" ]; then + echo "Sending ctrl-c to existing telegraf session: ${session}" + screen -S "${session}" -X stuff "^C" + else + echo "Quitting existing telegraf session: ${session}" + # shellcheck disable=SC2086 + screen -S "${session}" -X ${SIGNAL} + fi + done +} + +# If we find any screens running which are not the current screen, return 0 +function check_for_other_telegraf_screens() { + local CURRENT_SCREEN_NAME=${1} + for session in $(screen -ls | grep -o '[0-9]*\.TELEGRAF__\S*' | grep -v "${CURRENT_SCREEN_NAME}"); do + return 0 + done + return 1 +} + +# Handle the case where there is more than one CURRENT_SCREEN_NAME session +# we keep the last session to run and kill the rest +# Not to be confused with the code later on which asks screens running with different task names to exit +function keep_latest_telegraf_screen() { + local SCREEN_NAME=${1} + local SESSIONS + local SESSION_COUNT + local PID + # Get a list of all screen sessions with the given name, sorted by creation time: + SESSIONS=$(screen -ls | grep "${SCREEN_NAME}" | awk '{print $1}' | sort -n) + SESSION_COUNT=$(echo "${SESSIONS}" | wc -l) + echo -e "\n\nWARNING: Found ${SESSION_COUNT} telegraf sessions with the name: ${SCREEN_NAME}" + # If there is more than one session with the given name: + if [ "${SESSION_COUNT}" -gt 1 ]; then + KEPT_SESSION=$(echo "${SESSIONS}" | tail -n 1) + echo "Keeping the latest telegraf session: ${KEPT_SESSION}" + # Remove the last session from the list (the latest one): + SESSIONS=$(echo "${SESSIONS}" | head -n -1) + + # Ask session to gracefully quit: + for session in ${SESSIONS}; do + echo "Sending ctrl-c to existing telegraf session: ${session}" + screen -S "${session}" -X stuff "^C" + done + echo "Waiting 5s before checking if any other sessions remain" + # Pause before checking if any other sessions remain (this for loop is a fancy sleep 5 that can return early) + # shellcheck disable=SC2034 + for i in {1..10}; do + # Check if any sessions are still running + if ! screen -list | grep -q "${session}"; then + break # If no sessions are running, break the loop + fi + sleep 0.5 # If sessions are still running, sleep for 0.5 seconds + done + + # Kill all remaining sessions (the older ones): + for session in ${SESSIONS}; do + # Check if this session is still running: + if screen -list | grep -q "${session}"; then + echo "Killing old telegraf session: ${session}" + # Get the PID of the screen session: + PID=$(screen -list | grep "${session}" | cut -d'.' -f1) + # Kill the screen session and all its child processes (SIGTERM): + pkill -TERM -P "${PID}" + fi + done + fi +} + +function terminate_other_telegraf_screns(){ + local CURRENT_SCREEN_NAME=${1} + local PID + # End existing 'TELEGRAF__*' screen sessions: + for session in $(screen -ls | grep -o '[0-9]*\.TELEGRAF__\S*' | grep -v "${CURRENT_SCREEN_NAME}"); do + echo "Killing existing telegraf session: ${session}" + # Get the PID of the screen session: + PID=$(screen -list | grep "${session}" | cut -d'.' -f1) + # Kill the screen session and all its child processes (SIGKILL): + pkill -9 -P "${PID}" + done +} +# Extract resources only if we need to and setup the telegraf configs: +extract_resources +if [ $SETUP_ONLY -eq 1 ]; then + echo "Exiting start_vm_node_monitoring.sh, setup complete" + exit 0 +fi + +# Write the environmental variables to a file for the run_telegraf.sh script to import: +TASK_ENV_FILE="${PERF_SCRIPT_DIR}/tmp_task_env__${VM_TASK_NAME}.sh" +rm -f "${TASK_ENV_FILE}" +{ + echo "export VM_TASK_NAME=\"${VM_TASK_NAME}\"" + echo "export TASK_WORKING_DIR=\"${TASK_WORKING_DIR}\"" + echo "export PERF_SCRIPT_DIR=\"${PERF_SCRIPT_DIR}\"" + echo "export ONE_TIME_CONFIG_CLEAN=\"${ONE_TIME_CONFIG_CLEAN}\"" + echo "export CONTINUOUS_CONFIG_CLEAN=\"${CONTINUOUS_CONFIG_CLEAN}\"" + echo "export PERF_STORAGE_ACCOUNT_NAME=\"${PERF_STORAGE_ACCOUNT_NAME}\"" + echo "export PERF_CONTAINER_NAME=\"${PERF_CONTAINER_NAME}\"" + echo "export PERF_APPEND_BLOB_PATH=\"${PERF_APPEND_BLOB_PATH}\"" +} > "${TASK_ENV_FILE}" + +# Start the current monitor then stop the others: +SCREEN_NAME="TELEGRAF__${VM_TASK_NAME}" +# Start a new screen session with the current VM_TASK_NAME: +screen -S "${SCREEN_NAME}" -dm bash -c "${PERF_SCRIPT_DIR}/run_telegraf.sh \"${TASK_ENV_FILE}\"" +echo "Started telegraf for task \"${VM_TASK_NAME}\" in screen session \"${SCREEN_NAME}\"" + +# Make sure there is only one telegraf session running for the curretn VM_TASK_NAME +keep_latest_telegraf_screen "${SCREEN_NAME}" + +# Gracefully end existing 'TELEGRAF__*' screen sessions (SIGINT) with other VM_TASK_NAMEs +if check_for_other_telegraf_screens "${SCREEN_NAME}"; then + echo -e "\n\nStopping other telegraf sessions" + stop_other_telegraf_screens "${SCREEN_NAME}" + echo " Waiting 5s before checking if any other sessions remain" + sleep 5 # Pause before checking if any other sessions remain +fi +# Wait 30s then kill any other telegraf sessions: +if check_for_other_telegraf_screens "${SCREEN_NAME}"; then + echo -e "\n\nWaiting to quit other telegraf sessions" + sleep 30 + # Quit any remaining 'TELEGRAF__*' screen sessions (SIGTERM) with other VM_TASK_NAMEs + stop_other_telegraf_screens "${SCREEN_NAME}" "quit" + echo " Waiting 5s before checking if any other sessions remain" + sleep 5 # Pause before checking if any other sessions remain +fi +if check_for_other_telegraf_screens "${SCREEN_NAME}"; then + echo -e "\n\nWaiting to kill other telegraf sessions" + sleep 5 + # Kill any remaining 'TELEGRAF__*' screen sessions (SIGKILL), SIGKILL should + # only ever be necessary in cases of a serious bug: + terminate_other_telegraf_screns "${SCREEN_NAME}" +fi +echo -e "\n\nFinished starting telegraf for task \"${VM_TASK_NAME}\"" +echo "Exiting start_vm_node_monitoring.sh" + +exit 0 diff --git a/src/vm_monitor/vm_monitor_scripts/telegraf_helper.py b/src/vm_monitor/vm_monitor_scripts/telegraf_helper.py new file mode 100644 index 000000000..8e1f354ff --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/telegraf_helper.py @@ -0,0 +1,94 @@ +import sys + +class telegraf_output: + """Take in dictionary or struct data and store it for eventual printing in the telegraf line protocol format. + If debugging is enabled, output to stderr as well.""" + print_to_stderr = False + + def __init__(self, meter_name="imds_metadata", meter_prefix="imds"): + self.metrics = {} + self.tags = {} + self.meter_name = meter_name + self.meter_prefix = meter_prefix + + def print_structs(self, data): + prefix = f"{self.meter_prefix}_" if self.meter_prefix else "" + for key, val in data.items(): + self.output(prefix + key, val) + + def print_output(self, name, dictionary, keys): + if isinstance(keys, str): + keys = [keys] + value = dictionary + for key in keys: + if key in value: + value = value[key] + else: + value = None + break + self.output(name, value) + + def output_tag(self, name, value): + if self.print_to_stderr: + print(f"TAG: {name}: '{value}'", file=sys.stderr) + self.tags[name] = value + + def output(self, name, value): + if self.print_to_stderr: + print(f"{name}: '{value}'", file=sys.stderr) + self.metrics[name] = convert_to_number(value) + + def print_telegraf_output(self): + # Print the telegraf meter name first + print(f"{self.meter_name}", end='') + # Print a comma, then the tags (or just a space if there are no tags) + if len(self.tags) > 0: + print(",", end='') + sep = "" # Skip the first comma + for key, val in self.tags.items(): + print(f'{sep}{key}="{val}"', end='') + sep = "," + + # Print the final space before the metrics: + print(" ", end='') + sep = "" # Skip the first comma + # Then print every key=value pair + for key, val in self.metrics.items(): + if type(val) is str: + print(f'{sep}{key}="{val}"', end='') + elif type(val) is int: + print(f'{sep}{key}={val}', end='') + elif type(val) is float: + print(f'{sep}{key}={val}', end='') + sep = "," + print("") # Print the newline at the end + + +def flatten_json(y): + out = {} + def flatten(x, name=''): + if type(x) is dict: + for a in x: + flatten(x[a], name + a + '_') + elif type(x) is list: + i = 0 + for a in x: + flatten(a, name + str(i) + '_') + i += 1 + else: + out[name[:-1]] = x + flatten(y) + return out + + +def convert_to_number(value): + # See if we can convert a string value to a number: + if type(value) is str: + try: + numeric_value = float(value) + if numeric_value.is_integer(): + numeric_value = int(numeric_value) + return numeric_value + except ValueError: + pass + return value diff --git a/src/vm_monitor/vm_monitor_scripts/tes_vm_monitor.continuous.conf b/src/vm_monitor/vm_monitor_scripts/tes_vm_monitor.continuous.conf new file mode 100644 index 000000000..0a5d5fc34 --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/tes_vm_monitor.continuous.conf @@ -0,0 +1,313 @@ +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "1s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = false # not enabled to avoid write contention + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + ## Exceeding this value triggers a flush + metric_batch_size = 1000 + + ## Maximum number of unwritten metrics per output. Increasing this value + ## allows for longer periods of output downtime without dropping metrics at the + ## cost of higher maximum memory usage. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. Maximum flush_interval will be + ## flush_interval + flush_jitter + flush_interval = "120s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "15s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "10ms" + + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### +[[outputs.azure_append_blob]] + output_path = "${PERF_APPEND_BLOB_PATH}" + storage_account_name = "${PERF_STORAGE_ACCOUNT_NAME}" + container_name = "${PERF_CONTAINER_NAME}" + use_batch_format = true + data_format = "json" + json_timestamp_units = "1ms" + compression_algorithm = "zstd" +# Send telegraf metrics to stdout (off for continuous monitoring to save cycles) +# [[outputs.file]] +# ## Files to write to, "stdout" is a specially handled file. +# files = ["stdout"] + +# ## Use batch serialization format instead of line based delimiting. The +# ## batch format allows for the production of non line based output formats and +# ## may more efficiently encode and write metrics. +# use_batch_format = true + +# ## The logfile will be rotated when it becomes larger than the specified +# ## size. When set to 0 no size based rotation is performed. +# rotation_max_size = "50MB" + +# ## Data format to output. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md +# ## https://github.com/influxdata/telegraf/tree/master/plugins/serializers/json +# data_format = "influx" + + +############################################################################### +# AGGREGATORS # +############################################################################### + +# Keep the aggregate basicstats of each metric passing through. +[[aggregators.basicstats]] + ## The period on which to flush & clear the aggregator. + period = "15s" + grace = "5s" + + ## If true, the original metric will be dropped by the + ## aggregator and will not get sent to the output plugins. + drop_original = false + + ## Configures which basic stats to push as fields + # stats = ["count","diff","rate","min","max","mean","non_negative_diff","non_negative_rate","percent_change","stdev","s2","sum","interval"] + ## Metrics to exclude from aggregation (these will go directly to the output plugins) + namedrop = ["filecount", "docker_log", "docker", "disk", "system", "processes", "swap"] + ## Metrics to include in aggregation + namepass = ["cpu", "diskio", "mem", "net"] + name_prefix = "agg." + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Service input plugin for monitoring changes written to output files +# metrics.txt is a nice to a have, stderr.txt and stdout.txt are for debugging +[[inputs.tail]] + ## NOTE: `once` may not product output with this plugin + files = ["${TASK_WORKING_DIR}/metrics.txt", "${TASK_WORKING_DIR}/../metrics.txt", "${TASK_WORKING_DIR}/stderr.txt", "${TASK_WORKING_DIR}/stdout.txt"] + ## Read file from beginning. + from_beginning = true + ## Method used to watch for file updates. Can be either "inotify" or "poll". + watch_method = "poll" + data_format = "grok" + grok_patterns = ["%{GREEDYDATA:message}"] + + +# # Count files in a directory +# [[inputs.filecount]] +# interval="120s" +# collection_jitter="10s" +# ## Directories to gather stats about. +# ## This accept standard unit glob matching rules, but with the addition of +# ## ** as a "super asterisk". ie: +# ## /var/log/** -> recursively find all directories in /var/log and count files in each directories +# ## /var/log/*/* -> find all directories with a parent dir in /var/log and count files in each directories +# ## /var/log -> count all files in /var/log and all of its subdirectories +# directories = ["${TASK_WORKING_DIR}/**"] +# ## Only count files that match the name pattern. Defaults to "*". +# name = "*" +# ## Count files in subdirectories. Defaults to true. +# recursive = true +# ## Only count regular files. Defaults to true. +# regular_only = true +# ## Follow all symlinks while walking the directory tree. Defaults to false. +# follow_symlinks = false +# ## Only count files that are at least this size. If size is +# ## a negative number, only count files that are smaller than the +# ## absolute value of size. Acceptable units are B, KiB, MiB, KB, ... +# ## Without quotes and units, interpreted as size in bytes. +# size = "0B" +# ## Only count files that have not been touched for at least this +# ## duration. If mtime is negative, only count files that have been +# ## touched in this duration. Defaults to "0s". +# # mtime = "0s" +# # This is set to -3mins for continous monitoring to avoid spam (and must be greater than the interval) +# mtime = "-3m" + +# Collect files stats on files inside a directory +[[inputs.filestat]] + interval="120s" + collection_jitter="10s" + files = ["${TASK_WORKING_DIR}/**"] + md5 = false + +[[processors.dedup]] + ## Name of the fields is used to check against previous measurement + fieldinclude = ["file", "exists", "size_bytes", "modification_time"] + ## Name of the measurements to let through + namepass = ["filestat"] + ## Allow a full set of outputs once every 30mins + dedup_interval = "30m" + +# Monitor Telegraf process cpu and memory usage +[[inputs.procstat]] + interval="120s" + collection_jitter="10s" + ## executable name (ie, pgrep ) + exe = "telegraf" + tag_with = ["cmdline"] + +[[inputs.cpu]] + interval = "5s" + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## If true, collect raw CPU time metrics + collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states + report_active = false + fieldexclude=["usage_guest", "usage_guest_nice", "usage_irq", "usage_nice", "usage_steal"] + +# Not supported on Azure VMs +# # Provides Linux CPU metrics +# # This plugin ONLY supports Linux +# [[inputs.linux_cpu]] +# ## Path for sysfs filesystem. +# ## See https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt +# ## Defaults: +# # host_sys = "/sys" +# ## CPU metrics collected by the plugin. +# ## Supported options: +# ## "cpufreq", "thermal" +# ## Defaults: +# metrics = ["cpufreq", "thermal"] + +[[inputs.disk]] + interval="60s" + collection_jitter="5s" + ## By default stats will be gathered for all mount points. + ## Ignore mount points by filesystem type. + ignore_fs = ["tmpfs", "devtmpfs", "sysfs", "cgroup", "cgroup2", "devfs", "iso9660", "overlay", "aufs", "squashfs", "vfat"] +# [[inputs.diskio]] +# interval="60s" +# collection_jitter="5s" +# skip_serial_number = true +# ## By default, telegraf will gather stats for all devices including +# ## disk partitions. +# ## Setting devices will restrict the stats to the specified devices. +# # devices = ["sd*", "nvme*", "dm*"] +# devices = ["^sd[a-z]+$", "^nvme[0-9]+n[0-9]+"] +# ## On systems which support it, device metadata can be added in the form of +# ## tags. +# device_tags = ["ID_FS_TYPE", "ID_FS_LABEL"] +[[inputs.diskio]] + interval="5s" + collection_jitter="1s" + skip_serial_number = true + ## By default, telegraf will gather stats for all devices including + ## disk partitions. We really only care about stats at disk level + ## Globs for 'devices' are provided by github.com/gobwas/glob and not regex + ## * LS80s machines already have 10 nvme disks (hence nvme10n1 support) + ## * sdaa is also supported but unlikley to be used + ## * "ID_TYPE = disk" filtering might work but the diskio plugin would have to + ## collect all stats and then filter them out, which is expensive + devices = ["sd[a-z]", "sd[a-z][a-z]", "nvme[0-9]*n[0-9]", "nvme[0-9]*n[0-9]", "md[0-9]"] + ## On systems which support it, device metadata can be added in the form of tags + device_tags = ["ID_TYPE"] +[[inputs.mem]] + interval = "5s" + # no configuration +[[inputs.net]] + interval = "5s" + ## On linux systems telegraf also collects protocol stats. + ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. + ignore_protocol_stats = true +[[inputs.processes]] + interval="60s" + collection_jitter="5s" +[[inputs.swap]] + interval="60s" + collection_jitter="5s" + # no configuration +[[inputs.system]] + interval="60s" + collection_jitter="5s" + fieldexclude = ["uptime_format"] # uptime_format is deprecated +# Read metrics about docker containers +[[inputs.docker]] + interval="60s" + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" + endpoint = "unix:///var/run/docker.sock" + ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) + ## Note: configure this in one of the manager nodes in a Swarm cluster. + ## configuring in multiple Swarm managers results in duplication of metrics. + gather_services = false + ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars + source_tag = false + ## Containers to include and exclude. Collect all if empty. Globs accepted. + container_name_include = [] + container_name_exclude = [] + ## Timeout for docker list, info, and stats commands + timeout = "5s" + ## Whether to report for each container per-device blkio (8:0, 8:1...), + ## network (eth0, eth1, ...) and cpu (cpu0, cpu1, ...) stats or not. + ## Usage of this setting is discouraged since it will be deprecated in favor of 'perdevice_include'. + ## Default value is 'true' for backwards compatibility, please set it to 'false' so that 'perdevice_include' setting + ## is honored. + perdevice = false + ## Specifies for which classes a per-device metric should be issued + ## Possible values are 'cpu' (cpu0, cpu1, ...), 'blkio' (8:0, 8:1, ...) and 'network' (eth0, eth1, ...) + ## Please note that this setting has no effect if 'perdevice' is set to 'true' + # perdevice_include = ["cpu", "blkio", "network"] + perdevice_include = ["blkio", "network"] + ## Whether to report for each container total blkio and network stats or not. + ## Usage of this setting is discouraged since it will be deprecated in favor of 'total_include'. + ## Default value is 'false' for backwards compatibility, please set it to 'true' so that 'total_include' setting + ## is honored. + total = false + ## Specifies for which classes a total metric should be issued. Total is an aggregated of the 'perdevice' values. + ## Possible values are 'cpu', 'blkio' and 'network' + ## Total 'cpu' is reported directly by Docker daemon, and 'network' and 'blkio' totals are aggregated by this plugin. + ## Please note that this setting has no effect if 'total' is set to 'false' + # total_include = ["cpu", "blkio", "network"] + ## docker labels to include and exclude as tags. Globs accepted. + ## Note that an empty array for both will include all labels as tags + docker_label_include = [] + docker_label_exclude = [] + ## Which environment variables should we use as a tag + tag_env = ["JAVA_HOME", "HEAP_SIZE"] + +[[inputs.docker_log]] + interval="60s" + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" + endpoint = "unix:///var/run/docker.sock" + ## When true, container logs are read from the beginning; otherwise + ## reading begins at the end of the log. + from_beginning = false + ## Timeout for Docker API calls. + timeout = "5s" + ## Containers to include and exclude. Globs accepted. + ## Note that an empty array for both will include all containers + container_name_include = [] + ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars + source_tag = false diff --git a/src/vm_monitor/vm_monitor_scripts/tes_vm_monitor.once.conf b/src/vm_monitor/vm_monitor_scripts/tes_vm_monitor.once.conf new file mode 100644 index 000000000..69f353333 --- /dev/null +++ b/src/vm_monitor/vm_monitor_scripts/tes_vm_monitor.once.conf @@ -0,0 +1,308 @@ +# Configuration for telegraf agent +[agent] + ## Default data collection interval for all inputs + interval = "1s" + ## Rounds collection interval to 'interval' + ## ie, if interval="10s" then always collect on :00, :10, :20, etc. + round_interval = false ## False for once to ensure collection happens once + + ## Telegraf will send metrics to outputs in batches of at most + ## metric_batch_size metrics. + ## This controls the size of writes that Telegraf sends to output plugins. + ## Exceeding this value triggers a flush + metric_batch_size = 1000 + + ## Maximum number of unwritten metrics per output. Increasing this value + ## allows for longer periods of output downtime without dropping metrics at the + ## cost of higher maximum memory usage. + metric_buffer_limit = 10000 + + ## Collection jitter is used to jitter the collection by a random amount. + ## Each plugin will sleep for a random time within jitter before collecting. + ## This can be used to avoid many plugins querying things like sysfs at the + ## same time, which can have a measurable effect on the system. + collection_jitter = "0s" + + ## Default flushing interval for all outputs. Maximum flush_interval will be + ## flush_interval + flush_jitter + flush_interval = "120s" + ## Jitter the flush interval by a random amount. This is primarily to avoid + ## large write spikes for users running a large number of telegraf instances. + ## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s + flush_jitter = "5s" + + ## By default or when set to "0s", precision will be set to the same + ## timestamp order as the collection interval, with the maximum being 1s. + ## ie, when interval = "10s", precision will be "1s" + ## when interval = "250ms", precision will be "1ms" + ## Precision will NOT be used for service inputs. It is up to each individual + ## service input to set the timestamp at the appropriate precision. + ## Valid time units are "ns", "us" (or "µs"), "ms", "s". + precision = "10ms" + + ## Override default hostname, if empty use os.Hostname() + hostname = "" + ## If set to true, do no set the "host" tag in the telegraf agent. + omit_hostname = true + +# Add a 'once' tag to everything logged by this config (this helps distinguish between once and continuous monitoring) +[global_tags] + vm_monitor_collection = "once" + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + +[[outputs.azure_append_blob]] + output_path = "${PERF_APPEND_BLOB_PATH}" + storage_account_name = "${PERF_STORAGE_ACCOUNT_NAME}" + container_name = "${PERF_CONTAINER_NAME}" + use_batch_format = true + data_format = "json" + json_timestamp_units = "1ms" + compression_algorithm = "zstd" + compression_level = 7 ## Attempt to compress verbose 'once' data more + +# Send telegraf metrics to stdout (only on for the once config) +[[outputs.file]] + ## Files to write to, "stdout" is a specially handled file. + files = ["stdout"] + ## Use batch serialization format instead of line based delimiting. The + ## batch format allows for the production of non line based output formats and + ## may more efficiently encode and write metrics. + use_batch_format = true + ## The logfile will be rotated when it becomes larger than the specified + ## size. When set to 0 no size based rotation is performed. + rotation_max_size = "50MB" + ## Data format to output. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_OUTPUT.md + ## https://github.com/influxdata/telegraf/tree/master/plugins/serializers/json + data_format = "influx" + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + +# Capture the output of mount of real /dev devices +# Intent is to ignore all cgroups, etc. +[[inputs.exec]] + commands = ["/bin/bash -c 'mount | grep ^/dev'"] + timeout = "5s" + data_format = "grok" + grok_patterns = ["^%{UNIXPATH:device} on %{UNIXPATH:mount} type %{WORD:fstype} \\(%{GREEDYDATA:options}\\)"] + name_override = "exec.mount" +[[inputs.exec]] + commands = ["/bin/bash -c 'df -T --exclude-type=squashfs | tail -n +2'"] + timeout = "5s" + data_format = "grok" + grok_patterns = ["^%{NOTSPACE:filesystem}\\s+%{NOTSPACE:type}\\s+%{NUMBER:1K-blocks:int}\\s+%{NUMBER:used:int}\\s+%{NUMBER:available:int}\\s+%{NUMBER:use_percent:int}%\\s+%{GREEDYDATA:mounted_on}"] + name_override = "exec.df" +[[inputs.exec]] + commands = ["/bin/bash -c 'lsblk -P -b -o NAME,SIZE,MAJ:MIN,FSTYPE,MOUNTPOINT,PTTYPE,MODEL,STATE,TYPE | tail -n +2'"] + timeout = "5s" + data_format = "grok" + grok_patterns = ["^NAME=\"%{NOTSPACE:name}\" SIZE=\"%{NUMBER:size:int}\" MAJ:MIN=\"%{DATA:maj_min}\" FSTYPE=\"%{DATA:fstype}\" MOUNTPOINT=\"%{DATA:mountpoint}\" PTTYPE=\"%{DATA:pttype}\" MODEL=\"%{DATA:model}\" STATE=\"%{DATA:state}\" TYPE=\"%{DATA:type}\""] + name_override = "exec.lsblk" +[[inputs.exec]] + commands = ["/bin/bash -c 'env | grep -v -E \"^BASH_FUNC_|^LS_COLORS\"'"] + timeout = "5s" + data_format = "grok" + grok_patterns = ["^%{NOTSPACE:key}=%{GREEDYDATA:value}"] + name_override = "exec.env" + +[[inputs.exec]] + ## Commands array + commands = [ + "/bin/bash ${PERF_SCRIPT_DIR}/collect_azure_vm_perf.sh ${PERF_SCRIPT_DIR}/" + ] + ## Timeout for each command to complete. + timeout = "5s" + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" +[[inputs.exec]] + commands = [ + "/bin/sh -c 'echo task_env,tag=VM_TASK_NAME value=\"\\\"${VM_TASK_NAME}\\\"\"'", + "/bin/sh -c 'echo task_env,tag=TASK_WORKING_DIR value=\"\\\"${TASK_WORKING_DIR}\\\"\"'", + "/bin/sh -c 'echo task_env,tag=PERF_SCRIPT_DIR value=\"\\\"${PERF_SCRIPT_DIR}\\\"\"'", + "/bin/sh -c 'echo task_env,tag=ONE_TIME_CONFIG_CLEAN value=\"\\\"${ONE_TIME_CONFIG_CLEAN}\\\"\"'", + "/bin/sh -c 'echo task_env,tag=CONTINUOUS_CONFIG_CLEAN value=\"\\\"${CONTINUOUS_CONFIG_CLEAN}\\\"\"'", + "/bin/sh -c 'echo task_env,tag=PERF_APPEND_BLOB_PATH value=\"\\\"${PERF_APPEND_BLOB_PATH}\\\"\"'", + "/bin/sh -c 'echo task_env,tag=PERF_STORAGE_ACCOUNT_NAME value=\"\\\"${PERF_STORAGE_ACCOUNT_NAME}\\\"\"'", + "/bin/sh -c 'echo task_env,tag=PERF_CONTAINER_NAME value=\"\\\"${PERF_CONTAINER_NAME}\\\"\"'" + ] + timeout = "5s" + # name_suffix = "task_env" + data_format = "influx" +[[inputs.file]] + ## Files to parse each interval. Accept standard unix glob matching rules, + ## as well as ** to match recursive files and directories. + files = ["${PERF_SCRIPT_DIR}/tes_vm_monitor.once.clean.conf"] + name_override = "telegraf_config_once" + data_format = "value" # Name of the field (e.g. "value=") + data_type = "string" + character_encoding = "utf-8" +[[inputs.file]] + ## Files to parse each interval. Accept standard unix glob matching rules, + ## as well as ** to match recursive files and directories. + files = ["${PERF_SCRIPT_DIR}/tes_vm_monitor.continuous.clean.conf"] + name_override = "telegraf_config_continuous" + data_format = "value" # Name of the field (e.g. "value=") + data_type = "string" + character_encoding = "utf-8" + +# Take a quick snapshot of the /mnt state: +[[inputs.filecount]] + ## Directories to gather stats about. + ## This accept standard unit glob matching rules, but with the addition of + ## ** as a "super asterisk". ie: + ## /var/log/** -> recursively find all directories in /var/log and count files in each directories + ## /var/log/*/* -> find all directories with a parent dir in /var/log and count files in each directories + ## /var/log -> count all files in /var/log and all of its subdirectories + directories = ["/mnt/**"] + ## Only count files that match the name pattern. Defaults to "*". + name = "*" + ## Count files in subdirectories. Defaults to true. + recursive = true + ## Only count regular files. Defaults to true. + regular_only = true + ## Follow all symlinks while walking the directory tree. Defaults to false. + follow_symlinks = false + ## Only count files that are at least this size. If size is + ## a negative number, only count files that are smaller than the + ## absolute value of size. Acceptable units are B, KiB, MiB, KB, ... + ## Without quotes and units, interpreted as size in bytes. + size = "0B" + ## Only count files that have not been touched for at least this + ## duration. If mtime is negative, only count files that have been + ## touched in this duration. Defaults to "0s". + mtime = "0s" + +# Collect files stats on files inside the task working directory +[[inputs.filestat]] + # The timeout means that jobs with hundreds of thousands of files may not collect all information + files = ["${TASK_WORKING_DIR}/**"] + md5 = false +[[inputs.cpu]] + ## Whether to report per-cpu stats or not + percpu = true + ## Whether to report total system cpu stats or not + totalcpu = true + ## If true, collect raw CPU time metrics + collect_cpu_time = false + ## If true, compute and report the sum of all non-idle CPU states + report_active = false +# Not supported on Azure VMs +# # Provides Linux CPU metrics +# # This plugin ONLY supports Linux +# [[inputs.linux_cpu]] +# ## Path for sysfs filesystem. +# ## See https://www.kernel.org/doc/Documentation/filesystems/sysfs.txt +# ## Defaults: +# # host_sys = "/sys" +# ## CPU metrics collected by the plugin. +# ## Supported options: +# ## "cpufreq", "thermal" +# ## Defaults: +# metrics = ["cpufreq", "thermal"] +[[inputs.disk]] + interval="30s" + collection_jitter="5s" + ## By default stats will be gathered for all mount points. + ## Ignore mount points by filesystem type. + ignore_fs = ["tmpfs", "devtmpfs", "sysfs", "cgroup", "cgroup2", "devfs", "iso9660", "overlay", "aufs", "squashfs", "vfat"] +[[inputs.diskio]] + interval="5s" + collection_jitter="1s" + ## By default, telegraf will gather stats for all devices including + ## disk partitions. + ## Setting devices will restrict the stats to the specified devices. + devices = ["sd*", "nvme*"] + ## On systems which support it, device metadata can be added in the form of + ## tags. + device_tags = ["ID_FS_TYPE", "ID_FS_LABEL"] +[[inputs.mem]] + # no configuration +[[inputs.net]] + ## On linux systems telegraf also collects protocol stats. + ## Setting ignore_protocol_stats to true will skip reporting of protocol metrics. + ignore_protocol_stats = true +[[inputs.processes]] + interval="60s" + collection_jitter="5s" +[[inputs.swap]] + interval="60s" + collection_jitter="5s" + # no configuration +[[inputs.system]] + interval="60s" + collection_jitter="5s" + fieldexclude = ["uptime_format"] # uptime_format is deprecated +# Read metrics about docker containers +[[inputs.docker]] + interval="60s" + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" + endpoint = "unix:///var/run/docker.sock" + ## Set to true to collect Swarm metrics(desired_replicas, running_replicas) + ## Note: configure this in one of the manager nodes in a Swarm cluster. + ## configuring in multiple Swarm managers results in duplication of metrics. + gather_services = false + ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars + source_tag = false + ## Containers to include and exclude. Collect all if empty. Globs accepted. + container_name_include = [] + container_name_exclude = [] + ## Timeout for docker list, info, and stats commands + timeout = "5s" + ## Whether to report for each container per-device blkio (8:0, 8:1...), + ## network (eth0, eth1, ...) and cpu (cpu0, cpu1, ...) stats or not. + ## Usage of this setting is discouraged since it will be deprecated in favor of 'perdevice_include'. + ## Default value is 'true' for backwards compatibility, please set it to 'false' so that 'perdevice_include' setting + ## is honored. + perdevice = false + ## Specifies for which classes a per-device metric should be issued + ## Possible values are 'cpu' (cpu0, cpu1, ...), 'blkio' (8:0, 8:1, ...) and 'network' (eth0, eth1, ...) + ## Please note that this setting has no effect if 'perdevice' is set to 'true' + # perdevice_include = ["cpu", "blkio", "network"] + perdevice_include = ["blkio", "network"] + ## Whether to report for each container total blkio and network stats or not. + ## Usage of this setting is discouraged since it will be deprecated in favor of 'total_include'. + ## Default value is 'false' for backwards compatibility, please set it to 'true' so that 'total_include' setting + ## is honored. + total = false + ## Specifies for which classes a total metric should be issued. Total is an aggregated of the 'perdevice' values. + ## Possible values are 'cpu', 'blkio' and 'network' + ## Total 'cpu' is reported directly by Docker daemon, and 'network' and 'blkio' totals are aggregated by this plugin. + ## Please note that this setting has no effect if 'total' is set to 'false' + # total_include = ["cpu", "blkio", "network"] + ## docker labels to include and exclude as tags. Globs accepted. + ## Note that an empty array for both will include all labels as tags + docker_label_include = [] + docker_label_exclude = [] + ## Which environment variables should we use as a tag + tag_env = ["JAVA_HOME", "HEAP_SIZE"] + +[[inputs.docker_log]] + interval="60s" + ## Docker Endpoint + ## To use TCP, set endpoint = "tcp://[ip]:[port]" + ## To use environment variables (ie, docker-machine), set endpoint = "ENV" + endpoint = "unix:///var/run/docker.sock" + ## When true, container logs are read from the beginning; otherwise + ## reading begins at the end of the log. + from_beginning = false + ## Timeout for Docker API calls. + timeout = "5s" + ## Containers to include and exclude. Globs accepted. + ## Note that an empty array for both will include all containers + container_name_include = [] + ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars + source_tag = false