Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update health tests to execute locally per role #356

Merged
merged 31 commits into from
Jan 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8aafc2d
temp saving changes
arudell Nov 27, 2024
87c556d
save latest changes
arudell Nov 27, 2024
8b54a8a
save latest local tests
arudell Nov 27, 2024
10b1fa5
add logical network ping test
arudell Nov 27, 2024
30bd1c8
update processing of results
arudell Nov 27, 2024
3292323
port tests for nc service fabric
arudell Nov 29, 2024
f37ebb4
update code
arudell Dec 5, 2024
9158eb5
better handle errors w/ tests
arudell Dec 6, 2024
f815f38
add new localized tests
arudell Dec 12, 2024
a5ba728
add new mux functions
arudell Dec 12, 2024
5ce83ca
save latest changes for local tests
arudell Dec 12, 2024
c92654e
save latest changes
arudell Dec 13, 2024
726d3fa
consolidate health tests to single file
arudell Dec 16, 2024
5481a1e
save changes
arudell Dec 16, 2024
5d2c286
save latest changes
arudell Dec 16, 2024
4d1a413
save latest changes
arudell Dec 18, 2024
272da2d
save changes
arudell Dec 20, 2024
55df6ae
Adding SDN Health faults
sbgms Jan 13, 2025
dc23ca0
save latest changes
arudell Jan 15, 2025
c6285e5
save latest changes
arudell Jan 21, 2025
1a3feda
Adding telemetry for runners, some additional bug fixes
sbgms Jan 22, 2025
f4bfe90
Addressing CR comments and some bug fixes
sbgms Jan 23, 2025
5a02743
Merge branch 'main' of https://github.com/microsoft/SdnDiagnostics in…
arudell Jan 23, 2025
0ad3f16
remove duplicate function
arudell Jan 23, 2025
358dce6
add function to detect nuget path location
arudell Jan 23, 2025
83ddfcc
save latest changes
arudell Jan 24, 2025
ce95082
update tests, fix formatting, add new exported functions
arudell Jan 24, 2025
fcef004
fix scenario where cert may be in different x509 format
arudell Jan 24, 2025
10d5d86
add block for non-hci systems
arudell Jan 24, 2025
0750e42
improve logging for cert rotate
arudell Jan 24, 2025
e731e12
enable health faults only for supported systems
arudell Jan 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion src/SdnDiagnostics.psd1
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
'Copy-SdnFileToComputer',
'Convert-SdnEtwTraceToTxt',
'Debug-SdnFabricInfrastructure',
'Debug-SdnGateway',
'Debug-SdnLoadBalancerMux',
'Debug-SdnNetworkController',
'Debug-SdnServer',
'Disable-SdnRasGatewayTracing',
'Enable-SdnRasGatewayTracing',
'Enable-SdnVipTrace'
Expand Down Expand Up @@ -145,11 +149,29 @@
'Start-SdnMuxCertificateRotation',
'Start-SdnServerCertificateRotation',
'Start-SdnNetshTrace',
'Start-SdnHealthFault',
'Stop-SdnEtwTraceCapture',
'Stop-SdnNetshTrace',
'Test-SdnCertificateRotationConfig',
'Test-SdnClusterServiceState',
'Test-SdnDiagnosticsCleanupTaskEnabled',
'Test-SdnExpressBGP',
'Test-SdnProviderAddressConnectivity'
'Test-SdnHostAgentConnectionStateToApiService',
'Test-SdnEncapOverhead',
'Test-SdnProviderAddressConnectivity',
'Test-SdnProviderNetwork',
'Test-SdnMuxConnectionStateToRouter',
'Test-SdnMuxConnectionStateToSlbManager',
'Test-SdnNetworkControllerApiNameResolution',
'Test-SdnResourceConfigurationState',
'Test-SdnResourceProvisioningState',
'Test-SdnServiceFabricApplicationHealth',
'Test-SdnServiceFabricClusterHealth',
'Test-SdnServiceFabricNodeStatus',
'Test-SdnConfigurationState',
'Test-SdnNonSelfSignedCertificateInTrustedRootStore',
'Test-SdnClusterServiceState',
'Test-SdnServiceState'
)

# Variables to export from this module
Expand Down
15 changes: 6 additions & 9 deletions src/SdnDiagnostics.psm1
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ New-Variable -Name 'SdnDiagnostics' -Scope 'Global' -Force -Value @{
Cache = @{}
EnvironmentInfo = @{
# defines the cluster configuration type, supported values are 'ServiceFabric', 'FailoverCluster'
# will default to 'ServiceFabric' on module import and updated once environment details have been retrieved
ClusterConfigType = 'ServiceFabric'
FailoverClusterConfig = @{
Name = $null
Expand Down Expand Up @@ -51,14 +50,12 @@ if (Confirm-IsFailoverClusterNC) {
# powershell module paths. We need to import the module from the artifact path
if ($Global:SdnDiagnostics.Config.Mode -ieq 'AzureStackHCI' -and $Global:SdnDiagnostics.EnvironmentInfo.ClusterConfigType -ieq 'FailoverCluster') {
if ($null -ieq (Get-Module -Name 'NetworkControllerFc')) {
if (Get-Command -Name 'Get-AsArtifactPath' -ErrorAction Ignore) {
try {
$nugetPath = Get-AsArtifactPath -NugetName 'Microsoft.AS.Network.Deploy.NC'
Import-Module "$nugetPath\content\Powershell\Roles\NC\NetworkControllerFc" -Global
}
catch {
Write-Warning "Failed to import NetworkControllerFc module. Error: $_"
}
try {
$nugetPath = Get-NugetArtifactPath -NugetName 'Microsoft.AS.Network.Deploy.NC'
Import-Module "$nugetPath\content\Powershell\Roles\NC\NetworkControllerFc" -Global
}
catch {
Write-Warning "Failed to import NetworkControllerFc module. Error: $_"
}
}
}
Expand Down
23 changes: 9 additions & 14 deletions src/modules/SdnDiag.Common.psm1
Original file line number Diff line number Diff line change
Expand Up @@ -1907,16 +1907,11 @@ function Repair-SdnDiagnosticsScheduledTask {
#>

[CmdletBinding()]
param()

switch ($Global:SdnDiagnostics.EnvironmentInfo.ClusterConfigType) {
'FailoverCluster' {
$taskName = "FcDiagnostics"
}
'ServiceFabric' {
$taskName = "SDN Diagnostics Task"
}
}
param(
[Parameter(Mandatory = $true)]
[ValidateSet('FcDiagnostics', 'SDN Diagnostics Task')]
[string]$TaskName
)

try {
$isLoggingEnabled = Get-ItemPropertyValue -Path "HKLM:\Software\Microsoft\NetworkController\Sdn\Diagnostics\Parameters" -Name 'IsLoggingEnabled'
Expand All @@ -1925,21 +1920,21 @@ function Repair-SdnDiagnosticsScheduledTask {
return $null
}

$scheduledTask = Get-ScheduledTask -TaskName $taskName -ErrorAction Stop
$scheduledTask = Get-ScheduledTask -TaskName $TaskName -ErrorAction Stop
if ($scheduledTask) {
# if the scheduled task is disabled, enable it and start it
if ($scheduledTask.State -ieq "Disabled") {
"Enabling scheduled task." | Trace-Output
$scheduledTask | Enable-ScheduledTask -ErrorAction Stop

"Starting scheduled task." | Trace-Output
Get-ScheduledTask -TaskName $taskName | Start-ScheduledTask -ErrorAction Stop
Get-ScheduledTask -TaskName $TaskName | Start-ScheduledTask -ErrorAction Stop
}
else {
"Scheduled task is already enabled." | Trace-Output
}

return (Get-ScheduledTask -TaskName $taskName)
return (Get-ScheduledTask -TaskName $TaskName)
}
else {
"Scheduled task does not exist." | Trace-Output -Level:Warning
Expand Down Expand Up @@ -2288,7 +2283,7 @@ function Confirm-IsCertSelfSigned {
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[System.Security.Cryptography.X509Certificates.X509Certificate2]$Certificate
$Certificate
)

if ($Certificate.Issuer -eq $Certificate.Subject) {
Expand Down
134 changes: 74 additions & 60 deletions src/modules/SdnDiag.Health.Config.psd1
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,62 @@

@{
HealthValidations = @{
'Test-EncapOverhead' = @{
Description = "EncapOverhead/JumboPacket is not configured properly on the Hyper-V Hosts"
Impact = "Intermittent packet loss may occur under certain conditions when routing traffic within the logical network."
PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-mtu-and-jumbo-frame-support-on-hnv-provider-logical-network"

# COMMON TESTS

'Test-SdnDiagnosticsCleanupTaskEnabled' = @{
Description = "Scheduled task is not enabled on the SDN infrastructure node(s)."
Impact = "Unconstrained log files may grow and consume disk space."
PublicDocUrl = ""
}
'Test-SdnNetworkControllerApiNameResolution' = @{
Description = "Network Controller URL is not resolvable."
Impact = "Calls to Network Controller NB API will fail resulting in policy configuration failures and unable to manage SDN resources."
PublicDocUrl = ""
}
'Test-HostRootStoreNonRootCert' = @{
'Test-SdnNonSelfSignedCertificateInTrustedRootStore' = @{
Description = "Non Root Cert exist in Host Trusted Root CA Store"
Impact = "Network Controller will have issues communicating Host's TCP 6640 and 443 port with certificate error."
PublicDocUrl = ""
}
'Test-MuxBgpConnectionState' = @{
'Test-SdnServiceState' = @{
Description = "Identified service(s) are not running on the SDN infrastructure node(s)."
Impact = "SDN services and functionality will be impacted without the service running."
PublicDocUrl = ""
}

# GATEWAY TESTS


# LOAD BALANCER MUX TESTS

'Test-SdnMuxConnectionStateToRouter' = @{
Description = "One or more Load Balancer Muxes do not have an active BGP connection via TCP port 179 to the switch."
Impact = "Public IP addresses may not be routable as Load Balancer Muxes are not advertising the public IP addresses to the switch."
PublicDocUrl = "https://learn.microsoft.com/en-us/azure-stack/hci/manage/troubleshoot-software-load-balancer"
}
'Test-NetworkControllerCertCredential' = @{
Description = "Network Controller does not have the x509 certificate installed for southbound device(s)."
Impact = "Network Controller will have issues communicating with the southbound device(s)."
'Test-SdnMuxConnectionStateToSlbManager' = @{
Description = "SLB Manager does not have connectivity established to Mux(es) via TCP 8560."
Impact = "SLB Manager will not be able to program VIP:DIP mappings to the Load Balancer Mux(es) which will impact routing of Virtual IPs."
PublicDocUrl = "https://learn.microsoft.com/en-us/azure-stack/hci/manage/troubleshoot-software-load-balancer"
}

# NETWORK CONTROLLER TESTS

'Test-SdnServiceFabricApplicationHealth' = @{
Description = "Network Controller application with Service Fabric is not healthy."
Impact = "Network Controller services and functionality may be impacted."
PublicDocUrl = ""
}
'Test-NetworkInterfaceAPIDuplicateMacAddress' = @{
Description = "Duplicate MAC address detected within the API."
Impact = "Policy configuration failures may be reported by Network Controller when applying policies to the Hyper-v host. Network Interfaces reporting configurationState failure will not be routable."
'Test-SdnServiceFabricClusterHealth' = @{
Description = "Service Fabric cluster for Network Controller is not healthy."
Impact = "Network Controller services and functionality may be impacted."
PublicDocUrl = ""
}
'Test-ProviderNetwork' = @{
Description = "Logical network does not support VXLAN or NVGRE encapsulated traffic"
Impact = "Intermittent packet loss may occur under certain conditions when routing traffic within the logical network."
PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-mtu-and-jumbo-frame-support-on-hnv-provider-logical-network"
'Test-SdnServiceFabricNodeStatus' = @{
Description = "Service Fabric node(s) are offline and not participating in the cluster."
Impact = "Minimum amount of nodes are required to maintain quorum and cluster availability. Services will be in read-only state if quorum is lost and may result in data loss."
PublicDocUrl = "https://learn.microsoft.com/en-us/azure/service-fabric/service-fabric-disaster-recovery"
}
'Test-ResourceConfigurationState' = @{
Description = "Infrastructure resource configuration is not Succeeded."
Expand All @@ -43,47 +70,30 @@
Impact = "SDN services and functionality may be impacted."
PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#hoster-validate-system-health"
}
'Test-ScheduledTaskEnabled' = @{
Description = "Scheduled task is not enabled on the SDN infrastructure node(s)."
Impact = "Unconstrained log files may grow and consume disk space."
PublicDocUrl = ""
}
'Test-ServerHostId' = @{
Description = "HostID is not configured properly on the Hyper-V Hosts"
Impact = "Mismatch of HostId between Hyper-V host(s) and Network Controller will result in policy configuration failures."
PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-for-corresponding-hostids-and-certificates-between-network-controller-and-each-hyper-v-host"
}
'Test-ServiceFabricApplicationHealth' = @{
Description = "Network Controller application with Service Fabric is not healthy."
Impact = "Network Controller services and functionality may be impacted."
PublicDocUrl = ""
}
'Test-ServiceFabricClusterHealth' = @{
Description = "Service Fabric cluster for Network Controller is not healthy."
Impact = "Network Controller services and functionality may be impacted."
'Test-NetworkInterfaceAPIDuplicateMacAddress' = @{
Description = "Duplicate MAC address detected within the API."
Impact = "Policy configuration failures may be reported by Network Controller when applying policies to the Hyper-v host. Network Interfaces reporting configurationState failure will not be routable."
PublicDocUrl = ""
}
'Test-ServiceFabricNodeStatus' = @{
Description = "Service Fabric node(s) are offline and not participating in the cluster."
Impact = "Minimum amount of nodes are required to maintain quorum and cluster availability. Services will be in read-only state if quorum is lost and may result in data loss."
PublicDocUrl = "https://learn.microsoft.com/en-us/azure/service-fabric/service-fabric-disaster-recovery"
}
'Test-ServiceFabricPartitionDatabaseSize' = @{
Description = "Service Fabric partition database size has exceeded normal size expected."
Impact = "Performance of the Service Fabric Services may occur."
PublicDocUrl = ""

# SERVER TESTS

'Test-SdnEncapOverhead' = @{
Description = "EncapOverhead/JumboPacket is not configured properly on the Hyper-V Hosts"
Impact = "Intermittent packet loss may occur under certain conditions when routing traffic within the logical network."
PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-mtu-and-jumbo-frame-support-on-hnv-provider-logical-network"
}
'Test-ServiceState' = @{
Description = "Identified service(s) are not running on the SDN infrastructure node(s)."
Impact = "SDN services and functionality will be impacted without the service running."
'Test-SdnHostAgentConnectionStateToApiService' = @{
Description = "Network Controller Host Agent is not connected to the Network Controller API Service."
Impact = "Policy configuration may not be pushed to the Hyper-V host(s) if no southbound connectivity is available."
PublicDocUrl = ""
}
'Test-SlbManagerConnectionToMux' = @{
Description = "SLB Manager is not able to connect to the Mux(es)."
Impact = "SLB Manager will not be able to program VIP:DIP mappings to the Load Balancer Mux(es) which will impact routing of Virtual IPs."
PublicDocUrl = "https://learn.microsoft.com/en-us/azure-stack/hci/manage/troubleshoot-software-load-balancer"
'Test-SdnProviderNetwork' = @{
Description = "Logical network does not support VXLAN or NVGRE encapsulated traffic"
Impact = "Intermittent packet loss may occur under certain conditions when routing traffic within the logical network."
PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-mtu-and-jumbo-frame-support-on-hnv-provider-logical-network"
}
'Test-VfpDuplicatePort' = @{
'Test-VfpDuplicateMacAddress' = @{
Description = "Duplicate MAC address detected within Virtual Filtering Platform (VFP)."
Impact = "Policy configuration failures may be reported by Network Controller when applying policies to the Hyper-v host. In addition, network traffic may be impacted."
PublicDocUrl = ""
Expand All @@ -93,15 +103,10 @@
Impact = "Policy configuration failures may be reported by Network Controller when applying policies to the Hyper-v host. In addition, network traffic may be impacted for the interfaces that are duplicated."
PublicDocUrl = ""
}
'Test-NcHostAgentConnectionToApiService' = @{
Description = "Network Controller Host Agent is not connected to the Network Controller API Service."
Impact = "Policy configuration may not be pushed to the Hyper-V host(s) if no southbound connectivity is available."
PublicDocUrl = ""
}
'Test-NcUrlNameResolution' = @{
Description = "Network Controller URL is not resolvable."
Impact = "Calls to Network Controller NB API will fail resulting in policy configuration failures and unable to manage SDN resources."
PublicDocUrl = ""
'Test-ServerHostId' = @{
Description = "HostID is not configured properly on the Hyper-V Hosts"
Impact = "Mismatch of HostId between Hyper-V host(s) and Network Controller will result in policy configuration failures."
PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-for-corresponding-hostids-and-certificates-between-network-controller-and-each-hyper-v-host"
}
}
ConfigurationStateErrorCodes = @{
Expand Down Expand Up @@ -182,4 +187,13 @@
Action = 'See if sufficient bandwidth is available for all VM''s if QOS reservation is used'
}
}

HealthFaultEnabled = $false
HealthFaultSupportedBuilds = @(
'24H2' # Build Number 26100
)
HealthFaultSupportedProducts = @(
'Azure Stack HCI'
'Windows Server 2025 Datacenter'
)
}
Loading
Loading