diff --git a/src/SdnDiagnostics.psd1 b/src/SdnDiagnostics.psd1 index c5813801..aac002d1 100644 --- a/src/SdnDiagnostics.psd1 +++ b/src/SdnDiagnostics.psd1 @@ -55,6 +55,10 @@ 'Copy-SdnFileToComputer', 'Convert-SdnEtwTraceToTxt', 'Debug-SdnFabricInfrastructure', + 'Debug-SdnGateway', + 'Debug-SdnLoadBalancerMux', + 'Debug-SdnNetworkController', + 'Debug-SdnServer', 'Disable-SdnRasGatewayTracing', 'Enable-SdnRasGatewayTracing', 'Enable-SdnVipTrace' @@ -145,11 +149,29 @@ 'Start-SdnMuxCertificateRotation', 'Start-SdnServerCertificateRotation', 'Start-SdnNetshTrace', + 'Start-SdnHealthFault', 'Stop-SdnEtwTraceCapture', 'Stop-SdnNetshTrace', 'Test-SdnCertificateRotationConfig', + 'Test-SdnClusterServiceState', + 'Test-SdnDiagnosticsCleanupTaskEnabled', 'Test-SdnExpressBGP', - 'Test-SdnProviderAddressConnectivity' + 'Test-SdnHostAgentConnectionStateToApiService', + 'Test-SdnEncapOverhead', + 'Test-SdnProviderAddressConnectivity', + 'Test-SdnProviderNetwork', + 'Test-SdnMuxConnectionStateToRouter', + 'Test-SdnMuxConnectionStateToSlbManager', + 'Test-SdnNetworkControllerApiNameResolution', + 'Test-SdnResourceConfigurationState', + 'Test-SdnResourceProvisioningState', + 'Test-SdnServiceFabricApplicationHealth', + 'Test-SdnServiceFabricClusterHealth', + 'Test-SdnServiceFabricNodeStatus', + 'Test-SdnConfigurationState', + 'Test-SdnNonSelfSignedCertificateInTrustedRootStore', + 'Test-SdnClusterServiceState', + 'Test-SdnServiceState' ) # Variables to export from this module diff --git a/src/SdnDiagnostics.psm1 b/src/SdnDiagnostics.psm1 index 62996910..27207c4a 100644 --- a/src/SdnDiagnostics.psm1 +++ b/src/SdnDiagnostics.psm1 @@ -5,7 +5,6 @@ New-Variable -Name 'SdnDiagnostics' -Scope 'Global' -Force -Value @{ Cache = @{} EnvironmentInfo = @{ # defines the cluster configuration type, supported values are 'ServiceFabric', 'FailoverCluster' - # will default to 'ServiceFabric' on module import and updated once environment details have been retrieved ClusterConfigType = 'ServiceFabric' FailoverClusterConfig = @{ Name = $null @@ -51,14 +50,12 @@ if (Confirm-IsFailoverClusterNC) { # powershell module paths. We need to import the module from the artifact path if ($Global:SdnDiagnostics.Config.Mode -ieq 'AzureStackHCI' -and $Global:SdnDiagnostics.EnvironmentInfo.ClusterConfigType -ieq 'FailoverCluster') { if ($null -ieq (Get-Module -Name 'NetworkControllerFc')) { - if (Get-Command -Name 'Get-AsArtifactPath' -ErrorAction Ignore) { - try { - $nugetPath = Get-AsArtifactPath -NugetName 'Microsoft.AS.Network.Deploy.NC' - Import-Module "$nugetPath\content\Powershell\Roles\NC\NetworkControllerFc" -Global - } - catch { - Write-Warning "Failed to import NetworkControllerFc module. Error: $_" - } + try { + $nugetPath = Get-NugetArtifactPath -NugetName 'Microsoft.AS.Network.Deploy.NC' + Import-Module "$nugetPath\content\Powershell\Roles\NC\NetworkControllerFc" -Global + } + catch { + Write-Warning "Failed to import NetworkControllerFc module. Error: $_" } } } diff --git a/src/modules/SdnDiag.Common.psm1 b/src/modules/SdnDiag.Common.psm1 index 6565a4a1..db662c8b 100644 --- a/src/modules/SdnDiag.Common.psm1 +++ b/src/modules/SdnDiag.Common.psm1 @@ -1909,16 +1909,11 @@ function Repair-SdnDiagnosticsScheduledTask { #> [CmdletBinding()] - param() - - switch ($Global:SdnDiagnostics.EnvironmentInfo.ClusterConfigType) { - 'FailoverCluster' { - $taskName = "FcDiagnostics" - } - 'ServiceFabric' { - $taskName = "SDN Diagnostics Task" - } - } + param( + [Parameter(Mandatory = $true)] + [ValidateSet('FcDiagnostics', 'SDN Diagnostics Task')] + [string]$TaskName + ) try { $isLoggingEnabled = Get-ItemPropertyValue -Path "HKLM:\Software\Microsoft\NetworkController\Sdn\Diagnostics\Parameters" -Name 'IsLoggingEnabled' @@ -1927,7 +1922,7 @@ function Repair-SdnDiagnosticsScheduledTask { return $null } - $scheduledTask = Get-ScheduledTask -TaskName $taskName -ErrorAction Stop + $scheduledTask = Get-ScheduledTask -TaskName $TaskName -ErrorAction Stop if ($scheduledTask) { # if the scheduled task is disabled, enable it and start it if ($scheduledTask.State -ieq "Disabled") { @@ -1935,13 +1930,13 @@ function Repair-SdnDiagnosticsScheduledTask { $scheduledTask | Enable-ScheduledTask -ErrorAction Stop "Starting scheduled task." | Trace-Output - Get-ScheduledTask -TaskName $taskName | Start-ScheduledTask -ErrorAction Stop + Get-ScheduledTask -TaskName $TaskName | Start-ScheduledTask -ErrorAction Stop } else { "Scheduled task is already enabled." | Trace-Output } - return (Get-ScheduledTask -TaskName $taskName) + return (Get-ScheduledTask -TaskName $TaskName) } else { "Scheduled task does not exist." | Trace-Output -Level:Warning @@ -2290,7 +2285,7 @@ function Confirm-IsCertSelfSigned { [CmdletBinding()] param ( [Parameter(Mandatory = $true)] - [System.Security.Cryptography.X509Certificates.X509Certificate2]$Certificate + $Certificate ) if ($Certificate.Issuer -eq $Certificate.Subject) { diff --git a/src/modules/SdnDiag.Health.Config.psd1 b/src/modules/SdnDiag.Health.Config.psd1 index 4e2c8ec2..2ea38eb3 100644 --- a/src/modules/SdnDiag.Health.Config.psd1 +++ b/src/modules/SdnDiag.Health.Config.psd1 @@ -3,35 +3,62 @@ @{ HealthValidations = @{ - 'Test-EncapOverhead' = @{ - Description = "EncapOverhead/JumboPacket is not configured properly on the Hyper-V Hosts" - Impact = "Intermittent packet loss may occur under certain conditions when routing traffic within the logical network." - PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-mtu-and-jumbo-frame-support-on-hnv-provider-logical-network" + + # COMMON TESTS + + 'Test-SdnDiagnosticsCleanupTaskEnabled' = @{ + Description = "Scheduled task is not enabled on the SDN infrastructure node(s)." + Impact = "Unconstrained log files may grow and consume disk space." + PublicDocUrl = "" + } + 'Test-SdnNetworkControllerApiNameResolution' = @{ + Description = "Network Controller URL is not resolvable." + Impact = "Calls to Network Controller NB API will fail resulting in policy configuration failures and unable to manage SDN resources." + PublicDocUrl = "" } - 'Test-HostRootStoreNonRootCert' = @{ + 'Test-SdnNonSelfSignedCertificateInTrustedRootStore' = @{ Description = "Non Root Cert exist in Host Trusted Root CA Store" Impact = "Network Controller will have issues communicating Host's TCP 6640 and 443 port with certificate error." PublicDocUrl = "" } - 'Test-MuxBgpConnectionState' = @{ + 'Test-SdnServiceState' = @{ + Description = "Identified service(s) are not running on the SDN infrastructure node(s)." + Impact = "SDN services and functionality will be impacted without the service running." + PublicDocUrl = "" + } + + # GATEWAY TESTS + + + # LOAD BALANCER MUX TESTS + + 'Test-SdnMuxConnectionStateToRouter' = @{ Description = "One or more Load Balancer Muxes do not have an active BGP connection via TCP port 179 to the switch." Impact = "Public IP addresses may not be routable as Load Balancer Muxes are not advertising the public IP addresses to the switch." PublicDocUrl = "https://learn.microsoft.com/en-us/azure-stack/hci/manage/troubleshoot-software-load-balancer" } - 'Test-NetworkControllerCertCredential' = @{ - Description = "Network Controller does not have the x509 certificate installed for southbound device(s)." - Impact = "Network Controller will have issues communicating with the southbound device(s)." + 'Test-SdnMuxConnectionStateToSlbManager' = @{ + Description = "SLB Manager does not have connectivity established to Mux(es) via TCP 8560." + Impact = "SLB Manager will not be able to program VIP:DIP mappings to the Load Balancer Mux(es) which will impact routing of Virtual IPs." + PublicDocUrl = "https://learn.microsoft.com/en-us/azure-stack/hci/manage/troubleshoot-software-load-balancer" + } + + # NETWORK CONTROLLER TESTS + + 'Test-SdnServiceFabricApplicationHealth' = @{ + Description = "Network Controller application with Service Fabric is not healthy." + Impact = "Network Controller services and functionality may be impacted." PublicDocUrl = "" } - 'Test-NetworkInterfaceAPIDuplicateMacAddress' = @{ - Description = "Duplicate MAC address detected within the API." - Impact = "Policy configuration failures may be reported by Network Controller when applying policies to the Hyper-v host. Network Interfaces reporting configurationState failure will not be routable." + 'Test-SdnServiceFabricClusterHealth' = @{ + Description = "Service Fabric cluster for Network Controller is not healthy." + Impact = "Network Controller services and functionality may be impacted." PublicDocUrl = "" } - 'Test-ProviderNetwork' = @{ - Description = "Logical network does not support VXLAN or NVGRE encapsulated traffic" - Impact = "Intermittent packet loss may occur under certain conditions when routing traffic within the logical network." - PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-mtu-and-jumbo-frame-support-on-hnv-provider-logical-network" + 'Test-SdnServiceFabricNodeStatus' = @{ + Description = "Service Fabric node(s) are offline and not participating in the cluster." + Impact = "Minimum amount of nodes are required to maintain quorum and cluster availability. Services will be in read-only state if quorum is lost and may result in data loss." + PublicDocUrl = "https://learn.microsoft.com/en-us/azure/service-fabric/service-fabric-disaster-recovery" } 'Test-ResourceConfigurationState' = @{ Description = "Infrastructure resource configuration is not Succeeded." @@ -43,47 +70,30 @@ Impact = "SDN services and functionality may be impacted." PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#hoster-validate-system-health" } - 'Test-ScheduledTaskEnabled' = @{ - Description = "Scheduled task is not enabled on the SDN infrastructure node(s)." - Impact = "Unconstrained log files may grow and consume disk space." - PublicDocUrl = "" - } - 'Test-ServerHostId' = @{ - Description = "HostID is not configured properly on the Hyper-V Hosts" - Impact = "Mismatch of HostId between Hyper-V host(s) and Network Controller will result in policy configuration failures." - PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-for-corresponding-hostids-and-certificates-between-network-controller-and-each-hyper-v-host" - } - 'Test-ServiceFabricApplicationHealth' = @{ - Description = "Network Controller application with Service Fabric is not healthy." - Impact = "Network Controller services and functionality may be impacted." - PublicDocUrl = "" - } - 'Test-ServiceFabricClusterHealth' = @{ - Description = "Service Fabric cluster for Network Controller is not healthy." - Impact = "Network Controller services and functionality may be impacted." + 'Test-NetworkInterfaceAPIDuplicateMacAddress' = @{ + Description = "Duplicate MAC address detected within the API." + Impact = "Policy configuration failures may be reported by Network Controller when applying policies to the Hyper-v host. Network Interfaces reporting configurationState failure will not be routable." PublicDocUrl = "" } - 'Test-ServiceFabricNodeStatus' = @{ - Description = "Service Fabric node(s) are offline and not participating in the cluster." - Impact = "Minimum amount of nodes are required to maintain quorum and cluster availability. Services will be in read-only state if quorum is lost and may result in data loss." - PublicDocUrl = "https://learn.microsoft.com/en-us/azure/service-fabric/service-fabric-disaster-recovery" - } - 'Test-ServiceFabricPartitionDatabaseSize' = @{ - Description = "Service Fabric partition database size has exceeded normal size expected." - Impact = "Performance of the Service Fabric Services may occur." - PublicDocUrl = "" + + # SERVER TESTS + + 'Test-SdnEncapOverhead' = @{ + Description = "EncapOverhead/JumboPacket is not configured properly on the Hyper-V Hosts" + Impact = "Intermittent packet loss may occur under certain conditions when routing traffic within the logical network." + PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-mtu-and-jumbo-frame-support-on-hnv-provider-logical-network" } - 'Test-ServiceState' = @{ - Description = "Identified service(s) are not running on the SDN infrastructure node(s)." - Impact = "SDN services and functionality will be impacted without the service running." + 'Test-SdnHostAgentConnectionStateToApiService' = @{ + Description = "Network Controller Host Agent is not connected to the Network Controller API Service." + Impact = "Policy configuration may not be pushed to the Hyper-V host(s) if no southbound connectivity is available." PublicDocUrl = "" } - 'Test-SlbManagerConnectionToMux' = @{ - Description = "SLB Manager is not able to connect to the Mux(es)." - Impact = "SLB Manager will not be able to program VIP:DIP mappings to the Load Balancer Mux(es) which will impact routing of Virtual IPs." - PublicDocUrl = "https://learn.microsoft.com/en-us/azure-stack/hci/manage/troubleshoot-software-load-balancer" + 'Test-SdnProviderNetwork' = @{ + Description = "Logical network does not support VXLAN or NVGRE encapsulated traffic" + Impact = "Intermittent packet loss may occur under certain conditions when routing traffic within the logical network." + PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-mtu-and-jumbo-frame-support-on-hnv-provider-logical-network" } - 'Test-VfpDuplicatePort' = @{ + 'Test-VfpDuplicateMacAddress' = @{ Description = "Duplicate MAC address detected within Virtual Filtering Platform (VFP)." Impact = "Policy configuration failures may be reported by Network Controller when applying policies to the Hyper-v host. In addition, network traffic may be impacted." PublicDocUrl = "" @@ -93,15 +103,10 @@ Impact = "Policy configuration failures may be reported by Network Controller when applying policies to the Hyper-v host. In addition, network traffic may be impacted for the interfaces that are duplicated." PublicDocUrl = "" } - 'Test-NcHostAgentConnectionToApiService' = @{ - Description = "Network Controller Host Agent is not connected to the Network Controller API Service." - Impact = "Policy configuration may not be pushed to the Hyper-V host(s) if no southbound connectivity is available." - PublicDocUrl = "" - } - 'Test-NcUrlNameResolution' = @{ - Description = "Network Controller URL is not resolvable." - Impact = "Calls to Network Controller NB API will fail resulting in policy configuration failures and unable to manage SDN resources." - PublicDocUrl = "" + 'Test-ServerHostId' = @{ + Description = "HostID is not configured properly on the Hyper-V Hosts" + Impact = "Mismatch of HostId between Hyper-V host(s) and Network Controller will result in policy configuration failures." + PublicDocUrl = "https://learn.microsoft.com/en-us/windows-server/networking/sdn/troubleshoot/troubleshoot-windows-server-software-defined-networking-stack#check-for-corresponding-hostids-and-certificates-between-network-controller-and-each-hyper-v-host" } } ConfigurationStateErrorCodes = @{ @@ -182,4 +187,13 @@ Action = 'See if sufficient bandwidth is available for all VM''s if QOS reservation is used' } } + + HealthFaultEnabled = $false + HealthFaultSupportedBuilds = @( + '24H2' # Build Number 26100 + ) + HealthFaultSupportedProducts = @( + 'Azure Stack HCI' + 'Windows Server 2025 Datacenter' + ) } diff --git a/src/modules/SdnDiag.Health.psm1 b/src/modules/SdnDiag.Health.psm1 index 6c577001..f32e25b2 100644 --- a/src/modules/SdnDiag.Health.psm1 +++ b/src/modules/SdnDiag.Health.psm1 @@ -2,1467 +2,1508 @@ # Licensed under the MIT License. Import-Module $PSScriptRoot\SdnDiag.Common.psm1 +Import-Module $PSScriptRoot\SdnDiag.Server.psm1 +Import-Module $PSScriptRoot\SdnDiag.NetworkController.psm1 Import-Module $PSScriptRoot\SdnDiag.NetworkController.FC.psm1 Import-Module $PSScriptRoot\SdnDiag.NetworkController.SF.psm1 Import-Module $PSScriptRoot\SdnDiag.Utilities.psm1 $configurationData = Import-PowerShellDataFile -Path "$PSScriptRoot\SdnDiag.Health.Config.psd1" New-Variable -Name 'SdnDiagnostics_Health' -Scope 'Script' -Force -Value @{ - Cache = @{} + Cache = @{} Config = $configurationData } -########################## -#### CLASSES & ENUMS ##### -########################## - -enum SdnHealthResult { - PASS - FAIL - WARNING +# confirm that the current system is supported to generate health faults +$displayVersion = Get-ItemProperty -Path "HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion" -Name 'DisplayVersion' +$productName = Get-ItemProperty -Path "HKLM:\SOFTWARE\Microsoft\Windows NT\CurrentVersion" -Name 'ProductName' +if ($productName.ProductName -iin $script:SdnDiagnostics_Health.Config.HealthFaultSupportedProducts){ + $productSupported = $true } - -class SdnHealth { - [String]$Name = (Get-PSCallStack)[1].Command - [SdnHealthResult]$Result = 'PASS' - [DateTime]$OccurrenceTime = [System.DateTime]::UtcNow - [Object]$Properties - [String[]]$Remediation +if ($displayVersion.DisplayVersion -iin $script:SdnDiagnostics_Health.Config.HealthFaultSupportedBuilds){ + $versionSupported = $true } - -class SdnFabricEnvObject { - [String[]]$ComputerName - [Uri]$NcUrl - [Object]$Role - [Object]$EnvironmentInfo +if ($versionSupported -and $productSupported){ + $script:SdnDiagnostics_Health.Config.HealthFaultEnabled = $true } -class SdnFabricHealthReport { - [DateTime]$OccurrenceTime = [System.DateTime]::UtcNow - [String]$Role - [SdnHealthResult]$Result = 'PASS' - [Object[]]$HealthValidation +########################## +#### CLASSES & ENUMS ##### +########################## + +class SdnFaultInfo { + [datetime] $OccurrenceTime = [System.DateTime]::UtcNow + [string] $KeyFaultingObjectDescription + [string] $KeyFaultingObjectID + [string] $KeyFaultingObjectType + [string] $FaultingObjectLocation + [string] $FaultDescription + [string] $FaultActionRemediation } ########################## -#### ARG COMPLETERS ###### +#### FAULT HELPERS ##### ########################## -$argScriptBlock = @{ - Role = { - param($commandName, $parameterName, $wordToComplete, $commandAst, $fakeBoundParameters) - $result = (Get-SdnFabricInfrastructureResult) - if ([string]::IsNullOrEmpty($wordToComplete)) { - return ($result.Role | Sort-Object -Unique) - } +# pInvoke definition for fault APIs +$signature = @' +[DllImport("hcihealthutils.dll", CharSet = CharSet.Unicode, SetLastError = false)] +public static extern int HciModifyFault( + string entityType, + string entityKey, + string entityDescription, + string entityLocation, + string entityUniqueKey, + uint action, + string faultType, + uint urgency, + string title, + string description, + string actions, + uint flag); + +[DllImport("hcihealthutils.dll", CharSet = CharSet.Unicode, SetLastError = false)] +public static extern int HciModifyRelationship( + string entityType, + string entityKey, + string entityDescription, + string entityLocation, + string entityUniqueKey, + uint action, + string parentEntityType, + string parenetEntityKey, + string parentEntityDescription, + string parentEntityLocation, + string parentEntityUniqueKey, + string groupKey, + uint urgency, + uint relationshipType, + uint flag); +'@ + +function ValidateFault { + param( + [SdnFaultInfo] $Fault + ) - return $result.Role | Where-Object {$_.Role -like "*$wordToComplete*"} | Sort-Object + if ([string]::IsNullOrEmpty($Fault.KeyFaultingObjectDescription)) { + throw "KeyFaultingObjectDescription is required" } - Name = { - param($commandName, $parameterName, $wordToComplete, $commandAst, $fakeBoundParameters) - $result = (Get-SdnFabricInfrastructureResult).HealthValidation - if ([string]::IsNullOrEmpty($wordToComplete)) { - return ($result.Name | Sort-Object -Unique) - } - return $result.Name | Where-Object {$_.Name -like "*$wordToComplete*"} | Sort-Object + if ([string]::IsNullOrEmpty($Fault.KeyFaultingObjectID)) { + throw "KeyFaultingObjectID is required" } -} -Register-ArgumentCompleter -CommandName 'Get-SdnFabricInfrastructureResult' -ParameterName 'Role' -ScriptBlock $argScriptBlock.Role -Register-ArgumentCompleter -CommandName 'Get-SdnFabricInfrastructureResult' -ParameterName 'Name' -ScriptBlock $argScriptBlock.Name + if ([string]::IsNullOrEmpty($Fault.KeyFaultingObjectType)) { + throw "KeyFaultingObjectType is required" + } +} +function LogWmiHealthFault { -########################## -####### FUNCTIONS ######## -########################## + <# + .SYNOPSIS + Logs the WMI version of the health fault -function Get-HealthData { - param ( - [Parameter(Mandatory = $true)] - [System.String]$Property, + .PARAMETER fault + The fault to log + #> - [Parameter(Mandatory = $true)] - [System.String]$Id + param( + [object] $fault ) - - $results = $script:SdnDiagnostics_Health.Config[$Property] - return ($results[$Id]) + Write-Verbose " WmiFault:" + Write-Verbose " (FaultId) $($fault.FaultId)" + Write-Verbose " (FaultingObjectDescription) $($fault.FaultingObjectDescription)" + Write-Verbose " (FaultingObjectLocation) $($fault.FaultingObjectLocation)" + Write-Verbose " (FaultingObjectType) $($fault.FaultingObjectType)" + Write-Verbose " (FaultingObjectUniqueId) $($fault.FaultingObjectUniqueId)" + Write-Verbose " (FaultTime) $($fault.FaultTime)" + Write-Verbose " (FaultType) $($fault.FaultType)" + Write-Verbose " (Reason) $($fault.Reason)" } -function Test-EncapOverhead { +function ConvertFaultListToPsObjectList { + <# - .SYNOPSIS - Retrieves the VMSwitch across servers in the dataplane to confirm that the network interfaces support EncapOverhead or JumboPackets - and that the settings are configured as expected + .SYNOPSIS + Converts a list of faults to a list of PSObjects + (used by ASZ modules to emit telemetry events ) + + .PARAMETER faults + The list of faults to convert #> - [CmdletBinding()] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + param( + [SdnFaultInfo[]] $faults, - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty + [ValidateSet("Create", "Delete")] + [string] $faultType ) - [int]$encapOverheadExpectedValue = 160 - [int]$jumboPacketExpectedValue = 1674 # this is default 1514 MTU + 160 encap overhead - $sdnHealthObject = [SdnHealth]::new() - $array = @() - - try { - "Validating the network interfaces across the SDN dataplane support Encap Overhead or Jumbo Packets" | Trace-Output - - $encapOverheadResults = Invoke-PSRemoteCommand -ComputerName $SdnEnvironmentObject.ComputerName -Credential $Credential -Scriptblock {Get-SdnNetAdapterEncapOverheadConfig} - if($null -eq $encapOverheadResults){ - $sdnHealthObject.Result = 'FAIL' + $faultList = @() + foreach ($fault in $faults) { + # convert properties of class SdnFaultInfo + $faultList += [PSCustomObject]@{ + OccurrenceTime = $fault.OccurrenceTime + KeyFaultingObjectDescription = $fault.KeyFaultingObjectDescription + KeyFaultingObjectID = $fault.KeyFaultingObjectID + KeyFaultingObjectType = $fault.KeyFaultingObjectType + FaultingObjectLocation = $fault.FaultingObjectLocation + FaultDescription = $fault.FaultDescription + FaultActionRemediation = $fault.FaultActionRemediation + OperationType = $faultType } - else { - foreach($object in ($encapOverheadResults | Group-Object -Property PSComputerName)){ - foreach($interface in $object.Group){ - "[{0}] {1}" -f $object.Name, ($interface | Out-String -Width 4096) | Trace-Output -Level:Verbose + } - if($interface.EncapOverheadEnabled -eq $false -or $interface.EncapOverheadValue -lt $encapOverheadExpectedValue){ - "EncapOverhead settings for {0} on {1} are disabled or not configured correctly" -f $interface.NetworkInterface, $object.Name | Trace-Output -Level:Verbose - $encapDisabled = $true - } + return $faultList +} - if($interface.JumboPacketEnabled -eq $false -or $interface.JumboPacketValue -lt $jumboPacketExpectedValue){ - "JumboPacket settings for {0} on {1} are disabled or not configured correctly" -f $interface.NetworkInterface, $object.Name | Trace-Output -Level:Verbose - $jumboPacketDisabled = $true - } +function ConvertFaultToPsObject { - # if both encapoverhead and jumbo packets are not set, this is indication the physical network cannot support VXLAN encapsulation - # and as such, environment would experience intermittent packet loss - if ($encapDisabled -and $jumboPacketDisabled) { - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Ensure EncapOverhead and JumboPacket for interface {0} on {1} are enabled and configured correctly." -f $interface.NetworkInterface, $object.Name + <# + .SYNOPSIS + Converts a fault to a PSObject + (used by ASZ modules to emit telemetry events ) - "EncapOverhead and JumboPacket for interface {0} on {1} are disabled or not configured correctly." -f $interface.NetworkInterface, $object.Name | Trace-Output -Level:Error - } + .PARAMETER healthFault + The fault to convert - $array += $interface - } - } + .PARAMETER faultOpType + The operation type of the fault + #> - $sdnHealthObject.Properties = $array - } + param( + [SdnFaultInfo] $healthFault, - return $sdnHealthObject - } - catch { - $_ | Trace-Exception - $_ | Write-Error + [ValidateSet("Create", "Delete")] + [string] $faultOpType + ) + + # convert properties of class SdnFaultInfo + $faultObject = [PSCustomObject]@{ + OccurrenceTime = $healthFault.OccurrenceTime + KeyFaultingObjectDescription = $healthFault.KeyFaultingObjectDescription + KeyFaultingObjectID = $healthFault.KeyFaultingObjectID + KeyFaultingObjectType = $healthFault.KeyFaultingObjectType + FaultingObjectLocation = $healthFault.FaultingObjectLocation + FaultDescription = $healthFault.FaultDescription + FaultActionRemediation = $healthFault.FaultActionRemediation + OperationType = $faultOpType } + + return $faultObject } -function Test-HostRootStoreNonRootCert { +function LogHealthFault { + <# - .SYNOPSIS - Validate the Cert in Host's Root CA Store to detect if any Non Root Cert exist - #> + .SYNOPSIS + Logs the health fault - [CmdletBinding()] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + .PARAMETER fault + The fault to log + #> - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty + param( + [object] $healthFault ) + Write-Verbose " HealthFault:" + Write-Verbose " (KeyFaultingObjectDescription) $($healthFault.KeyFaultingObjectDescription)" + Write-Verbose " (KeyFaultingObjectID) $($healthFault.KeyFaultingObjectID)" + Write-Verbose " (KeyFaultingObjectType) $($healthFault.KeyFaultingObjectType)" + Write-Verbose " (FaultingObjectLocation) $($healthFault.FaultingObjectLocation)" + Write-Verbose " (FaultDescription) $($healthFault.FaultDescription)" + Write-Verbose " (FaultActionRemediation) $($healthFault.FaultActionRemediation)" +} - $sdnHealthObject = [SdnHealth]::new() - $array = @() +function LogHealthFaultToEventLog { - try { - "Validating Certificates under Root CA Store" | Trace-Output - - $scriptBlock = { - $nonRootCerts = @() - $rootCerts = Get-ChildItem Cert:LocalMachine\Root - foreach ($rootCert in $rootCerts) { - if (-NOT (Confirm-IsCertSelfSigned -Certificate $rootCert)) { - $certInfo = [PSCustomObject]@{ - Thumbprint = $rootCert.Thumbprint - Subject = $rootCert.Subject - Issuer = $rootCert.Issuer - } + <# + .SYNOPSIS + Logs the health fault to the event log - $nonRootCerts += $certInfo - } - } - return $nonRootCerts - } + .PARAMETER fault + The fault to log + #> - foreach($node in $SdnEnvironmentObject.ComputerName){ - $nonRootCerts = Invoke-PSRemoteCommand -ComputerName $node -Credential $Credential -ScriptBlock $scriptBlock -PassThru - # If any node have Non Root Certs in Trusted Root Store. Issue detected. - if($nonRootCerts.Count -gt 0){ - $sdnHealthObject.Result = 'FAIL' + [CmdletBinding()] + param( + [object] $fault, - $object = [PSCustomObject]@{ - ComputerName = $node - NonRootCerts = $nonRootCerts - } + [ValidateSet("Create", "Delete")] + [string] $operation + ) - foreach($nonRootCert in $nonRootCerts) { - $sdnHealthObject.Remediation += "Remove Certificate Thumbprint:{0} Subject:{1} from Host:{2}" -f $nonRootCert.Thumbprint, $nonRootCert.Subject, $node - } + if ([string]::IsNullOrEmpty($operation) ) { + $operation = "" + } + + $eventLogMessage = "SDN HealthServiceHealth Fault: $($fault.FaultDescription)" + $eventLogMessage += "`r`n" + $eventLogMessage += "Faulting Object Description: $($fault.KeyFaultingObjectDescription)" + $eventLogMessage += "`r`n" + $eventLogMessage += "Faulting Object ID: $($fault.KeyFaultingObjectID)" + $eventLogMessage += "`r`n" + $eventLogMessage += "Faulting Object Type: $($fault.KeyFaultingObjectType)" + $eventLogMessage += "`r`n" + $eventLogMessage += "Faulting Object Location: $($fault.FaultingObjectLocation)" + $eventLogMessage += "`r`n" + $eventLogMessage += "Fault Action Remediation: $($fault.FaultActionRemediation)" + $eventLogMessage += "`r`n" + $eventLogMessage += "Fault Operation: $($operation)" + $eventLogJson = (ConvertTo-Json -InputObject $fault -Depth 5) + + $eventInstance = [System.Diagnostics.EventInstance]::new(1, 1) + $evtObject = New-Object System.Diagnostics.EventLog; + $evtObject.Log = $LOG_NAME + $evtObject.Source = $LOG_SOURCE + + Write-Verbose "Source : $($LOG_SOURCE) Log : $($LOG_NAME) Message : $($eventLogMessage)" + $evtObject.WriteEvent($eventInstance, @($eventLogMessage, $eventLogJson, $operation)) +} - $array += $object - } - } +function CreateorUpdateFault { + param( + [SdnFaultInfo] $Fault + ) - $sdnHealthObject.Properties = $array - return $sdnHealthObject - } - catch { - $_ | Trace-Exception - $_ | Write-Error - } + if (-NOT $script:SdnDiagnostics_Health.Config.HealthFaultEnabled) { + return + } + + ValidateFault -Fault $Fault + InitFaults + + Write-Verbose "CreateorUpdateFault:" + + LogHealthFault -healthFault $Fault + LogHealthFaultToEventLog -fault $Fault -operation Create + + if ([string]::IsNullOrEmpty($script:subsystemId)) { + $script:subsystemId = (get-storagesubsystem Cluster*).UniqueId + $script:entityTypeSubSystem = "Microsoft.Health.EntityType.Subsystem" + } + $retValue = [Microsoft.NetworkHud.FunctionalTests.Module.HciHealthUtils]::HciModifyFault( ` + $Fault.KeyFaultingObjectDescription, # $entityType, ` + $Fault.KeyFaultingObjectID, # $entityId, ` + $Fault.KeyFaultingObjectDescription, # "E Desc", ` + $Fault.FaultingObjectLocation, # $entityLocation, ` + $Fault.KeyFaultingObjectID, # $entityId, ` + $HCI_MODIFY_FAULT_ACTION_MODIFY, #action ` + $Fault.KeyFaultingObjectType, # $faultType, ` + $HEALTH_URGENCY_UNHEALTHY, # ` + "Fault Title", ` + $Fault.FaultDescription, # fault description + $Fault.FaultActionRemediation, # fault remediation action + $HCI_MODIFY_FAULT_FLAG_NONE) | Out-Null + + $retValue = [Microsoft.NetworkHud.FunctionalTests.Module.HciHealthUtils]::HciModifyRelationship( + $Fault.KeyFaultingObjectDescription, # $entityType, ` + $Fault.KeyFaultingObjectID, # $entityId, ` + $Fault.KeyFaultingObjectDescription, # $entityDescription + $Fault.FaultingObjectLocation, # $entityLocation, ` + $Fault.KeyFaultingObjectID, # $entityId, ` + $HCI_MODIFY_RELATIONSHIP_ACTION_MODIFY, ` + $script:entityTypeSubSystem, ` + $script:subsystemId, ` + $null, ` + $null, ` + $script:subsystemId, ` + "TestGroupKey", ` + $HEALTH_URGENCY_UNHEALTHY, ` + $HEALTH_RELATIONSHIP_COLLECTION, ` + $HCI_MODIFY_RELATIONSHIP_FLAG_NONE) | Out-Null } -function Test-MuxBgpConnectionState { - [CmdletBinding(DefaultParameterSetName = 'RestCredential')] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, +function DeleteFaultBy { + <# + .SYNOPSIS + Deletes a fault by its key properties, those with empty or a * will be ignored while comaprison for a broader clear operation - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty, + .PARAMETER KeyFaultingObjectDescription + The description of the faulting object - [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, + .PARAMETER KeyFaultingObjectID + The unique ID of the faulting object - [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] - [X509Certificate]$NcRestCertificate + .PARAMETER KeyFaultingObjectType + The type of the faulting object + + .PARAMETER FaultingObjectLocation + The location of the faulting object + #> + param( + [string] $KeyFaultingObjectDescription, + [string] $KeyFaultingObjectID, + [string] $KeyFaultingObjectType, + [string] $FaultingObjectLocation, + [switch] $Verbose ) - $ncRestParams = @{ - NcUri = $SdnEnvironmentObject.NcUrl - } - switch ($PSCmdlet.ParameterSetName) { - 'RestCertificate' { - $ncRestParams.Add('NcRestCertificate', $NcRestCertificate) - } - 'RestCredential' { - $ncRestParams.Add('NcRestCredential', $NcRestCredential) - } + if (-NOT $script:SdnDiagnostics_Health.Config.HealthFaultEnabled) { + return } - $sdnHealthObject = [SdnHealth]::new() - $array = @() + Write-Verbose "DeleteFault: " + Write-Verbose "(KeyFaultingObjectDescription) $($KeyFaultingObjectDescription)" + Write-Verbose "(KeyFaultingObjectID) $($KeyFaultingObjectID)" + Write-Verbose "(KeyFaultingObjectType) $($KeyFaultingObjectType)" + Write-Verbose "(FaultingObjectLocation) $($FaultingObjectLocation)" - $netConnectionExistsScriptBlock = { - param([Parameter(Position = 0)][String]$arg0) - $tcpConnection = Get-NetTCPConnection -RemotePort 179 -RemoteAddress $arg0 -ErrorAction SilentlyContinue | Where-Object { $_.State -eq "Established" } - if ($tcpConnection) { - return $true - } - } + InitFaults - try { - "Validating the BGP connectivity between LoadBalancerMuxes and Top of Rack (ToR) Switches." | Trace-Output - $loadBalancerMux = Get-SdnLoadBalancerMux @ncRestParams + # get all the system faults + $faults = Get-HealthFault + [bool] $match = $true + [string[]] $matchFaultsId = @() + foreach ($fault in $faults) { + # delete the one(s) that match the filter + # KeyFaultingObjectDescription, KeyFaultingObjectID, KeyFaultingObjectType may be empty , in which case + # we will not consider them for comparison + $match = [string]::IsNullOrEmpty($KeyFaultingObjectDescription) -or $KeyFaultingObjectDescription -eq "*" -or ` + $KeyFaultingObjectDescription -eq $fault.FaultingObjectDescription; - # if no load balancer muxes configured within the environment, return back the health object to caller - if ($null -ieq $loadBalancerMux) { - return $sdnHealthObject - } + Write-Verbose "KeyFaultingObjectDescription $match" - # enumerate through the load balancer muxes in the environment and validate the BGP connection state - foreach ($mux in $loadBalancerMux) { - $virtualServer = Get-SdnResource @ncRestParams -ResourceRef $mux.properties.virtualServer.resourceRef - [string]$virtualServerConnection = $virtualServer.properties.connections[0].managementAddresses - $peerRouters = $mux.properties.routerConfiguration.peerRouterConfigurations.routerIPAddress - foreach ($router in $peerRouters) { - $connectionExists = Invoke-PSRemoteCommand -ComputerName $virtualServerConnection -Credential $Credential -ScriptBlock $netConnectionExistsScriptBlock -ArgumentList $router - if (-NOT $connectionExists) { - "{0} is not connected to {1}" -f $virtualServerConnection, $router | Trace-Output -Level:Error - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Fix BGP Peering between $($virtualServerConnection) and $($router)." - - # create a custom object to store the load balancer mux and the router that it is not connected to - # this will be added to the array - $object = [PSCustomObject]@{ - LoadBalancerMux = $virtualServerConnection - TopOfRackSwitch = $router - } + $match = $match -and ([string]::IsNullOrEmpty($KeyFaultingObjectID) -or $KeyFaultingObjectID -eq "*" -or ` + $KeyFaultingObjectID -eq $fault.FaultingObjectUniqueId) + Write-Verbose "KeyFaultingObjectID $match" - $array += $object - } - else { - "{0} is connected to {1}" -f $virtualServerConnection, $router | Trace-Output -Level:Verbose - } - } - } + $match = $match -and ([string]::IsNullOrEmpty($KeyFaultingObjectType) -or $KeyFaultingObjectType -eq "*" -or ` + $KeyFaultingObjectType -eq $fault.FaultingObjectType) + Write-Verbose "KeyFaultingObjectType $match" - # if the array is not empty, add it to the health object - if ($array) { - $sdnHealthObject.Properties = $array + if ($match) { + Write-Verbose "Deleting fault (ID) $($fault.FaultId)" + $matchFaultsId += $fault.FaultId } - - return $sdnHealthObject } - catch { - $_ | Trace-Exception - $_ | Write-Error + if ($matchFaultsId.Count -eq 0) { + Write-Verbose "No faults found to delete" + return + } + else { + Write-Verbose "Found $($matchFaultsId.Count) faults to delete" + } + + foreach ($faultId in $matchFaultsId) { + DeleteFaultById -faultUniqueID $faultId } } -function Test-NcHostAgentConnectionToApiService { +function DeleteFaultById { <# - .SYNOPSIS - Validates the TCP connection between Server and primary replica of Api service within Network Controller. - #> - - [CmdletBinding(DefaultParameterSetName = 'RestCredential')] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, - - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty, - - [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, + .SYNOPSIS + Deletes a fault by its unique ID - [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] - [X509Certificate]$NcRestCertificate + .PARAMETER faultUniqueID + The unique ID of the fault to delete + #> + param( + [string] $faultUniqueID ) - $ncRestParams = @{ - NcUri = $SdnEnvironmentObject.NcUrl + if (-NOT $script:SdnDiagnostics_Health.Config.HealthFaultEnabled) { + return } - switch ($PSCmdlet.ParameterSetName) { - 'RestCertificate' { - $ncRestParams.Add('NcRestCertificate', $NcRestCertificate) - } - 'RestCredential' { - $ncRestParams.Add('NcRestCredential', $NcRestCredential) - } + + if ([string]::IsNullOrEmpty($faultUniqueID)) { + throw "Empty faultID" } - $sdnHealthObject = [SdnHealth]::new() - $array = @() + InitFaults + Write-Verbose "DeleteFaultById $faultId" + $fault = Get-HealthFault | ? { $_.FaultId -eq $faultUniqueID } - $netConnectionExistsScriptBlock = { - $tcpConnection = Get-NetTCPConnection -RemotePort 6640 -ErrorAction SilentlyContinue | Where-Object { $_.State -eq "Established" } - if ($tcpConnection) { - return $true - } + if ($null -eq $fault) { + throw "Fault with ID $faultUniqueID not found" } + else { + LogWmiHealthFault -fault $fault + } + + [Microsoft.NetworkHud.FunctionalTests.Module.HciHealthUtils]::HciModifyFault( ` + $fault.FaultingObjectType, ` + $fault.FaultingObjectUniqueId, ` + "", ` + $fault.FaultingObjectUniqueId, ` + $fault.FaultingObjectUniqueId, ` + $HCI_MODIFY_FAULT_ACTION_REMOVE, ` + $fault.FaultType, ` + $HEALTH_URGENCY_UNHEALTHY, ` + "", ` + "", ` + "", ` + $HCI_MODIFY_FAULT_FLAG_NONE) | Out-Null +} - try { - "Validating connectivity between Server and primary replica of API service within Network Controller" | Trace-Output - $servers = Get-SdnServer @ncRestParams - - # if no load balancer muxes configured within the environment, return back the health object to caller - if ($null -ieq $servers) { - return $sdnHealthObject - } +function ShowFaultSet { + <# + .SYNOPSIS + Shows the fault set - # get the current primary replica of Network Controller - # if we cannot return the primary replica, then something is critically wrong with Network Controller - # in which case we should mark this test as failed and return back to the caller with guidance to fix the SlbManagerService - $primaryReplicaNode = Get-SdnServiceFabricReplica -NetworkController $SdnEnvironmentObject.EnvironmentInfo.NetworkController[0] -ServiceTypeName 'ApiService' -Credential $Credential -Primary - if ($null -ieq $primaryReplicaNode) { - "Unable to return primary replica of ApiService" | Trace-Output -Level:Error - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation = "Fix the primary replica of ApiService within Network Controller." - return $sdnHealthObject - } + .PARAMETER faultset + The fault set to show + #> - # enumerate through the servers in the environment and validate the TCP connection state - # we expect the NCHostAgent to have an active connection to ApiService within Network Controller via port 6640, which informs - # Network Controller that the host is operational and ready to receive policy configuration updates - foreach ($server in $servers) { - [System.Array]$connectionAddress = Get-SdnServer @ncRestParams -ResourceId $server.resourceId -ManagementAddressOnly - $connectionExists = Invoke-PSRemoteCommand -ComputerName $connectionAddress[0] -Credential $Credential -ScriptBlock $netConnectionExistsScriptBlock - if (-NOT $connectionExists) { - "{0} is not connected to ApiService of Network Controller" -f $server.resourceRef | Trace-Output -Level:Error - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Ensure NCHostAgent service is started. Investigate and fix TCP connectivity or x509 authentication between $($primaryReplicaNode.ReplicaAddress) and $($server.resourceRef)." - - $object = [PSCustomObject]@{ - Server = $server.resourceRef - ApiPrimaryReplica = $primaryReplicaNode.ReplicaAddress - } + param([object[]]$faultset) - $array += $object - } - else { - "{0} is connected to {1}" -f $server.resourceRef, $primaryReplicaNode.ReplicaAddress | Trace-Output -Level:Verbose - } + Write-Verbose "Success Faults (for rest res):" + if ($PSCmdlet.MyInvocation.BoundParameters["Verbose"].IsPresent) { + if ($null -eq $faultset[0] -or $faultset[0].Count -eq 0) { + Write-Verbose "(none)" + return + } + foreach ($faultInst in $faultset[0]) { + LogHealthFault -healthFault $faultInst } - - $sdnHealthObject.Properties = $array - return $sdnHealthObject } - catch { - $_ | Trace-Exception - $_ | Write-Error + + Write-Verbose "Failure Faults (for rest res):" + if ($PSCmdlet.MyInvocation.BoundParameters["Verbose"].IsPresent) { + if ($null -eq $faultset[1] -or $faultset[1].Count -eq 0) { + Write-Verbose "(none)" + return + } + foreach ($faultInst in $faultset[1]) { + LogHealthFault -healthFault $faultInst + } } } -function Test-NcUrlNameResolution { +function UpdateFaultSet { - [CmdletBinding()] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + <# + .SYNOPSIS + Updates the fault set and returns the health test object - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty, + .PARAMETER successFaults + The set of faults that were successful - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $NcRestCredential = [System.Management.Automation.PSCredential]::Empty + .PARAMETER failureFaults + The set of faults that failed + #> + + param( + [object[]]$successFaults, + [object[]]$failureFaults ) - $sdnHealthObject = [SdnHealth]::new() + $healthTest = New-SdnHealthTest - try { - "Validate that the Network Controller NB API URL resolves to the correct IP address" | Trace-Output + if ($null -ne $failureFaults -and $failureFaults.Count -gt 0) { + $healthTest.Result = "FAIL" + } - $ncApiReplicaPrimary = Get-SdnServiceFabricReplica -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $Credential -ServiceTypeName 'ApiService' -Primary - if ($null -eq $ncApiReplicaPrimary) { - "Unable to find the primary replica for the ApiService" | Trace-Output -Level:Warning - return $sdnHealthObject - } + foreach ($fault in $successFaults) { + DeleteFaultBy -KeyFaultingObjectDescription $fault.KeyFaultingObjectDescription + $convFault = ConvertFaultToPsObject -healthFault $fault -faultType "Delete" + $healthTest.HealthFault += $convFault + } - $networkController = Get-SdnNetworkController -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $Credential - if ($null -eq $networkController) { - "Unable to retrieve results from Get-SdnNetworkController" | Trace-Output -Level:Warning - return $sdnHealthObject - } + foreach ($fault in $failureFaults) { + CreateOrUpdateFault -Fault $fault + $convFault = ConvertFaultToPsObject -healthFault $fault -faultType "Create" + $healthTest.HealthFault += $convFault + } - # depending on the configuration returned, will determine if we need to use the RestIPAddress or RestName - $nbApiName = $networkController.ServerCertificate.Subject.Split('=')[1].Trim() + $healthTest +} - if ($networkController.RestIPAddress) { - $expectedIPAddress = $($networkController.RestIPAddress).Split('/')[0].Trim() # we expect to be in IP/CIDR format - "Network Controller is configured with static RestIPAddress: {0}" -f $expectedIPAddress | Trace-Output -Level:Verbose - } - else { - "Network Controller is configured with RestName" | Trace-Output -Level:Verbose - $ncNodeName = $ncApiReplicaPrimary.ReplicaAddress.Split(':')[0].Trim() - $isIpAddress = [System.Net.IPAddress]::TryParse($ncNodeName, [ref]$null) - if ($isIpAddress) { - $expectedIPAddress = $ncNodeName.ToString() - } - else { - $dnsResultNetworkControllerNode = Resolve-DnsName -Name $ncNodeName -NoHostsFile -ErrorAction SilentlyContinue - if ($null -ieq $dnsResultNetworkControllerNode) { - "Unable to resolve IP address for {0}" -f $ncNodeName | Trace-Output -Level:Warning - return $sdnHealthObject - } - else { - $expectedIPAddress = $dnsResultNetworkControllerNode.IPAddress - "ApiService replica primary is hosted on {0} with an IP address of {1}" -f $ncApiReplicaPrimary.ReplicaAddress, $expectedIPAddress | Trace-Output -Level:Verbose - } - } - } +function DeleteFault { + <# + .SYNOPSIS + Deletes a fault - # in this scenario, the certificate is using an IP address as the subject, so we will need to compare the IP address to the expected IP address - # if they match, we will return a success - $isIpAddress = [System.Net.IPAddress]::TryParse($nbApiName, [ref]$null) - if ($isIpAddress -and ($nbApiName -ieq $expectedIPAddress)) { - return $sdnHealthObject - } + .PARAMETER Fault + The fault to delete + #> + [CmdletBinding()] + param( + [SdnFaultInfo] $Fault + ) - # perform some DNS resolution to ensure that the NB API URL resolves to the correct IP address - $dnsResult = Resolve-DnsName -Name $nbApiName -NoHostsFile -ErrorAction SilentlyContinue - if ($null -ieq $dnsResult) { - $sdnHealthObject.Result = 'FAIL' + if (-NOT$script:SdnDiagnostics_Health.Config.HealthFaultEnabled) { + return + } - "Unable to resolve DNS name for {0}" -f $nbApiName | Trace-Output -Level:Warning - return $sdnHealthObject - } - elseif ($dnsResult[0].IPAddress -ine $expectedIPAddress) { - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation = 'Ensure that the DNS name for the Network Controller NB API URL resolves to the correct IP address.' + ValidateFault -Fault $Fault + InitFaults - "DNS name for {0} resolves to {1} instead of {2}" -f $nbApiName, $dnsResult[0].IPAddress, $expectedIPAddress | Trace-Output -Level:Warning - return $sdnHealthObject - } + Write-Verbose "DeleteFault $($Fault.KeyFaultingObjectDescription) $($Fault.KeyFaultingObjectID) $($Fault.KeyFaultingObjectType)" - return $sdnHealthObject - } - catch { - $_ | Trace-Exception - $_ | Write-Error + if ([string]::IsNullOrEmpty($script:subsystemId)) { + $script:subsystemId = (get-storagesubsystem Cluster*).UniqueId + $script:entityTypeSubSystem = "Microsoft.Health.EntityType.Subsystem" } + [Microsoft.NetworkHud.FunctionalTests.Module.HciHealthUtils]::HciModifyFault( ` + $Fault.KeyFaultingObjectDescription, # $entityType, ` + $Fault.KeyFaultingObjectID, # $entityId, ` + $Fault.KeyFaultingObjectDescription, # "E Desc", ` + $Fault.FaultingObjectLocation, # $entityLocation, ` + $Fault.KeyFaultingObjectID, # $entityId, ` + $HCI_MODIFY_FAULT_ACTION_REMOVE, #action ` + $Fault.KeyFaultingObjectType, # $faultType, ` + $HEALTH_URGENCY_UNHEALTHY, # ` + "Fault Title", ` + $Fault.FaultDescription, # fault description + $Fault.FaultActionRemediation, # fault remediation action + $HCI_MODIFY_FAULT_FLAG_NONE) | Out-Null } -function Test-NetworkControllerCertCredential { +function Start-HealthFaultsTranscript { <# - .SYNOPSIS - Query the NC Cert credential used to connect to SDN Servers, ensure cert exist. + .SYNOPSIS + Initializes the health runner transcript #> - [CmdletBinding(DefaultParameterSetName = 'RestCredential')] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + $logLocation = GetLogLocation - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty, - - [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, - - [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] - [X509Certificate]$NcRestCertificate - ) - - $ncRestParams = @{ - NcUri = $SdnEnvironmentObject.NcUrl + if ($null -eq $logLocation) { + return $false } - switch ($PSCmdlet.ParameterSetName) { - 'RestCertificate' { - $ncRestParams.Add('NcRestCertificate', $NcRestCertificate) - } - 'RestCredential' { - $ncRestParams.Add('NcRestCredential', $NcRestCredential) - } + else { + $fullLogPath = Join-Path -Path $logLocation -ChildPath "SdnHealthTranscript.log" + Start-Transcript -Path $fullLogPath -Append -ErrorAction SilentlyContinue + $script:TranscriptStarted = $true + return $true } +} - $sdnHealthObject = [SdnHealth]::new() - $arrayList = [System.Collections.ArrayList]::new() - - try { - "Validate cert credential resource of SDN Servers. Ensure certificate exists on each of the Network Controller " | Trace-Output - - # enumerate each server's conection->credential object into the array - $servers = Get-SdnServer @ncRestParams - $serverCredentialRefs = [System.Collections.Hashtable]::new() - foreach ($server in $servers) { - # find the first connection with credential type of X509Certificate - $serverConnection = $server.properties.connections | Where-Object {$_.credentialType -ieq "X509Certificate" -or $_.credentialType -ieq "X509CertificateSubjectName"} | Select-Object -First 1; - if ($null -ne $serverConnection) { - $credRef = $serverConnection.credential[0].resourceRef - "Adding credential {0} for server {1} for validation" -f $credRef, $serverConnection.managementAddresses[0] | Trace-Output -Level:Verbose - if ($null -ne $credRef) { - if (-NOT $serverCredentialRefs.ContainsKey($credRef)) { - $serverList = [System.Collections.ArrayList]::new() - $serverCredentialRefs.Add($credRef, $serverList) - } - - [void]$serverCredentialRefs[$credRef].Add($server) - } - } - } - - # iterate the credential object to validate certificate on each NC - foreach ($credRef in $serverCredentialRefs.Keys) { - $credObj = Get-SdnResource @ncRestParams -ResourceRef $credRef - if ($null -ne $credObj) { - $thumbPrint = $credObj.properties.value - $scriptBlock = { - param([Parameter(Position = 0)][String]$param1) - - if (-NOT (Test-Path -Path Cert:\LocalMachine\My\$param1)) { - return $false - } - else { - return $true - } - } - - # invoke command on each NC seperately so to record which NC missing certificate - foreach ($nc in $SdnEnvironmentObject.ComputerName) { - "Validating certificate [{0}] on NC {1}" -f $thumbPrint, $nc | Trace-Output -Level:Verbose - $result = Invoke-PSRemoteCommand -ComputerName $nc -Credential $Credential -ScriptBlock $scriptBlock -ArgumentList $thumbPrint - if ($result -ne $true) { - # if any NC missing certificate, it indicate issue detected - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Install certificate [$thumbPrint] on Network Controller [$nc]" - $object = [PSCustomObject]@{ - NetworkController = $nc - CertificateMissing = $thumbPrint - AffectedServers = $serverCredentialRefs[$credRef] - } - - [void]$arrayList.Add($object) - } - } - - } - } +function StopHealthRunnerTranscript { + <# + .SYNOPSIS + Stops the health runner transcript + #> - $sdnHealthObject.Properties = $arrayList - return $sdnHealthObject - } - catch { - $_ | Trace-Exception - $_ | Write-Error + if ($script:TranscriptStarted) { + Write-Host "Stopping transcript" + Stop-Transcript -ErrorAction SilentlyContinue + $script:TranscriptStarted = $false } } -function Test-NetworkInterfaceAPIDuplicateMacAddress { +function InitFaults { <# - .SYNOPSIS - Validate there are no adapters within the Network Controller Network Interfaces API that are duplicate. + .SYNOPSIS + Initializes defaults and constants for fault handling #> - [CmdletBinding(DefaultParameterSetName = 'RestCredential')] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, - - [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, - - [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] - [X509Certificate]$NcRestCertificate - ) + [CmdletBinding()] + param() - $ncRestParams = @{ - NcUri = $SdnEnvironmentObject.NcUrl - } - switch ($PSCmdlet.ParameterSetName) { - 'RestCertificate' { - $ncRestParams.Add('NcRestCertificate', $NcRestCertificate) - } - 'RestCredential' { - $ncRestParams.Add('NcRestCredential', $NcRestCredential) - } + Write-Verbose "InitFaults" + if (-not ("Microsoft.NetworkHud.FunctionalTests.Module.HciHealthUtils" -as [type])) { + Add-Type -MemberDefinition $signature -Name "HciHealthUtils" -Namespace "Microsoft.NetworkHud.FunctionalTests.Module" | Out-Null + Write-Verbose "Registered HCI fault utilities" } - $sdnHealthObject = [SdnHealth]::new() - $array = @() + New-Variable -Name 'HCI_MODIFY_FAULT_ACTION_MODIFY' -Scope 'Script' -Force -Value 0 + New-Variable -Name 'HCI_MODIFY_FAULT_ACTION_REMOVE' -Scope 'Script' -Force -Value 1 - try { - "Validate no duplicate MAC addresses for network interfaces in Network Controller" | Trace-Output + New-Variable -Name 'HCI_MODIFY_RELATIONSHIP_ACTION_MODIFY' -Scope 'Script' -Force -Value 0 + New-Variable -Name 'HCI_MODIFY_RELATIONSHIP_ACTION_REMOVE' -Scope 'Script' -Force -Value 1 - $networkInterfaces = Get-SdnResource @ncRestParams -Resource:NetworkInterfaces - if($null -eq $networkInterfaces){ - # if there are no network interfaces, then there is nothing to validate - # pass back the health object to the caller - return $sdnHealthObject - } + New-Variable -Name 'HEALTH_RELATIONSHIP_UNKNOWN' -Scope 'Script' -Force -Value 0 + New-Variable -Name 'HEALTH_RELATIONSHIP_COMPOSITION' -Scope 'Script' -Force -Value 1 + New-Variable -Name 'HEALTH_RELATIONSHIP_CONTAINMENT' -Scope 'Script' -Force -Value 2 + New-Variable -Name 'HEALTH_RELATIONSHIP_COLLECTION' -Scope 'Script' -Force -Value 3 - $duplicateObjects = $networkInterfaces.properties | Group-Object -Property privateMacAddress | Where-Object {$_.Count -ge 2} - if($duplicateObjects){ - $sdnHealthObject.Result = 'FAIL' + New-Variable -Name 'HEALTH_URGENCY_UNKNOWN' -Scope 'Script' -Force -Value 255 + New-Variable -Name 'HEALTH_URGENCY_HEALTHY' -Scope 'Script' -Force -Value 0 + New-Variable -Name 'HEALTH_URGENCY_WARNING' -Scope 'Script' -Force -Value 1 + New-Variable -Name 'HEALTH_URGENCY_UNHEALTHY' -Scope 'Script' -Force -Value 2 - # since there can be multiple grouped objects, we need to enumerate each duplicate group - foreach($obj in $duplicateObjects){ - $sdnHealthObject.Remediation += "Remove the duplicate MAC addresses for $($obj.Name) within Network Controller Network Interfaces" + New-Variable -Name 'HCI_MODIFY_FAULT_FLAG_NONE' -Scope 'Script' -Force -Value 0 + New-Variable -Name 'HCI_MODIFY_RELATIONSHIP_FLAG_NONE' -Scope 'Script' -Force -Value 0 - $duplicateInterfaces = $networkInterfaces | Where-Object {$_.properties.privateMacAddress -eq $obj.Name} - $array += $duplicateInterfaces + New-Variable -Name 'LOG_NAME' -Scope 'Script' -Force -Value 'SdnHealthService' + New-Variable -Name 'LOG_CHANNEL' -Scope 'Script' -Force -Value 'Admin' + New-Variable -Name 'LOG_SOURCE' -Scope 'Script' -Force -Value 'HealthService' - "Located {0} virtual machines associated with MAC address {1}:`r`n`n{2}`r`n" -f $obj.Count, $obj.Name, ` - ($duplicateInterfaces ` - | Select-Object @{n="ResourceRef";e={"`t$($_.resourceRef)"}} ` - | Select-Object -ExpandProperty ResourceRef ` - | Out-String ` - ) | Trace-Output -Level:Error - } + [bool] $eventLogFound = $false + try { + $evtLog = Get-EventLog -LogName $script:LOG_NAME -Source $script:LOG_SOURCE -ErrorAction SilentlyContinue + if ($null -ne $evtLog) { + $eventLogFound = $true } + } + catch { + #get-eventlog throws even on erroraction silentlycontinue + } - $sdnHealthObject.Properties = $array - return $sdnHealthObject + try { + if ($eventLogFound -eq $false) { + New-EventLog -LogName $script:LOG_NAME -Source $script:LOG_SOURCE -ErrorAction SilentlyContinue + } } catch { - $_ | Trace-Exception - $_ | Write-Error + #failure to create event log is non-fatal } } -function Test-ProviderNetwork { +function IsSdnFcClusterServiceRole { + <# - .SYNOPSIS - Performs ICMP tests across the computers defined to confirm that jumbo packets are able to successfully traverse between the provider addresses on each host - #> + .SYNOPSIS + Checks if the provided service role is an SDN cluster service role - [CmdletBinding()] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + .PARAMETER ServiceName + The name of the service role to check + #> - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty + param([string] $ServiceName) + + # Define the list of valid service roles + $validServiceRoles = @( + "ApiService", + "ControllerService", + "FirewallService", + "FnmService", + "GatewayManager", + "ServiceInsertion", + "VSwitchService" ) - $sdnHealthObject = [SdnHealth]::new() - $array = @() + # Check if the provided service role name is in the list + return $validServiceRoles -contains $ServiceName +} +function IsSdnService { - try { - "Validating Provider Address network has connectivity across the SDN dataplane" | Trace-Output + <# + .SYNOPSIS + Checks if the provided service name is an SDN agent service - $providerAddresses = (Get-SdnProviderAddress -ComputerName $SdnEnvironmentObject.ComputerName -Credential $Credential).ProviderAddress - if ($null -eq $providerAddresses){ - "No provider addresses were found on the hosts." | Trace-Output - } - else { - $connectivityResults = Invoke-PSRemoteCommand -ComputerName $SdnEnvironmentObject.ComputerName -Credential $Credential -Scriptblock { - param([Parameter(Position = 0)][String[]]$param1) - Test-SdnProviderAddressConnectivity -ProviderAddress $param1 - } -ArgumentList $providerAddresses - - foreach($computer in $connectivityResults | Group-Object PSComputerName){ - foreach($destinationAddress in $computer.Group){ - $jumboPacketResult = $destinationAddress | Where-Object {$_.BufferSize -gt 1472} - $standardPacketResult = $destinationAddress | Where-Object {$_.BufferSize -le 1472} - - if($destinationAddress.Status -ine 'Success'){ - $sdnHealthObject.Result = 'FAIL' - - # if both jumbo and standard icmp tests fails, indicates a failure in the physical network - if($jumboPacketResult.Status -ieq 'Failure' -and $standardPacketResult.Status -ieq 'Failure'){ - $remediationMsg = "Ensure ICMP enabled on {0} and {1}. If issue persists, investigate physical network." -f $destinationAddress[0].DestinationAddress, $destinationAddress[0].SourceAddress - $sdnHealthObject.Remediation += $remediationMsg - - "Cannot ping {0} from {1} ({2})." ` - -f $destinationAddress[0].DestinationAddress, $computer.Name, $destinationAddress[0].SourceAddress | Trace-Output -Level:Error - } + .PARAMETER serviceName + The name of the service to check + #> - # if standard MTU was success but jumbo MTU was failure, indication that jumbo packets or encap overhead has not been setup and configured - # either on the physical nic or within the physical switches between the provider addresses - if($jumboPacketResult.Status -ieq 'Failure' -and $standardPacketResult.Status -ieq 'Success'){ - $remediationMsg += "Ensure the physical network between {0} and {1} configured to support VXLAN or NVGRE encapsulated packets with minimum MTU of 1660." ` - -f $destinationAddress[0].DestinationAddress, $destinationAddress[0].SourceAddress - $sdnHealthObject.Remediation += $remediationMsg + param([string] $serviceName) - "Cannot send jumbo packets to {0} from {1} ({2})." ` - -f $destinationAddress[0].DestinationAddress, $computer.Name, $destinationAddress[0].SourceAddress | Trace-Output -Level:Error - } - } - else { - "Successfully sent jumbo packet to {0} from {1} ({2})" ` - -f $destinationAddress[0].DestinationAddress, $computer.Name, $destinationAddress[0].SourceAddress | Trace-Output - } + return $serviceName -in @( "NCHostAgent", "SlbHostAgent") +} - $array += $destinationAddress - } - } - } +function IsCurrentNodeClusterOwner { + <# + .SYNOPSIS + Checks if the current node is the owner of the cluster - $sdnHealthObject.Properties = $array - return $sdnHealthObject - } - catch { - $_ | Trace-Exception - $_ | Write-Error + .NOTES + This function is used to determine if the current node is the owner of the cluster. This is used to determine if the current node is the primary node in a cluster. + #> + + $activeNode = Get-ClusterResource -ErrorAction Ignore | Where-Object { $_.OwnerGroup -eq "Cluster Group" -and $_.ResourceType -eq "IP Address" -and $_.Name -eq "Cluster IP Address" } + + if ( $null -eq $activeNode ) { + Write-Verbose "Active $($activeNode.OwnerNode)" + + # todo : generate a fault on failing to generate a fault (or switch to different algorithm for picking the primary node) + return $false } + + return ($activeNode.OwnerNode -eq $env:COMPUTERNAME) } -function Test-ResourceConfigurationState { +function GetFaultFromConfigurationState { <# - .SYNOPSIS - Validate that the configurationState of the resources. - #> - - [CmdletBinding(DefaultParameterSetName = 'RestCredential')] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + .SYNOPSIS + Generates a fault from the configuration state - [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, + .PARAMETER resources + The resources to generate the fault from + #> - [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] - [X509Certificate]$NcRestCertificate + param( + [object[]] $resources ) - $ncRestParams = @{ - NcUri = $SdnEnvironmentObject.NcUrl - } - switch ($PSCmdlet.ParameterSetName) { - 'RestCertificate' { - $ncRestParams.Add('NcRestCertificate', $NcRestCertificate) - } - 'RestCredential' { - $ncRestParams.Add('NcRestCredential', $NcRestCredential) - } - } + $healthFaults = @() + # successful faults are just a stub holder for the resource + # these are not created, but used for clearing out any older unhealthy states + # these have KeyFaultingObjectType set to string.empty + $successFaults = @() - $sdnHealthObject = [SdnHealth]::new() - $array = @() + foreach ($resource in $resources) { - try { - "Validating configuration state of {0}" -f $SdnEnvironmentObject.Role.ResourceName | Trace-Output + ########################################################################################## + ## ServiceState Fault Template (ServerResource) + ########################################################################################## + # $KeyFaultingObjectDescription (SDN ID) : [ResourceRef] + # $KeyFaultingObjectID (ARC ID) : [ResourceMetadataID (if available) else ResourceRef] + # $KeyFaultingObjectType (CODE) : "ConfgiStateCode" (if 2 more errors are found with same other properties will be concat) + # $FaultingObjectLocation (SOURCE) : "Source (if keys of 2 errors collide they will be concatanated)" + # $FaultDescription (MESSAGE) : "ConfigStateMessage (2 or more if errors collide)." + # $FaultActionRemediation (ACTION) : "See for more information on how to resolve this issue." + # * Config state faults issued only from the primary Node + ########################################################################################## - $sdnResources = Get-SdnResource @ncRestParams -Resource $SdnEnvironmentObject.Role.ResourceName - foreach ($object in $sdnResources) { - # if we have a resource that is not in a success state, we will skip validation - # as we do not expect configurationState to be accurate if provisioningState is not Success - if ($object.properties.provisioningState -ine 'Succeeded') { - continue - } + if ($null -ne $resource.Properties.ConfigurationState -and $null -ne $resource.Properties.ConfigurationState.DetailedInfo -and ` + $resource.Properties.ConfigurationState.DetailedInfo.Count -gt 0) { - # examine the configuration state of the resources and display errors to the screen - $errorMessages = @() - switch ($object.properties.configurationState.Status) { - 'Warning' { - # if we already have a failure, we will not change the result to warning - if ($sdnHealthObject.Result -ne 'FAIL') { - $sdnHealthObject.Result = 'WARNING' - } + foreach ($detailedInfo in $resource.Properties.ConfigurationState.DetailedInfo) { - $traceLevel = 'Warning' + # supression check for some of the known configuration states + if (IsConfigurationStateSkipped -Source $detailedInfo.Source -Message $detailedInfo.Message -Code $detailedInfo.Code) { + continue } - 'Failure' { - $sdnHealthObject.Result = 'FAIL' - $traceLevel = 'Error' - } + # handle success cases + if ($detailedInfo.Code -eq "Success") { - 'InProgress' { - # if we already have a failure, we will not change the result to warning - if ($sdnHealthObject.Result -ne 'FAIL') { - $sdnHealthObject.Result = 'WARNING' - } + $successFault = [SdnFaultInfo]::new() + $successFault.KeyFaultingObjectDescription = $resource.ResourceRef + $successFault.KeyFaultingObjectID = $resource.ResourceRef + $successFault.KeyFaultingObjectType = [string]::Empty + $successFault.FaultingObjectLocation = [string]::Empty + $successFault.FaultDescription = [string]::Empty + $successFaults += $successFault - $traceLevel = 'Warning' } + else { - 'Uninitialized' { - # in scenarios where state is redundant, we will not fail the test - if ($object.properties.state -ieq 'Redundant') { - # do nothing - } - else { - # if we already have a failure, we will not change the result to warning - if ($sdnHealthObject.Result -ne 'FAIL') { - $sdnHealthObject.Result = 'WARNING' - } + # find any existing overlapping fault + $existingFault = $healthFaults | Where-Object { $_.KeyFaultingObjectDescription -eq $resource.ResourceRef -and ` + $_.KeyFaultingObjectType -eq $detailedInfo.Code } + + if ($null -ne $existingFault) { + + $existingFault.FaultDescription += ("; " + $detailedInfo.Message) + $existingFault.FaultingObjectLocation += ("; " + $detailedInfo.Source) - $traceLevel = 'Warning' } - } + else { - default { - $traceLevel = 'Verbose' - } - } + $healthFault = [SdnFaultInfo]::new() + $healthFault.KeyFaultingObjectDescription = $resource.ResourceRef + $healthFault.KeyFaultingObjectType = $detailedInfo.Code + $healthFault.FaultingObjectLocation = $detailedInfo.Source + $healthFault.FaultDescription += $detailedInfo.Message - if ($object.properties.configurationState.detailedInfo) { - foreach ($detail in $object.properties.configurationState.detailedInfo) { - switch ($detail.code) { - 'Success' { - # do nothing + # add resource metadata if available + if ($null -ne $resource.Properties.ResourceMetadata) { + $healthFault.KeyFaultingObjectID = $resource.Properties.ResourceMetadata } - - default { - $errorMessages += $detail.message - try { - $errorDetails = Get-HealthData -Property 'ConfigurationStateErrorCodes' -Id $detail.code - $sdnHealthObject.Remediation += "[{0}] {1}" -f $object.resourceRef, $errorDetails.Action - } - catch { - "Unable to locate remediation actions for {0}" -f $detail.code | Trace-Output -Level:Warning - $remediationString = "[{0}] Examine the configurationState property to determine why configuration failed." -f $object.resourceRef - $sdnHealthObject.Remediation += $remediationString - } + else { + $healthFault.KeyFaultingObjectID = $resource.ResourceRef } } + $healthFaults += $healthFault } - - # print the overall configuration state to screen, with each of the messages that were captured - # as part of the detailedinfo property - if ($errorMessages) { - $msg = "{0} is reporting configurationState status {1}:`n`t- {2}" -f $object.resourceRef, $object.properties.configurationState.Status, ($errorMessages -join "`n`t- ") - } - else { - $msg = "{0} is reporting configurationState status {1}" -f $object.resourceRef, $object.properties.configurationState.Status - } - - $msg | Trace-Output -Level $traceLevel.ToString() } - - $details = [PSCustomObject]@{ - resourceRef = $object.resourceRef - configurationState = $object.properties.configurationState + } + else { + # if configuration state is not available, we will clear out any existing faults + if ($healthFaults.Count -eq 0) { + $successFault = [SdnFaultInfo]::new() + $successFault.KeyFaultingObjectDescription = $resource.ResourceRef + $successFault.KeyFaultingObjectType = [string]::Empty + $successFault.FaultingObjectLocation = [string]::Empty + $successFault.FaultDescription = [string]::Empty + $successFault.KeyFaultingObjectID = $resource.ResourceRef + $successFaults += $successFault } - - $array += $details } - - $sdnHealthObject.Properties = $array - return $sdnHealthObject } - catch { - $_ | Trace-Exception - $_ | Write-Error + + foreach ($fault in $healthFaults) { + LogWmiHealthFault -fault $fault } + + @($successFaults, $healthFaults) } -function Test-ResourceProvisioningState { +function IsConfigurationStateSkipped { + <# - .SYNOPSIS - Validate that the provisioningState of the resources. - #> + .SYNOPSIS + Checks if the configuration state should be skipped - [CmdletBinding(DefaultParameterSetName = 'RestCredential')] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + .PARAMETER Source + The source of the configuration state - [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, + .PARAMETER Message + The message of the configuration state - [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] - [X509Certificate]$NcRestCertificate + .PARAMETER Code + The code of the configuration state + #> + + param( + [string] $Source, + [string] $Message, + [string] $Code ) - $ncRestParams = @{ - NcUri = $SdnEnvironmentObject.NcUrl - } - switch ($PSCmdlet.ParameterSetName) { - 'RestCertificate' { - $ncRestParams.Add('NcRestCertificate', $NcRestCertificate) - } - 'RestCredential' { - $ncRestParams.Add('NcRestCredential', $NcRestCredential) + if ($Source -eq "SoftwareLoadbalancerManager") { + if ($Code -eq "HostNotConnectedToController") { + return $true } } - $sdnHealthObject = [SdnHealth]::new() - $array = @() - - try { - "Validating provisioning state of {0}" -f $SdnEnvironmentObject.Role.ResourceName | Trace-Output + $false +} - $sdnResources = Get-SdnResource @ncRestParams -Resource $SdnEnvironmentObject.Role.ResourceName - foreach ($object in $sdnResources) { - # examine the provisioning state of the resources and display errors to the screen - $msg = "{0} is reporting provisioning state: {1}" -f $object.resourceRef, $object.properties.provisioningState +########################## +#### ARG COMPLETERS ###### +########################## - switch ($object.properties.provisioningState) { - 'Failed' { - $sdnHealthObject.Result = 'FAIL' - $msg | Trace-Output -Level:Error +$argScriptBlock = @{ + Role = { + param($commandName, $parameterName, $wordToComplete, $commandAst, $fakeBoundParameters) + $result = (Get-SdnFabricInfrastructureResult) + if ([string]::IsNullOrEmpty($wordToComplete)) { + return ($result.Role | Sort-Object -Unique) + } - $sdnHealthObject.Remediation += "[$($object.resourceRef)] Examine the Network Controller logs to determine why provisioning is $($object.properties.provisioningState)." - } + return $result.Role | Where-Object { $_ -like "*$wordToComplete*" } | Sort-Object + } + Name = { + param($commandName, $parameterName, $wordToComplete, $commandAst, $fakeBoundParameters) + $result = (Get-SdnFabricInfrastructureResult).RoleTest.HealthTest + if ([string]::IsNullOrEmpty($wordToComplete)) { + return ($result.Name | Sort-Object -Unique) + } - 'Updating' { - # if we already have a failure, we will not change the result to warning - if ($sdnHealthObject.Result -ne 'FAIL') { - $sdnHealthObject.Result = 'WARNING' - } + return $result | Where-Object { $_.Name -like "*$wordToComplete*" } | Sort-Object + } +} - # since we do not know what operations happened prior to this, we will log a warning - # and ask the user to monitor the provisioningState - $msg | Trace-Output -Level:Warning - $sdnHealthObject.Remediation += "[$($object.resourceRef)] Is reporting $($object.properties.provisioningState). Monitor to ensure that provisioningState moves to Succeeded." - } +Register-ArgumentCompleter -CommandName 'Get-SdnFabricInfrastructureResult' -ParameterName 'Role' -ScriptBlock $argScriptBlock.Role +Register-ArgumentCompleter -CommandName 'Get-SdnFabricInfrastructureResult' -ParameterName 'Name' -ScriptBlock $argScriptBlock.Name - default { - # this should cover scenario where provisioningState is 'Deleting' or Succeeded - $msg | Trace-Output -Level:Verbose - } - } +########################## +####### FUNCTIONS ######## +########################## - $details = [PSCustomObject]@{ - resourceRef = $object.resourceRef - provisioningState = $object.properties.provisioningState - } +function New-SdnHealthTest { + param ( + [Parameter(Mandatory = $false)] + [System.String]$Name = (Get-PSCallStack)[0].Command + ) - $array += $details - } + $object = [PSCustomObject]@{ + Name = $Name + Result = 'PASS' # default to PASS. Allowed values are PASS, WARN, FAIL + OccurrenceTime = [System.DateTime]::UtcNow + Properties = @() + Remediation = @() + HealthFault = [PSCustomObject]@() + } + + return $object +} + +function New-SdnRoleHealthReport { + param ( + [Parameter(Mandatory = $true)] + [System.String]$Role + ) - $sdnHealthObject.Properties = $array - return $sdnHealthObject + $object = [PSCustomObject]@{ + Role = $Role + ComputerName = $env:COMPUTERNAME + Result = 'PASS' # default to PASS. Allowed values are PASS, WARN, FAIL + OccurrenceTime = [System.DateTime]::UtcNow + HealthTest = @() # array of New-SdnHealthTest objects } - catch { - $_ | Trace-Exception - $_ | Write-Error + + return $object +} + +function New-SdnFabricHealthReport { + param ( + [Parameter(Mandatory = $true)] + [System.String]$Role + ) + + $object = [PSCustomObject]@{ + OccurrenceTime = [System.DateTime]::UtcNow + Role = $Role + Result = 'PASS' # default to PASS. Allowed values are PASS, WARN, FAIL + RoleTest = @() # array of New-SdnRoleHealthReport objects } + + return $object } -function Test-ScheduledTaskEnabled { - <# - .SYNOPSIS - Ensures the scheduled task responsible for etl compression is enabled and running - #> - [CmdletBinding()] +function Get-HealthData { param ( [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + [System.String]$Property, - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty + [Parameter(Mandatory = $true)] + [System.String]$Id ) - $sdnHealthObject = [SdnHealth]::new() - $array = @() - - $scriptBlock = { + $results = $script:SdnDiagnostics_Health.Config[$Property] + return ($results[$Id]) +} - $object = [PSCustomObject]@{ - TaskName = 'SDN Diagnostics Task' - State = $null - } +function Write-HealthValidationInfo { + [CmdletBinding()] + param ( + [Parameter(Mandatory = $true)] + [String]$ComputerName, - try { - # check to see if logging is enabled on the registry key - # if it is not, return the object with the state set to 'Logging Disabled' - $isLoggingEnabled = Get-ItemPropertyValue -Path "HKLM:\Software\Microsoft\NetworkController\Sdn\Diagnostics\Parameters" -Name 'IsLoggingEnabled' - if (-NOT $isLoggingEnabled ) { - $object.State = 'Logging Disabled' - return $object - } + [Parameter(Mandatory = $true)] + [String]$Name, - $result = Get-ScheduledTask -TaskName 'SDN Diagnostics Task' -ErrorAction Stop - if ($result) { - $object.State = $result.State.ToString() - return $object - } - } - catch { - # if the scheduled task does not exist, return the object with the state set to 'Not Found' - $object.State = 'Not Found' - return $object - } - } + [Parameter(Mandatory = $false)] + [String[]]$Remediation + ) - try { - $scheduledTaskReady = Invoke-PSRemoteCommand -ComputerName $SdnEnvironmentObject.ComputerName -Credential $Credential -ScriptBlock $scriptBlock -AsJob -PassThru - foreach ($result in $scheduledTaskReady) { - switch ($result.State) { - 'Logging Disabled' { - "SDN Diagnostics Task is not available on {0} because logging is disabled." -f $result.PSComputerName | Trace-Output -Level:Verbose - } - 'Not Found' { - "Unable to locate SDN Diagnostics Task on {0}." -f $result.PSComputerName | Trace-Output -Level:Error - $sdnHealthObject.Result = 'FAIL' - } - 'Disabled' { - "SDN Diagnostics Task is disabled on {0}." -f $result.PSComputerName | Trace-Output -Level:Error - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Use 'Repair-SdnDiagnosticsScheduledTask' to enable the 'SDN Diagnostics Task' scheduled task on $($result.PSComputerName)." - } - default { - "SDN Diagnostics Task is {0} on {1}." -f $result.State, $result.PSComputerName | Trace-Output -Level:Verbose - } - } + $details = Get-HealthData -Property 'HealthValidations' -Id $Name - $array += [PSCustomObject]@{ - State = $result.State - Computer = $result.PSComputerName - } - } + $outputString += "`r`n`r`n" + $outputString += "--------------------------`r`n" + $outputString += "[$ComputerName] $Name" + $outputString += "`r`n`r`n" + $outputString += "Description:`t$($details.Description)`r`n" + $outputString += "Impact:`t`t`t$($details.Impact)`r`n" - $sdnHealthObject.Properties = $array - return $sdnHealthObject + if (-NOT [string]::IsNullOrEmpty($Remediation)) { + $outputString += "Remediation:`r`n`t - $($Remediation -join "`r`n`t - ")`r`n" } - catch { - $_ | Trace-Exception - $_ | Write-Error + + if (-NOT [string]::IsNullOrEmpty($details.PublicDocUrl)) { + $outputString += "`r`n" + $outputString += "Additional information can be found at $($details.PublicDocUrl).`r`n" } + + $outputString += "`r`n--------------------------`r`n" + + $outputString | Write-Host -ForegroundColor Yellow } -function Test-ServerHostId { +function Debug-SdnFabricInfrastructure { <# .SYNOPSIS - Queries the NCHostAgent HostID registry key value across the hypervisor hosts to ensure the HostID matches known InstanceID results from NC Servers API. + Executes a series of fabric validation tests to validate the state and health of the underlying components within the SDN fabric. + .PARAMETER NetworkController + Specifies the name or IP address of the network controller node on which this cmdlet operates. The parameter is optional if running on network controller node. + .PARAMETER ComputerName + Type the NetBIOS name, an IP address, or a fully qualified domain name of one or more remote computers. + .PARAMETER Role + The specific SDN role(s) to perform tests and validations for. If ommitted, defaults to all roles. + .PARAMETER Credential + Specifies a user account that has permission to perform this action. The default is the current user. + .PARAMETER NcRestCertificate + Specifies the client certificate that is used for a secure web request to Network Controller REST API. + Enter a variable that contains a certificate or a command or expression that gets the certificate. + .PARAMETER NcRestCredential + Specifies a user account that has permission to perform this action against the Network Controller REST API. The default is the current user. + .EXAMPLE + PS> Debug-SdnFabricInfrastructure + .EXAMPLE + PS> Debug-SdnFabricInfrastructure -NetworkController 'NC01' -Credential (Get-Credential) -NcRestCredential (Get-Credential) #> - [CmdletBinding(DefaultParameterSetName = 'RestCredential')] + [CmdletBinding(DefaultParameterSetName = 'Role')] param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + [Parameter(Mandatory = $false, ParameterSetName = 'Role')] + [Parameter(Mandatory = $false, ParameterSetName = 'ComputerName')] + [System.String]$NetworkController = $env:COMPUTERNAME, - [Parameter(Mandatory = $false)] + [Parameter(Mandatory = $false, ParameterSetName = 'Role')] + [ValidateSet('Gateway', 'NetworkController', 'Server', 'LoadBalancerMux')] + [String[]]$Role = ('Gateway', 'LoadBalancerMux', 'NetworkController', 'Server'), + + [Parameter(Mandatory = $true, ParameterSetName = 'ComputerName')] + [System.String[]]$ComputerName, + + [Parameter(Mandatory = $false, ParameterSetName = 'Role')] + [Parameter(Mandatory = $false, ParameterSetName = 'ComputerName')] [System.Management.Automation.PSCredential] [System.Management.Automation.Credential()] $Credential = [System.Management.Automation.PSCredential]::Empty, - [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] + [Parameter(Mandatory = $false, ParameterSetName = 'Role')] + [Parameter(Mandatory = $false, ParameterSetName = 'ComputerName')] [System.Management.Automation.PSCredential] [System.Management.Automation.Credential()] $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, - [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] + [Parameter(Mandatory = $false, ParameterSetName = 'Role')] + [Parameter(Mandatory = $false, ParameterSetName = 'ComputerName')] [X509Certificate]$NcRestCertificate ) - $ncRestParams = @{ - NcUri = $SdnEnvironmentObject.NcUrl + $script:SdnDiagnostics_Health.Cache = $null + $aggregateHealthReport = @() + if (Test-ComputerNameIsLocal -ComputerName $NetworkController) { + Confirm-IsNetworkController } - switch ($PSCmdlet.ParameterSetName) { - 'RestCertificate' { - $ncRestParams.Add('NcRestCertificate', $NcRestCertificate) - } - 'RestCredential' { - $ncRestParams.Add('NcRestCredential', $NcRestCredential) - } + + if ($PSBoundParameters.ContainsKey('NcRestCertificate')) { + $restCredParam = @{ NcRestCertificate = $NcRestCertificate } + } + else { + $restCredParam = @{ NcRestCredential = $NcRestCredential } } - $sdnHealthObject = [SdnHealth]::new() - $array = @() + $environmentInfo = Get-SdnInfrastructureInfo -NetworkController $NetworkController -Credential $Credential @restCredParam + if ($null -eq $environmentInfo) { + throw New-Object System.NullReferenceException("Unable to retrieve environment details") + } try { - "Validating Server HostID registry matches known InstanceIDs from Network Controller Servers API." | Trace-Output - - $scriptBlock = { - $result = Get-ItemProperty -Path 'HKLM:\SYSTEM\CurrentControlSet\Services\NcHostAgent\Parameters' -Name 'HostId' -ErrorAction SilentlyContinue - return $result.HostID + # if we opted to specify the ComputerName rather than Role, we need to determine which role + # the computer names are associated with + if ($PSCmdlet.ParameterSetName -ieq 'ComputerName') { + $Role = @() + $ComputerName | ForEach-Object { + $computerRole = $_ | Get-SdnRole -EnvironmentInfo $environmentInfo + if ($computerRole) { + $Role += $computerRole + } + } } - $servers = Get-SdnResource @ncRestParams -Resource $SdnEnvironmentObject.Role.ResourceName - $hostId = Invoke-PSRemoteCommand -ComputerName $SdnEnvironmentObject.ComputerName -Credential $Credential -ScriptBlock $scriptBlock -AsJob -PassThru - foreach($id in $hostId){ - if($id -inotin $servers.instanceId){ - "{0}'s HostID {1} does not match known instanceID results in Network Controller Server REST API" -f $id.PSComputerName, $id | Trace-Output -Level:Error - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Update the HostId registry key on $($id.PSComputerName) to match the InstanceId of the Server resource in Network Controller" - - $object = [PSCustomObject]@{ - HostID = $id - Computer = $id.PSComputerName - } + $Role = $Role | Sort-Object -Unique + foreach ($object in $Role) { + "Processing tests for {0} role" -f $object.ToString() | Trace-Output -Level:Verbose + $config = Get-SdnModuleConfiguration -Role $object.ToString() - $array += $object + $roleHealthReport = New-SdnFabricHealthReport -Role $object.ToString() + $sdnFabricDetails = [PSCustomObject]@{ + ComputerName = $null + NcUrl = $environmentInfo.NcUrl + Role = $config + EnvironmentInfo = $environmentInfo } - else { - "{0}'s HostID {1} matches known InstanceID in Network Controller Server REST API" -f $id.PSComputerName, $id | Trace-Output -Level:Verbose + + # check to see if we were provided a specific computer(s) to test against + # otherwise we will want to pick up the node name(s) from the environment info + if ($ComputerName) { + $sdnFabricDetails.ComputerName = $ComputerName } - } + else { + # in scenarios where there are not mux(es) or gateway(s) then we need to gracefully handle this + # and move to the next role for processing + if ($null -ieq $environmentInfo[$object.ToString()]) { + "Unable to locate fabric nodes for {0}. Skipping health tests." -f $object.ToString() | Trace-Output -Level:Warning + continue + } - $sdnHealthObject.Properties = $array - return $sdnHealthObject - } - catch { - $_ | Trace-Exception - $_ | Write-Error - } -} + $sdnFabricDetails.ComputerName = $environmentInfo[$object.ToString()] + } -function Test-ServiceFabricApplicationHealth { - <# - .SYNOPSIS - Validate the health of the Network Controller application within Service Fabric. - #> + $restApiParams = @{ + NcUri = $sdnFabricDetails.NcUrl + } + $restApiParams += $restCredParam - [CmdletBinding()] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + # before proceeding with tests, ensure that the computer objects we are testing against are running the latest version of SdnDiagnostics + Install-SdnDiagnostics -ComputerName $sdnFabricDetails.ComputerName -Credential $Credential - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty - ) + $params = @{ + ComputerName = $sdnFabricDetails.ComputerName + Credential = $Credential + ScriptBlock = $null + ArgumentList = @($restApiParams) + } - $sdnHealthObject = [SdnHealth]::new() + switch ($object) { + 'Gateway' { $params.ScriptBlock = { param($boundParams) Debug-SdnGateway @boundParams } } + 'LoadBalancerMux' { $params.ScriptBlock = { param($boundParams) Debug-SdnLoadBalancerMux @boundParams } } + 'NetworkController' { $params.ScriptBlock = { param($boundParams) Debug-SdnNetworkController @boundParams } } + 'Server' { $params.ScriptBlock = { param($boundParams) Debug-SdnServer @boundParams } } + } - try { - "Validating the Service Fabric Application Health for Network Controller" | Trace-Output + $healthReport = Invoke-SdnCommand @params - $ncNodes = Get-SdnServiceFabricNode -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $credential - if($null -eq $ncNodes){ - throw New-Object System.NullReferenceException("Unable to retrieve service fabric nodes") - } + # evaluate the results of the tests and determine if any completed with Warning or FAIL + # if so, we will want to set the Result of the report to reflect this + foreach ($test in $healthReport) { + if ($test.Result -ieq 'WARN') { + $roleHealthReport.Result = 'WARN' + } + if ($test.Result -ieq 'FAIL') { + $roleHealthReport.Result = 'FAIL' + break + } + } - $applicationHealth = Get-SdnServiceFabricApplicationHealth -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $Credential - if ($applicationHealth.AggregatedHealthState -ine 'Ok') { - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Examine the Service Fabric Application Health for Network Controller to determine why the health is not OK." + $roleHealthReport.RoleTest += $healthReport + $aggregateHealthReport += $roleHealthReport } - - return $sdnHealthObject } catch { $_ | Trace-Exception $_ | Write-Error } -} + finally { + if ($aggregateHealthReport) { -function Test-ServiceFabricClusterHealth { - <# - .SYNOPSIS - Validate the health of the Network Controller cluster within Service Fabric. - #> + # enumerate all the roles that were tested so we can determine if any completed with Warning or FAIL + $aggregateHealthReport | ForEach-Object { + if ($_.Result -ine 'PASS') { - [CmdletBinding()] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + # enumerate all the individual role tests performed so we can determine if any completed that are not PASS + $_.RoleTest | ForEach-Object { + $c = $_.ComputerName + $_.HealthTest | ForEach-Object { + + # enum only the health tests that failed + if ($_.Result -ine 'PASS') { + # add the remediation steps to an array list so we can pass it to the Write-HealthValidationInfo function + # otherwise if we pass it directly, it will be treated as a single string + $remediationList = [System.Collections.ArrayList]::new() + $_.Remediation | ForEach-Object { [void]$remediationList.Add($_) } + + Write-HealthValidationInfo -ComputerName $c -Name $_.Name -Remediation $remediationList + } + } + } + } + } - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty - ) + # save the aggregate health report to cache so we can use it for further analysis + $script:SdnDiagnostics_Health.Cache = $aggregateHealthReport + } + } - $sdnHealthObject = [SdnHealth]::new() + if ($script:SdnDiagnostics_Health.Cache) { + "Results for fabric health have been saved to cache for further analysis. Use 'Get-SdnFabricInfrastructureResult' to examine the results." | Trace-Output + return $script:SdnDiagnostics_Health.Cache + } +} - try { - "Validating the Service Fabric Cluster Health for Network Controller" | Trace-Output +function GetLogLocation { - $ncNodes = Get-SdnServiceFabricNode -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $credential - if($null -eq $ncNodes){ - throw New-Object System.NullReferenceException("Unable to retrieve service fabric nodes") - } + <# + .SYNOPSIS + Gets the log location file path for SDN Health, returns null if none is set + #> - $clusterHealth = Get-SdnServiceFabricClusterHealth -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $Credential - if ($clusterHealth.AggregatedHealthState -ine 'Ok') { - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Examine the Service Fabric Cluster Health for Network Controller to determine why the health is not OK." - } + $RegistryPath = "HKLM:\SOFTWARE\Microsoft\SdnHealth" + $logPath = Get-ItemProperty -Path $RegistryPath -Name LogPath -ErrorAction SilentlyContinue + if ($null -ne $logPath) { + return $logPath.LogPath + } + else { + return $null + } +} +function SetLogLocation { + <# + .SYNOPSIS + Sets the location of the log path for the SDN diagnostics module + + .PARAMETER logPath + The path to the log file + #> + param( + [string] $logPath + ) + + $RegistryPath = "HKLM:\SOFTWARE\Microsoft\SdnHealth" - return $sdnHealthObject + if (-not (Test-Path $RegistryPath)) { + New-Item -Path $RegistryPath -Force | Out-Null } - catch { - $_ | Trace-Exception - $_ | Write-Error + + if ([string]::IsNullOrEmpty($logPath)) { + Remove-ItemProperty -Path $RegistryPath -Name logPath -ErrorAction SilentlyContinue + } + else { + New-ItemProperty -Path $RegistryPath -Name logPath -Value $logPath -Force | Out-Null } } -function Test-ServiceFabricNodeStatus { +function Start-SdnHealthFault { <# - .SYNOPSIS - Validate the health of the Network Controller nodes within Service Fabric. + .SYNOPSIS + Executes a series of fabric validation tests to validate the state and health of the underlying components within the SDN fabric. + #> [CmdletBinding()] param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + [Parameter(Mandatory = $false)] + [bool] $Poll = $false, [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty + [int] $PollIntervalSeconds = 30 ) - $sdnHealthObject = [SdnHealth]::new() - + Write-Verbose "Starting SDN Health Faults" + [bool] $transcriptStarted = $false try { - "Validating the Service Fabric Nodes for Network Controller" | Trace-Output - - $ncNodes = Get-SdnServiceFabricNode -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $credential - if($null -eq $ncNodes){ - throw New-Object System.NullReferenceException("Unable to retrieve service fabric nodes") - } - foreach ($node in $ncNodes) { - if ($node.NodeStatus -ine 'Up') { - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation = 'Examine the Service Fabric Nodes for Network Controller to determine why the node is not Up.' + # todo : change logpath + $transcriptFile = Join-Path -Path $Env:TEMP -ChildPath "SdnDiag.log" + Start-Transcript -Path $transcriptFile -Append + $transcriptStarted = $true + + do { + + # Test encapoverhead settings + Test-SdnEncapOverhead + + # Test all SDN Services + $validServiceRoles = @( + "ApiService", + "ControllerService", + "FirewallService", + "FnmService", + "GatewayManager", + "ServiceInsertion", + "VSwitchService" + ) + Test-SdnClusterServiceState -ServiceName $validServiceRoles + + # Test all agent services + $agentServices = @( + 'NcHostAgent', + 'SlbHostAgent' + ) + Test-SdnServiceState -ServiceName $agentServices + + # Test certificate related faults + Test-SdnNonSelfSignedCertificateInTrustedRootStore + + # Test tenant configuration states + Test-SdnConfigurationState + + if ($Poll) { + Start-Sleep -Seconds $PollIntervalSeconds } - } - return $sdnHealthObject + } until($Poll -eq $false); } catch { - $_ | Trace-Exception $_ | Write-Error } + finally { + if ($transcriptStarted) { + Stop-Transcript + } + } } -function Test-ServiceFabricPartitionDatabaseSize { +function GetSdnResourceFromNc { <# - .SYNOPSIS - Validate the Service Fabric partition size for each of the services running on Network Controller. + .SYNOPSIS + Wrapper around Get-SdnResource which attempts using different available certificates + NOTE: this is specifically for ASZ env because the nc cmdlets do not work there + + .PARAMETER NcUri + The base URI of the Network Controller. (https://) + + .PARAMETER Resource + The resource to retrieve from the Network Controller. + + .PARAMETER ApiVersion + (optional) The version of the resource to retrieve from the Network Controller. + note: if nothing is specified, v1 is queried + #> [CmdletBinding()] param ( + [Parameter(Mandatory = $false)] + [string] $NcUri, + [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + [ValidateSet('Servers', 'NetworkInterfaces', 'VirtualNetworks', 'LogicalNetworks')] + [String]$ResourceType, [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty + [String]$ApiVersion = 'v1' ) - $sdnHealthObject = [SdnHealth]::new() - $array = @() - - try { - "Validate the size of the Service Fabric Partition Databases for Network Controller services" | Trace-Output - - $ncNodes = Get-SdnServiceFabricNode -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $credential - if($null -eq $ncNodes){ - throw New-Object System.NullReferenceException("Unable to retrieve service fabric nodes") - } - - foreach($node in $ncNodes){ - $ncApp = Invoke-SdnServiceFabricCommand -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $Credential -ScriptBlock { - param([Parameter(Position = 0)][String]$param1) + $certs = @() + $certs += $null + $resources = $null + $NcUri = $NcUri.TrimEnd('/') - # The 3>$null 4>$null sends unwanted verbose and debug streams into the bit bucket - $null = Connect-ServiceFabricCluster -TimeoutSec 15 3>$null 4>$null - Get-ServiceFabricDeployedApplication -ApplicationName 'fabric:/NetworkController' -NodeName $param1 - } -ArgumentList @($node.NodeName.ToString()) + $sdnRequestParams = @{ + NcUri = $NcUri + ResourceRef = $ResourceType + ApiVersion = $ApiVersion + NcRestCertificate = $null + } - $ncAppWorkDir = $ncApp.WorkDirectory - if($null -eq $ncAppWorkDir){ - throw New-Object System.NullReferenceException("Unable to retrieve working directory path") + try { + $certs += Get-SdnServerCertificate + [System.Array]::Reverse($certs) + foreach ($cert in $certs) { + if ($null -ieq $cert) { + $sdnRequestParams = @{ + NcUri = $NcUri + ResourceRef = $ResourceType + ApiVersion = $ApiVersion + } } + else { + $sdnRequestParams = @{ + NcUri = $NcUri + ResourceRef = $ResourceType + ApiVersion = $ApiVersion + NcRestCertificate = $cert + } - # Only stateful service have the database file - $ncServices = Get-SdnServiceFabricService -NetworkController $SdnEnvironmentObject.ComputerName[0] -Credential $Credential | Where-Object {$_.ServiceKind -eq "Stateful"} - - foreach ($ncService in $ncServices){ - $replica = Get-SdnServiceFabricReplica -NetworkController $SdnEnvironmentObject.ComputerName[0] -ServiceName $ncService.ServiceName -Credential $Credential | Where-Object {$_.NodeName -eq $node.NodeName} - $imosStorePath = Join-Path -Path $ncAppWorkDir -ChildPath "P_$($replica.PartitionId)\R_$($replica.ReplicaId)\ImosStore" - $imosStoreFile = Invoke-PSRemoteCommand -ComputerName $node.NodeName -Credential $Credential -ScriptBlock { - param([Parameter(Position = 0)][String]$param1) - if (Test-Path -Path $param1) { - return (Get-Item -Path $param1) - } - else { - return $null - } - } -ArgumentList @($imosStorePath) - - if($null -ne $imosStoreFile){ - $formatedByteSize = Format-ByteSize -Bytes $imosStoreFile.Length - - $imosInfo = [PSCustomObject]@{ - Node = $node.NodeName - Service = $ncService.ServiceName - ImosSize = $formatedByteSize.GB - } - - # if the imos database file exceeds 4GB, want to indicate failure as it should not grow to be larger than this size - # need to perform InvariantCulture to ensure that the decimal separator is a period - if([float]::Parse($formatedByteSize.GB, [System.Globalization.NumberStyles]::Float, [System.Globalization.CultureInfo]::InvariantCulture) -gt 4){ - "[{0}] Service {1} is reporting {2} GB in size" -f $node.NodeName, $ncService.ServiceName, $formatedByteSize.GB | Trace-Output -Level:Warning - - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation = "Engage Microsoft CSS for further support" - } - else { - "[{0}] Service {1} is reporting {2} GB in size" -f $node.NodeName, $ncService.ServiceName, $formatedByteSize.GB | Trace-Output -Level:Verbose - } + Write-Verbose "Retrieving $NcUri with certificate $($cert.Subject) thumbprint $($cert.Thumbprint)" + } - $array += $imosInfo + try { + $resources = Get-SdnResource @sdnRequestParams + if ($resources) { + Write-Verbose "Retrieved $($resources.Count) resources for $ResourceType" + return $resources + } + } + catch [System.Net.WebException] { + if ( $_.Exception.Response.StatusCode -eq [System.Net.HttpStatusCode]::Unauthorized ) { + continue } else { - "No ImosStore file for service {0} found on node {1} from {2}" -f $ncService.ServiceName, $node.NodeName, $imosStorePath | Trace-Output -Level:Warning + Write-Error $_ + break } } + catch { + Write-Error $_ + # dont try other certificates + break + } } - $sdnHealthObject.Properties = $array - return $sdnHealthObject + return $null } catch { - $_ | Trace-Exception - $_ | Write-Error + Write-Error $_ } } -function Test-ServiceState { +function Get-SdnFabricInfrastructureResult { <# - .SYNOPSIS - Confirms that critical services for gateway are running + .SYNOPSIS + Returns the results that have been saved to cache as part of running Debug-SdnFabricInfrastructure. + .PARAMETER Role + The name of the SDN role that you want to return test results from within the cache. + .PARAMETER Name + The name of the test results you want to examine. + .EXAMPLE + PS> Get-SdnFabricInfrastructureResult + .EXAMPLE + PS> Get-SdnFabricInfrastructureResult -Role Server + .EXAMPLE + PS> Get-SdnFabricInfrastructureResult -Role Server -Name 'Test-SdnServiceState' #> [CmdletBinding()] param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + [Parameter(Mandatory = $false)] + [String]$Role, [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty + [System.String]$Name ) - $sdnHealthObject = [SdnHealth]::new() - $array = @() - $serviceStateResults = @() + $cacheResults = $script:SdnDiagnostics_Health.Cache - try { - [string[]]$services = $SdnEnvironmentObject.Role.Properties.Services.Keys - if ([string]::IsNullOrEmpty($services)) { - return $sdnHealthObject + if ($PSBoundParameters.ContainsKey('Role')) { + if ($cacheResults) { + $cacheResults = $cacheResults | Where-Object { $_.Role -eq $Role } } + } - "Validating {0} service state for {1}" -f ($services -join ', '), ($SdnEnvironmentObject.ComputerName -join ', ') | Trace-Output + if ($PSBoundParameters.ContainsKey('Name')) { + if ($cacheResults) { + $cacheResults = $cacheResults.HealthValidation | Where-Object { $_.Name -eq $Name } + } + } - $scriptBlock = { - param([Parameter(Position = 0)][String]$param1) + return $cacheResults +} - $result = Get-Service -Name $param1 -ErrorAction SilentlyContinue - return $result - } +function Debug-SdnNetworkController { + [CmdletBinding(DefaultParameterSetName = 'RestCredential')] + param ( + [Parameter(Mandatory = $true)] + [ValidateScript({ + if ($_.Scheme -ne "http" -and $_.Scheme -ne "https") { + throw New-Object System.FormatException("Parameter is expected to be in http:// or https:// format.") + } + return $true + })] + [Uri]$NcUri, - foreach ($service in $services) { - $serviceStateResults += Invoke-PSRemoteCommand -ComputerName $SdnEnvironmentObject.ComputerName -Credential $Credential -Scriptblock $scriptBlock -ArgumentList $service - } + [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] + [System.Management.Automation.PSCredential] + [System.Management.Automation.Credential()] + $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, - foreach($result in $serviceStateResults){ - $array += $result + [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] + [X509Certificate]$NcRestCertificate + ) - if($result.Status -ine 'Running'){ - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Start $($result.Name) service on $($result.PSComputerName)" + Confirm-IsNetworkController + $healthReport = New-SdnRoleHealthReport -Role 'NetworkController' - "{0} is {1} on {2}" -f $result.Name, $result.Status, $result.PSComputerName | Trace-Output -Level:Error + try { + # execute tests for network controller, regardless of the cluster type + $healthReport.HealthTest += @( + Test-SdnNonSelfSignedCertificateInTrustedRootStore + ) + + # execute tests based on the cluster type + switch ($Global:SdnDiagnostics.EnvironmentInfo.ClusterConfigType) { + 'FailoverCluster' { + $healthReport.HealthTest += @( + Test-SdnDiagnosticsCleanupTaskEnabled -TaskName 'FcDiagnostics' + ) } - else { - "{0} is {1} on {2}" -f $result.Name, $result.Status, $result.PSComputerName | Trace-Output -Level:Verbose + 'ServiceFabric' { + $config_sf = Get-SdnModuleConfiguration -Role 'NetworkController_SF' + [string[]]$services_sf = $config_sf.properties.services.Keys + $healthReport.HealthTest += @( + Test-SdnDiagnosticsCleanupTaskEnabled -TaskName 'SDN Diagnostics Task' + Test-SdnServiceState -ServiceName $services_sf + Test-SdnServiceFabricApplicationHealth + Test-SdnServiceFabricClusterHealth + Test-SdnServiceFabricNodeStatus + ) } } - $sdnHealthObject.Properties = $array - return $sdnHealthObject + # enumerate all the tests performed so we can determine if any completed with WARN or FAIL + # if any of the tests completed with WARN, we will set the aggregate result to WARN + # if any of the tests completed with FAIL, we will set the aggregate result to FAIL and then break out of the foreach loop + # we will skip tests with PASS, as that is the default value + foreach ($test in $healthReport.HealthTest) { + if ($test.Result -eq 'WARN') { + $healthReport.Result = $test.Result + } + elseif ($test.Result -eq 'FAIL') { + $healthReport.Result = $test.Result + break + } + } } catch { $_ | Trace-Exception - $_ | Write-Error + $healthReport.Result = 'FAIL' } -} -function Test-SlbManagerConnectionToMux { - <# - .SYNOPSIS - Validates the TCP connection between LoadBalancerMuxes and primary replica of SlbManager service within Network Controller. - #> + return $healthReport +} +function Debug-SdnServer { [CmdletBinding(DefaultParameterSetName = 'RestCredential')] param ( [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, - - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty, + [ValidateScript({ + if ($_.Scheme -ne "http" -and $_.Scheme -ne "https") { + throw New-Object System.FormatException("Parameter is expected to be in http:// or https:// format.") + } + return $true + })] + [Uri]$NcUri, [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] [System.Management.Automation.PSCredential] @@ -1473,503 +1514,1284 @@ function Test-SlbManagerConnectionToMux { [X509Certificate]$NcRestCertificate ) - $ncRestParams = @{ - NcUri = $SdnEnvironmentObject.NcUrl - } - switch ($PSCmdlet.ParameterSetName) { - 'RestCertificate' { - $ncRestParams.Add('NcRestCertificate', $NcRestCertificate) + Confirm-IsServer + $config = Get-SdnModuleConfiguration -Role 'Server' + [string[]]$services = $config.properties.services.Keys + $healthReport = New-SdnRoleHealthReport -Role 'Server' + + $ncRestParams = $PSBoundParameters + $serverResource = Get-SdnResource @ncRestParams -Resource:Servers + + try { + # execute tests based on the cluster type + switch ($Global:SdnDiagnostics.EnvironmentInfo.ClusterConfigType) { + 'ServiceFabric' { + $healthReport.HealthTest += @( + Test-SdnDiagnosticsCleanupTaskEnabled -TaskName 'SDN Diagnostics Task' + ) + } + 'FailoverCluster' { + $healthReport.HealthTest += @( + Test-SdnDiagnosticsCleanupTaskEnabled -TaskName 'FcDiagnostics' + ) + } } - 'RestCredential' { - $ncRestParams.Add('NcRestCredential', $NcRestCredential) + + # these tests are executed locally and have no dependencies on network controller rest API being available + $healthReport.HealthTest += @( + Test-SdnNonSelfSignedCertificateInTrustedRootStore + Test-SdnEncapOverhead + Test-VfpDuplicateMacAddress + Test-VMNetAdapterDuplicateMacAddress + Test-SdnServiceState -ServiceName $services + Test-SdnProviderNetwork + Test-SdnHostAgentConnectionStateToApiService + Test-SdnNetworkControllerApiNameResolution -NcUri $NcUri + ) + + # these tests have dependencies on network controller rest API being available + # and will only be executed if we have been able to get the data from the network controller + if ($serverResource) { + $healthReport.HealthTest += @( + Test-ServerHostId -InstanceId $serverResource.InstanceId + ) + } + + # enumerate all the tests performed so we can determine if any completed with WARN or FAIL + # if any of the tests completed with WARN, we will set the aggregate result to WARN + # if any of the tests completed with FAIL, we will set the aggregate result to FAIL and then break out of the foreach loop + # we will skip tests with PASS, as that is the default value + foreach ($test in $healthReport.HealthTest) { + if ($test.Result -eq 'WARN') { + $healthReport.Result = $test.Result + } + elseif ($test.Result -eq 'FAIL') { + $healthReport.Result = $test.Result + break + } } } + catch { + $_ | Trace-Exception + $healthReport.Result = 'FAIL' + } - $sdnHealthObject = [SdnHealth]::new() - $array = @() + return $healthReport +} - $netConnectionExistsScriptBlock = { - $tcpConnection = Get-NetTCPConnection -LocalPort 8560 -ErrorAction SilentlyContinue | Where-Object { $_.State -eq "Established" } - if ($tcpConnection) { - return $true - } - } +function Debug-SdnLoadBalancerMux { + [CmdletBinding(DefaultParameterSetName = 'RestCredential')] + param ( + [Parameter(Mandatory = $true)] + [ValidateScript({ + if ($_.Scheme -ne "http" -and $_.Scheme -ne "https") { + throw New-Object System.FormatException("Parameter is expected to be in http:// or https:// format.") + } + return $true + })] + [Uri]$NcUri, - try { - "Validating connectivity between LoadBalancerMuxes and primary replica of SlbManager service within Network Controller" | Trace-Output - $loadBalancerMux = Get-SdnLoadBalancerMux @ncRestParams + [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] + [System.Management.Automation.PSCredential] + [System.Management.Automation.Credential()] + $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, - # if no load balancer muxes configured within the environment, return back the health object to caller - if ($null -ieq $loadBalancerMux) { - return $sdnHealthObject - } + [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] + [X509Certificate]$NcRestCertificate + ) - # get the current primary replica of Network Controller - # if we cannot return the primary replica, then something is critically wrong with Network Controller - # in which case we should mark this test as failed and return back to the caller with guidance to fix the SlbManagerService - $primaryReplicaNode = Get-SdnServiceFabricReplica -NetworkController $SdnEnvironmentObject.EnvironmentInfo.NetworkController[0] -ServiceTypeName 'SlbManagerService' -Credential $NcRestCredential -Primary - if ($null -ieq $primaryReplicaNode) { - "Unable to return primary replica of SlbManagerService" | Trace-Output -Level:Error - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation = "Fix the primary replica of SlbManagerService within Network Controller." - return $sdnHealthObject - } + Confirm-IsLoadBalancerMux + $config = Get-SdnModuleConfiguration -Role 'LoadBalancerMux' + [string[]]$services = $config.properties.services.Keys + $healthReport = New-SdnRoleHealthReport -Role 'LoadBalancerMux' - # enumerate through the load balancer muxes in the environment and validate the TCP connection state - # we expect the primary replica for SlbManager within Network Controller to have an active connection for DIP:VIP programming to the Muxes - foreach ($mux in $loadBalancerMux) { - $virtualServer = Get-SdnResource @ncRestParams -ResourceRef $mux.properties.virtualServer.resourceRef - $virtualServerConnection = $virtualServer.properties.connections[0].managementAddresses - $connectionExists = Invoke-PSRemoteCommand -ComputerName $virtualServerConnection -Credential $Credential -ScriptBlock $netConnectionExistsScriptBlock - if (-NOT $connectionExists) { - "{0} is not connected to SlbManager of Network Controller" -f $mux.resourceRef | Trace-Output -Level:Error - $sdnHealthObject.Result = 'FAIL' - $sdnHealthObject.Remediation += "Investigate and fix TCP connectivity or x509 authentication between $($primaryReplicaNode.ReplicaAddress) and $($mux.resourceRef)." - - $object = [PSCustomObject]@{ - LoadBalancerMux = $mux.resourceRef - SlbManagerPrimaryReplica = $primaryReplicaNode.ReplicaAddress - } + $ncRestParams = $PSBoundParameters - $array += $object + try { + $muxCertRegKey = Get-ItemProperty -Path "HKLM:\SYSTEM\CurrentControlSet\Services\SlbMux" -Name MuxCert + $virtualServers = Get-SdnResource -Resource VirtualServers @ncRestParams + $muxVirtualServer = $virtualServers | Where-Object { $_.properties.connections.managementaddresses -contains $muxCertRegKey.MuxCert } + $loadBalancerMux = Get-SdnLoadBalancerMux @ncRestParams | Where-Object { $_.properties.virtualserver.resourceRef -ieq $muxVirtualServer.resourceRef } + $peerRouters = $loadBalancerMux.properties.routerConfiguration.peerRouterConfigurations.routerIPAddress + + $healthReport.HealthTest += @( + Test-SdnNonSelfSignedCertificateInTrustedRootStore + Test-SdnServiceState -ServiceName $services + Test-SdnDiagnosticsCleanupTaskEnabled -TaskName 'SDN Diagnostics Task' + Test-SdnMuxConnectionStateToSlbManager + Test-SdnNetworkControllerApiNameResolution -NcUri $NcUri + ) + + # these tests have dependencies on network controller rest API being available + # and will only be executed if we have been able to get the data from the network controller + if ($muxVirtualServer) { + $healthReport.HealthTest += @( + Test-SdnMuxConnectionStateToRouter -RouterIPAddress $peerRouters + ) + } + + # enumerate all the tests performed so we can determine if any completed with WARN or FAIL + # if any of the tests completed with WARN, we will set the aggregate result to WARN + # if any of the tests completed with FAIL, we will set the aggregate result to FAIL and then break out of the foreach loop + # we will skip tests with PASS, as that is the default value + foreach ($test in $healthReport.HealthTest) { + if ($test.Result -eq 'WARN') { + $healthReport.Result = $test.Result } - else { - "{0} is connected to {1}" -f $mux.resourceRef, $primaryReplicaNode.ReplicaAddress | Trace-Output -Level:Verbose + elseif ($test.Result -eq 'FAIL') { + $healthReport.Result = $test.Result + break } } - - $sdnHealthObject.Properties = $array - return $sdnHealthObject } catch { $_ | Trace-Exception - $_ | Write-Error + $healthReport.Result = 'FAIL' } -} -function Test-VfpDuplicatePort { - <# - .SYNOPSIS - Validate there are no ports within VFP layer that may have duplicate MAC addresses. - #> + return $healthReport +} - [CmdletBinding()] +function Debug-SdnGateway { + [CmdletBinding(DefaultParameterSetName = 'RestCredential')] param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, + [Parameter(Mandatory = $true, ParameterSetName = 'RestCredential')] + [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] + [ValidateScript({ + if ($_.Scheme -ne "http" -and $_.Scheme -ne "https") { + throw New-Object System.FormatException("Parameter is expected to be in http:// or https:// format.") + } + return $true + })] + [Uri]$NcUri, - [Parameter(Mandatory = $false)] + [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] [System.Management.Automation.PSCredential] [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty + $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, + + [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] + [X509Certificate]$NcRestCertificate ) - $sdnHealthObject = [SdnHealth]::new() - $array = @() + Confirm-IsRasGateway + $config = Get-SdnModuleConfiguration -Role 'Gateway' + [string[]]$services = $config.properties.services.Keys + $healthReport = New-SdnRoleHealthReport -Role 'Gateway' + + $ncRestParams = @{ + NcUri = $NcUri + } + switch ($PSCmdlet.ParameterSetName) { + 'RestCredential' { $ncRestParams += @{ NcRestCredential = $NcRestCredential } } + 'RestCertificate' { $ncRestParams += @{ NcRestCertificate = $NcRestCertificate } } + } try { - "Validate no duplicate MAC addresses for ports within Virtual Filtering Platform (VFP)" | Trace-Output - - $vfpPorts = Get-SdnVfpVmSwitchPort -ComputerName $SdnEnvironmentObject.ComputerName -Credential $Credential - $duplicateObjects = $vfpPorts | Where-Object {$_.MACaddress -ne '00-00-00-00-00-00' -and $null -ne $_.MacAddress} | Group-Object -Property MacAddress | Where-Object {$_.Count -ge 2} - if($duplicateObjects){ - $array += $duplicateObjects - $sdnHealthObject.Result = 'FAIL' - - # since there can be multiple grouped objects, we need to enumerate each duplicate group - foreach($obj in $duplicateObjects){ - $sdnHealthObject.Remediation += "Remove the duplicate MAC addresses for $($obj.Name) within VFP" - - "Located {0} VFP ports associated with {1}:`r`n`n{2}`r`n" -f $obj.Count, $obj.Name, ` - ($obj.Group ` - | Select-Object @{n="Portname";e={"`t$($_.Portname)"}} ` - | Select-Object -ExpandProperty Portname ` - | Out-String ` - ) | Trace-Output -Level:Error + $healthReport.HealthTest += @( + Test-SdnNonSelfSignedCertificateInTrustedRootStore + Test-SdnDiagnosticsCleanupTaskEnabled -TaskName 'SDN Diagnostics Task' + Test-SdnServiceState -ServiceName $services + ) + + # enumerate all the tests performed so we can determine if any completed with Warning or FAIL + # if any of the tests completed with Warning, we will set the aggregate result to Warning + # if any of the tests completed with FAIL, we will set the aggregate result to FAIL and then break out of the foreach loop + # we will skip tests with PASS, as that is the default value + foreach ($test in $healthReport.HealthTest) { + if ($test.Result -eq 'Warning') { + $healthReport.Result = $test.Result + } + elseif ($test.Result -eq 'FAIL') { + $healthReport.Result = $test.Result + break } } - - $sdnHealthObject.Properties = $array - return $sdnHealthObject } catch { $_ | Trace-Exception - $_ | Write-Error + $healthReport.Result = 'FAIL' } + + return ( $healthReport ) } -function Test-VMNetAdapterDuplicateMacAddress { +################################### +#### COMMON HEALTH VALIDATIONS #### +################################### + +function Test-SdnNonSelfSignedCertificateInTrustedRootStore { <# .SYNOPSIS - Validate there are no adapters within hyper-v dataplane that may have duplicate MAC addresses. + Validate the Cert in Host's Root CA Store to detect if any Non Root Cert exist #> [CmdletBinding()] - param ( - [Parameter(Mandatory = $true)] - [SdnFabricEnvObject]$SdnEnvironmentObject, - - [Parameter(Mandatory = $false)] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty - ) + param () - $sdnHealthObject = [SdnHealth]::new() + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) invoked" + $sdnHealthTest = New-SdnHealthTest $array = @() try { - "Validate no duplicate MAC addresses for network adapters within Hyper-V" | Trace-Output - - $vmNetAdapters = Get-SdnVMNetworkAdapter -ComputerName $SdnEnvironmentObject.ComputerName -AsJob -PassThru -Timeout 900 -Credential $Credential - $duplicateObjects = $vmNetAdapters | Group-Object -Property MacAddress | Where-Object {$_.Count -ge 2} - if($duplicateObjects){ - $array += $duplicateObjects - $sdnHealthObject.Result = 'FAIL' - - # since there can be multiple grouped objects, we need to enumerate each duplicate group - foreach($obj in $duplicateObjects){ - $sdnHealthObject.Remediation += "Remove the duplicate MAC addresses for $($obj.Name) within Hyper-V" - "Located {0} virtual machines associated with MAC address {1}:`r`n`n{2}`r`n" -f $obj.Count, $obj.Name, ` - ($obj.Group ` - | Select-Object @{n="VMName";e={"`t$($_.VMName)"}} ` - | Select-Object -ExpandProperty VMName ` - | Out-String ` - ) | Trace-Output -Level:Error + $rootCerts = Get-ChildItem -Path 'Cert:LocalMachine\Root' | Where-Object { $_.Issuer -ne $_.Subject } + if ($rootCerts -or $rootCerts.Count -gt 0) { + $sdnHealthTest.Result = 'FAIL' + + $rootCerts | ForEach-Object { + $sdnHealthTest.Remediation += "Remove Certificate Thumbprint: $($_.Thumbprint) Subject: $($_.Subject)" + $array += [PSCustomObject]@{ + Thumbprint = $_.Thumbprint + Subject = $_.Subject + Issuer = $_.Issuer + } } } + $sdnHealthTest.Properties = $array + + + ########################################################################################## + ## ServiceState Fault Template + ########################################################################################## + # $KeyFaultingObjectDescription (SDN ID) : [HostName] + # $KeyFaultingObjectID (ARC ID) : [HostName] + # $KeyFaultingObjectType (CODE) : "NonSelfSignedCertificateInTrustedRootStore" + # $FaultingObjectLocation (SOURCE) : "CertificateConfiguration" + # $FaultDescription (MESSAGE) : "A non self signed ceritificate was found in trusted root store. This may lead to authentication problems." + # $FaultActionRemediation (ACTION) : "Investigate and remove certificate with subject [SubjectNamesCsv]" + # * Fault may be issued from each node + ########################################################################################## + if ($null -ne $array.Subject -and $array.Subject.Count -gt 0) { + $subjectNames = [string]::Join(",", $array.Subject) + } + else { + $subjectNames = "" + } + $healthFault = [SdnFaultInfo]::new() + $healthFault.KeyFaultingObjectDescription = $Env:COMPUTERNAME + $healthFault.KeyFaultingObjectID = $Env:COMPUTERNAME + $healthFault.KeyFaultingObjectType = "NonSelfSignedCertificateInTrustedRootStore" + $healthFault.FaultingObjectLocation = "CertificateConfiguration" + $healthFault.FaultDescription = "A non self signed ceritificate was found in trusted root store. This may lead to authentication problems." + $healthFault.FaultActionRemediation = "Investigate and remove certificate with subject(s) $($subjectNames)." - $sdnHealthObject.Properties = $array - return $sdnHealthObject + if ( $rootCerts -or $rootCerts.Count -gt 0) { + CreateorUpdateFault -Fault $healthFault + $convFault = ConvertFaultToPsObject -healthFault $healthFault -faultOpType "Create" + $sdnHealthTest.HealthFault += $convFault + } + else { + DeleteFault -Fault $healthFault + $convFault = ConvertFaultToPsObject -healthFault $healthFault -faultOpType "Delete" + $sdnHealthTest.HealthFault += $convFault + } } catch { $_ | Trace-Exception - $_ | Write-Error + $sdnHealthTest.Result = 'FAIL' } + finally { + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) exiting" + } + + return $sdnHealthTest } -function Write-HealthValidationInfo { +function Test-SdnServiceState { [CmdletBinding()] param ( - [Parameter(Mandatory = $true)] - [String]$Role, [Parameter(Mandatory = $true)] - [String]$Name, + [String[]]$ServiceName + ) - [Parameter(Mandatory = $false)] - [String[]]$Remediation + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) invoked for $($ServiceName)" + $sdnHealthTest = New-SdnHealthTest + $failureDetected = $false + $array = @() + + try { + foreach ($service in $ServiceName) { + $result = Get-Service -Name $service -ErrorAction Ignore + if ($result) { + $array += [PSCustomObject]@{ + ServiceName = $result.Name + Status = $result.Status + } + + if ($result.Status -ine 'Running') { + $failureDetected = $true + $sdnHealthTest.Remediation += "[$service] Start the service" + } + } + else { + $failureDetected = $true + } + + ########################################################################################## + ## ServiceState Fault Template + ########################################################################################## + # $KeyFaultingObjectDescription (SDN ID) : [HostName] + # $KeyFaultingObjectID (ARC ID) : [ServiceName] + # $KeyFaultingObjectType (CODE) : [ServiceDown] + # $FaultingObjectLocation (SOURCE) : [ServiceName] + # $FaultDescription (MESSAGE) : Service [ServiceName] is not up. + # $FaultActionRemediation (ACTION) : [ServiceName] Start the service + # *ServiceState faults will be reported from each node + ########################################################################################## + + $healthFault = [SdnFaultInfo]::new() + $healthFault.KeyFaultingObjectDescription = $Env:COMPUTERNAME + $healthFault.KeyFaultingObjectID = $service + $healthFault.KeyFaultingObjectType = "ServiceDown" + $healthFault.FaultingObjectLocation = $service + $healthFault.FaultDescription = "Service $($service) is not up." + $healthFault.FaultActionRemediation = "Start the cluster service role $($service) from failover cluster manager" + + if ($result.Status -ine 'Running') { + Write-Verbose "Creating fault for $($service) status $($result.Status)" + CreateorUpdateFault -Fault $healthFault + $convFault = ConvertFaultToPsObject -healthFault $healthFault -faultOpType "Create" + $sdnHealthTest.HealthFault += $convFault + } + else { + Write-Verbose "No fault(s) on $($service) clearing any existing ones" + DeleteFault -Fault $healthFault + $convFault = ConvertFaultToPsObject -healthFault $healthFault -faultOpType "Delete" + $sdnHealthTest.HealthFault += $convFault + } + } + + if ($failureDetected) { + $sdnHealthTest.Result = 'FAIL' + } + + if ($array) { + $sdnHealthTest.Properties = $array + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + finally { + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) exiting" + } + + return $sdnHealthTest +} + + +function Test-SdnClusterServiceState { + [CmdletBinding()] + param ( + [Parameter(Mandatory = $true)] + [String[]]$ServiceName + ) + + $isCurrentNodeClusterOwner = IsCurrentNodeClusterOwner + if ($isCurrentNodeClusterOwner -eq $false) { + Write-Verbose "This node is not the cluster owner. Skipping health tests." + return + } + + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) invoked" + $sdnHealthTest = New-SdnHealthTest + $failureDetected = $false + $array = @() + + try { + foreach ($service in $ServiceName) { + $result = Get-ClusterGroup -Name $service -ErrorAction Ignore + if ($result) { + $array += [PSCustomObject]@{ + ServiceName = $result.Name + Status = $result.State + } + Write-Verbose "$service state $($result.State)" + if ($result.State -ine 'Online') { + $failureDetected = $true + $sdnHealthTest.Remediation += "[$service] Start the service" + } + + ########################################################################################## + ## FailoverClusterServiceState Fault Template + ########################################################################################## + # $KeyFaultingObjectDescription (SDN ID) : [ServiceName] + # $KeyFaultingObjectID (ARC ID) : [ServiceName] + # $KeyFaultingObjectType (CODE) : ServiceUnavailable + # $FaultingObjectLocation (SOURCE) : [ServiceName] + # $FaultDescription (MESSAGE) : Service [ServiceName] is not up. + # $FaultActionRemediation (ACTION) : [ServiceName] Start the service + # *ServiceState faults will be reported only on one (primary) cluster node + ########################################################################################## + + $healthFault = [SdnFaultInfo]::new() + $healthFault.KeyFaultingObjectDescription = $service + $healthFault.KeyFaultingObjectID = $service + $healthFault.KeyFaultingObjectType = "ServiceUnavailable" + $healthFault.FaultingObjectLocation = $service + $healthFault.FaultDescription = "Service $($service) is $($result.State) on Failover Cluster" + $healthFault.FaultActionRemediation = "Start the cluster service role $($service)" + + if ($result.State -ine 'Online') { + Write-Verbose "Creating fault for $($service)" + CreateorUpdateFault -Fault $healthFault + $convFault = ConvertFaultToPsObject -healthFault $healthFault -faultOpType "Create" + $sdnHealthTest.HealthFault += $convFault + } + else { + Write-Verbose "No fault(s) on $($service)" + DeleteFault -Fault $healthFault + $convFault = ConvertFaultToPsObject -healthFault $healthFault -faultOpType "Delete" + $sdnHealthTest.HealthFault += $convFault + } + } + else { + $sdnHealthTest.Result = 'FAIL' + } + } + + if ($failureDetected) { + $sdnHealthTest.Result = 'FAIL' + } + $sdnHealthTest.Properties = $array + } + catch { + $_ | Trace-Exception + $_ | Write-Error + } + finally { + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) exiting" + } + + return $sdnHealthTest +} + +function Test-SdnDiagnosticsCleanupTaskEnabled { + <# + .SYNOPSIS + Ensures the scheduled task responsible for etl compression is enabled and running + #> + + [CmdletBinding()] + param ( + [Parameter(Mandatory = $true)] + [ValidateSet('FcDiagnostics', 'SDN Diagnostics Task')] + [String]$TaskName + ) + + $sdnHealthTest = New-SdnHealthTest + + try { + # check to see if logging is enabled on the registry key + $isLoggingEnabled = Get-ItemPropertyValue -Path "HKLM:\Software\Microsoft\NetworkController\Sdn\Diagnostics\Parameters" -Name 'IsLoggingEnabled' -ErrorAction Ignore + + # in this scenario, logging is currently disabled so scheduled task will not be available + if ($isLoggingEnabled ) { + try { + $result = Get-ScheduledTask -TaskName $TaskName -ErrorAction Stop + if ($result.State -ieq 'Disabled') { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "Use 'Repair-SdnDiagnosticsScheduledTask -TaskName $TaskName'." + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +function Test-SdnNetworkControllerApiNameResolution { + <# + .SYNOPSIS + Validates that the Network Controller API is resolvable via DNS + #> + + [CmdletBinding()] + param ( + [Parameter(Mandatory = $true)] + [ValidateScript({ + if ($_.Scheme -ne "http" -and $_.Scheme -ne "https") { + throw New-Object System.FormatException("Parameter is expected to be in http:// or https:// format.") + } + return $true + })] + [Uri]$NcUri + ) + + $sdnHealthTest = New-SdnHealthTest + + try { + # check to see if the Uri is an IP address or a DNS name + # if it is a DNS name, we need to ensure that it is resolvable + # if it is an IP address, we can skip the DNS resolution check + $isIpAddress = [System.Net.IPAddress]::TryParse($NcUri.Host, [ref]$null) + if (-NOT $isIpAddress) { + $dnsResult = Resolve-DnsName -Name $NcUri.Host -ErrorAction Ignore + if ($null -eq $dnsResult) { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "Ensure that the DNS server(s) are reachable and DNS record exists." + } + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + + +################################### +#### SERVER HEALTH VALIDATIONS #### +################################### + +function Test-SdnEncapOverhead { + <# + .SYNOPSIS + Validate EncapOverhead configuration on the network adapter + #> + + [CmdletBinding()] + param () + + Confirm-IsServer + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) invoked" + + [int]$encapOverheadExpectedValue = 160 + [int]$jumboPacketExpectedValue = 1674 # this is default 1514 MTU + 160 encap overhead + $sdnHealthTest = New-SdnHealthTest + [bool] $misconfigurationFound = $false + [string[]] $misconfiguredNics = @() + + try { + $encapOverheadResults = Get-SdnNetAdapterEncapOverheadConfig + if ($null -eq $encapOverheadResults) { + # skip generation of fault if we cannot determine status confidently + $sdnHealthTest.Result = 'FAIL' + } + else { + $encapOverheadResults | ForEach-Object { + # if encapoverhead is not enabled, this is most commonly due to network adapter firmware or driver + # recommendations are to update the firmware and driver to the latest version and make sure not using default inbox drivers + if ($_.EncapOverheadEnabled -eq $false) { + + # in this scenario, encapoverhead is disabled and we have the expected jumbo packet value + # packets will be allowed to traverse the network without being dropped after adding VXLAN/GRE headers + if ($_.JumboPacketValue -ge $jumboPacketExpectedValue) { + # will not do anything as configuring the jumbo packet is viable workaround if encapoverhead is not supported on the network adapter + # this is a PASS scenario + } + + # in this scenario, encapoverhead is disabled and we do not have the expected jumbo packet value + # this will result in a failure on the test as it will result in packets being dropped if we exceed default MTU + if ($_.JumboPacketValue -lt $jumboPacketExpectedValue) { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "[$($_.NetAdapterInterfaceDescription)] Ensure the latest firmware and drivers are installed to support EncapOverhead. Configure JumboPacket to $jumboPacketExpectedValue if EncapOverhead is not supported." + $misconfigurationFound = $true + $misconfiguredNics += $_.NetAdapterInterfaceDescription + } + } + + # in this case, the encapoverhead is enabled but the value is less than the expected value + if ($_.EncapOverheadEnabled -and $_.EncapOverheadValue -lt $encapOverheadExpectedValue) { + # do nothing here at this time as may be expected if no workloads deployed to host + # todo: add extended checks once vnet support is available, check against ovsdb + } + + $FAULTNAME = "InvalidEncapOverheadConfiguration" + ########################################################################################## + ## EncapOverhead Fault Template + ########################################################################################## + # $KeyFaultingObjectDescription (SDN ID) : [HostName] + # $KeyFaultingObjectID (ARC ID) : [NetworkAdapterIfDesc] + # $KeyFaultingObjectType (CODE) : InvalidEncapOverheadConfiguration + # $FaultingObjectLocation (SOURCE) : [HostName] + # $FaultDescription (MESSAGE) : EncapOverhead is not enabled or configured correctly for on host . + # $FaultActionRemediation (ACTION) : JumboPacket should be enabled & EncapOverhead must be configured to support SDN. Please check NetworkATC configuration for configuring optimal networking configuration. + # *EncapOverhead Faults will be reported from each node + ########################################################################################## + + $sdnHealthFault = [SdnFaultInfo]::new() + $sdnHealthFault.KeyFaultingObjectDescription = $env:COMPUTERNAME + $sdnHealthFault.KeyFaultingObjectID = $_.NetAdapterInterfaceDescription + $sdnHealthFault.KeyFaultingObjectType = $FAULTNAME + $sdnHealthFault.FaultingObjectLocation = $env:COMPUTERNAME + $sdnHealthFault.FaultDescription = "EncapOverhead is not enabled or configured correctly for $($_.NetAdapterInterfaceDescription) on host $env:COMPUTERNAME." + $sdnHealthFault.FaultActionRemediation = "JumboPacket should be enabled & EncapOverhead must be configured to support SDN. Please check NetworkATC configuration for configuring optimal networking configuration." + + if ($misconfigurationFound -eq $true) { + CreateorUpdateFault -Fault $sdnHealthFault + $sdnHealthTest.HealthFault += ConvertFaultToPsObject -healthFault $sdnHealthFault -faultType "Create" + } + else { + Write-Verbose "No fault(s) on EncapOverhead, clearing any existing ones" + # clear all existing faults for host($FAULTNAME) + # todo: validate multiple hosts reporting the same fault + DeleteFaultBy -KeyFaultingObjectDescription $env:COMPUTERNAME -KeyFaultingObjectType $FAULTNAME + $sdnHealthTest.HealthFault += ConvertFaultToPsObject -healthFault $sdnHealthFault -faultType "Delete" + } + } + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + finally { + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) exiting" + } + + return $sdnHealthTest +} + +function Test-ServerHostId { + <# + .SYNOPSIS + Queries the NCHostAgent HostID registry key value ensure the HostID matches known InstanceID + #> + + [CmdletBinding()] + param ( + [Parameter(Mandatory = $true)] + [string[]]$InstanceId + ) + + Confirm-IsServer + + $sdnHealthTest = New-SdnHealthTest + $regkeyPath = 'HKLM:\SYSTEM\CurrentControlSet\Services\NcHostAgent\Parameters' + + try { + $regHostId = Get-ItemProperty -Path $regkeyPath -Name 'HostId' -ErrorAction Ignore + if ($null -ieq $regHostId) { + $sdnHealthTest.Result = 'FAIL' + } + else { + if ($regHostId.HostId -inotin $InstanceId) { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "Update the HostId registry under $regkeyPath to match the correct InstanceId from the NC Servers API." + $sdnHealthTest.Properties = [PSCustomObject]@{ + HostID = $regHostId + } + } + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +function Test-VfpDuplicateMacAddress { + [CmdletBinding()] + param () + + Confirm-IsServer + $sdnHealthTest = New-SdnHealthTest + + try { + $vfpPorts = Get-SdnVfpVmSwitchPort + $duplicateObjects = $vfpPorts | Where-Object { $_.MACaddress -ne '00-00-00-00-00-00' -and $null -ne $_.MacAddress } | Group-Object -Property MacAddress | Where-Object { $_.Count -ge 2 } + if ($duplicateObjects) { + $sdnHealthTest.Result = 'FAIL' + + $duplicateObjects | ForEach-Object { + $sdnHealthTest.Remediation += "[$($_.Name)] Resolve the duplicate MAC address issue with VFP." + } + } + + $sdnHealthTest.Properties = [PSCustomObject]@{ + DuplicateVfpPorts = $duplicateObjects.Group + VfpPorts = $vfpPorts + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +function Test-VMNetAdapterDuplicateMacAddress { + [CmdletBinding()] + param () + + Confirm-IsServer + $sdnHealthTest = New-SdnHealthTest + + try { + $vmNetAdapters = Get-SdnVMNetworkAdapter + $duplicateObjects = $vmNetAdapters | Group-Object -Property MacAddress | Where-Object { $_.Count -ge 2 } + if ($duplicateObjects) { + $sdnHealthTest.Result = 'FAIL' + + $duplicateObjects | ForEach-Object { + $sdnHealthTest.Remediation += "[$($_.Name)] Resolve the duplicate MAC address issue with VMNetworkAdapters." + } + } + + $sdnHealthTest.Properties = [PSCustomObject]@{ + DuplicateVMNetworkAdapters = $duplicateObjects.Group + VMNetworkAdapters = $vmNetAdapters + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +function Test-SdnProviderNetwork { + + <# + .SYNOPSIS + Validate the health of the provider network by pinging the provider addresses. + #> + + [CmdletBinding()] + param () + + Confirm-IsServer + $sdnHealthTest = New-SdnHealthTest + + try { + $addressMapping = Get-SdnOvsdbAddressMapping + if (-NOT ($null -eq $addressMapping -or $addressMapping.Count -eq 0)) { + $providerAddreses = $addressMapping.ProviderAddress | Sort-Object -Unique + $connectivityResults = Test-SdnProviderAddressConnectivity -ProviderAddress $providerAddreses + + foreach ($destination in $connectivityResults) { + $failureDetected = $false + $sourceIPAddress = $destination.SourceAddress[0] + $destinationIPAddress = $destination.DestinationAddress[0] + $jumboPacketResult = $destination | Where-Object { $_.BufferSize -gt 1472 } + $standardPacketResult = $destination | Where-Object { $_.BufferSize -le 1472 } + + if ($destination.Status -ine 'Success') { + $remediationMsg = $null + $failureDetected = $true + + # if both jumbo and standard icmp tests fails, indicates a failure in the physical network + if ($jumboPacketResult.Status -ieq 'Failure' -and $standardPacketResult.Status -ieq 'Failure') { + $remediationMsg = "Unable to ping Provider Addresses. Ensure ICMP enabled on $sourceIPAddress and $destinationIPAddress. If issue persists, investigate physical network." + $sdnHealthTest.Remediation += $remediationMsg + } + + # if standard MTU was success but jumbo MTU was failure, indication that jumbo packets or encap overhead has not been setup and configured + # either on the physical nic or within the physical switches between the provider addresses + if ($jumboPacketResult.Status -ieq 'Failure' -and $standardPacketResult.Status -ieq 'Success') { + $remediationMsg = "Ensure the physical network between $sourceIPAddress and $destinationIPAddress are configured to support VXLAN or NVGRE encapsulated packets with minimum MTU of 1660." + $sdnHealthTest.Remediation += $remediationMsg + } + } + } + } + + if ($failureDetected) { + $sdnHealthTest.Result = 'FAIL' + } + if ($connectivityResults) { + $sdnHealthTest.Properties = [PSCustomObject]@{ + PingResult = $connectivityResults + } + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +function Test-SdnHostAgentConnectionStateToApiService { + <# + SYNOPSIS + Validate the health of the Network Controller Host Agent connection to the Network Controller API Service. + #> + + [CmdletBinding()] + param() + + Confirm-IsServer + $sdnHealthTest = New-SdnHealthTest + + try { + $tcpConnection = Get-NetTCPConnection -RemotePort 6640 -ErrorAction Ignore + if ($null -eq $tcpConnection -or $tcpConnection.State -ine 'Established') { + $sdnHealthTest.Result = 'FAIL' + } + + if ($tcpConnection) { + if ($tcpConnection.ConnectionState -ine 'Connected') { + $serviceState = Get-Service -Name NCHostAgent -ErrorAction Stop + if ($serviceState.Status -ine 'Running') { + $sdnHealthTest.Result = 'WARN' + $sdnHealthTest.Remediation += "Ensure the NCHostAgent service is running." + } + else { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "Ensure that Network Controller ApiService is healthy and operational. Investigate and fix TCP / TLS connectivity issues." + } + } + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +################################### +###### NC HEALTH VALIDATIONS ###### +################################### + +function Test-SdnServiceFabricApplicationHealth { + <# + .SYNOPSIS + Validate the health of the Network Controller application within Service Fabric. + #> + + [CmdletBinding()] + param () + + $sdnHealthTest = New-SdnHealthTest + + try { + $applicationHealth = Get-SdnServiceFabricApplicationHealth -ErrorAction Stop + if ($applicationHealth.AggregatedHealthState -ine 'Ok') { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "Examine the Service Fabric Application Health for Network Controller to determine why the health is not OK." + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +function Test-SdnServiceFabricClusterHealth { + <# + .SYNOPSIS + Validate the health of the Network Controller cluster within Service Fabric. + #> + + [CmdletBinding()] + param () + + $sdnHealthTest = New-SdnHealthTest + + try { + $clusterHealth = Get-SdnServiceFabricClusterHealth -ErrorAction Stop + if ($clusterHealth.AggregatedHealthState -ine 'Ok') { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "Examine the Service Fabric Cluster Health for Network Controller to determine why the health is not OK." + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +function Test-SdnServiceFabricNodeStatus { + <# + .SYNOPSIS + Validate the health of the Network Controller nodes within Service Fabric. + #> + + [CmdletBinding()] + param () + + $sdnHealthTest = New-SdnHealthTest + + try { + $ncNodes = Get-SdnServiceFabricNode -NodeName $env:COMPUTERNAME -ErrorAction Stop + if ($null -eq $ncNodes) { + $sdnHealthTest.Result = 'FAIL' + } + else { + if ($ncNodes.NodeStatus -ine 'Up') { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation = 'Examine the Service Fabric Nodes for Network Controller to determine why the node is not Up.' + } + } + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest +} + +function Test-SdnResourceConfigurationState { + <# + .SYNOPSIS + Validate that the configurationState of the resources. + #> + + [CmdletBinding(DefaultParameterSetName = 'RestCredential')] + param ( + [Parameter(Mandatory = $true)] + [string]$Resource, + + [Parameter(Mandatory = $true)] + [ValidateScript({ + if ($_.Scheme -ne "http" -and $_.Scheme -ne "https") { + throw New-Object System.FormatException("Parameter is expected to be in http:// or https:// format.") + } + return $true + })] + [Uri]$NcUri, + + [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] + [System.Management.Automation.PSCredential] + [System.Management.Automation.Credential()] + $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, + + [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] + [X509Certificate]$NcRestCertificate ) - $details = Get-HealthData -Property 'HealthValidations' -Id $Name + $sdnHealthTest = New-SdnHealthTest + $array = @() + + try { + "Validating configuration state of {0}" -f $SdnEnvironmentObject.Role.ResourceName | Trace-Output + + $sdnResources = Get-SdnResource @PSBoundParameters + foreach ($object in $sdnResources) { + + # if we have a resource that is not in a success state, we will skip validation + # as we do not expect configurationState to be accurate if provisioningState is not Success + if ($object.properties.provisioningState -ine 'Succeeded') { + continue + } + + # examine the configuration state of the resources and display errors to the screen + $errorMessages = @() + switch ($object.properties.configurationState.Status) { + 'Warning' { + # if we already have a failure, we will not change the result to warning + if ($sdnHealthTest.Result -ne 'FAIL') { + $sdnHealthTest.Result = 'WARNING' + } + + $traceLevel = 'Warning' + } + + 'Failure' { + $sdnHealthTest.Result = 'FAIL' + $traceLevel = 'Error' + } + + 'InProgress' { + # if we already have a failure, we will not change the result to warning + if ($sdnHealthTest.Result -ne 'FAIL') { + $sdnHealthTest.Result = 'WARNING' + } + + $traceLevel = 'Warning' + } + + 'Uninitialized' { + # in scenarios where state is redundant, we will not fail the test + if ($object.properties.state -ieq 'Redundant') { + # do nothing + } + else { + # if we already have a failure, we will not change the result to warning + if ($sdnHealthTest.Result -ne 'FAIL') { + $sdnHealthTest.Result = 'WARNING' + } + + $traceLevel = 'Warning' + } + } + + default { + $traceLevel = 'Verbose' + } + } + + if ($object.properties.configurationState.detailedInfo) { + foreach ($detail in $object.properties.configurationState.detailedInfo) { + switch ($detail.code) { + 'Success' { + # do nothing + } + + default { + $errorMessages += $detail.message + try { + $errorDetails = Get-HealthData -Property 'ConfigurationStateErrorCodes' -Id $detail.code + $sdnHealthTest.Remediation += "[{0}] {1}" -f $object.resourceRef, $errorDetails.Action + } + catch { + "Unable to locate remediation actions for {0}" -f $detail.code | Trace-Output -Level:Warning + $remediationString = "[{0}] Examine the configurationState property to determine why configuration failed." -f $object.resourceRef + $sdnHealthTest.Remediation += $remediationString + } + } + } + } - $outputString = "[$Role] $Name" - $outputString += "`r`n`r`n" - $outputString += "--------------------------`r`n" - $outputString += "Description:`t$($details.Description)`r`n" - $outputString += "Impact:`t`t$($details.Impact)`r`n" + # print the overall configuration state to screen, with each of the messages that were captured + # as part of the detailedinfo property + if ($errorMessages) { + $msg = "{0} is reporting configurationState status {1}:`n`t- {2}" -f $object.resourceRef, $object.properties.configurationState.Status, ($errorMessages -join "`n`t- ") + } + else { + $msg = "{0} is reporting configurationState status {1}" -f $object.resourceRef, $object.properties.configurationState.Status + } - if (-NOT [string]::IsNullOrEmpty($Remediation)) { - $outputString += "Remediation:`r`n`t -`t$($Remediation -join "`r`n`t -`t")`r`n" - } + $msg | Trace-Output -Level $traceLevel.ToString() + } - if (-NOT [string]::IsNullOrEmpty($details.PublicDocUrl)) { - $outputString += "`r`n" - $outputString += "Additional information can be found at $($details.PublicDocUrl).`r`n" - } + $details = [PSCustomObject]@{ + resourceRef = $object.resourceRef + configurationState = $object.properties.configurationState + } - $outputString += "`r`n--------------------------`r`n" - $outputString += "`r`n" + $array += $details + } - $outputString | Write-Host -ForegroundColor Yellow + $sdnHealthTest.Properties = $array + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } + + return $sdnHealthTest } -function Debug-SdnFabricInfrastructure { +function Test-SdnResourceProvisioningState { <# .SYNOPSIS - Executes a series of fabric validation tests to validate the state and health of the underlying components within the SDN fabric. - .PARAMETER NetworkController - Specifies the name or IP address of the network controller node on which this cmdlet operates. The parameter is optional if running on network controller node. - .PARAMETER ComputerName - Type the NetBIOS name, an IP address, or a fully qualified domain name of one or more remote computers. - .PARAMETER Role - The specific SDN role(s) to perform tests and validations for. If ommitted, defaults to all roles. - .PARAMETER Credential - Specifies a user account that has permission to perform this action. The default is the current user. - .PARAMETER NcRestCertificate - Specifies the client certificate that is used for a secure web request to Network Controller REST API. - Enter a variable that contains a certificate or a command or expression that gets the certificate. - .PARAMETER NcRestCredential - Specifies a user account that has permission to perform this action against the Network Controller REST API. The default is the current user. - .EXAMPLE - PS> Debug-SdnFabricInfrastructure - .EXAMPLE - PS> Debug-SdnFabricInfrastructure -NetworkController 'NC01' -Credential (Get-Credential) -NcRestCredential (Get-Credential) + Validate that the provisioningState of the resources. #> - [CmdletBinding(DefaultParameterSetName = 'Role')] + [CmdletBinding(DefaultParameterSetName = 'RestCredential')] param ( - [Parameter(Mandatory = $false, ParameterSetName = 'Role')] - [Parameter(Mandatory = $false, ParameterSetName = 'ComputerName')] - [System.String]$NetworkController = $env:COMPUTERNAME, - - [Parameter(Mandatory = $false, ParameterSetName = 'Role')] - [ValidateSet('Gateway', 'NetworkController', 'Server', 'LoadBalancerMux')] - [String[]]$Role = ('Gateway','LoadBalancerMux','NetworkController','Server'), - - [Parameter(Mandatory = $true, ParameterSetName = 'ComputerName')] - [System.String[]]$ComputerName, + [Parameter(Mandatory = $true)] + [string]$Resource, - [Parameter(Mandatory = $false, ParameterSetName = 'Role')] - [Parameter(Mandatory = $false, ParameterSetName = 'ComputerName')] - [System.Management.Automation.PSCredential] - [System.Management.Automation.Credential()] - $Credential = [System.Management.Automation.PSCredential]::Empty, + [Parameter(Mandatory = $true)] + [ValidateScript({ + if ($_.Scheme -ne "http" -and $_.Scheme -ne "https") { + throw New-Object System.FormatException("Parameter is expected to be in http:// or https:// format.") + } + return $true + })] + [Uri]$NcUri, - [Parameter(Mandatory = $false, ParameterSetName = 'Role')] - [Parameter(Mandatory = $false, ParameterSetName = 'ComputerName')] + [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] [System.Management.Automation.PSCredential] [System.Management.Automation.Credential()] $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, - [Parameter(Mandatory = $false, ParameterSetName = 'Role')] - [Parameter(Mandatory = $false, ParameterSetName = 'ComputerName')] + [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] [X509Certificate]$NcRestCertificate ) - if ($Global:SdnDiagnostics.EnvironmentInfo.ClusterConfigType -ine 'ServiceFabric') { - throw New-Object System.NotSupportedException("This function is only supported on Service Fabric clusters.") - } - - $script:SdnDiagnostics_Health.Cache = $null - $aggregateHealthReport = @() - if (Test-ComputerNameIsLocal -ComputerName $NetworkController) { - Confirm-IsNetworkController - } - - if ($PSBoundParameters.ContainsKey('NcRestCertificate')) { - $restCredParam = @{ NcRestCertificate = $NcRestCertificate } - } - else { - $restCredParam = @{ NcRestCredential = $NcRestCredential } - } - - $environmentInfo = Get-SdnInfrastructureInfo -NetworkController $NetworkController -Credential $Credential @restCredParam - if($null -eq $environmentInfo){ - throw New-Object System.NullReferenceException("Unable to retrieve environment details") - } + $sdnHealthTest = New-SdnHealthTest + $array = @() try { - # if we opted to specify the ComputerName rather than Role, we need to determine which role - # the computer names are associated with - if ($PSCmdlet.ParameterSetName -ieq 'ComputerName') { - $Role = @() - $ComputerName | ForEach-Object { - $computerRole = $_ | Get-SdnRole -EnvironmentInfo $environmentInfo - if ($computerRole) { - $Role += $computerRole - } - } - } + "Validating provisioning state of {0}" -f $Resource | Trace-Output - $Role = $Role | Sort-Object -Unique - foreach ($object in $Role) { - "Processing tests for {0} role" -f $object.ToString() | Trace-Output -Level:Verbose - $config = Get-SdnModuleConfiguration -Role $object.ToString() - - $roleHealthReport = [SdnFabricHealthReport]@{ - Role = $object.ToString() - } + $sdnResources = Get-SdnResource @PSBoundParameters + foreach ($object in $sdnResources) { + # examine the provisioning state of the resources and display errors to the screen + $msg = "{0} is reporting provisioning state: {1}" -f $object.resourceRef, $object.properties.provisioningState - $sdnFabricDetails = [SdnFabricEnvObject]@{ - NcUrl = $environmentInfo.NcUrl - Role = $config - EnvironmentInfo = $environmentInfo - } + switch ($object.properties.provisioningState) { + 'Failed' { + $sdnHealthTest.Result = 'FAIL' + $msg | Trace-Output -Level:Error - # check to see if we were provided a specific computer(s) to test against - # otherwise we will want to pick up the node name(s) from the environment info - if ($ComputerName) { - $sdnFabricDetails.ComputerName = $ComputerName - } - else { - # in scenarios where there are not mux(es) or gateway(s) then we need to gracefully handle this - # and move to the next role for processing - if ($null -ieq $environmentInfo[$object.ToString()]) { - "Unable to locate fabric nodes for {0}. Skipping health tests." -f $object.ToString() | Trace-Output -Level:Warning - continue + $sdnHealthTest.Remediation += "[$($object.resourceRef)] Examine the Network Controller logs to determine why provisioning is $($object.properties.provisioningState)." } - $sdnFabricDetails.ComputerName = $environmentInfo[$object.ToString()] - } + 'Updating' { + # if we already have a failure, we will not change the result to warning + if ($sdnHealthTest.Result -ne 'FAIL') { + $sdnHealthTest.Result = 'WARNING' + } - $restApiParams = @{ - SdnEnvironmentObject = $sdnFabricDetails - } - $restApiParams += $restCredParam + # since we do not know what operations happened prior to this, we will log a warning + # and ask the user to monitor the provisioningState + $msg | Trace-Output -Level:Warning + $sdnHealthTest.Remediation += "[$($object.resourceRef)] Is reporting $($object.properties.provisioningState). Monitor to ensure that provisioningState moves to Succeeded." + } - $computerCredParams = @{ - SdnEnvironmentObject = $sdnFabricDetails - Credential = $Credential + default { + # this should cover scenario where provisioningState is 'Deleting' or Succeeded + $msg | Trace-Output -Level:Verbose + } } - $computerCredAndRestApiParams = @{ - SdnEnvironmentObject = $sdnFabricDetails - Credential = $Credential + $details = [PSCustomObject]@{ + resourceRef = $object.resourceRef + provisioningState = $object.properties.provisioningState } - $computerCredAndRestApiParams += $restCredParam - # before proceeding with tests, ensure that the computer objects we are testing against are running the latest version of SdnDiagnostics - Install-SdnDiagnostics -ComputerName $sdnFabricDetails.ComputerName -Credential $Credential + $array += $details + } - # perform the health validations for the appropriate roles that were specified directly - # or determined via which ComputerNames were defined - switch ($object) { - 'Gateway' { - $roleHealthReport.HealthValidation += @( - Test-ResourceProvisioningState @restApiParams - Test-ResourceConfigurationState @restApiParams - Test-ServiceState @computerCredParams - Test-ScheduledTaskEnabled @computerCredParams - ) - } + $sdnHealthTest.Properties = $array + return $sdnHealthTest + } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' + } - 'LoadBalancerMux' { - $roleHealthReport.HealthValidation += @( - Test-ResourceProvisioningState @restApiParams - Test-ResourceConfigurationState @restApiParams - Test-ServiceState @computerCredParams - Test-ScheduledTaskEnabled @computerCredParams - Test-MuxBgpConnectionState @computerCredAndRestApiParams - Test-SlbManagerConnectionToMux @computerCredAndRestApiParams - ) - } + return $sdnHealthTest +} - 'NetworkController' { - $roleHealthReport.HealthValidation += @( - Test-NcUrlNameResolution @computerCredAndRestApiParams - Test-ServiceState @computerCredParams - Test-ServiceFabricPartitionDatabaseSize @computerCredParams - Test-ServiceFabricClusterHealth @computerCredParams - Test-ServiceFabricApplicationHealth @computerCredParams - Test-ServiceFabricNodeStatus @computerCredParams - Test-NetworkInterfaceAPIDuplicateMacAddress @restApiParams - Test-ScheduledTaskEnabled @computerCredParams - Test-NetworkControllerCertCredential @computerCredAndRestApiParams - ) - } +function Test-SdnConfigurationState { - 'Server' { - $roleHealthReport.HealthValidation += @( - Test-ResourceProvisioningState @restApiParams - Test-ResourceConfigurationState @restApiParams - Test-EncapOverhead @computerCredParams - Test-ProviderNetwork @computerCredParams - Test-ServiceState @computerCredParams - Test-ServerHostId @computerCredAndRestApiParams - Test-VfpDuplicatePort @computerCredParams - Test-VMNetAdapterDuplicateMacAddress @computerCredParams - Test-HostRootStoreNonRootCert @computerCredParams - Test-ScheduledTaskEnabled @computerCredParams - Test-NcHostAgentConnectionToApiService @computerCredAndRestApiParams - ) - } - } + [CmdletBinding()] + param ( + [Parameter(Mandatory = $false, ParameterSetName = 'RestCredential')] + [System.Management.Automation.PSCredential] + [System.Management.Automation.Credential()] + $NcRestCredential = [System.Management.Automation.PSCredential]::Empty, - # enumerate all the tests performed so we can determine if any completed with Warning or FAIL - # if any of the tests completed with Warning, we will set the aggregate result to Warning - # if any of the tests completed with FAIL, we will set the aggregate result to FAIL and then break out of the foreach loop - # we will skip tests with PASS, as that is the default value - foreach ($healthStatus in $roleHealthReport.HealthValidation) { - if ($healthStatus.Result -eq 'Warning') { - $roleHealthReport.Result = $healthStatus.Result - } - elseif ($healthStatus.Result -eq 'FAIL') { - $roleHealthReport.Result = $healthStatus.Result - break - } - } + [Parameter(Mandatory = $true, ParameterSetName = 'RestCertificate')] + [X509Certificate]$NcRestCertificate + ) - # add the individual role health report to the aggregate report - $aggregateHealthReport += $roleHealthReport - } + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) invoked" + try { + $isCurrentNodeClusterOwner = IsCurrentNodeClusterOwner + if ($false -eq $isCurrentNodeClusterOwner) { + Write-Verbose "This node is not the cluster owner. Skipping health tests." + return + } + + # servers + $items = Get-ItemProperty -Path HKLM:\SYSTEM\CurrentControlSet\Services\NcHostAgent\Parameters\ + $NcUri = "https://$($items.PeerCertificateCName)" + + $configStateHealths = @() + + # generate faults for servers + $servers = GetSdnResourceFromNc -ResourceType 'Servers' -NcUri $NcUri + $faultSet = GetFaultFromConfigurationState -resources $servers + ShowFaultSet -faultset $faultSet + $serverHealthTest = UpdateFaultSet -successFaults $faultSet[0] -FailureFaults $faultSet[1] + $serverHealthTest.Name = "servers" + $configStateHealths += $serverHealthTest + + # generate faults for vnics + $vnics = GetSdnResourceFromNc -Resource 'NetworkInterfaces' -NcUri $NcUri + $faultSet = GetFaultFromConfigurationState -resources $vnics + ShowFaultSet -faultset $faultSet + $vnicHealthTest = UpdateFaultSet -successFaults $faultSet[0] -FailureFaults $faultSet[1] + $vnicHealthTest.Name = "networkinterfaces" + $configStateHealths += $vnicHealthTest + + # generate faults for lnets + $vnics = GetSdnResourceFromNc -Resource 'LogicalNetworks' -NcUri $NcUri + $faultSet = GetFaultFromConfigurationState -resources $vnics + ShowFaultSet -faultset $faultSet + $vnicHealthTest = UpdateFaultSet -successFaults $faultSet[0] -FailureFaults $faultSet[1] + $vnicHealthTest.Name = "logicalnetworks" + $configStateHealths += $vnicHealthTest } catch { - $_ | Trace-Exception $_ | Write-Error } finally { - if ($aggregateHealthReport) { + Write-Verbose "$($PSCmdlet.MyInvocation.MyCommand.Name) exiting" + } +} - # enumerate all the roles that were tested so we can determine if any completed with Warning or FAIL - $aggregateHealthReport | ForEach-Object { - if ($_.Result -ine 'PASS') { - $role = $_.Role +################################### +##### MUX HEALTH VALIDATIONS ###### +################################### - # enumerate all the individual role tests performed so we can determine if any completed that are not PASS - $_.HealthValidation | ForEach-Object { - if ($_.Result -ine 'PASS') { - # add the remediation steps to an array list so we can pass it to the Write-HealthValidationInfo function - # otherwise if we pass it directly, it will be treated as a single string - $remediationList = [System.Collections.ArrayList]::new() - $_.Remediation | ForEach-Object { [void]$remediationList.Add($_)} - - Write-HealthValidationInfo -Role $([string]$role) -Name $_.Name -Remediation $remediationList - } - } - } - } +function Test-SdnMuxConnectionStateToRouter { + <# + SYNOPSIS + Validates the TCP connectivity for BGP endpoint to the routers. + #> - # save the aggregate health report to cache so we can use it for further analysis - $script:SdnDiagnostics_Health.Cache = $aggregateHealthReport + [CmdletBinding()] + param( + [Parameter(Mandatory = $true)] + [string[]]$RouterIPAddress + ) + + Confirm-IsLoadBalancerMux + $sdnHealthTest = New-SdnHealthTest + + try { + foreach ($router in $RouterIPAddress) { + $tcpConnection = Get-NetTCPConnection -RemotePort 179 -RemoteAddress $router -ErrorAction Ignore + if ($null -eq $tcpConnection -or $tcpConnection.State -ine 'Established') { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "Examine the TCP connectivity for router $router to determine why TCP connection is not established." + } } } - - if ($script:SdnDiagnostics_Health.Cache) { - "Results for fabric health have been saved to cache for further analysis. Use 'Get-SdnFabricInfrastructureResult' to examine the results." | Trace-Output - return $script:SdnDiagnostics_Health.Cache + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' } + + return $sdnHealthTest } -function Get-SdnFabricInfrastructureResult { +function Test-SdnMuxConnectionStateToSlbManager { <# - .SYNOPSIS - Returns the results that have been saved to cache as part of running Debug-SdnFabricInfrastructure. - .PARAMETER Role - The name of the SDN role that you want to return test results from within the cache. - .PARAMETER Name - The name of the test results you want to examine. - .EXAMPLE - PS> Get-SdnFabricInfrastructureResult - .EXAMPLE - PS> Get-SdnFabricInfrastructureResult -Role Server - .EXAMPLE - PS> Get-SdnFabricInfrastructureResult -Role Server -Name 'Test-ServiceState' + SYNOPSIS + Validates the TCP / TLS connectivity to the SlbManager service. #> [CmdletBinding()] - param ( - [Parameter(Mandatory = $false)] - [String]$Role, + param() - [Parameter(Mandatory = $false)] - [System.String]$Name - ) - - $cacheResults = $script:SdnDiagnostics_Health.Cache + Confirm-IsLoadBalancerMux + $sdnHealthTest = New-SdnHealthTest - if ($PSBoundParameters.ContainsKey('Role')) { - if ($cacheResults) { - $cacheResults = $cacheResults | Where-Object {$_.Role -eq $Role} + try { + $tcpConnection = Get-NetTCPConnection -LocalPort 8560 -ErrorAction Ignore + if ($null -eq $tcpConnection -or $tcpConnection.State -ine 'Established') { + $sdnHealthTest.Result = 'FAIL' + $sdnHealthTest.Remediation += "Move SlbManager service primary role to another node. Examine the TCP / TLS connectivity for the SlbManager service." } } - - if ($PSBoundParameters.ContainsKey('Name')) { - if ($cacheResults) { - $cacheResults = $cacheResults.HealthValidation | Where-Object {$_.Name -eq $Name} - } + catch { + $_ | Trace-Exception + $sdnHealthTest.Result = 'FAIL' } - return $cacheResults + return $sdnHealthTest } - diff --git a/src/modules/SdnDiag.LoadBalancerMux.psm1 b/src/modules/SdnDiag.LoadBalancerMux.psm1 index 51e87411..229b8f96 100644 --- a/src/modules/SdnDiag.LoadBalancerMux.psm1 +++ b/src/modules/SdnDiag.LoadBalancerMux.psm1 @@ -13,6 +13,48 @@ New-Variable -Name 'SdnDiagnostics_SLB' -Scope 'Script' -Force -Value @{ #### CLASSES & ENUMS ##### ########################## +class MuxConfig { + [ipaddress]$SourceIP4Address + [ipaddress]$SourceIP6Address + [int]$MaxFlowEntries + [int]$FlowIdleTimeout + [int]$HalfFlowIdleTimeout + [int]$FlowEntriesWatermark + [int]$CoeffecientForMovingAverage + [int]$BandwidthCalculationTimeInterval + [int]$AggregateBandwidthWatermark + [int]$ElephantBandwidthThreshold + [int]$AggregateBandwidthLimitForElephant + [int]$MaxDropProbability + [int]$MaxBandwidthUtilizationForDrop + [int]$InitialHashTableSize + [int]$MaxHashTableUsagePct + [int]$MinHashTableUsagePct + [int]$HashTableResizeFactor + [int]$HashPrimeNumber + [int]$FlowSamplingIntervalInSec + [string]$MUXFlags +} + +class MuxStatistics { + [string]$Type + [int]$TotalPackets + [int]$SynPackets + [int]$PacketsPerSecond + [int]$DroppedPackets + [int]$TotalBytes + [int]$FlowEntries + [int]$DroppedFlowEntries + [int]$TotalNumberOfHashTableBuckets + $FlowEntriesLimitUtilization + $FlowEntriesWatermarkUtilization + $AverageBandwidth + $BandwidthLimitUtilization + $BandwidthWatermarkUtilization + [int]$ElephantCount + [int]$FlowCacheMisses +} + ########################## #### ARG COMPLETERS ###### ########################## @@ -158,6 +200,67 @@ function Get-SdnMuxState { } } +function Get-SdnMuxConfig { + $muxConfig = [MuxConfig]::new() + $results = muxdrivercontrolconsole /GetMuxConfig + foreach ($i in $results) { + if ([string]::IsNullOrEmpty($i)) { continue } + if ($i.contains(":")){ + $property = $i.Split(":")[0].Trim().Replace(" ", "") + $value = $i.Split(":")[1].Trim() + + if ($property -iin $muxConfig.PSObject.Properties.Name) { + $muxConfig.$property = $value + } + } + } + + return $muxConfig +} + +function Get-SdnMuxStats { + $array = @() + $results = muxdrivercontrolconsole /GetMuxStats + + foreach ($i in $results) { + if ([string]::IsNullOrEmpty($i)) { continue } + + if ($i.contains(":")){ + $property = $i.Split(":")[0].Trim().Replace(" ", "") + $value = $i.Split(":")[1].Trim() + + if ($property -ilike "MuxStatisticsfor*") { + if ($muxStatistics) { + $array += $muxStatistics + } + + switch ($property) { + "MuxStatisticsforIPv4Traffic" { + $muxStatistics = [MuxStatistics]@{ + Type = "IPv4" + } + } + "MuxStatisticsforIPv6Traffic" { + $muxStatistics = [MuxStatistics]@{ + Type = "IPv6" + } + } + } + } + + if ($property -iin $muxStatistics.PSObject.Properties.Name) { + $muxStatistics.$property = $value + } + } + } + + if ($muxStatistics) { + $array += $muxStatistics + } + + return $array +} + function Get-SdnMuxStatefulVip { <# .SYNOPSIS diff --git a/src/modules/SdnDiag.NetworkController.FC.Config.psd1 b/src/modules/SdnDiag.NetworkController.FC.Config.psd1 index fce04ef8..350569ed 100644 --- a/src/modules/SdnDiag.NetworkController.FC.Config.psd1 +++ b/src/modules/SdnDiag.NetworkController.FC.Config.psd1 @@ -23,46 +23,55 @@ SDNApiService = @{ Properties = @{ DisplayName = "SDNApiService" + OwnerGroupName = "ApiService" } } SDNControllerService = @{ Properties = @{ DisplayName = "SDNControllerService" + OwnerGroupName = "ControllerService" } } SDNFirewallService = @{ Properties = @{ DisplayName = "SDNFirewallService" + OwnerGroupName = "FirewallService" } } SDNFnmService = @{ Properties = @{ DisplayName = "SDNFnmService" + OwnerGroupName = "FnmService" } } SDNGatewayManager = @{ Properties = @{ DisplayName = "SDNGatewayManager" + OwnerGroupName = "GatewayManager" } } SDNHelperService = @{ Properties = @{ DisplayName = "SDNHelperService" + OwnerGroupName = "" } } SDNServiceInsertion = @{ Properties = @{ DisplayName = "SDNServiceInsertion" + OwnerGroupName = "ServiceInsertion" } } SDNSlbManagerService = @{ Properties = @{ DisplayName = "SDNSlbManagerService" + OwnerGroupName = "SlbManagerService" } } SDNVSwitchService = @{ Properties = @{ DisplayName = "SDNVSwitchService" + OwnerGroupName = "VSwitchService" } } } diff --git a/src/modules/SdnDiag.NetworkController.FC.psm1 b/src/modules/SdnDiag.NetworkController.FC.psm1 index 129a05fb..8f5f8048 100644 --- a/src/modules/SdnDiag.NetworkController.FC.psm1 +++ b/src/modules/SdnDiag.NetworkController.FC.psm1 @@ -122,10 +122,10 @@ function Get-SdnNetworkControllerFC { try { if (Test-ComputerNameIsLocal -ComputerName $NetworkController) { Confirm-IsNetworkController - $result = Get-NetworkControllerOnFailoverCluster 4>$null + $result = Get-NetworkControllerOnFailoverCluster } else { - $result = Invoke-PSRemoteCommand -ComputerName $NetworkController -ScriptBlock { Get-NetworkControllerOnFailoverCluster 4>$null } -Credential $Credential + $result = Invoke-PSRemoteCommand -ComputerName $NetworkController -ScriptBlock { Get-NetworkControllerOnFailoverCluster } -Credential $Credential } return $result diff --git a/src/modules/SdnDiag.NetworkController.SF.psm1 b/src/modules/SdnDiag.NetworkController.SF.psm1 index 613e5cea..9fb21f5c 100644 --- a/src/modules/SdnDiag.NetworkController.SF.psm1 +++ b/src/modules/SdnDiag.NetworkController.SF.psm1 @@ -816,17 +816,18 @@ function Invoke-CertRotateCommand { Start-Sleep -Seconds 300 } - "Invoking {0} to configure thumbprint {1}" -f $Command, $cert.Thumbprint | Trace-Output "Command:{0} Params: {1}" -f $Command, ($params | ConvertTo-Json) | Trace-Output -Level:Verbose - switch ($Command) { 'Set-NetworkController' { + "Invoking {0} to configure thumbprint {1}" -f $Command, $cert.Thumbprint | Trace-Output Set-NetworkController @params } 'Set-NetworkControllerCluster' { + "Invoking {0} to configure thumbprint {1}" -f $Command, $cert.Thumbprint | Trace-Output Set-NetworkControllerCluster @params } 'Set-NetworkControllerNode' { + "Invoking {0} to configure thumbprint {1} for {2}" -f $Command, $cert.Thumbprint, $params.Name | Trace-Output Set-NetworkControllerNode @params } } @@ -2304,7 +2305,7 @@ function Get-SdnServiceFabricService { $sb = { param([string]$param1, [string]$param2) if (( Get-Service -Name 'FabricHostSvc').Status -ine 'Running' ) { - throw "Service Fabric Service is currently not running." + throw New-Object System.Exception("Service Fabric Service is currently not running.") } # The 3>$null 4>$null sends unwanted verbose and debug streams into the bit bucket diff --git a/src/modules/SdnDiag.Server.psm1 b/src/modules/SdnDiag.Server.psm1 index 42da7826..4465c586 100644 --- a/src/modules/SdnDiag.Server.psm1 +++ b/src/modules/SdnDiag.Server.psm1 @@ -718,7 +718,7 @@ function Get-ServerConfigState { $hnvDiag | ForEach-Object { try { $cmd = $_ - Invoke-Expression -Command $cmd 4> $null | Export-ObjectToFile -FilePath $OutputDirectory.FullName -Name $cmd -FileType txt -Format Table + Invoke-Expression -Command $cmd | Export-ObjectToFile -FilePath $OutputDirectory.FullName -Name $cmd -FileType txt -Format Table } catch { "Failed to execute {0}" -f $cmd | Trace-Output -Level:Error @@ -1483,32 +1483,26 @@ function Get-SdnNetAdapterEncapOverheadConfig { foreach ($physicalNicIfDesc in $switch.NetAdapterInterfaceDescriptions) { # get the encap overhead settings for each of the network interfaces within the vm switch team - $encapOverhead = Get-NetAdapterAdvancedProperty -InterfaceDescription $physicalNicIfDesc -RegistryKeyword "*Encapoverhead" -ErrorAction SilentlyContinue - if ($null -eq $encapoverhead) { - "Network interface {0} does not support EncapOverhead." -f $physicalNicIfDesc | Trace-Output -Level:Warning - } - else { + $encapOverhead = Get-NetAdapterAdvancedProperty -InterfaceDescription $physicalNicIfDesc -RegistryKeyword "*Encapoverhead" -ErrorAction Ignore + if ($encapoverhead) { $supportsEncapOverhead = $true [int]$encapOverheadValue = $encapoverhead.DisplayValue } # get the jumbo packet settings for each of the network interfaces within the vm switch team - $jumboPacket = Get-NetAdapterAdvancedProperty -InterfaceDescription $physicalNicIfDesc -RegistryKeyword "*JumboPacket" -ErrorAction SilentlyContinue - if ($null -eq $jumboPacket) { - "Network interface {0} does not support JumboPacket." -f $physicalNicIfDesc | Trace-Output -Level:Warning - } - else { + $jumboPacket = Get-NetAdapterAdvancedProperty -InterfaceDescription $physicalNicIfDesc -RegistryKeyword "*JumboPacket" -ErrorAction Ignore + if ($jumboPacket) { $supportsJumboPacket = $true [int]$jumboPacketValue = $jumboPacket.RegistryValue[0] } $object = [PSCustomObject]@{ - Switch = $switch.Name - NetworkInterface = $physicalNicIfDesc - EncapOverheadEnabled = $supportsEncapOverhead - EncapOverheadValue = $encapOverheadValue - JumboPacketEnabled = $supportsJumboPacket - JumboPacketValue = $jumboPacketValue + Switch = $switch.Name + NetAdapterInterfaceDescription = $physicalNicIfDesc + EncapOverheadEnabled = $supportsEncapOverhead + EncapOverheadValue = $encapOverheadValue + JumboPacketEnabled = $supportsJumboPacket + JumboPacketValue = $jumboPacketValue } # add each network interface to the interface array diff --git a/src/modules/SdnDiag.Utilities.psm1 b/src/modules/SdnDiag.Utilities.psm1 index 31a9606b..39526444 100644 --- a/src/modules/SdnDiag.Utilities.psm1 +++ b/src/modules/SdnDiag.Utilities.psm1 @@ -175,6 +175,30 @@ function Confirm-IsNetworkController { } } +function Confirm-IsLoadBalancerMux { + $config = Get-SdnModuleConfiguration -Role 'LoadBalancerMux' + $confirmFeatures = Confirm-RequiredFeaturesInstalled -Name $config.windowsFeature + if (-NOT ($confirmFeatures)) { + throw New-Object System.NotSupportedException("The current machine is not a LoadBalancerMux. Run this on LoadBalancerMux.") + } +} + +function Confirm-IsServer { + $config = Get-SdnModuleConfiguration -Role 'Server' + $confirmFeatures = Confirm-RequiredFeaturesInstalled -Name $config.windowsFeature + if (-NOT ($confirmFeatures)) { + throw New-Object System.NotSupportedException("The current machine is not a Server. Run this on Server.") + } +} + +function Confirm-IsRasGateway { + $config = Get-SdnModuleConfiguration -Role 'Gateway' + $confirmFeatures = Confirm-RequiredFeaturesInstalled -Name $config.windowsFeature + if (-NOT ($confirmFeatures)) { + throw New-Object System.NotSupportedException("The current machine is not a Gateway. Run this on Gateway.") + } +} + function Confirm-ProvisioningStateSucceeded { <# .SYNOPSIS @@ -2611,6 +2635,8 @@ function Invoke-SdnCommand { Runs commands on local and remote computers. .PARAMETER ComputerName Type the NetBIOS name, an IP address, or a fully qualified domain name a remote computer. + .PARAMETER ArgumentList + Supplies the values of parameters for the scriptblock. The parameters in the script block are passed by position from the array value supplied to ArgumentList. This is known as array splatting. .PARAMETER Credential Specifies a user account that has permission to perform this action. The default is the current user. Type a user name, such as User01 or Domain01\User01, or enter a PSCredential object generated by the Get-Credential cmdlet. If you type a user name, you're prompted to enter the password. @@ -2624,6 +2650,9 @@ function Invoke-SdnCommand { [Parameter(Mandatory = $true)] [ScriptBlock]$ScriptBlock, + [Parameter(Mandatory = $false)] + [Object]$ArgumentList, + [Parameter(Mandatory = $false)] [System.Management.Automation.PSCredential] [System.Management.Automation.Credential()] @@ -2631,7 +2660,7 @@ function Invoke-SdnCommand { ) try { - Invoke-PSRemoteCommand -ComputerName $ComputerName -Credential $Credential -ScriptBlock $ScriptBlock + Invoke-PSRemoteCommand @PSBoundParameters } catch { $_ | Trace-Exception @@ -2651,3 +2680,34 @@ function Get-ProductNameFromRegistry { } } } + +function Get-NugetArtifactPath { + [CmdletBinding()] + param( + [Parameter(Mandatory = $false)] + [System.String]$Path = "$($env:SystemDrive)\NugetStore", + + [Parameter(Mandatory = $true)] + [ValidateNotNullOrEmpty()] + [System.String]$NugetName, + + [Parameter(Mandatory = $false)] + [System.String]$Version + ) + + $null = Import-Module PackageManagement -DisableNameChecking + + $params = @{ + Name = $NugetName + Destination = $Path + ProviderName = 'NuGet' + } + if ($Version) { + $params.Add('RequiredVersion', $Version) + } + + $package = Get-Package @params + if ($package) { + return [System.IO.Path]::GetDirectoryName($package.Source) + } +}