Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add init health module. #311

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions health/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package health

import (
"crypto/x509"
"encoding/pem"
"fmt"
"log/syslog"
"os/exec"
"strconv"
"strings"
"time"
)

type ContainerHealthInfo struct {
ContainerID string
CPUUtilization float64
MemoryUsage float64
DiskOccupation float64
CertExpiration int64 // days until expiration
Status string
}

// GetHealthInfo gathers health information for the gNMI container
func GetHealthInfo() ([]ContainerHealthInfo, error) {
// Here we interact with Docker to get container stats
output, err := exec.Command("docker", "stats", "--no-stream", "--format", "\"{{.Container}},{{.CPUPerc}},{{.MemPerc}},{{.Name}}\"", "| grep gnmi").Output()
if err != nil {
return nil, fmt.Errorf("failed to retrieve container stats: %v", err)
}

var healthInfo []ContainerHealthInfo
lines := strings.Split(string(output), "\n")
for _, line := range lines {
if line == "" {
continue
}
parts := strings.Split(line, ",")
if len(parts) < 4 {
continue
}

containerID := parts[0]
container := ContainerHealthInfo{
ContainerID: containerID,
CPUUtilization: parsePercentage(parts[1]),
MemoryUsage: parsePercentage(parts[2]),
DiskOccupation: getDiskOccupation(containerID),
CertExpiration: getCertExpiration(containerID),
Status: parts[3],
}

healthInfo = append(healthInfo, container)
}

return healthInfo, nil
}

// getDiskOccupation retrieves the disk usage for the container
func getDiskOccupation(containerID string) float64 {
// Run the command to get disk usage inside the container
output, err := exec.Command("docker", "exec", containerID, "df", "/").Output()
if err != nil {
fmt.Printf("failed to retrieve disk occupation for container %s: %v\n", containerID, err)
return 0.0
}
return parsePercentage(strings.TrimSpace(string(output)))
}

// getCertExpiration retrieves the certificate expiration for the container
func getCertExpiration(containerID string) int64 {
// Run the command to get the certificate from the container
output, err := fmt.Sprintf("docker", "exec", containerID, "cat", "/path/to/cert.pem")
if err != nil {
fmt.Printf("failed to retrieve certificate for container %s: %v\n", containerID, err)
return 0
}

// Parse the certificate to get the expiration date
block, _ := pem.Decode(output)
if block == nil {
fmt.Printf("failed to parse certificate PEM for container %s\n", containerID)
return 0
}

cert, err := x509.ParseCertificate(block.Bytes)
if err != nil {
fmt.Printf("failed to parse certificate for container %s: %v\n", containerID, err)
return 0
}

// Calculate days until expiration
return int64(time.Until(cert.NotAfter).Hours() / 24)
}

// LogHealthProofs logs container health information to syslog
func LogHealthProofs(container ContainerHealthInfo) {
logwriter, err := syslog.New(syslog.LOG_NOTICE, "container_health")
if err == nil {
logwriter.Info("Health check for container " + container.ContainerID + ": " +
"CPU=" + fmt.Sprintf("%.2f", container.CPUUtilization) +
", Memory=" + fmt.Sprintf("%.2f", container.MemoryUsage) +
", Disk=" + fmt.Sprintf("%.2f", container.DiskOccupation) +
", CertExpiryDays=" + fmt.Sprintf("%d", container.CertExpiration))
}
}

// Helper function to parse percentages
func parsePercentage(value string) float64 {
value = strings.TrimSuffix(value, "%")
parsedValue, err := strconv.ParseFloat(value, 64)
if err != nil {
return 0.0
}
return parsedValue
}
47 changes: 44 additions & 3 deletions sonic_data_client/non_db_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

spb "github.com/sonic-net/sonic-gnmi/proto"
"github.com/Workiva/go-datastructures/queue"
"github.com/sonic-net/sonic-gnmi/health"
linuxproc "github.com/c9s/goprocinfo/linux"
log "github.com/golang/glog"
gnmipb "github.com/openconfig/gnmi/proto/gnmi"
Expand Down Expand Up @@ -39,6 +40,13 @@ type statsRing struct {
mu sync.RWMutex // Mutex for data protection
}

type healthInfoStash struct {
once sync.Once
healthInfo []health.ContainerHealthInfo
err error
isHealthy bool
}

// SonicVersionInfo is a data model to serialize '/etc/sonic/sonic_version.yml'
type SonicVersionInfo struct {
BuildVersion string `yaml:"build_version" json:"build_version"`
Expand Down Expand Up @@ -101,6 +109,10 @@ var (
path: []string{"OTHERS", "osversion", "build"},
getFunc: dataGetFunc(getBuildVersion),
},
{ // Container Health Status
path: []string{"OTHERS", "container-health-status", "gnmi"},
getFunc: dataGetFunc(getContainerHealthStatus),
},
}
)

Expand Down Expand Up @@ -137,7 +149,7 @@ func getCpuUtilPercents(cur, last *linuxproc.CPUStat) uint64 {
idleTicks := cur.Idle - last.Idle
totalTicks := curTotal - lastTotal
if totalTicks == 0 { // No change in CPU Utilization
return 0
return 0
}
return 100 * (totalTicks - idleTicks) / totalTicks
}
Expand Down Expand Up @@ -335,6 +347,36 @@ func getBuildVersion() ([]byte, error) {
return b, nil
}

func getContainerHealthStatus() ([]byte, error) {
// Load and parse the container health status
var stash healthInfoStash
stash.once.Do(func() {
stash.healthInfo, stash.err = health.GetHealthInfo() // Assuming GetHealthInfo() returns ([]ContainerHealthInfo, error)
if stash.err != nil {
log.V(2).Infof("Failed to gather health metrics: %v", stash.err)
return
}

// Evaluate health info
stash.isHealthy = true
for _, container := range stash.healthInfo {
health.LogHealthProofs(container)
if container.CPUUtilization > 80.0 || container.MemoryUsage > 80.0 || container.DiskOccupation > 90.0 || container.CertExpiration <= 30 {
stash.isHealthy = false
break
}
}
})

b, err := json.Marshal(stash.healthInfo)
if err != nil {
log.V(2).Infof("%v", err)
return b, err
}
log.V(4).Infof("getContainerHealthStatus, output %v", string(b))
return b, nil
}

func WriteStatsToBuffer(stat *linuxproc.Stat) {
statsR.mu.Lock()
statsR.buff[statsR.writeIdx] = stat
Expand Down Expand Up @@ -588,7 +630,7 @@ func (c *NonDbClient) Close() error {
return nil
}

func (c *NonDbClient) Set(delete []*gnmipb.Path, replace []*gnmipb.Update, update []*gnmipb.Update) error {
func (c *NonDbClient) Set(delete []*gnmipb.Path, replace []*gnmipb.Update, update []*gnmipb.Update) error {
return nil
}
func (c *NonDbClient) Capabilities() []gnmipb.ModelData {
Expand All @@ -599,4 +641,3 @@ func (c *NonDbClient) SentOne(val *Value) {

func (c *NonDbClient) FailedSend() {
}

Loading