From 4c1ea9e267d74d9ee5228c6bd3ed34e44289120b Mon Sep 17 00:00:00 2001 From: Zvi Rosenfeld Date: Thu, 25 Jul 2024 17:14:10 +0300 Subject: [PATCH] Added a bdb label to all metrics --- collector/process_collector.go | 104 ++++++++++++++++----------------- proc/grouper.go | 38 ++++++++++++ proc/grouper_test.go | 32 +++++----- 3 files changed, 106 insertions(+), 68 deletions(-) diff --git a/collector/process_collector.go b/collector/process_collector.go index 8f41ba4..9ebe0d9 100644 --- a/collector/process_collector.go +++ b/collector/process_collector.go @@ -13,79 +13,79 @@ var ( numprocsDesc = prometheus.NewDesc( "namedprocess_namegroup_num_procs", "number of processes in this group", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) cpuSecsDesc = prometheus.NewDesc( "namedprocess_namegroup_cpu_seconds_total", "Cpu user usage in seconds", - []string{"groupname", "mode"}, + []string{"groupname", "mode", "bdb"}, nil) readBytesDesc = prometheus.NewDesc( "namedprocess_namegroup_read_bytes_total", "number of bytes read by this group", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) writeBytesDesc = prometheus.NewDesc( "namedprocess_namegroup_write_bytes_total", "number of bytes written by this group", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) majorPageFaultsDesc = prometheus.NewDesc( "namedprocess_namegroup_major_page_faults_total", "Major page faults", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) minorPageFaultsDesc = prometheus.NewDesc( "namedprocess_namegroup_minor_page_faults_total", "Minor page faults", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) contextSwitchesDesc = prometheus.NewDesc( "namedprocess_namegroup_context_switches_total", "Context switches", - []string{"groupname", "ctxswitchtype"}, + []string{"groupname", "ctxswitchtype", "bdb"}, nil) membytesDesc = prometheus.NewDesc( "namedprocess_namegroup_memory_bytes", "number of bytes of memory in use", - []string{"groupname", "memtype"}, + []string{"groupname", "memtype", "bdb"}, nil) openFDsDesc = prometheus.NewDesc( "namedprocess_namegroup_open_filedesc", "number of open file descriptors for this group", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) worstFDRatioDesc = prometheus.NewDesc( "namedprocess_namegroup_worst_fd_ratio", "the worst (closest to 1) ratio between open fds and max fds among all procs in this group", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) startTimeDesc = prometheus.NewDesc( "namedprocess_namegroup_oldest_start_time_seconds", "start time in seconds since 1970/01/01 of oldest process in group", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) numThreadsDesc = prometheus.NewDesc( "namedprocess_namegroup_num_threads", "Number of threads", - []string{"groupname"}, + []string{"groupname", "bdb"}, nil) statesDesc = prometheus.NewDesc( "namedprocess_namegroup_states", "Number of processes in states Running, Sleeping, Waiting, Zombie, or Other", - []string{"groupname", "state"}, + []string{"groupname", "state", "bdb"}, nil) scrapeErrorsDesc = prometheus.NewDesc( @@ -109,43 +109,43 @@ var ( threadWchanDesc = prometheus.NewDesc( "namedprocess_namegroup_threads_wchan", "Number of threads in this group waiting on each wchan", - []string{"groupname", "wchan"}, + []string{"groupname", "wchan", "bdb"}, nil) threadCountDesc = prometheus.NewDesc( "namedprocess_namegroup_thread_count", "Number of threads in this group with same threadname", - []string{"groupname", "threadname"}, + []string{"groupname", "threadname", "bdb"}, nil) threadCpuSecsDesc = prometheus.NewDesc( "namedprocess_namegroup_thread_cpu_seconds_total", "Cpu user/system usage in seconds", - []string{"groupname", "threadname", "mode"}, + []string{"groupname", "threadname", "mode", "bdb"}, nil) threadIoBytesDesc = prometheus.NewDesc( "namedprocess_namegroup_thread_io_bytes_total", "number of bytes read/written by these threads", - []string{"groupname", "threadname", "iomode"}, + []string{"groupname", "threadname", "iomode", "bdb"}, nil) threadMajorPageFaultsDesc = prometheus.NewDesc( "namedprocess_namegroup_thread_major_page_faults_total", "Major page faults for these threads", - []string{"groupname", "threadname"}, + []string{"groupname", "threadname", "bdb"}, nil) threadMinorPageFaultsDesc = prometheus.NewDesc( "namedprocess_namegroup_thread_minor_page_faults_total", "Minor page faults for these threads", - []string{"groupname", "threadname"}, + []string{"groupname", "threadname", "bdb"}, nil) threadContextSwitchesDesc = prometheus.NewDesc( "namedprocess_namegroup_thread_context_switches_total", "Context switches for these threads", - []string{"groupname", "threadname", "ctxswitchtype"}, + []string{"groupname", "threadname", "ctxswitchtype", "bdb"}, nil) ) @@ -262,47 +262,47 @@ func (p *NamedProcessCollector) scrape(ch chan<- prometheus.Metric) { } else { for gname, gcounts := range groups { ch <- prometheus.MustNewConstMetric(numprocsDesc, - prometheus.GaugeValue, float64(gcounts.Procs), gname) + prometheus.GaugeValue, float64(gcounts.Procs), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(membytesDesc, - prometheus.GaugeValue, float64(gcounts.Memory.ResidentBytes), gname, "resident") + prometheus.GaugeValue, float64(gcounts.Memory.ResidentBytes), gname, "resident", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(membytesDesc, - prometheus.GaugeValue, float64(gcounts.Memory.VirtualBytes), gname, "virtual") + prometheus.GaugeValue, float64(gcounts.Memory.VirtualBytes), gname, "virtual", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(membytesDesc, - prometheus.GaugeValue, float64(gcounts.Memory.VmSwapBytes), gname, "swapped") + prometheus.GaugeValue, float64(gcounts.Memory.VmSwapBytes), gname, "swapped", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(startTimeDesc, - prometheus.GaugeValue, float64(gcounts.OldestStartTime.Unix()), gname) + prometheus.GaugeValue, float64(gcounts.OldestStartTime.Unix()), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(openFDsDesc, - prometheus.GaugeValue, float64(gcounts.OpenFDs), gname) + prometheus.GaugeValue, float64(gcounts.OpenFDs), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(worstFDRatioDesc, - prometheus.GaugeValue, float64(gcounts.WorstFDratio), gname) + prometheus.GaugeValue, float64(gcounts.WorstFDratio), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(cpuSecsDesc, - prometheus.CounterValue, gcounts.CPUUserTime, gname, "user") + prometheus.CounterValue, gcounts.CPUUserTime, gname, "user", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(cpuSecsDesc, - prometheus.CounterValue, gcounts.CPUSystemTime, gname, "system") + prometheus.CounterValue, gcounts.CPUSystemTime, gname, "system", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(readBytesDesc, - prometheus.CounterValue, float64(gcounts.ReadBytes), gname) + prometheus.CounterValue, float64(gcounts.ReadBytes), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(writeBytesDesc, - prometheus.CounterValue, float64(gcounts.WriteBytes), gname) + prometheus.CounterValue, float64(gcounts.WriteBytes), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(majorPageFaultsDesc, - prometheus.CounterValue, float64(gcounts.MajorPageFaults), gname) + prometheus.CounterValue, float64(gcounts.MajorPageFaults), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(minorPageFaultsDesc, - prometheus.CounterValue, float64(gcounts.MinorPageFaults), gname) + prometheus.CounterValue, float64(gcounts.MinorPageFaults), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(contextSwitchesDesc, - prometheus.CounterValue, float64(gcounts.CtxSwitchVoluntary), gname, "voluntary") + prometheus.CounterValue, float64(gcounts.CtxSwitchVoluntary), gname, "voluntary", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(contextSwitchesDesc, - prometheus.CounterValue, float64(gcounts.CtxSwitchNonvoluntary), gname, "nonvoluntary") + prometheus.CounterValue, float64(gcounts.CtxSwitchNonvoluntary), gname, "nonvoluntary", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(numThreadsDesc, - prometheus.GaugeValue, float64(gcounts.NumThreads), gname) + prometheus.GaugeValue, float64(gcounts.NumThreads), gname, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(statesDesc, - prometheus.GaugeValue, float64(gcounts.States.Running), gname, "Running") + prometheus.GaugeValue, float64(gcounts.States.Running), gname, "Running", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(statesDesc, - prometheus.GaugeValue, float64(gcounts.States.Sleeping), gname, "Sleeping") + prometheus.GaugeValue, float64(gcounts.States.Sleeping), gname, "Sleeping", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(statesDesc, - prometheus.GaugeValue, float64(gcounts.States.Waiting), gname, "Waiting") + prometheus.GaugeValue, float64(gcounts.States.Waiting), gname, "Waiting", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(statesDesc, - prometheus.GaugeValue, float64(gcounts.States.Zombie), gname, "Zombie") + prometheus.GaugeValue, float64(gcounts.States.Zombie), gname, "Zombie", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(statesDesc, - prometheus.GaugeValue, float64(gcounts.States.Other), gname, "Other") + prometheus.GaugeValue, float64(gcounts.States.Other), gname, "Other", gcounts.BdbLabel) for wchan, count := range gcounts.Wchans { ch <- prometheus.MustNewConstMetric(threadWchanDesc, @@ -311,40 +311,40 @@ func (p *NamedProcessCollector) scrape(ch chan<- prometheus.Metric) { if p.smaps { ch <- prometheus.MustNewConstMetric(membytesDesc, - prometheus.GaugeValue, float64(gcounts.Memory.ProportionalBytes), gname, "proportionalResident") + prometheus.GaugeValue, float64(gcounts.Memory.ProportionalBytes), gname, "proportionalResident", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(membytesDesc, - prometheus.GaugeValue, float64(gcounts.Memory.ProportionalSwapBytes), gname, "proportionalSwapped") + prometheus.GaugeValue, float64(gcounts.Memory.ProportionalSwapBytes), gname, "proportionalSwapped", gcounts.BdbLabel) } if p.threads { for _, thr := range gcounts.Threads { ch <- prometheus.MustNewConstMetric(threadCountDesc, prometheus.GaugeValue, float64(thr.NumThreads), - gname, thr.Name) + gname, thr.Name, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(threadCpuSecsDesc, prometheus.CounterValue, float64(thr.CPUUserTime), - gname, thr.Name, "user") + gname, thr.Name, "user", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(threadCpuSecsDesc, prometheus.CounterValue, float64(thr.CPUSystemTime), - gname, thr.Name, "system") + gname, thr.Name, "system", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(threadIoBytesDesc, prometheus.CounterValue, float64(thr.ReadBytes), - gname, thr.Name, "read") + gname, thr.Name, "read", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(threadIoBytesDesc, prometheus.CounterValue, float64(thr.WriteBytes), - gname, thr.Name, "write") + gname, thr.Name, "write", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(threadMajorPageFaultsDesc, prometheus.CounterValue, float64(thr.MajorPageFaults), - gname, thr.Name) + gname, thr.Name, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(threadMinorPageFaultsDesc, prometheus.CounterValue, float64(thr.MinorPageFaults), - gname, thr.Name) + gname, thr.Name, gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(threadContextSwitchesDesc, prometheus.CounterValue, float64(thr.CtxSwitchVoluntary), - gname, thr.Name, "voluntary") + gname, thr.Name, "voluntary", gcounts.BdbLabel) ch <- prometheus.MustNewConstMetric(threadContextSwitchesDesc, prometheus.CounterValue, float64(thr.CtxSwitchNonvoluntary), - gname, thr.Name, "nonvoluntary") + gname, thr.Name, "nonvoluntary", gcounts.BdbLabel) } } } diff --git a/proc/grouper.go b/proc/grouper.go index bbf240c..b22d1d0 100644 --- a/proc/grouper.go +++ b/proc/grouper.go @@ -1,6 +1,11 @@ package proc import ( + "fmt" + "log" + "os/exec" + "regexp" + "strings" "time" seq "github.com/ncabatoff/go-seq/seq" @@ -18,6 +23,7 @@ type ( threadAccum map[string]map[string]Threads debug bool removeEmptyGroups bool + groupBdbLabel map[string]string } // GroupByName maps group name to group metrics. @@ -42,6 +48,7 @@ type ( WorstFDratio float64 NumThreads uint64 Threads []Threads + BdbLabel string } ) @@ -57,6 +64,7 @@ func NewGrouper(namer common.MatchNamer, trackChildren, trackThreads, recheck bo tracker: NewTracker(namer, trackChildren, recheck, recheckTimeLimit, debug), debug: debug, removeEmptyGroups: removeEmptyGroups, + groupBdbLabel: make(map[string]string), } return &g } @@ -140,15 +148,45 @@ func (g *Grouper) groups(tracked []Update) GroupByName { if g.removeEmptyGroups { delete(g.groupAccum, gname) delete(g.threadAccum, gname) + delete(g.groupBdbLabel, gname) } else { groups[gname] = Group{Counts: gcounts} } } } + for gname, group := range groups { + lableValue, labelValueExists := g.groupBdbLabel[gname] + if !labelValueExists { + lableValue = get_bdb_label(gname) + g.groupBdbLabel[gname] = lableValue + } + group.BdbLabel = lableValue + groups[gname] = group + } + return groups } +func get_bdb_label(groupname string) string { + re := regexp.MustCompile(`redis-(\d+)`) + matches := re.FindStringSubmatch(groupname) + + // Check if a match was found + if len(matches) < 2 { + return "None" + } + + shard_uid := matches[1] + out, err := exec.Command("ccs-cli", "hget", fmt.Sprintf("redis:%s", shard_uid), "bdb_uid").Output() + if err != nil { + log.Fatal(err) + return "Error" + } + + return strings.TrimSuffix(string(out[:]), "\n") +} + func (g *Grouper) threads(gname string, tracked []ThreadUpdate) []Threads { if len(tracked) == 0 { delete(g.threadAccum, gname) diff --git a/proc/grouper_test.go b/proc/grouper_test.go index 826d1e9..b10e034 100644 --- a/proc/grouper_test.go +++ b/proc/grouper_test.go @@ -52,9 +52,9 @@ func TestGrouperBasic(t *testing.T) { }, GroupByName{ "g1": Group{Counts{}, States{Other: 1}, msi{}, 1, Memory{7, 8, 0, 0, 0}, starttime, - 4, 0.01, 2, nil}, + 4, 0.01, 2, nil, "None"}, "g2": Group{Counts{}, States{Waiting: 1}, msi{}, 1, Memory{8, 9, 0, 0, 0}, starttime, - 40, 0.1, 3, nil}, + 40, 0.1, 3, nil, "None"}, }, }, { @@ -66,9 +66,9 @@ func TestGrouperBasic(t *testing.T) { }, GroupByName{ "g1": Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{Zombie: 1}, msi{}, 1, - Memory{6, 7, 0, 0, 0}, starttime, 100, 0.25, 4, nil}, + Memory{6, 7, 0, 0, 0}, starttime, 100, 0.25, 4, nil, "None"}, "g2": Group{Counts{2, 2, 2, 2, 2, 2, 0, 0}, States{Running: 1}, msi{}, 1, - Memory{9, 8, 0, 0, 0}, starttime, 400, 1, 2, nil}, + Memory{9, 8, 0, 0, 0}, starttime, 400, 1, 2, nil, "None"}, }, }, } @@ -98,7 +98,7 @@ func TestGrouperProcJoin(t *testing.T) { piinfo(p1, n1, Counts{1, 2, 3, 4, 5, 6, 0, 0}, Memory{3, 4, 0, 0, 0}, Filedesc{4, 400}, 2), }, GroupByName{ - "g1": Group{Counts{}, States{}, msi{}, 1, Memory{3, 4, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, + "g1": Group{Counts{}, States{}, msi{}, 1, Memory{3, 4, 0, 0, 0}, starttime, 4, 0.01, 2, nil, "None"}, }, }, { // The counts for pid2 won't be factored into the total yet because we only add @@ -112,7 +112,7 @@ func TestGrouperProcJoin(t *testing.T) { }, GroupByName{ "g1": Group{Counts{2, 2, 2, 2, 2, 2, 0, 0}, States{Running: 1, Sleeping: 1}, msi{}, 2, - Memory{4, 6, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, + Memory{4, 6, 0, 0, 0}, starttime, 44, 0.1, 5, nil, "None"}, }, }, { []IDInfo{ @@ -123,7 +123,7 @@ func TestGrouperProcJoin(t *testing.T) { }, GroupByName{ "g1": Group{Counts{4, 4, 4, 4, 4, 4, 0, 0}, States{Running: 2}, msi{}, 2, - Memory{3, 9, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, + Memory{3, 9, 0, 0, 0}, starttime, 44, 0.1, 5, nil, "None"}, }, }, } @@ -154,19 +154,19 @@ func TestGrouperNonDecreasing(t *testing.T) { piinfo(p2, n2, Counts{1, 1, 1, 1, 1, 1, 0, 0}, Memory{1, 2, 0, 0, 0}, Filedesc{40, 400}, 3), }, GroupByName{ - "g1": Group{Counts{}, States{}, msi{}, 2, Memory{4, 6, 0, 0, 0}, starttime, 44, 0.1, 5, nil}, + "g1": Group{Counts{}, States{}, msi{}, 2, Memory{4, 6, 0, 0, 0}, starttime, 44, 0.1, 5, nil, "None"}, }, }, { []IDInfo{ piinfo(p1, n1, Counts{4, 5, 6, 7, 8, 9, 0, 0}, Memory{1, 5, 0, 0, 0}, Filedesc{4, 400}, 2), }, GroupByName{ - "g1": Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, msi{}, 1, Memory{1, 5, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, + "g1": Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, msi{}, 1, Memory{1, 5, 0, 0, 0}, starttime, 4, 0.01, 2, nil, "None"}, }, }, { []IDInfo{}, GroupByName{ - "g1": Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, nil, 0, Memory{}, time.Time{}, 0, 0, 0, nil}, + "g1": Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, nil, 0, Memory{}, time.Time{}, 0, 0, 0, nil, "None"}, }, }, } @@ -197,15 +197,15 @@ func TestGrouperRemoveEmptyGroups(t *testing.T) { piinfo(p2, n2, Counts{1, 1, 1, 1, 1, 1, 0, 0}, Memory{1, 2, 0, 0, 0}, Filedesc{40, 400}, 3), }, GroupByName{ - n1: Group{Counts{}, States{}, msi{}, 1, Memory{3, 4, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, - n2: Group{Counts{}, States{}, msi{}, 1, Memory{1, 2, 0, 0, 0}, starttime, 40, 0.1, 3, nil}, + n1: Group{Counts{}, States{}, msi{}, 1, Memory{3, 4, 0, 0, 0}, starttime, 4, 0.01, 2, nil, "None"}, + n2: Group{Counts{}, States{}, msi{}, 1, Memory{1, 2, 0, 0, 0}, starttime, 40, 0.1, 3, nil, "None"}, }, }, { []IDInfo{ piinfo(p1, n1, Counts{4, 5, 6, 7, 8, 9, 0, 0}, Memory{1, 5, 0, 0, 0}, Filedesc{4, 400}, 2), }, GroupByName{ - n1: Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, msi{}, 1, Memory{1, 5, 0, 0, 0}, starttime, 4, 0.01, 2, nil}, + n1: Group{Counts{1, 1, 1, 1, 1, 1, 0, 0}, States{}, msi{}, 1, Memory{1, 5, 0, 0, 0}, starttime, 4, 0.01, 2, nil, "None"}, }, }, { []IDInfo{}, @@ -238,7 +238,7 @@ func TestGrouperThreads(t *testing.T) { "g1": Group{Counts{}, States{}, msi{}, 1, Memory{}, tm, 1, 1, 2, []Threads{ Threads{"t1", 1, Counts{}}, Threads{"t2", 1, Counts{}}, - }}, + }, "None"}, }, }, { piinfot(p, n, Counts{}, Memory{}, Filedesc{1, 1}, []Thread{ @@ -250,7 +250,7 @@ func TestGrouperThreads(t *testing.T) { "g1": Group{Counts{}, States{}, msi{}, 1, Memory{}, tm, 1, 1, 3, []Threads{ Threads{"t1", 1, Counts{1, 1, 1, 1, 1, 1, 0, 0}}, Threads{"t2", 2, Counts{1, 1, 1, 1, 1, 1, 0, 0}}, - }}, + }, "None"}, }, }, { piinfot(p, n, Counts{}, Memory{}, Filedesc{1, 1}, []Thread{ @@ -260,7 +260,7 @@ func TestGrouperThreads(t *testing.T) { GroupByName{ "g1": Group{Counts{}, States{}, msi{}, 1, Memory{}, tm, 1, 1, 2, []Threads{ Threads{"t2", 2, Counts{4, 5, 6, 7, 8, 9, 0, 0}}, - }}, + }, "None"}, }, }, }