From 6a0a00c93facf4dee7a765b3e1cf82ac6eb59a21 Mon Sep 17 00:00:00 2001 From: Shashank Reddy Boyapally Date: Mon, 8 Jan 2024 16:15:02 -0500 Subject: [PATCH 1/5] added matcher class and functions --- .DS_Store | Bin 0 -> 6148 bytes README.md | 2 +- fmatch/__init__.py | 0 fmatch/matcher.py | 127 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 .DS_Store create mode 100644 fmatch/__init__.py create mode 100644 fmatch/matcher.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..42ed45ebc9fc05e89fc760b351262b1fe79f07de GIT binary patch literal 6148 zcmeHKJ8l9o5S>X7T8V}-rLVvZtQDMq3xE;<1uRmOP;ZrUF!x>p zSS$dniG3g5W|%t#gO_8Vmt!of9KU%| aiGvE;K6eXa%l1JQali literal 0 HcmV?d00001 diff --git a/README.md b/README.md index 3b5e7f1..f3fdc10 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ # py-commons -Common Python Library +This common python library is dedicated to metadata matching and finding regressions. diff --git a/fmatch/__init__.py b/fmatch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fmatch/matcher.py b/fmatch/matcher.py new file mode 100644 index 0000000..9e9d7fa --- /dev/null +++ b/fmatch/matcher.py @@ -0,0 +1,127 @@ +from elasticsearch7 import Elasticsearch +from elasticsearch.exceptions import NotFoundError +import pandas as pd + +import os +import csv + +ES_URL=os.getenv("ES_SERVER") + +class Matcher: + def __init__(self, index="perf_scale_ci"): + self.index=index + self.es_url=ES_URL + self.es=Elasticsearch([self.es_url],http_auth=["username","password"]) + self.data=None + + def get_metadata_by_uuid(self,uuid,index=None): + if index==None: + index=self.index + query = { + "query": { + "match": { + "uuid": uuid + } + } + } + try: + result = self.es.search(index=index, body=query) + hits = result.get('hits', {}).get('hits', []) + if hits: + return dict(hits[0]['_source']) + else: + return None + except NotFoundError: + print(f"UUID {uuid} not found in index {index}") + return None + + + def get_uuid_by_metadata(self,meta,index=None): + if index==None: + index=self.index + version=meta["ocpVersion"][:4] + query = { + "query": { + "bool": { + "must": [ + { + "query_string": { + "query": ' AND '.join([ + f'{field}: "{value}"' if isinstance(value, str) else f'{field}: {value}' + for field, value in meta.items() if field!="ocpVersion" + ]) + + f' AND ocpVersion: {version}* AND jobStatus: success' + } + } + ] + } + }, + "size": 10000 + } + result = self.es.search(index=index, body=query) + hits = result.get('hits', {}).get('hits', []) + uuids=[hit['_source']['uuid'] for hit in hits] + return uuids + + def match_kube_burner(self,uuids): + index = "ripsaw-kube-burner*" + ids = "\" OR uuid: \"".join(uuids) + query = { + "query": { + "query_string": { + "query": ( + f'( uuid: \"{ids}\" )' + f' AND metricName: "jobSummary"' + ) + } + }, + "size":10000 + } + result=self.es.search(index=index,body=query) + runs = [item['_source'] for item in result["hits"]["hits"]] + return runs + + def filter_runs(self,pdata,data): + columns = ['uuid','jobConfig.jobIterations'] + pdf = pd.json_normalize(pdata) + pdf.to_csv("check.csv") + pick_df = pd.DataFrame(pdf, columns=columns) + iterations = pick_df.iloc[0]['jobConfig.jobIterations'] + df = pd.json_normalize(data) + ndf = pd.DataFrame(df, columns=columns) + ids_df = ndf.loc[df['jobConfig.jobIterations'] == iterations ] + return ids_df['uuid'].to_list() + + def burner_results(self,uuid,uuids,index): + if len(uuids) > 1 : + if len(uuid) > 0 : + uuids.remove(uuid) + if len(uuids) < 1 : + return [] + ids = "\" OR uuid: \"".join(uuids) + query = { + "query": { + "query_string": { + "query": ( + f'( uuid: \"{ids}\" )' + f' AND metricName: "podLatencyQuantilesMeasurement"' + f' AND quantileName: "Ready"' + ) + } + }, + "size":10000 + } + #print(query) + result=self.es.search(index=index,body=query) + runs = [item['_source'] for item in result["hits"]["hits"]] + self.data=runs + return runs + + def saveResults(self,csv_file_path="output.csv"): + odf = pd.json_normalize(self.data) + odf.to_csv("check.csv") + columns = ['uuid','timestamp', 'quantileName','metricName', 'P99'] + odf = pd.DataFrame(odf, columns=columns) + odf = odf.sort_values(by=['timestamp']) + odf.to_csv(csv_file_path) + From ca7083241ab7df12f8c75331130c626a227e3012 Mon Sep 17 00:00:00 2001 From: Shashank Reddy Boyapally Date: Wed, 10 Jan 2024 09:48:41 -0500 Subject: [PATCH 2/5] added readme for fmatch --- .DS_Store | Bin 6148 -> 0 bytes .gitignore | 3 +++ README.md | 2 +- fmatch/README.md | 2 ++ fmatch/matcher.py | 3 +-- 5 files changed, 7 insertions(+), 3 deletions(-) delete mode 100644 .DS_Store create mode 100644 fmatch/README.md diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 42ed45ebc9fc05e89fc760b351262b1fe79f07de..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKJ8l9o5S>X7T8V}-rLVvZtQDMq3xE;<1uRmOP;ZrUF!x>p zSS$dniG3g5W|%t#gO_8Vmt!of9KU%| aiGvE;K6eXa%l1JQali diff --git a/.gitignore b/.gitignore index 68bc17f..9b495d8 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,6 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +.DS_Store +fmatch/main.py diff --git a/README.md b/README.md index f3fdc10..3b5e7f1 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ # py-commons -This common python library is dedicated to metadata matching and finding regressions. +Common Python Library diff --git a/fmatch/README.md b/fmatch/README.md new file mode 100644 index 0000000..c4d1e96 --- /dev/null +++ b/fmatch/README.md @@ -0,0 +1,2 @@ +# fmatch +This common python library is dedicated to metadata matching and finding regressions. \ No newline at end of file diff --git a/fmatch/matcher.py b/fmatch/matcher.py index 9e9d7fa..1d1a53a 100644 --- a/fmatch/matcher.py +++ b/fmatch/matcher.py @@ -84,7 +84,7 @@ def match_kube_burner(self,uuids): def filter_runs(self,pdata,data): columns = ['uuid','jobConfig.jobIterations'] pdf = pd.json_normalize(pdata) - pdf.to_csv("check.csv") + #print(pdf) pick_df = pd.DataFrame(pdf, columns=columns) iterations = pick_df.iloc[0]['jobConfig.jobIterations'] df = pd.json_normalize(data) @@ -119,7 +119,6 @@ def burner_results(self,uuid,uuids,index): def saveResults(self,csv_file_path="output.csv"): odf = pd.json_normalize(self.data) - odf.to_csv("check.csv") columns = ['uuid','timestamp', 'quantileName','metricName', 'P99'] odf = pd.DataFrame(odf, columns=columns) odf = odf.sort_values(by=['timestamp']) From 557ae7ca32f8916b14af5da841204f898ca10c1d Mon Sep 17 00:00:00 2001 From: Shashank Reddy Boyapally Date: Thu, 11 Jan 2024 10:41:46 -0500 Subject: [PATCH 3/5] added cpu_avg --- fmatch/matcher.py | 86 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 76 insertions(+), 10 deletions(-) diff --git a/fmatch/matcher.py b/fmatch/matcher.py index 1d1a53a..37d03a2 100644 --- a/fmatch/matcher.py +++ b/fmatch/matcher.py @@ -4,6 +4,7 @@ import os import csv +import json ES_URL=os.getenv("ES_SERVER") @@ -57,7 +58,7 @@ def get_uuid_by_metadata(self,meta,index=None): } }, "size": 10000 - } + } result = self.es.search(index=index, body=query) hits = result.get('hits', {}).get('hits', []) uuids=[hit['_source']['uuid'] for hit in hits] @@ -84,7 +85,6 @@ def match_kube_burner(self,uuids): def filter_runs(self,pdata,data): columns = ['uuid','jobConfig.jobIterations'] pdf = pd.json_normalize(pdata) - #print(pdf) pick_df = pd.DataFrame(pdf, columns=columns) iterations = pick_df.iloc[0]['jobConfig.jobIterations'] df = pd.json_normalize(data) @@ -111,16 +111,82 @@ def burner_results(self,uuid,uuids,index): }, "size":10000 } - #print(query) result=self.es.search(index=index,body=query) runs = [item['_source'] for item in result["hits"]["hits"]] - self.data=runs return runs - - def saveResults(self,csv_file_path="output.csv"): - odf = pd.json_normalize(self.data) - columns = ['uuid','timestamp', 'quantileName','metricName', 'P99'] - odf = pd.DataFrame(odf, columns=columns) + + def burner_cpu_results(self,uuids,namespace,index): + ids = "\" OR uuid: \"".join(uuids) + query = { + "aggs": { + "time": { + "terms": { + "field": "uuid.keyword", + "size":10000 + }, + "aggs": { + "time": { + "avg": { + "field": "timestamp"} + } + } + }, + "uuid": { + "terms": { + "field": "uuid.keyword", + "size":10000 + }, + "aggs": { + "cpu": { + "avg": { + "field": "value" + } + } + } + } + }, + "query": { + "bool": { + "must": [{ + "query_string": { + "query": ( + f'( uuid: \"{ids}\" )' + f' AND metricName: "containerCPU"' + f' AND labels.namespace.keyword: {namespace}' + ) + } + }] + } + }, + "size":10000 + } + runs=self.es.search(index=index,body=query) + data=self.parse_burner_cpu_results(runs) + return data + + def parse_burner_cpu_results(self,data: dict): + res = [] + stamps = data['aggregations']['time']['buckets'] + cpu = data['aggregations']['uuid']['buckets'] + for stamp in stamps : + dat = {} + dat['uuid'] = stamp['key'] + dat['timestamp'] = stamp['time']['value_as_string'] + acpu = next(item for item in cpu if item["key"] == stamp['key']) + dat['cpu_avg'] = acpu['cpu']['value'] + res.append(dat) + return res + + def convert_to_df(self,data,columns=None): + odf = pd.json_normalize(data) + if columns!=None: + odf = pd.DataFrame(odf, columns=columns) odf = odf.sort_values(by=['timestamp']) - odf.to_csv(csv_file_path) + return odf + + + def save_results(self,df,csv_file_path="output.csv",columns=None): + if columns!=None: + df = pd.DataFrame(df, columns=columns) + df.to_csv(csv_file_path) From a72da8d143ef35380442a5ef173777526f9ad6c1 Mon Sep 17 00:00:00 2001 From: Shashank Reddy Boyapally Date: Thu, 11 Jan 2024 13:11:30 -0500 Subject: [PATCH 4/5] added test and requirements --- fmatch/requirements.txt | 11 ++++++++ fmatch/test_fmatch.py | 61 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 fmatch/requirements.txt create mode 100644 fmatch/test_fmatch.py diff --git a/fmatch/requirements.txt b/fmatch/requirements.txt new file mode 100644 index 0000000..75bae53 --- /dev/null +++ b/fmatch/requirements.txt @@ -0,0 +1,11 @@ +certifi==2023.11.17 +elastic-transport==8.11.0 +elasticsearch==8.11.1 +elasticsearch7==7.13.0 +numpy==1.26.3 +pandas==2.1.4 +python-dateutil==2.8.2 +pytz==2023.3.post1 +six==1.16.0 +tzdata==2023.4 +urllib3==1.26.18 diff --git a/fmatch/test_fmatch.py b/fmatch/test_fmatch.py new file mode 100644 index 0000000..f3260f0 --- /dev/null +++ b/fmatch/test_fmatch.py @@ -0,0 +1,61 @@ +from matcher import Matcher +import pandas as pd +import json + + + +match=Matcher(index='perf_scale_ci') + +df=pd.read_csv("merged.csv") +ls=df["uuid"].to_list() + +for i in ls: + print(match.get_metadata_by_uuid(i)["networkType"]) + #print(json.dumps(match.get_metadata_by_uuid(i),sort_keys=False, indent=4)) + +meta={} +meta['benchmark']="cluster-density-v2" +#meta['masterNodesType'] = "m6a.4xlarge" +meta['masterNodesType'] = "m6a.xlarge" +meta['workerNodesType'] = "m6a.xlarge" +meta['platform']="AWS" +meta['masterNodesCount']=3 +meta['workerNodesCount']=24 +meta['jobStatus']="success" +meta['ocpVersion']='4.15' +meta['networkType']="OVNKubernetes" + + +uuids=match.get_uuid_by_metadata(meta) +if len(uuids)==0: + print("No UUID present for given metadata") + exit() +#print(uuids) +#print("5eb93cb1-5db1-41cd-997d-4a35741e3236" in uuids) +runs=match.match_kube_burner(uuids) +#print("ef1b328b-1843-43f4-8529-5f4b6ceaadaf" in uuids) +#print(runs) +ids=match.filter_runs(runs,runs) +podl=match.burner_results("",ids,"ripsaw-kube-burner*") + +kapi_cpu=match.burner_cpu_results(ids,"openshift-kube-apiserver","ripsaw-kube-burner*") +ovn_cpu=match.burner_cpu_results(ids,"openshift-ovn-kubernetes","ripsaw-kube-burner*") +etcd_cpu=match.burner_cpu_results(ids,"openshift-etcd","ripsaw-kube-burner*") + + +podl_df=match.convert_to_df(podl,columns=['uuid','timestamp', 'quantileName', 'P99']) +kapi_cpu_df=match.convert_to_df(kapi_cpu) +merge_df=pd.merge(kapi_cpu_df,podl_df,on="uuid") +match.save_results(merge_df,"merged.csv",["uuid","timestamp_x","cpu_avg","P99"]) +match.save_results(kapi_cpu_df,"CPUavg24.csv") +match.save_results(podl_df,"podlatency24.csv") +# cdf = pd.json_normalize(cdf) +# cdf = cdf.sort_values(by=['timestamp']) +# cdf.to_csv("output2.csv") +#match.saveResults(nrs) +#match.saveResults(nrs2,"output2.csv") + +# #print(json.dumps(runs[0],sort_keys=False, indent=4)) +# for run in runs: +# print(json.dumps(run,indent=4)) + From b9799e580270a468ae6b78db5102d1cdb0e404ab Mon Sep 17 00:00:00 2001 From: Shashank Reddy Boyapally Date: Thu, 11 Jan 2024 14:23:08 -0500 Subject: [PATCH 5/5] cleaned up test_fmatch --- fmatch/test_fmatch.py | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/fmatch/test_fmatch.py b/fmatch/test_fmatch.py index f3260f0..4697b03 100644 --- a/fmatch/test_fmatch.py +++ b/fmatch/test_fmatch.py @@ -2,20 +2,10 @@ import pandas as pd import json - - match=Matcher(index='perf_scale_ci') -df=pd.read_csv("merged.csv") -ls=df["uuid"].to_list() - -for i in ls: - print(match.get_metadata_by_uuid(i)["networkType"]) - #print(json.dumps(match.get_metadata_by_uuid(i),sort_keys=False, indent=4)) - meta={} meta['benchmark']="cluster-density-v2" -#meta['masterNodesType'] = "m6a.4xlarge" meta['masterNodesType'] = "m6a.xlarge" meta['workerNodesType'] = "m6a.xlarge" meta['platform']="AWS" @@ -25,16 +15,11 @@ meta['ocpVersion']='4.15' meta['networkType']="OVNKubernetes" - uuids=match.get_uuid_by_metadata(meta) if len(uuids)==0: print("No UUID present for given metadata") exit() -#print(uuids) -#print("5eb93cb1-5db1-41cd-997d-4a35741e3236" in uuids) runs=match.match_kube_burner(uuids) -#print("ef1b328b-1843-43f4-8529-5f4b6ceaadaf" in uuids) -#print(runs) ids=match.filter_runs(runs,runs) podl=match.burner_results("",ids,"ripsaw-kube-burner*") @@ -42,20 +27,16 @@ ovn_cpu=match.burner_cpu_results(ids,"openshift-ovn-kubernetes","ripsaw-kube-burner*") etcd_cpu=match.burner_cpu_results(ids,"openshift-etcd","ripsaw-kube-burner*") - podl_df=match.convert_to_df(podl,columns=['uuid','timestamp', 'quantileName', 'P99']) kapi_cpu_df=match.convert_to_df(kapi_cpu) merge_df=pd.merge(kapi_cpu_df,podl_df,on="uuid") match.save_results(merge_df,"merged.csv",["uuid","timestamp_x","cpu_avg","P99"]) match.save_results(kapi_cpu_df,"CPUavg24.csv") match.save_results(podl_df,"podlatency24.csv") -# cdf = pd.json_normalize(cdf) -# cdf = cdf.sort_values(by=['timestamp']) -# cdf.to_csv("output2.csv") -#match.saveResults(nrs) -#match.saveResults(nrs2,"output2.csv") - -# #print(json.dumps(runs[0],sort_keys=False, indent=4)) -# for run in runs: -# print(json.dumps(run,indent=4)) +df=pd.read_csv("merged.csv") +ls=df["uuid"].to_list() +# Check merged csv data - Debug +for i in ls: + # Debug - Ensure they are all using the same networkType + print(match.get_metadata_by_uuid(i)["networkType"]) \ No newline at end of file