Skip to content

Commit

Permalink
Bugfix and ML retrain (#597)
Browse files Browse the repository at this point in the history
* custom ref BM

* upd

* after corrections

* [skip actions] [auxiliary] 2024-08-17T09:35:51+03:00

* retrain

* testfix

* style

* md5sum of ml model

* model config check

* custom ref rollback

* Scores fix

* Move cicd dir to .ci as not sourcecode related

* Missed dir added
  • Loading branch information
babenek authored Aug 19, 2024
1 parent 355a5d5 commit 34575e1
Show file tree
Hide file tree
Showing 15 changed files with 1,580 additions and 924 deletions.
File renamed without changes.
275 changes: 138 additions & 137 deletions cicd/benchmark.txt → .ci/benchmark.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ jobs:

- name: Verify benchmark scores of the PR
run: |
diff --ignore-all-space --ignore-blank-lines temp/CredSweeper/cicd/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log
diff --unified=3 --ignore-all-space --ignore-blank-lines temp/CredSweeper/.ci/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ jobs:
- name: Check ml_model.onnx integrity
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
run: |
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 62d92ab2f91a18e861d846a7b8a0c3a7
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 88f37978fc0599ac8d1bf732ad40c077
md5sum --binary credsweeper/ml_model/model_config.json | grep 2b29c5e1aa199d14b788652bd542c7c0
# # # Python setup

Expand Down
2 changes: 1 addition & 1 deletion credsweeper/filters/value_string_type_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
not_comment = not line_data.is_comment()

if line_data.is_source_file_with_quotes() and not_comment and not_quoted and not line_data.is_quoted \
and '=' in line_data.separator:
and line_data.separator and '=' in line_data.separator:
# heterogeneous code e.g. YAML in Python uses colon sign instead equals
return True

Expand Down
Binary file modified credsweeper/ml_model/ml_model.onnx
Binary file not shown.
10 changes: 2 additions & 8 deletions credsweeper/ml_model/model_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -373,12 +373,12 @@
".cjs",
".cljc",
".cmd",
".cmm",
".cnf",
".coffee",
".conf",
".config",
".cpp",
".creds",
".crt",
".cs",
".csp",
Expand Down Expand Up @@ -417,7 +417,6 @@
".json",
".jsp",
".jsx",
".jwt",
".kt",
".las",
".ldif",
Expand All @@ -428,7 +427,6 @@
".log",
".lua",
".m",
".map",
".markerb",
".md",
".mdx",
Expand All @@ -445,7 +443,6 @@
".patch",
".php",
".pl",
".pm",
".po",
".pod",
".postinst",
Expand All @@ -457,7 +454,6 @@
".purs",
".pxd",
".py",
".pyi",
".pyx",
".r",
".rb",
Expand All @@ -469,6 +465,7 @@
".rs",
".rsp",
".rst",
".rules",
".sample",
".sbt",
".scala",
Expand All @@ -478,7 +475,6 @@
".sql",
".storyboard",
".strings",
".swift",
".t",
".td",
".tdf",
Expand All @@ -498,7 +494,6 @@
".vue",
".xaml",
".xib",
".xml",
".yaml",
".yml",
".zsh",
Expand All @@ -515,7 +510,6 @@
"Certificate",
"Credential",
"Github Old Token",
"JSON Web Token",
"Key",
"Nonce",
"Password",
Expand Down
4 changes: 3 additions & 1 deletion experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def main(cred_data_location: str, jobs: int) -> str:
prepare_train_data(_cred_data_location, jobs)

# detected data means which data is passed to ML validator of credsweeper after filters with RuleName
detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location)}.json")
cred_data_location_path = pathlib.Path(cred_data_location) / "data"
detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location_path)}.json")
print(f"CredSweeper detected {len(detected_data)} credentials without ML")
# all markup data
meta_data = read_metadata(f"{cred_data_location}/meta")
Expand Down Expand Up @@ -125,6 +126,7 @@ def main(cred_data_location: str, jobs: int) -> str:
x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test)
y_test = get_y_labels(df_test)
print(f"Class-1 prop on test: {np.mean(y_test):.4f}")
del df_test

max_epochs = 100
# ^^^ the line is patched in GitHub action to speed-up test train
Expand Down
5 changes: 5 additions & 0 deletions experiment/src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ def read_text(path) -> list[str]:
+ line[markup["ValueStart"]:markup["ValueEnd"]] \
+ Style.RESET_ALL \
+ line[markup["ValueEnd"]:]
elif 0 <= markup["ValueStart"]:
line = line[:markup["ValueStart"]] \
+ Fore.LIGHTGREEN_EX \
+ line[:markup["ValueStart"]] \
+ Style.RESET_ALL
print(line)
break
read_text.cache_clear()
Expand Down
13 changes: 9 additions & 4 deletions experiment/src/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def execute_scanner(dataset_location: str, result_location_str, j):
sys.exit(error_code)


def data_checksum(cred_data_location: str) -> str:
def data_checksum(dir_path: Path) -> str:
checksum = hashlib.md5(b'').digest()
for root, dirs, files in os.walk(Path(cred_data_location) / "data"):
for root, dirs, files in os.walk(dir_path):
for file in files:
with open(os.path.join(root, file), "rb") as f:
cvs_checksum = hashlib.md5(f.read()).digest()
Expand All @@ -38,12 +38,17 @@ def prepare_train_data(cred_data_location: str, j: int):
new_rules = [x for x in rules if x.get("use_ml")]
Util.yaml_dump(new_rules, "results/train_config.yaml")

detected_data_filename = f"results/detected_data.{data_checksum(cred_data_location)}.json"
meta_checksum = data_checksum(Path(cred_data_location) / "meta")
print(f"meta checksum {meta_checksum}")

data_dir_checksum = data_checksum(Path(cred_data_location) / "data")
print(f"data checksum {data_dir_checksum}")
detected_data_filename = f"results/detected_data.{data_dir_checksum}.json"

if not os.path.exists(detected_data_filename):
print(f"Get CredSweeper results from {cred_data_location}. May take some time")
execute_scanner(cred_data_location, detected_data_filename, j)
else:
print(f"Get cached result {data_checksum(cred_data_location)}")
print(f"Get cached result {data_dir_checksum}")

print("Train data prepared!")
6 changes: 3 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
SAMPLES_CRED_LINE_COUNT: int = 381

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 323
SAMPLES_POST_CRED_COUNT: int = 336

# with option --doc
SAMPLES_IN_DOC = 417

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 17
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 23
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1

# well known string with all latin letters
Expand Down
Loading

0 comments on commit 34575e1

Please sign in to comment.