Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix and ML retrain #597

Merged
merged 13 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
275 changes: 138 additions & 137 deletions cicd/benchmark.txt → .ci/benchmark.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ jobs:

- name: Verify benchmark scores of the PR
run: |
diff --ignore-all-space --ignore-blank-lines temp/CredSweeper/cicd/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log
diff --unified=3 --ignore-all-space --ignore-blank-lines temp/CredSweeper/.ci/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ jobs:
- name: Check ml_model.onnx integrity
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
run: |
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 62d92ab2f91a18e861d846a7b8a0c3a7
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 88f37978fc0599ac8d1bf732ad40c077
md5sum --binary credsweeper/ml_model/model_config.json | grep 2b29c5e1aa199d14b788652bd542c7c0

# # # Python setup

Expand Down
2 changes: 1 addition & 1 deletion credsweeper/filters/value_string_type_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
not_comment = not line_data.is_comment()

if line_data.is_source_file_with_quotes() and not_comment and not_quoted and not line_data.is_quoted \
and '=' in line_data.separator:
and line_data.separator and '=' in line_data.separator:
# heterogeneous code e.g. YAML in Python uses colon sign instead equals
return True

Expand Down
Binary file modified credsweeper/ml_model/ml_model.onnx
Binary file not shown.
10 changes: 2 additions & 8 deletions credsweeper/ml_model/model_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -373,12 +373,12 @@
".cjs",
".cljc",
".cmd",
".cmm",
".cnf",
".coffee",
".conf",
".config",
".cpp",
".creds",
".crt",
".cs",
".csp",
Expand Down Expand Up @@ -417,7 +417,6 @@
".json",
".jsp",
".jsx",
".jwt",
".kt",
".las",
".ldif",
Expand All @@ -428,7 +427,6 @@
".log",
".lua",
".m",
".map",
".markerb",
".md",
".mdx",
Expand All @@ -445,7 +443,6 @@
".patch",
".php",
".pl",
".pm",
".po",
".pod",
".postinst",
Expand All @@ -457,7 +454,6 @@
".purs",
".pxd",
".py",
".pyi",
".pyx",
".r",
".rb",
Expand All @@ -469,6 +465,7 @@
".rs",
".rsp",
".rst",
".rules",
".sample",
".sbt",
".scala",
Expand All @@ -478,7 +475,6 @@
".sql",
".storyboard",
".strings",
".swift",
".t",
".td",
".tdf",
Expand All @@ -498,7 +494,6 @@
".vue",
".xaml",
".xib",
".xml",
".yaml",
".yml",
".zsh",
Expand All @@ -515,7 +510,6 @@
"Certificate",
"Credential",
"Github Old Token",
"JSON Web Token",
"Key",
"Nonce",
"Password",
Expand Down
4 changes: 3 additions & 1 deletion experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def main(cred_data_location: str, jobs: int) -> str:
prepare_train_data(_cred_data_location, jobs)

# detected data means which data is passed to ML validator of credsweeper after filters with RuleName
detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location)}.json")
cred_data_location_path = pathlib.Path(cred_data_location) / "data"
detected_data = read_detected_data(f"results/detected_data.{data_checksum(cred_data_location_path)}.json")
print(f"CredSweeper detected {len(detected_data)} credentials without ML")
# all markup data
meta_data = read_metadata(f"{cred_data_location}/meta")
Expand Down Expand Up @@ -125,6 +126,7 @@ def main(cred_data_location: str, jobs: int) -> str:
x_test_line, x_test_variable, x_test_value, x_test_features = prepare_data(df_test)
y_test = get_y_labels(df_test)
print(f"Class-1 prop on test: {np.mean(y_test):.4f}")
del df_test

max_epochs = 100
# ^^^ the line is patched in GitHub action to speed-up test train
Expand Down
5 changes: 5 additions & 0 deletions experiment/src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ def read_text(path) -> list[str]:
+ line[markup["ValueStart"]:markup["ValueEnd"]] \
+ Style.RESET_ALL \
+ line[markup["ValueEnd"]:]
elif 0 <= markup["ValueStart"]:
line = line[:markup["ValueStart"]] \
+ Fore.LIGHTGREEN_EX \
+ line[:markup["ValueStart"]] \
+ Style.RESET_ALL
print(line)
break
read_text.cache_clear()
Expand Down
13 changes: 9 additions & 4 deletions experiment/src/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def execute_scanner(dataset_location: str, result_location_str, j):
sys.exit(error_code)


def data_checksum(cred_data_location: str) -> str:
def data_checksum(dir_path: Path) -> str:
checksum = hashlib.md5(b'').digest()
for root, dirs, files in os.walk(Path(cred_data_location) / "data"):
for root, dirs, files in os.walk(dir_path):
for file in files:
with open(os.path.join(root, file), "rb") as f:
cvs_checksum = hashlib.md5(f.read()).digest()
Expand All @@ -38,12 +38,17 @@ def prepare_train_data(cred_data_location: str, j: int):
new_rules = [x for x in rules if x.get("use_ml")]
Util.yaml_dump(new_rules, "results/train_config.yaml")

detected_data_filename = f"results/detected_data.{data_checksum(cred_data_location)}.json"
meta_checksum = data_checksum(Path(cred_data_location) / "meta")
print(f"meta checksum {meta_checksum}")

data_dir_checksum = data_checksum(Path(cred_data_location) / "data")
print(f"data checksum {data_dir_checksum}")
detected_data_filename = f"results/detected_data.{data_dir_checksum}.json"

if not os.path.exists(detected_data_filename):
print(f"Get CredSweeper results from {cred_data_location}. May take some time")
execute_scanner(cred_data_location, detected_data_filename, j)
else:
print(f"Get cached result {data_checksum(cred_data_location)}")
print(f"Get cached result {data_dir_checksum}")

print("Train data prepared!")
6 changes: 3 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
SAMPLES_CRED_LINE_COUNT: int = 381

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 323
SAMPLES_POST_CRED_COUNT: int = 336

# with option --doc
SAMPLES_IN_DOC = 417

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 17
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 23
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 16
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1

# well known string with all latin letters
Expand Down
Loading
Loading