Skip to content

Commit

Permalink
uuid pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Jul 25, 2024
1 parent 898a252 commit 30044cf
Show file tree
Hide file tree
Showing 22 changed files with 271 additions and 95 deletions.
21 changes: 13 additions & 8 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -72,7 +73,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -169,7 +171,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -287,7 +290,7 @@ jobs:
exit_code=0
LOW_DELTA=10
THRESHOLD=250
# RELEASE
if [ ${RELEASE_TIME} -le ${HEAD_TIME} ]; then
d=$(( 1000 * ( ${HEAD_TIME} - ${RELEASE_TIME} ) / ${RELEASE_TIME} ))
Expand All @@ -311,7 +314,7 @@ jobs:
echo "Speed-up."
fi
fi
# BASE
if [ ${BASE_TIME} -le ${HEAD_TIME} ]; then
d=$(( 1000 * ( ${HEAD_TIME} - ${BASE_TIME} ) / ${BASE_TIME} ))
Expand All @@ -335,10 +338,11 @@ jobs:
echo "Speed-up."
fi
fi
exit ${exit_code}
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

experiment:
# the ml train test is placed here to use cached data set
needs: [ download_data ]
Expand All @@ -350,7 +354,8 @@ jobs:
- name: Checkout CredData
uses: actions/checkout@v4
with:
repository: Samsung/CredData
repository: babenek/CredData
ref: uuid

- name: Markup hashing
run: |
Expand Down Expand Up @@ -428,7 +433,7 @@ jobs:
exit 1
fi
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

run_doc_benchmark:
runs-on: ubuntu-latest
Expand Down
100 changes: 50 additions & 50 deletions cicd/benchmark.txt

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions credsweeper/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def __init__(self, config: Dict[str, Any]) -> None:
self.min_keyword_value_length: int = int(config["min_keyword_value_length"])
self.min_pattern_value_length: int = int(config["min_pattern_value_length"])

self.line_specific_key_offset: int = int(config["line_specific_key_offset"])

# Trim exclude patterns from space like characters
self.exclude_lines = set(line.strip() for line in self.exclude_lines)
self.exclude_values = set(line.strip() for line in self.exclude_values)
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/filters/group/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,6 @@ def get_keyword_base_filters(config: Config) -> List[Filter]:
def get_pattern_base_filters(config: Config) -> List[Filter]:
"""return base filters for pattern"""
return [ #
LineSpecificKeyCheck(), #
LineSpecificKeyCheck(config), #
ValuePatternCheck(config), #
]
12 changes: 9 additions & 3 deletions credsweeper/filters/line_specific_key_check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re

from credsweeper.common.constants import ML_HUNK
from credsweeper.config import Config
from credsweeper.credentials import LineData
from credsweeper.file_handler.analysis_target import AnalysisTarget
Expand All @@ -10,8 +11,8 @@
class LineSpecificKeyCheck(Filter):
"""Check that values from list below is not in candidate line."""

NOT_ALLOWED = [r"example", r"enc\(", r"enc\[", r"true", r"false"]
NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED))
NOT_ALLOWED = [r"\bexample\b", r"\benc\(", r"\benc\[", r"\btrue\b", r"\bfalse\b"]
NOT_ALLOWED_PATTERN = re.compile(Util.get_regex_combine_or(NOT_ALLOWED), re.IGNORECASE)

def __init__(self, config: Config = None) -> None:
pass
Expand All @@ -29,8 +30,13 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
"""
if line_data.line is None:
return True
if 0 <= line_data.variable_start:
# variable may be defined too
sub_line_start = 0 if ML_HUNK >= line_data.variable_start else line_data.variable_start - ML_HUNK
else:
sub_line_start = 0 if ML_HUNK >= line_data.value_start else line_data.value_start - ML_HUNK

if self.NOT_ALLOWED_PATTERN.search(target.line_lower):
if self.NOT_ALLOWED_PATTERN.search(line_data.line, sub_line_start, line_data.value_end + ML_HUNK):
return True

return False
2 changes: 1 addition & 1 deletion credsweeper/filters/value_useless_word_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class ValueUselessWordCheck(Filter):
NOT_ALLOWED = [
"((\\{)?(0x)+([0-9a-f]|\\%){1}.*)", # Check is contain \{0x or 0x
"(\\-\\>.*)", # Check if contain ->
"(xxxx.*)", # Check if contain xxxxx
"(xxxxx.*)", # Check if contain xxxxx
"(\\$\\w+)", # Check whether it looks like a variable e.g. $word
"(\\s).*" # Check if contain \s
]
Expand Down
16 changes: 16 additions & 0 deletions credsweeper/rules/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,22 @@
target:
- code

- name: UUID
severity: info
confidence: strong
type: pattern
values:
- (?<![0-9A-Za-z_+-])(?P<value>[0-9A-Fa-f]{8}(-[0-9A-Fa-f]{4}){3}-[0-9A-Fa-f]{12})(?![=0-9A-Za-z_+-])
min_line_len: 36
required_substrings:
- "-"
required_regex: "[0-9A-Za-z_/+-]{15}"
filter_type:
- ValuePatternCheck
target:
- code
- doc

- name: AWS Client ID
severity: high
confidence: moderate
Expand Down
1 change: 1 addition & 0 deletions credsweeper/secret/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
"check_for_literals": true,
"min_pattern_value_length": 12,
"min_keyword_value_length": 4,
"line_specific_key_offset": 16,
"line_data_output": [
"line",
"line_num",
Expand Down
8 changes: 8 additions & 0 deletions docs/source/credsweeper.filters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ credsweeper.filters.line\_specific\_key\_check module
:undoc-members:
:show-inheritance:

credsweeper.filters.separator\_unusual\_check module
----------------------------------------------------

.. automodule:: credsweeper.filters.separator_unusual_check
:members:
:undoc-members:
:show-inheritance:

credsweeper.filters.value\_allowlist\_check module
--------------------------------------------------

Expand Down
17 changes: 5 additions & 12 deletions experiment/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,13 @@ now=$(date +%Y%m%d_%H%M%S)
RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results
mkdir -vp ${RESULT_DIR}

${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData --jobs 32 | tee ${RESULT_DIR}/train.${now}.log
${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData --jobs $(nproc) | tee ${RESULT_DIR}/train.${now}.log
error_code=${PIPESTATUS}
if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi

cd ${CREDSWEEPER_DIR}
report_file=${RESULT_DIR}/${now}.json
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/auxiliary/data/ --log error --job 32 --save-json ${report_file}
${CREDSWEEPER_DIR}/.venv/bin/python -m credsweeper --sort --path ~/q/DataCred/uuid/data/ --log info --job $(nproc) --save-json ${report_file}

cd ~/q/DataCred/auxiliary/
cd ~/q/DataCred/uuid/
.venv/bin/python -m benchmark --scanner credsweeper --load ${report_file} | tee ${report_file}.log

#last_tf_model=$(cat train.log | tail -n1)

#echo $last_tf_model

#pwd

#python -m tf2onnx.convert --saved-model results/$last_tf_model --output ../credsweeper/ml_model/ml_model.onnx --verbose

3 changes: 2 additions & 1 deletion experiment/src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ def read_metadata(meta_dir: str) -> Dict[identifier, Dict]:
df.loc[df["GroundTruth"] == "Template", "GroundTruth"] = 'F'
for _, row in df.iterrows():
j += 1
if row["LineStart"] != row["LineEnd"] or any(x in row["Category"] for x in ["AWS Multi", "Google Multi"]):
if row["LineStart"] != row["LineEnd"] \
or all(x in ["AWS Multi", "Google Multi"] for x in row["Category"].split(':')):
# print(f"WARNING: skip not ml category {row['FilePath']},{line_start},{line_end}"
# f",{row['GroundTruth']},{row['Category']}")
continue
Expand Down
10 changes: 6 additions & 4 deletions experiment/src/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ def execute_scanner(dataset_location: str, result_location_str, j):
"""Execute CredSweeper as a separate process to make sure no global states is shared with training script"""
dir_path = os.path.dirname(os.path.realpath(__file__)) + "/.."
command = f"{sys.executable} -m credsweeper --path {dataset_location}/data" \
f" --save-json {result_location_str} " \
f"--job {j} --sort --rules results/train_config.yaml --ml_threshold 0"
subprocess.check_call(command, shell=True, cwd=dir_path, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
f" --save-json {result_location_str} --log info" \
f" --job {j} --sort --rules results/train_config.yaml --ml_threshold 0"
error_code = subprocess.check_call(command, shell=True, cwd=dir_path)
if 0 != error_code:
sys.exit(error_code)


def prepare_train_data(cred_data_location: str, j: int):
Expand All @@ -20,7 +22,7 @@ def prepare_train_data(cred_data_location: str, j: int):
if not os.path.exists("train_config.yaml"):
# use pattern or keyword type
rules = Util.yaml_load("../credsweeper/rules/config.yaml")
new_rules = [x for x in rules if x.get("use_ml")]
new_rules = [x for x in rules if "code" in x.get("target") and x.get("type") in ["pattern", "keyword"]]
Util.yaml_dump(new_rules, "results/train_config.yaml")

if not os.path.exists("results/detected_data.json"):
Expand Down
10 changes: 5 additions & 5 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 130
SAMPLES_FILES_COUNT: int = 131

# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan
SAMPLES_CRED_COUNT: int = 431
SAMPLES_CRED_LINE_COUNT: int = 448
SAMPLES_CRED_COUNT: int = 432
SAMPLES_CRED_LINE_COUNT: int = 449

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 389
SAMPLES_POST_CRED_COUNT: int = 390

# with option --doc
SAMPLES_IN_DOC = 410
SAMPLES_IN_DOC = 411

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 25
Expand Down
10 changes: 5 additions & 5 deletions tests/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
"sort_output": True,
"json_filename": "ml_threshold.json",
"ml_threshold": NEGLIGIBLE_ML_THRESHOLD
}, {
"__cred_count": SAMPLES_IN_DOC,
"sort_output": True,
"json_filename": "doc.json",
"doc": True
}, {
"__cred_count": SAMPLES_IN_DEEP_3,
"sort_output": True,
"json_filename": "depth_3.json",
"depth": 3
}, {
"__cred_count": SAMPLES_IN_DOC,
"sort_output": True,
"json_filename": "doc.json",
"doc": True
}]
27 changes: 27 additions & 0 deletions tests/data/depth_3.json
Original file line number Diff line number Diff line change
Expand Up @@ -12809,6 +12809,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "tests/samples/uuid|RAW",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
27 changes: 27 additions & 0 deletions tests/data/doc.json
Original file line number Diff line number Diff line change
Expand Up @@ -12929,6 +12929,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "tests/samples/uuid|RAW",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
27 changes: 27 additions & 0 deletions tests/data/ml_threshold.json
Original file line number Diff line number Diff line change
Expand Up @@ -11844,6 +11844,33 @@
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
"ml_probability": null,
"rule": "UUID",
"severity": "info",
"confidence": "strong",
"line_data_list": [
{
"line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp",
"line_num": 1,
"path": "tests/samples/uuid",
"info": "",
"value": "bace4d19-fa7e-beef-cafe-9129474bcd81",
"value_start": 0,
"value_end": 36,
"variable": null,
"variable_start": -2,
"variable_end": -2,
"entropy_validation": {
"iterator": "BASE36_CHARS",
"entropy": 3.2373263071270246,
"valid": true
}
}
]
},
{
"api_validation": "NOT_AVAILABLE",
"ml_validation": "NOT_AVAILABLE",
Expand Down
Loading

0 comments on commit 30044cf

Please sign in to comment.