Skip to content

Commit

Permalink
Merge pull request #29 from google/yanickf-dev
Browse files Browse the repository at this point in the history
Yanickf dev
  • Loading branch information
reyammer authored Feb 15, 2024
2 parents f68341e + f652e78 commit 5051f89
Show file tree
Hide file tree
Showing 71 changed files with 305 additions and 144 deletions.
38 changes: 30 additions & 8 deletions python/magika/cli/magika.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def main(
# check CLI arguments and options
if list_output_content_types:
if len(files_paths) > 0:
_l.error("You cannot pass any path when using the -l / --list option")
_l.error("You cannot pass any path when using the -l / --list option.")
sys.exit(1)
print_output_content_types_list()
sys.exit(0)
Expand All @@ -194,26 +194,48 @@ def main(
_l.error("You need to pass at least one path, or - to read from stdin.")
sys.exit(1)

read_from_stdin = False
for p in files_paths:
if str(p) == "-":
read_from_stdin = True
elif not p.exists():
_l.error(f'File or directory "{str(p)}" does not exist.')
sys.exit(1)
if read_from_stdin:
if len(files_paths) > 1:
_l.error('If you pass "-", you cannot pass anything else.')
sys.exit(1)
if recursive:
_l.error('If you pass "-", recursive scan is not meaningful.')
sys.exit(1)

if batch_size <= 0 or batch_size > 512:
_l.error("Batch size needs to be greater than 0 and less or equal than 512")
_l.error("Batch size needs to be greater than 0 and less or equal than 512.")
sys.exit(1)

if json_output and jsonl_output:
_l.error("You should use either --json or --jsonl, not both")
_l.error("You should use either --json or --jsonl, not both.")
sys.exit(1)

if int(mime_output) + int(label_output) + int(magic_compatibility_mode) > 1:
_l.error("You should use only one of --mime, --label, --compatibility-mode")
_l.error("You should use only one of --mime, --label, --compatibility-mode.")
sys.exit(1)

if recursive:
# recursively enumerate files within directories
expanded_paths = []
for p in files_paths:
if p.is_file():
expanded_paths.append(p)
elif p.is_dir():
expanded_paths.extend(sorted(p.rglob("*")))
if p.exists():
if p.is_file():
expanded_paths.append(p)
elif p.is_dir():
expanded_paths.extend(sorted(p.rglob("*")))
elif str(p) == "-":
# this is "read from stdin", that's OK
pass
else:
_l.error(f'File or directory "{str(p)}" does not exist.')
sys.exit(1)
# the resulting list may still include some directories; thus, we filter them out.
files_paths = list(filter(lambda x: not x.is_dir(), expanded_paths))

Expand Down
11 changes: 7 additions & 4 deletions python/magika/config/content_types_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,13 @@
"appleplist": {
"name": "appleplist",
"extensions": [
"bplist",
"plist"
],
"mime_type": "application/x-plist",
"group": "application",
"magic": "Apple binary property list",
"description": "Android property list",
"description": "Apple property list",
"vt_type": "appleplist",
"datasets": [
"vt-ext"
Expand Down Expand Up @@ -2342,12 +2343,13 @@
"mp4": {
"name": "mp4",
"extensions": [
"mov",
"mp4"
],
"mime_type": "video/mp4",
"group": "video",
"magic": "ISO Media",
"description": "MP4 medial",
"description": "MP4 media file",
"vt_type": null,
"datasets": [
"vt-ext"
Expand Down Expand Up @@ -4490,7 +4492,8 @@
"xar": {
"name": "xar",
"extensions": [
"pkg"
"pkg",
"xar"
],
"mime_type": "application/x-xar",
"group": "archive",
Expand Down Expand Up @@ -4754,4 +4757,4 @@
"in_scope_for_output_content_type": true,
"in_scope_for_training": true
}
}
}
2 changes: 1 addition & 1 deletion python/magika/models/standard_v1/thresholds.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"go": 0.95,
"gzip": 0.95,
"hlp": 0.95,
"html": 0.93,
"html": 0.75,
"ico": 0.95,
"ini": 0.55,
"internetshortcut": 0.95,
Expand Down
73 changes: 56 additions & 17 deletions python/tests/test_magika_python_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
from magika.content_types import ContentType, ContentTypesManager
from magika.prediction_mode import PredictionMode
from tests import utils
from tests.utils_magika_python_client import (
run_magika_python_cli,
)
from tests.utils_magika_python_client import MagikaClientError, run_magika_python_cli


@pytest.mark.smoketest
Expand Down Expand Up @@ -230,6 +228,15 @@ def test_magika_cli_with_basic_test_files() -> None:
)


def test_magika_cli_with_mitra_test_files() -> None:
test_files_paths = utils.get_mitra_test_files_paths()

stdout, stderr = run_magika_python_cli(test_files_paths)
utils.check_magika_cli_output_matches_expected_by_ext(
test_files_paths, stdout, stderr
)


def test_magika_cli_with_basic_test_files_and_json_output() -> None:
test_files_paths = utils.get_basic_test_files_paths()

Expand Down Expand Up @@ -442,7 +449,7 @@ def test_magika_cli_with_basic_test_files_and_different_prediction_modes() -> No
)

# Test with invalid prediction mode
with pytest.raises(subprocess.CalledProcessError):
with pytest.raises(MagikaClientError):
_ = run_magika_python_cli(
test_files_paths[:n],
extra_cli_options=["--prediction-mode", "non-existing-mode"],
Expand Down Expand Up @@ -569,26 +576,43 @@ def test_magika_cli_with_bad_input() -> None:
test_file_path = utils.get_one_basic_test_file_path()

# Test without any argument or option
with pytest.raises(subprocess.CalledProcessError):
run_magika_python_cli([])
with pytest.raises(MagikaClientError) as e_info:
p = Path("/this/does/not/exist")
_ = run_magika_python_cli([])
assert e_info.value.stdout == ""
assert (
e_info.value.stderr
== "ERROR: You need to pass at least one path, or - to read from stdin.\n"
)

# Test with file that does not exist
stdout, stderr = run_magika_python_cli(
[Path("/this/does/not/exist")], label_output=True
with pytest.raises(MagikaClientError) as e_info:
p = Path("/this/does/not/exist")
_ = run_magika_python_cli([p], label_output=True)
assert e_info.value.stdout == ""
assert (
e_info.value.stderr == f'ERROR: File or directory "{str(p)}" does not exist.\n'
)
predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr)
assert len(predicted_cts) == 1
assert predicted_cts[0][1] == ContentType.FILE_DOES_NOT_EXIST

# Test with incompatible list of options
with pytest.raises(subprocess.CalledProcessError):
run_magika_python_cli([test_file_path], json_output=True, jsonl_output=True)
with pytest.raises(MagikaClientError) as e_info:
_ = run_magika_python_cli([test_file_path], json_output=True, jsonl_output=True)
assert e_info.value.stdout == ""
assert (
e_info.value.stderr
== "ERROR: You should use either --json or --jsonl, not both.\n"
)

# Test with an option does not exist
with pytest.raises(subprocess.CalledProcessError):
run_magika_python_cli(
with pytest.raises(MagikaClientError) as e_info:
_ = run_magika_python_cli(
[test_file_path], extra_cli_options=["--non-existing-option"]
)
assert e_info.value.stdout == ""
error_lines = e_info.value.stderr.split("\n")
assert error_lines[0].startswith("Usage: magika [OPTIONS] [FILE]...")
assert error_lines[-2].startswith("Error: No such option:")
assert error_lines[-1] == ""


def test_magika_cli_with_reading_from_stdin() -> None:
Expand All @@ -614,6 +638,21 @@ def test_magika_cli_with_reading_from_stdin() -> None:
assert str(entry["path"]) == "-"
assert entry["output"]["ct_label"] in true_cts_names

# test with some bad input
cmd = f"cat {str(test_file_path)} | magika - {str(test_file_path)}"
p = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=True)
assert p.returncode == 1
assert p.stdout == ""
assert p.stderr.find('ERROR: If you pass "-", you cannot pass anything else.') >= 0

cmd = f"cat {str(test_file_path)} | magika - -r"
p = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=True)
assert p.returncode == 1
assert p.stdout == ""
assert (
p.stderr.find('ERROR: If you pass "-", recursive scan is not meaningful.') >= 0
)


def test_magika_cli_with_colors() -> None:
test_file_path = utils.get_one_basic_test_file_path()
Expand Down Expand Up @@ -708,5 +747,5 @@ def test_magika_cli_list_content_types() -> None:
assert header.find("Description") >= 0
assert stderr == ""

with pytest.raises(subprocess.CalledProcessError):
run_magika_python_cli([test_file_path], list_output_content_types=True)
with pytest.raises(MagikaClientError):
_ = run_magika_python_cli([test_file_path], list_output_content_types=True)
29 changes: 22 additions & 7 deletions python/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,24 @@ def get_basic_tests_files_dir() -> Path:
return tests_files_dir


def get_mitra_tests_files_dir() -> Path:
tests_files_dir = get_tests_data_dir() / "mitra"
assert tests_files_dir.is_dir()
return tests_files_dir


def get_basic_test_files_paths() -> List[Path]:
tests_files_dir = get_basic_tests_files_dir()
test_files_paths = sorted(filter(lambda p: p.is_file(), tests_files_dir.iterdir()))
return test_files_paths


def get_mitra_test_files_paths() -> List[Path]:
tests_files_dir = get_mitra_tests_files_dir()
test_files_paths = sorted(filter(lambda p: p.is_file(), tests_files_dir.iterdir()))
return test_files_paths


def get_one_basic_test_file_path() -> Path:
return get_basic_test_files_paths()[0]

Expand Down Expand Up @@ -118,14 +130,15 @@ def check_magika_cli_output_matches_expected_by_ext(
for file_path, output in predicted_cts:
remaining_samples_paths.remove(file_path)
file_ext = file_path.suffix.lstrip(".")
true_cts = ctm.get_cts_by_ext(file_ext)
if len(true_cts) == 0:
# We could not find the content type from the extension. In this
# case, we assume this is a test file path with the
# <dataset>/<content type>/<hash> pattern
if file_ext != "":
true_cts = ctm.get_cts_by_ext(file_ext)
else:
# The test file does not have any extension. In this case, we assume
# this is a test file path with the <dataset>/<content type>/<hash>
# pattern.
true_ct_name = file_path.parent.name
true_cts = [ctm.get_or_raise(true_ct_name)]
assert len(true_cts) > 0
assert len(true_cts) > 0, f'File extension: "{file_ext}"'

true_cts_names = [ct.name for ct in true_cts]

Expand All @@ -151,7 +164,9 @@ def check_magika_cli_output_matches_expected_by_ext(
f"{ctm.get_description(ct.name)} ({ctm.get_group(ct.name)})"
for ct in true_cts
]
assert output in expected_outputs
assert (
output in expected_outputs
), f'Output: "{output}", expected output: "{expected_outputs}"'

# Check that all input samples have been scanned
assert len(remaining_samples_paths) == 0
Expand Down
14 changes: 12 additions & 2 deletions python/tests/utils_magika_python_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
from typing import List, Optional, Tuple


class MagikaClientError(Exception):
def __init__(self, stdout: str, stderr: str):
self.stdout = stdout
self.stderr = stderr


def run_magika_python_cli(
samples_paths: List[Path],
json_output: bool = False,
Expand Down Expand Up @@ -72,5 +78,9 @@ def run_magika_python_cli(
if extra_cli_options is not None:
cmd.extend(extra_cli_options)

p = subprocess.run(cmd, capture_output=True, text=True, check=True)
return p.stdout, p.stderr
p = subprocess.run(cmd, capture_output=True, text=True, check=False)

if p.returncode == 0:
return p.stdout, p.stderr
else:
raise MagikaClientError(stdout=p.stdout, stderr=p.stderr)
9 changes: 9 additions & 0 deletions tests_data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Tests Data

We use these files for regressions testing.

These files were not (and should not) be used for training purposes.

They are organized by directory:
- `basic/`: a number of simple files of various content types.
- `mitra/`: a selection of the files available at [https://github.com/corkami/mitra](https://github.com/corkami/mitra/tree/master/input).
Loading

0 comments on commit 5051f89

Please sign in to comment.