Merge pull request #29 from google/yanickf-dev

Yanickf dev
google · Feb 15, 2024 · 5051f89 · 5051f89
2 parents f68341e + f652e78
commit 5051f89
Show file tree

Hide file tree

Showing 71 changed files with 305 additions and 144 deletions.
diff --git a/python/magika/cli/magika.py b/python/magika/cli/magika.py
@@ -185,7 +185,7 @@ def main(
     # check CLI arguments and options
     if list_output_content_types:
         if len(files_paths) > 0:
-            _l.error("You cannot pass any path when using the -l / --list option")
+            _l.error("You cannot pass any path when using the -l / --list option.")
             sys.exit(1)
         print_output_content_types_list()
         sys.exit(0)
@@ -194,26 +194,48 @@ def main(
         _l.error("You need to pass at least one path, or - to read from stdin.")
         sys.exit(1)
 
+    read_from_stdin = False
+    for p in files_paths:
+        if str(p) == "-":
+            read_from_stdin = True
+        elif not p.exists():
+            _l.error(f'File or directory "{str(p)}" does not exist.')
+            sys.exit(1)
+    if read_from_stdin:
+        if len(files_paths) > 1:
+            _l.error('If you pass "-", you cannot pass anything else.')
+            sys.exit(1)
+        if recursive:
+            _l.error('If you pass "-", recursive scan is not meaningful.')
+            sys.exit(1)
+
     if batch_size <= 0 or batch_size > 512:
-        _l.error("Batch size needs to be greater than 0 and less or equal than 512")
+        _l.error("Batch size needs to be greater than 0 and less or equal than 512.")
         sys.exit(1)
 
     if json_output and jsonl_output:
-        _l.error("You should use either --json or --jsonl, not both")
+        _l.error("You should use either --json or --jsonl, not both.")
         sys.exit(1)
 
     if int(mime_output) + int(label_output) + int(magic_compatibility_mode) > 1:
-        _l.error("You should use only one of --mime, --label, --compatibility-mode")
+        _l.error("You should use only one of --mime, --label, --compatibility-mode.")
         sys.exit(1)
 
     if recursive:
         # recursively enumerate files within directories
         expanded_paths = []
         for p in files_paths:
-            if p.is_file():
-                expanded_paths.append(p)
-            elif p.is_dir():
-                expanded_paths.extend(sorted(p.rglob("*")))
+            if p.exists():
+                if p.is_file():
+                    expanded_paths.append(p)
+                elif p.is_dir():
+                    expanded_paths.extend(sorted(p.rglob("*")))
+            elif str(p) == "-":
+                # this is "read from stdin", that's OK
+                pass
+            else:
+                _l.error(f'File or directory "{str(p)}" does not exist.')
+                sys.exit(1)
         # the resulting list may still include some directories; thus, we filter them out.
         files_paths = list(filter(lambda x: not x.is_dir(), expanded_paths))
 

diff --git a/python/magika/config/content_types_config.json b/python/magika/config/content_types_config.json
@@ -127,12 +127,13 @@
     "appleplist": {
         "name": "appleplist",
         "extensions": [
+            "bplist",
             "plist"
         ],
         "mime_type": "application/x-plist",
         "group": "application",
         "magic": "Apple binary property list",
-        "description": "Android property list",
+        "description": "Apple property list",
         "vt_type": "appleplist",
         "datasets": [
             "vt-ext"
@@ -2342,12 +2343,13 @@
     "mp4": {
         "name": "mp4",
         "extensions": [
+            "mov",
             "mp4"
         ],
         "mime_type": "video/mp4",
         "group": "video",
         "magic": "ISO Media",
-        "description": "MP4 medial",
+        "description": "MP4 media file",
         "vt_type": null,
         "datasets": [
             "vt-ext"
@@ -4490,7 +4492,8 @@
     "xar": {
         "name": "xar",
         "extensions": [
-            "pkg"
+            "pkg",
+            "xar"
         ],
         "mime_type": "application/x-xar",
         "group": "archive",
@@ -4754,4 +4757,4 @@
         "in_scope_for_output_content_type": true,
         "in_scope_for_training": true
     }
-}
+}
diff --git a/python/magika/models/standard_v1/thresholds.json b/python/magika/models/standard_v1/thresholds.json
@@ -35,7 +35,7 @@
         "go": 0.95,
         "gzip": 0.95,
         "hlp": 0.95,
-        "html": 0.93,
+        "html": 0.75,
         "ico": 0.95,
         "ini": 0.55,
         "internetshortcut": 0.95,

diff --git a/python/tests/test_magika_python_cli.py b/python/tests/test_magika_python_cli.py
@@ -23,9 +23,7 @@
 from magika.content_types import ContentType, ContentTypesManager
 from magika.prediction_mode import PredictionMode
 from tests import utils
-from tests.utils_magika_python_client import (
-    run_magika_python_cli,
-)
+from tests.utils_magika_python_client import MagikaClientError, run_magika_python_cli
 
 
 @pytest.mark.smoketest
@@ -230,6 +228,15 @@ def test_magika_cli_with_basic_test_files() -> None:
         )
 
 
+def test_magika_cli_with_mitra_test_files() -> None:
+    test_files_paths = utils.get_mitra_test_files_paths()
+
+    stdout, stderr = run_magika_python_cli(test_files_paths)
+    utils.check_magika_cli_output_matches_expected_by_ext(
+        test_files_paths, stdout, stderr
+    )
+
+
 def test_magika_cli_with_basic_test_files_and_json_output() -> None:
     test_files_paths = utils.get_basic_test_files_paths()
 
@@ -442,7 +449,7 @@ def test_magika_cli_with_basic_test_files_and_different_prediction_modes() -> No
         )
 
         # Test with invalid prediction mode
-        with pytest.raises(subprocess.CalledProcessError):
+        with pytest.raises(MagikaClientError):
             _ = run_magika_python_cli(
                 test_files_paths[:n],
                 extra_cli_options=["--prediction-mode", "non-existing-mode"],
@@ -569,26 +576,43 @@ def test_magika_cli_with_bad_input() -> None:
     test_file_path = utils.get_one_basic_test_file_path()
 
     # Test without any argument or option
-    with pytest.raises(subprocess.CalledProcessError):
-        run_magika_python_cli([])
+    with pytest.raises(MagikaClientError) as e_info:
+        p = Path("/this/does/not/exist")
+        _ = run_magika_python_cli([])
+    assert e_info.value.stdout == ""
+    assert (
+        e_info.value.stderr
+        == "ERROR: You need to pass at least one path, or - to read from stdin.\n"
+    )
 
     # Test with file that does not exist
-    stdout, stderr = run_magika_python_cli(
-        [Path("/this/does/not/exist")], label_output=True
+    with pytest.raises(MagikaClientError) as e_info:
+        p = Path("/this/does/not/exist")
+        _ = run_magika_python_cli([p], label_output=True)
+    assert e_info.value.stdout == ""
+    assert (
+        e_info.value.stderr == f'ERROR: File or directory "{str(p)}" does not exist.\n'
     )
-    predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr)
-    assert len(predicted_cts) == 1
-    assert predicted_cts[0][1] == ContentType.FILE_DOES_NOT_EXIST
 
     # Test with incompatible list of options
-    with pytest.raises(subprocess.CalledProcessError):
-        run_magika_python_cli([test_file_path], json_output=True, jsonl_output=True)
+    with pytest.raises(MagikaClientError) as e_info:
+        _ = run_magika_python_cli([test_file_path], json_output=True, jsonl_output=True)
+    assert e_info.value.stdout == ""
+    assert (
+        e_info.value.stderr
+        == "ERROR: You should use either --json or --jsonl, not both.\n"
+    )
 
     # Test with an option does not exist
-    with pytest.raises(subprocess.CalledProcessError):
-        run_magika_python_cli(
+    with pytest.raises(MagikaClientError) as e_info:
+        _ = run_magika_python_cli(
             [test_file_path], extra_cli_options=["--non-existing-option"]
         )
+    assert e_info.value.stdout == ""
+    error_lines = e_info.value.stderr.split("\n")
+    assert error_lines[0].startswith("Usage: magika [OPTIONS] [FILE]...")
+    assert error_lines[-2].startswith("Error: No such option:")
+    assert error_lines[-1] == ""
 
 
 def test_magika_cli_with_reading_from_stdin() -> None:
@@ -614,6 +638,21 @@ def test_magika_cli_with_reading_from_stdin() -> None:
     assert str(entry["path"]) == "-"
     assert entry["output"]["ct_label"] in true_cts_names
 
+    # test with some bad input
+    cmd = f"cat {str(test_file_path)} | magika - {str(test_file_path)}"
+    p = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=True)
+    assert p.returncode == 1
+    assert p.stdout == ""
+    assert p.stderr.find('ERROR: If you pass "-", you cannot pass anything else.') >= 0
+
+    cmd = f"cat {str(test_file_path)} | magika - -r"
+    p = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=True)
+    assert p.returncode == 1
+    assert p.stdout == ""
+    assert (
+        p.stderr.find('ERROR: If you pass "-", recursive scan is not meaningful.') >= 0
+    )
+
 
 def test_magika_cli_with_colors() -> None:
     test_file_path = utils.get_one_basic_test_file_path()
@@ -708,5 +747,5 @@ def test_magika_cli_list_content_types() -> None:
     assert header.find("Description") >= 0
     assert stderr == ""
 
-    with pytest.raises(subprocess.CalledProcessError):
-        run_magika_python_cli([test_file_path], list_output_content_types=True)
+    with pytest.raises(MagikaClientError):
+        _ = run_magika_python_cli([test_file_path], list_output_content_types=True)
diff --git a/python/tests/utils.py b/python/tests/utils.py
@@ -43,12 +43,24 @@ def get_basic_tests_files_dir() -> Path:
     return tests_files_dir
 
 
+def get_mitra_tests_files_dir() -> Path:
+    tests_files_dir = get_tests_data_dir() / "mitra"
+    assert tests_files_dir.is_dir()
+    return tests_files_dir
+
+
 def get_basic_test_files_paths() -> List[Path]:
     tests_files_dir = get_basic_tests_files_dir()
     test_files_paths = sorted(filter(lambda p: p.is_file(), tests_files_dir.iterdir()))
     return test_files_paths
 
 
+def get_mitra_test_files_paths() -> List[Path]:
+    tests_files_dir = get_mitra_tests_files_dir()
+    test_files_paths = sorted(filter(lambda p: p.is_file(), tests_files_dir.iterdir()))
+    return test_files_paths
+
+
 def get_one_basic_test_file_path() -> Path:
     return get_basic_test_files_paths()[0]
 
@@ -118,14 +130,15 @@ def check_magika_cli_output_matches_expected_by_ext(
     for file_path, output in predicted_cts:
         remaining_samples_paths.remove(file_path)
         file_ext = file_path.suffix.lstrip(".")
-        true_cts = ctm.get_cts_by_ext(file_ext)
-        if len(true_cts) == 0:
-            # We could not find the content type from the extension.  In this
-            # case, we assume this is a test file path with the
-            # <dataset>/<content type>/<hash> pattern
+        if file_ext != "":
+            true_cts = ctm.get_cts_by_ext(file_ext)
+        else:
+            # The test file does not have any extension. In this case, we assume
+            # this is a test file path with the <dataset>/<content type>/<hash>
+            # pattern.
             true_ct_name = file_path.parent.name
             true_cts = [ctm.get_or_raise(true_ct_name)]
-        assert len(true_cts) > 0
+        assert len(true_cts) > 0, f'File extension: "{file_ext}"'
 
         true_cts_names = [ct.name for ct in true_cts]
 
@@ -151,7 +164,9 @@ def check_magika_cli_output_matches_expected_by_ext(
                     f"{ctm.get_description(ct.name)} ({ctm.get_group(ct.name)})"
                     for ct in true_cts
                 ]
-            assert output in expected_outputs
+            assert (
+                output in expected_outputs
+            ), f'Output: "{output}", expected output: "{expected_outputs}"'
 
     # Check that all input samples have been scanned
     assert len(remaining_samples_paths) == 0

diff --git a/python/tests/utils_magika_python_client.py b/python/tests/utils_magika_python_client.py
@@ -17,6 +17,12 @@
 from typing import List, Optional, Tuple
 
 
+class MagikaClientError(Exception):
+    def __init__(self, stdout: str, stderr: str):
+        self.stdout = stdout
+        self.stderr = stderr
+
+
 def run_magika_python_cli(
     samples_paths: List[Path],
     json_output: bool = False,
@@ -72,5 +78,9 @@ def run_magika_python_cli(
     if extra_cli_options is not None:
         cmd.extend(extra_cli_options)
 
-    p = subprocess.run(cmd, capture_output=True, text=True, check=True)
-    return p.stdout, p.stderr
+    p = subprocess.run(cmd, capture_output=True, text=True, check=False)
+
+    if p.returncode == 0:
+        return p.stdout, p.stderr
+    else:
+        raise MagikaClientError(stdout=p.stdout, stderr=p.stderr)
diff --git a/tests_data/README.md b/tests_data/README.md
@@ -0,0 +1,9 @@
+# Tests Data
+
+We use these files for regressions testing.
+
+These files were not (and should not) be used for training purposes.
+
+They are organized by directory:
+- `basic/`: a number of simple files of various content types.
+- `mitra/`: a selection of the files available at [https://github.com/corkami/mitra](https://github.com/corkami/mitra/tree/master/input).