Skip to content

Commit

Permalink
🐛 [#4795] Invert validation for .msg files
Browse files Browse the repository at this point in the history
The SDK cannot reliably determine which content type belongs to a .msg
file, most notably on Linux and MacOS because the extension is not in
the mime type database. This manifests as a file being uploaded with empty
content-type.

To allow these files to go through, the serializer must allow empty
values for the 'type' field which contains the detected content type,
and the backend must perform additional processing to determine the file
type. We can do this by falling back to the generic case of 'binary
file' (application/octet-stream) content type, and let libmagic figure
out which extensions belong to the magic bytes, i.e. we look at the
magic bytes to figure out what kind of file was provided, and we check
the provided file extensions against the list of valid extensions for
the detected file type.

Backport-of: #4961
  • Loading branch information
robinmolen authored and sergei-maertens committed Dec 30, 2024
1 parent 2d4faed commit 75e9299
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 31 deletions.
2 changes: 1 addition & 1 deletion src/openforms/conf/locale/nl/LC_MESSAGES/django.po
Original file line number Diff line number Diff line change
Expand Up @@ -5092,7 +5092,7 @@ msgid ""
"extension."
msgstr ""
"Het bestandstype kon niet bepaald worden. Controleer of de bestandsnaam met "
"een extensie eindigt (bijvoorbeel '.pdf' of '.png')."
"een extensie eindigt (bijvoorbeeld '.pdf' of '.png')."

#: openforms/formio/components/vanilla.py:366
#, python-brace-format
Expand Down
41 changes: 26 additions & 15 deletions src/openforms/formio/api/validators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from pathlib import Path
from typing import Iterable

from django.core.files.uploadedfile import UploadedFile
Expand Down Expand Up @@ -56,29 +57,40 @@ def __init__(self, allowed_mime_types: Iterable[str] | None = None):

def __call__(self, value: UploadedFile) -> None:
head = value.read(2048)
ext = value.name.split(".")[-1]
mime_type = magic.from_buffer(head, mime=True)
ext = Path(value.name or "").suffix[1:]
detected_mime_type = magic.from_buffer(head, mime=True)
provided_mime_type = value.content_type or "application/octet-stream"

# gh #2520
# application/x-ole-storage on Arch with shared-mime-info 2.0+155+gf4e7cbc-1
if mime_type in ["application/CDFV2", "application/x-ole-storage"]:
if detected_mime_type in ["application/CDFV2", "application/x-ole-storage"]:
whole_file = head + value.read()
mime_type = magic.from_buffer(whole_file, mime=True)
detected_mime_type = magic.from_buffer(whole_file, mime=True)

if mime_type == "image/heif":
mime_type = "image/heic"
if detected_mime_type == "image/heif":
detected_mime_type = "image/heic"

if not (
self.any_allowed
or mimetype_allowed(mime_type, self._regular_mimes, self._wildcard_mimes)
or mimetype_allowed(
detected_mime_type, self._regular_mimes, self._wildcard_mimes
)
):
raise serializers.ValidationError(
_("The provided file is not a valid file type.")
)

if not ext:
raise serializers.ValidationError(
_(
"Could not determine the file type. Please make sure the file name "
"has an extension."
)
)

# Contents is allowed. Do extension or submitted content_type agree?
if value.content_type == "application/octet-stream":
m = magic.Magic(extension=True)
if provided_mime_type == "application/octet-stream":
m = magic.Magic(extension=True) # pyright: ignore[reportCallIssue]
extensions = m.from_buffer(head).split("/")
# magic db doesn't know any more specific extension(s), so accept the
# file
Expand All @@ -101,27 +113,26 @@ def __call__(self, value: UploadedFile) -> None:
# If the file does not strictly follow the conventions of CSV (e.g. non-standard delimiters),
# may not be considered as a valid CSV.
elif (
value.content_type == "text/csv"
and mime_type == "text/plain"
provided_mime_type == "text/csv"
and detected_mime_type == "text/plain"
and ext == "csv"
):
return
elif mime_type == "image/heic" and value.content_type in (
elif detected_mime_type == "image/heic" and provided_mime_type in (
"image/heic",
"image/heif",
):
return

# gh #4658
# Windows use application/x-zip-compressed as a mimetype for .zip files, which
# is deprecated but still we need to support it. Instead, the common case for
# zip files is application/zip or application/zip-compressed mimetype.
elif mime_type == "application/zip" and value.content_type in (
elif detected_mime_type == "application/zip" and provided_mime_type in (
"application/zip-compressed",
"application/x-zip-compressed",
):
return
elif mime_type != value.content_type:
elif provided_mime_type != detected_mime_type:
raise serializers.ValidationError(
_("The provided file is not a {file_type}.").format(
filename=value.name, file_type=f".{ext}"
Expand Down
9 changes: 1 addition & 8 deletions src/openforms/formio/components/vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,14 +339,7 @@ class FileSerializer(serializers.Serializer):
originalName = serializers.CharField(trim_whitespace=False)
size = serializers.IntegerField(min_value=0)
storage = serializers.ChoiceField(choices=["url"])
type = serializers.CharField(
error_messages={
"blank": _(
"Could not determine the file type. Please make sure the file name "
"has an extension."
),
}
)
type = serializers.CharField(required=True, allow_blank=True)
url = serializers.URLField()
data = FileDataSerializer() # type: ignore

Expand Down
Binary file added src/openforms/formio/tests/files/test.msg
Binary file not shown.
34 changes: 34 additions & 0 deletions src/openforms/formio/tests/test_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,23 @@ def test_mime_type_inferred_from_magic(self):
except ValidationError as e:
self.fail(f"Valid file failed validation: {e}")

def test_unknown_file_type(self):
file = SimpleUploadedFile(
"unknown-type",
b"test",
content_type="application/octet-stream", # see e2e test SingleFileTests.test_unknown_file_type
)
validator = validators.MimeTypeValidator(
allowed_mime_types=None
) # allows any mime type

with self.assertRaisesMessage(
ValidationError,
"Could not determine the file type. Please make sure the file name "
"has an extension.",
):
validator(file)

def test_star_wildcard_in_allowed_mimetypes(self):
validator = validators.MimeTypeValidator({"*"})

Expand Down Expand Up @@ -202,6 +219,23 @@ def test_allowed_mime_types_for_csv_files(self):

validator(sample)

def test_allowed_mime_types_for_msg_files(self):
valid_type = "application/vnd.ms-outlook"
msg_file = TEST_FILES / "test.msg"
validator = validators.MimeTypeValidator(allowed_mime_types=[valid_type])

# 4795
# The sdk cannot determine the content_type for .msg files correctly.
# Because .msg is a windows specific file, and linux and MacOS don't know it.
# So we simulate the scenario where content_type is unknown
sample = SimpleUploadedFile(
name="test.msg",
content=msg_file.read_bytes(),
content_type="", # replicate the behaviour of the frontend
)

validator(sample)

def test_validate_files_multiple_mime_types(self):
"""Assert that validation of files associated with multiple mime types works
Expand Down
4 changes: 2 additions & 2 deletions src/openforms/formio/tests/validation/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,10 +602,10 @@ def test_attach_upload_validates_unknown_file_type(self):
}

is_valid, errors = validate_formio_data(component, data, submission=submission)
error = extract_error(errors["foo"][0], "type")
error = extract_error(errors["foo"][0], "non_field_errors")

self.assertFalse(is_valid)
self.assertEqual(error.code, "blank")
self.assertEqual(error.code, "invalid")
self.assertEqual(
error,
_(
Expand Down
10 changes: 5 additions & 5 deletions src/openforms/tests/e2e/test_input_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,8 +955,8 @@ def test_unknown_file_type(self):

# The frontend validation will *not* create a TemporaryFileUpload,
# as the frontend will block the upload because of the invalid file type.
# However the user could do an handcrafted API call.
# For this reason, we manually create an invalid TemporaryFileUpload
# However the user could do a handcrafted API call.
# For this reason, we manually try to create an invalid TemporaryFileUpload
# and use it for the `api_value`:

with open(TEST_FILES / "unknown-type", "rb") as infile:
Expand All @@ -971,7 +971,7 @@ def test_unknown_file_type(self):
ui_files=[TEST_FILES / "unknown-type"],
expected_ui_error=(
"Het bestandstype kon niet bepaald worden. Controleer of de "
"bestandsnaam met een extensie eindigt (bijvoorbeel '.pdf' of "
"bestandsnaam met een extensie eindigt (bijvoorbeeld '.pdf' of "
"'.png')."
),
api_value=[
Expand All @@ -982,8 +982,8 @@ def test_unknown_file_type(self):
],
)

# Make sure the frontend did not create one:
self.assertEqual(TemporaryFileUpload.objects.count(), 1)
# Make sure that no temporary files were created
self.assertEqual(TemporaryFileUpload.objects.count(), 0)


class SingleAddressNLTests(ValidationsTestCase):
Expand Down

0 comments on commit 75e9299

Please sign in to comment.