From b37a302dbbd16a38e018559d8405009bb2131910 Mon Sep 17 00:00:00 2001 From: louis-paulvlx <90868690+louis-paulvlx@users.noreply.github.com> Date: Fri, 8 Nov 2024 15:30:09 +0100 Subject: [PATCH] 90-Bug-fix-file-encoding-box-integration (#96) Allow setting File Encoding in Box implementation --- src/koheesio/integrations/box.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/koheesio/integrations/box.py b/src/koheesio/integrations/box.py index 114fdc04..2b4c0e8f 100644 --- a/src/koheesio/integrations/box.py +++ b/src/koheesio/integrations/box.py @@ -362,7 +362,11 @@ class BoxReaderBase(Box, Reader, ABC): default_factory=dict, description="[Optional] Set of extra parameters that should be passed to the Spark reader.", ) - + + file_encoding: Optional[str] = Field( + default="utf-8", + description="[Optional] Set file encoding format. By default is utf-8." + ) class BoxCsvFileReader(BoxReaderBase): """ @@ -412,7 +416,7 @@ def execute(self) -> BoxReaderBase.Output: for f in self.file: self.log.debug(f"Reading contents of file with the ID '{f}' into Spark DataFrame") file = self.client.file(file_id=f) - data = file.content().decode("utf-8") + data = file.content().decode(self.file_encoding) data_buffer = StringIO(data) temp_df_pandas = pd.read_csv(data_buffer, header=0, dtype=str if not self.schema_ else None, **self.params) # type: ignore