fix long headers not supported by playwright and selenium & add crop_…

…top parameter
dexplo · Oct 24, 2024 · e057912 · e057912
1 parent 2aec126
commit e057912
Show file tree

Hide file tree

Showing 8 changed files with 196 additions and 48 deletions.
diff --git a/dataframe_image/_pandas_accessor.py b/dataframe_image/_pandas_accessor.py
@@ -77,6 +77,7 @@ def prepare_converter(
     chrome_path=None,
     dpi=None,
     use_mathjax=False,
+    crop_top=True,
 ):
     if table_conversion in BROWSER_CONVERTER_DICT:
         converter = BROWSER_CONVERTER_DICT[table_conversion](
@@ -85,7 +86,7 @@ def prepare_converter(
             chrome_path=chrome_path,
             fontsize=fontsize,
             encode_base64=False,
-            limit_crop=False,
+            crop_top=crop_top,
             device_scale_factor=(1 if dpi is None else dpi / 100.0),
             use_mathjax=use_mathjax,
         ).run
@@ -162,6 +163,8 @@ def generate_html(
         html = styler2html(obj)
     else:
         html = obj.to_html(max_rows=max_rows, max_cols=max_cols, notebook=True)
+    # wrap html with a div and add id `dfi_table`
+    html = f'<div id="dfi_table">{html}</div>'
     return html
 
 
@@ -188,7 +191,22 @@ def export(
     chrome_path=None,
     dpi=None,
     use_mathjax=False,
+    crop_top=True,
 ):
+    """export a DataFrame as png to a file
+
+    Args:
+        obj: DataFrame or Styler object, required
+        filename: str or file-like, required
+        fontsize: int, optional, default 14
+        max_rows: int, optional, default None
+        max_cols: int, optional, default None
+        table_conversion: str, optional, default 'chrome'
+        chrome_path: str, optional, default None
+        dpi: int, optional, default None
+        use_mathjax: bool, optional, default False
+        crop_top: bool, optional, crop top of the generate image, default True
+    """
     converter = prepare_converter(
         filename,
         fontsize,
@@ -198,6 +216,7 @@ def export(
         chrome_path,
         dpi,
         use_mathjax,
+        crop_top=crop_top,
     )
     html = generate_html(obj, filename, max_rows, max_cols)
 
@@ -219,7 +238,22 @@ async def export_async(
     chrome_path=None,
     dpi=None,
     use_mathjax=False,
+    crop_top=True,
 ):
+    """export a DataFrame as png to a file
+
+    Args:
+        obj: DataFrame or Styler object, required
+        filename: str or file-like, required
+        fontsize: int, optional, default 14
+        max_rows: int, optional, default None
+        max_cols: int, optional, default None
+        table_conversion: str, optional, default 'chrome'
+        chrome_path: str, optional, default None
+        dpi: int, optional, default None
+        use_mathjax: bool, optional, default False
+        crop_top: bool, optional, crop top of the generate image, default True
+    """
     converter = prepare_converter(
         filename,
         fontsize,
@@ -229,6 +263,7 @@ async def export_async(
         chrome_path,
         dpi,
         use_mathjax,
+        crop_top=crop_top,
     )
     html = generate_html(obj, filename, max_rows, max_cols)
     with disable_max_image_pixels():
@@ -298,6 +333,13 @@ async def export_async(
     If `table_conversion`=`chrome`, the dpi value is converted to a 
     "device scale factor" but should provide the same effect. When `None`,
     the "device scale factor" is 1.
+use_mathjax : bool, default False
+    Use MathJax to render LaTeX in the DataFrame. This only works with 
+    `table_conversion` set to 'playwright', 'matplotlib' or 'selenium'.
+crop_top : bool, default True
+    Crop the top of the generated image. This is useful when the DataFrame
+    has a lot of white space at the top of the image. But if you can set it
+    to False if you think the image is being cropped too much.
 """
 
 export_intro = """

diff --git a/dataframe_image/converter/browser/base.py b/dataframe_image/converter/browser/base.py
@@ -25,7 +25,7 @@ def __init__(
         chrome_path: str = None,
         fontsize: int = 18,
         encode_base64: bool = True,
-        limit_crop: bool = True,
+        crop_top: bool = True,
         device_scale_factor: int = 1,
         use_mathjax: bool = False,
     ):
@@ -39,7 +39,7 @@ def __init__(
             chrome_path (str): Path to the Chrome executable. Default is None.
             fontsize (int): Font size. Default is 18.
             encode_base64 (bool): Whether to encode the image in base64. Default is True.
-            limit_crop (bool): Whether to limit the crop. Default is True.
+            crop_top (bool): Whether to limit the crop. Default is True.
             device_scale_factor (int): Device scale factor. Default is 1.
             use_mathjax (bool): Whether to use MathJax for rendering. Default is False.
         """
@@ -49,10 +49,38 @@ def __init__(
         self.chrome_path = chrome_path
         self.fontsize = fontsize
         self.encode_base64 = encode_base64
-        self.limit_crop = limit_crop
+        self.crop_top = crop_top
         self.device_scale_factor = device_scale_factor
         self.use_mathjax = use_mathjax
 
+    def build_valid_html(self, html: str) -> str:
+        """
+        Build a valid page HTML.
+
+        Args:
+            html (str): The HTML to build.
+
+        Returns:
+            str: The valid HTML string.
+        """
+        # <style>...</style> must be in the head
+        css_str = self.get_css()
+        # <div>...</div> must be in the body
+        table_div = html
+
+        page = f"""
+        <!DOCTYPE html>
+        <html>
+        <head>
+        {css_str}
+        </head>
+        <body>
+        {table_div}
+        </body>
+        </html>
+        """
+        return page
+
     def get_css(self) -> str:
         """
         Get the CSS for the HTML.
@@ -140,18 +168,23 @@ def crop(self, im: Image) -> Image:
         Returns:
             Image: The cropped image.
         """
-        # remove black
-        imrgb = im.convert("RGB")
-        imageBox = imrgb.getbbox()
-        im = im.crop(imageBox)
-
         # remove alpha channel
-        imrgb = im.convert("RGB")
-        # invert image (so that white is 0)
-        invert_im = ImageOps.invert(imrgb)
-        imageBox = invert_im.getbbox()
-        cropped = im.crop(imageBox)
-        return cropped
+        imrgb = ImageOps.invert(im.convert("RGB"))
+        imageBox = imrgb.getbbox()
+        # check imageBox top pixels are all not white
+        top_line_np = np.array(
+            imrgb.crop((imageBox[0], imageBox[1], imageBox[2], imageBox[1] + 1))
+        )
+        ## convert top_line_np to boolean array, white is 1
+        top_line_white_percent = (top_line_np != 0).mean()
+        ## some df has no top border, or top is caption, so we need to crop top from 0
+        ## else we crop top from imageBox
+        if top_line_white_percent > 0.5 and self.crop_top:
+            im = im.crop(imageBox)
+        else:
+            im = im.crop((imageBox[0], 0, imageBox[2], imageBox[3]))
+
+        return im
 
     def run(self, html: str) -> bytes:
         """

diff --git a/dataframe_image/converter/browser/chrome_converter.py b/dataframe_image/converter/browser/chrome_converter.py
@@ -86,7 +86,7 @@ def __init__(
         chrome_path: str = None,
         fontsize: int = 18,
         encode_base64: bool = True,
-        limit_crop: bool = True,
+        crop_top: bool = True,
         device_scale_factor: int = 1,
         use_mathjax: bool = False,
     ):
@@ -97,7 +97,7 @@ def __init__(
             chrome_path,
             fontsize,
             encode_base64,
-            limit_crop,
+            crop_top,
             device_scale_factor,
             use_mathjax,
         )
@@ -152,9 +152,9 @@ def screenshot(self, html, ss_width=1400, ss_height=900) -> Image:
                     return self.screenshot(html, ss_width, ss_height)
                 else:
                     logger.warning(
-                        f"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112
-                        You could try to install an individual Chrome dev version and set chrome_path to it
-                        or try 'df.dfi.export('df.png', table_conversion="selenium")'"""
+                        """Unable to enlarge image with Chrome, it is a known bug with version 111 and 112
+                        You could try to install an individual Chrome dev version and set `chrome_path` to it
+                        or try 'df.dfi.export('df.png', table_conversion="playwright")'"""
                     )
             return im
 

diff --git a/dataframe_image/converter/browser/html2image_converter.py b/dataframe_image/converter/browser/html2image_converter.py
@@ -40,7 +40,7 @@ def screenshot(
                 return self.screenshot(html, ss_width, ss_height)
             else:
                 logger.warning(
-                    f"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112
+                    """Unable to enlarge image with Chrome, it is a known bug with version 111 and 112
                     You could try to install an individual Chrome dev version and set chrome_path to it
                     or try 'df.dfi.export('df.png', table_conversion="selenium")'"""
                 )

diff --git a/dataframe_image/converter/browser/playwright_converter.py b/dataframe_image/converter/browser/playwright_converter.py
@@ -1,4 +1,7 @@
+import math
 from io import BytesIO
+from pathlib import Path
+from tempfile import TemporaryDirectory
 
 from PIL import Image
 
@@ -15,11 +18,14 @@ def screenshot(self, html):
             raise ImportError(
                 "Playwright is not installed. Install it with 'pip install playwright' and make sure you have a chromium browser installed."
             ) from ex
+
         with sync_playwright() as p:
             channels = ["chrome", "msedge", None]
             for c in channels:
                 try:
-                    browser = p.chromium.launch(channel=c, args=["--disable-web-security"])
+                    browser = p.chromium.launch(
+                        channel=c, args=["--disable-web-security"]
+                    )
                     break
                 except Error:
                     pass
@@ -29,9 +35,19 @@ def screenshot(self, html):
                     "Or install it by `playwright install chromium`"
                 )
 
-            context = browser.new_context(device_scale_factor=self.device_scale_factor, bypass_csp=True)
+            context = browser.new_context(
+                device_scale_factor=self.device_scale_factor, bypass_csp=True
+            )
             page = context.new_page()
-            page.set_content(self.get_css() + html)
+            page.set_content(self.build_valid_html(html))
+            # get height and width for #dfi_table
+            locator = page.locator("#dfi_table table")
+            bbox = locator.bounding_box()
+            width = bbox["width"]
+            height = bbox["height"]
+            page.set_viewport_size(
+                {"width": math.ceil(width) + 20, "height": math.ceil(height) + 20}
+            )
             if self.use_mathjax:
                 mj = page.locator("mjx-container math")
                 try:
@@ -42,19 +58,18 @@ def screenshot(self, html):
                     )
                     pass
                 page.wait_for_timeout(200)
-            screenshot_bytes = page.screenshot(full_page=True)
+            screenshot_bytes = locator.screenshot()
         im = Image.open(BytesIO(screenshot_bytes))
         return im
 
 
 class AsyncPlayWrightConverter(BrowserConverter):
-
     async def run(self, html: str) -> bytes:
         im = await self.screenshot(html)
         temp_img = self.crop(im)
         image_bytes = self.finalize_image(temp_img)
         return image_bytes
-    
+
     async def screenshot(self, html):
         try:
             from playwright.async_api import Error, async_playwright
@@ -84,7 +99,14 @@ async def screenshot(self, html):
                 device_scale_factor=self.device_scale_factor, bypass_csp=True
             )
             page = await context.new_page()
-            await page.set_content(self.get_css() + html)
+            await page.set_content(self.build_valid_html(html))
+            locator = await page.locator("#dfi_table table")
+            bbox = locator.bounding_box()
+            width = bbox["width"]
+            height = bbox["height"]
+            await page.set_viewport_size(
+                {"width": math.ceil(width) + 20, "height": math.ceil(height) + 20}
+            )
             if self.use_mathjax:
                 mj = page.locator("mjx-container math")
                 try:
@@ -96,6 +118,6 @@ async def screenshot(self, html):
                     )
                     pass
                 page.wait_for_timeout(200)
-            screenshot_bytes = await page.screenshot(full_page=True)
+            screenshot_bytes = await locator.screenshot()
         im = Image.open(BytesIO(screenshot_bytes))
-        return im
+        return im
diff --git a/dataframe_image/converter/browser/selenium_converter.py b/dataframe_image/converter/browser/selenium_converter.py
@@ -41,13 +41,14 @@ def screenshot(self, html: str) -> Image:
             f.write(self.get_css() + html)
 
         with selenium.webdriver.Firefox(options=options, service=service) as driver:
-            driver.get(f"file://{str(temp_html)}")  # selenium will do the rest
+            driver.get(temp_html.as_uri())  # selenium will do the rest
 
+            # get "#dfi_table table" width and height
             required_width = driver.execute_script(
-                "return document.body.parentNode.scrollWidth"
+                "return document.querySelector('#dfi_table table').scrollWidth"
             )
             required_height = driver.execute_script(
-                "return document.body.parentNode.scrollHeight"
+                "return document.querySelector('#dfi_table table').scrollHeight"
             )
             driver.set_window_size(required_width + 150, required_height + 90)
             driver.save_screenshot(str(temp_img))

diff --git a/dataframe_image/converter/matplotlib_table.py b/dataframe_image/converter/matplotlib_table.py
@@ -19,14 +19,14 @@ def __init__(
         self,
         fontsize=14,
         encode_base64=True,
-        limit_crop=True,
+        crop_top=True,
         for_document=True,
         savefig_dpi=None,
         format="png",
     ):
         self.original_fontsize = fontsize
         self.encode_base64 = encode_base64
-        self.limit_crop = limit_crop
+        self.crop_top = crop_top
         self.for_document = for_document
         self.figwidth = 1
         self.figheight = 1