Skip to content

Commit

Permalink
fix long headers not supported by playwright and selenium & add crop_…
Browse files Browse the repository at this point in the history
…top parameter
  • Loading branch information
John Lyu committed Oct 24, 2024
1 parent 2aec126 commit e057912
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 48 deletions.
44 changes: 43 additions & 1 deletion dataframe_image/_pandas_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def prepare_converter(
chrome_path=None,
dpi=None,
use_mathjax=False,
crop_top=True,
):
if table_conversion in BROWSER_CONVERTER_DICT:
converter = BROWSER_CONVERTER_DICT[table_conversion](
Expand All @@ -85,7 +86,7 @@ def prepare_converter(
chrome_path=chrome_path,
fontsize=fontsize,
encode_base64=False,
limit_crop=False,
crop_top=crop_top,
device_scale_factor=(1 if dpi is None else dpi / 100.0),
use_mathjax=use_mathjax,
).run
Expand Down Expand Up @@ -162,6 +163,8 @@ def generate_html(
html = styler2html(obj)
else:
html = obj.to_html(max_rows=max_rows, max_cols=max_cols, notebook=True)
# wrap html with a div and add id `dfi_table`
html = f'<div id="dfi_table">{html}</div>'
return html


Expand All @@ -188,7 +191,22 @@ def export(
chrome_path=None,
dpi=None,
use_mathjax=False,
crop_top=True,
):
"""export a DataFrame as png to a file
Args:
obj: DataFrame or Styler object, required
filename: str or file-like, required
fontsize: int, optional, default 14
max_rows: int, optional, default None
max_cols: int, optional, default None
table_conversion: str, optional, default 'chrome'
chrome_path: str, optional, default None
dpi: int, optional, default None
use_mathjax: bool, optional, default False
crop_top: bool, optional, crop top of the generate image, default True
"""
converter = prepare_converter(
filename,
fontsize,
Expand All @@ -198,6 +216,7 @@ def export(
chrome_path,
dpi,
use_mathjax,
crop_top=crop_top,
)
html = generate_html(obj, filename, max_rows, max_cols)

Expand All @@ -219,7 +238,22 @@ async def export_async(
chrome_path=None,
dpi=None,
use_mathjax=False,
crop_top=True,
):
"""export a DataFrame as png to a file
Args:
obj: DataFrame or Styler object, required
filename: str or file-like, required
fontsize: int, optional, default 14
max_rows: int, optional, default None
max_cols: int, optional, default None
table_conversion: str, optional, default 'chrome'
chrome_path: str, optional, default None
dpi: int, optional, default None
use_mathjax: bool, optional, default False
crop_top: bool, optional, crop top of the generate image, default True
"""
converter = prepare_converter(
filename,
fontsize,
Expand All @@ -229,6 +263,7 @@ async def export_async(
chrome_path,
dpi,
use_mathjax,
crop_top=crop_top,
)
html = generate_html(obj, filename, max_rows, max_cols)
with disable_max_image_pixels():
Expand Down Expand Up @@ -298,6 +333,13 @@ async def export_async(
If `table_conversion`=`chrome`, the dpi value is converted to a
"device scale factor" but should provide the same effect. When `None`,
the "device scale factor" is 1.
use_mathjax : bool, default False
Use MathJax to render LaTeX in the DataFrame. This only works with
`table_conversion` set to 'playwright', 'matplotlib' or 'selenium'.
crop_top : bool, default True
Crop the top of the generated image. This is useful when the DataFrame
has a lot of white space at the top of the image. But if you can set it
to False if you think the image is being cropped too much.
"""

export_intro = """
Expand Down
61 changes: 47 additions & 14 deletions dataframe_image/converter/browser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(
chrome_path: str = None,
fontsize: int = 18,
encode_base64: bool = True,
limit_crop: bool = True,
crop_top: bool = True,
device_scale_factor: int = 1,
use_mathjax: bool = False,
):
Expand All @@ -39,7 +39,7 @@ def __init__(
chrome_path (str): Path to the Chrome executable. Default is None.
fontsize (int): Font size. Default is 18.
encode_base64 (bool): Whether to encode the image in base64. Default is True.
limit_crop (bool): Whether to limit the crop. Default is True.
crop_top (bool): Whether to limit the crop. Default is True.
device_scale_factor (int): Device scale factor. Default is 1.
use_mathjax (bool): Whether to use MathJax for rendering. Default is False.
"""
Expand All @@ -49,10 +49,38 @@ def __init__(
self.chrome_path = chrome_path
self.fontsize = fontsize
self.encode_base64 = encode_base64
self.limit_crop = limit_crop
self.crop_top = crop_top
self.device_scale_factor = device_scale_factor
self.use_mathjax = use_mathjax

def build_valid_html(self, html: str) -> str:
"""
Build a valid page HTML.
Args:
html (str): The HTML to build.
Returns:
str: The valid HTML string.
"""
# <style>...</style> must be in the head
css_str = self.get_css()
# <div>...</div> must be in the body
table_div = html

page = f"""
<!DOCTYPE html>
<html>
<head>
{css_str}
</head>
<body>
{table_div}
</body>
</html>
"""
return page

def get_css(self) -> str:
"""
Get the CSS for the HTML.
Expand Down Expand Up @@ -140,18 +168,23 @@ def crop(self, im: Image) -> Image:
Returns:
Image: The cropped image.
"""
# remove black
imrgb = im.convert("RGB")
imageBox = imrgb.getbbox()
im = im.crop(imageBox)

# remove alpha channel
imrgb = im.convert("RGB")
# invert image (so that white is 0)
invert_im = ImageOps.invert(imrgb)
imageBox = invert_im.getbbox()
cropped = im.crop(imageBox)
return cropped
imrgb = ImageOps.invert(im.convert("RGB"))
imageBox = imrgb.getbbox()
# check imageBox top pixels are all not white
top_line_np = np.array(
imrgb.crop((imageBox[0], imageBox[1], imageBox[2], imageBox[1] + 1))
)
## convert top_line_np to boolean array, white is 1
top_line_white_percent = (top_line_np != 0).mean()
## some df has no top border, or top is caption, so we need to crop top from 0
## else we crop top from imageBox
if top_line_white_percent > 0.5 and self.crop_top:
im = im.crop(imageBox)
else:
im = im.crop((imageBox[0], 0, imageBox[2], imageBox[3]))

return im

def run(self, html: str) -> bytes:
"""
Expand Down
10 changes: 5 additions & 5 deletions dataframe_image/converter/browser/chrome_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def __init__(
chrome_path: str = None,
fontsize: int = 18,
encode_base64: bool = True,
limit_crop: bool = True,
crop_top: bool = True,
device_scale_factor: int = 1,
use_mathjax: bool = False,
):
Expand All @@ -97,7 +97,7 @@ def __init__(
chrome_path,
fontsize,
encode_base64,
limit_crop,
crop_top,
device_scale_factor,
use_mathjax,
)
Expand Down Expand Up @@ -152,9 +152,9 @@ def screenshot(self, html, ss_width=1400, ss_height=900) -> Image:
return self.screenshot(html, ss_width, ss_height)
else:
logger.warning(
f"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112
You could try to install an individual Chrome dev version and set chrome_path to it
or try 'df.dfi.export('df.png', table_conversion="selenium")'"""
"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112
You could try to install an individual Chrome dev version and set `chrome_path` to it
or try 'df.dfi.export('df.png', table_conversion="playwright")'"""
)
return im

Expand Down
2 changes: 1 addition & 1 deletion dataframe_image/converter/browser/html2image_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def screenshot(
return self.screenshot(html, ss_width, ss_height)
else:
logger.warning(
f"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112
"""Unable to enlarge image with Chrome, it is a known bug with version 111 and 112
You could try to install an individual Chrome dev version and set chrome_path to it
or try 'df.dfi.export('df.png', table_conversion="selenium")'"""
)
Expand Down
40 changes: 31 additions & 9 deletions dataframe_image/converter/browser/playwright_converter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import math
from io import BytesIO
from pathlib import Path
from tempfile import TemporaryDirectory

from PIL import Image

Expand All @@ -15,11 +18,14 @@ def screenshot(self, html):
raise ImportError(
"Playwright is not installed. Install it with 'pip install playwright' and make sure you have a chromium browser installed."
) from ex

with sync_playwright() as p:
channels = ["chrome", "msedge", None]
for c in channels:
try:
browser = p.chromium.launch(channel=c, args=["--disable-web-security"])
browser = p.chromium.launch(
channel=c, args=["--disable-web-security"]
)
break
except Error:
pass
Expand All @@ -29,9 +35,19 @@ def screenshot(self, html):
"Or install it by `playwright install chromium`"
)

context = browser.new_context(device_scale_factor=self.device_scale_factor, bypass_csp=True)
context = browser.new_context(
device_scale_factor=self.device_scale_factor, bypass_csp=True
)
page = context.new_page()
page.set_content(self.get_css() + html)
page.set_content(self.build_valid_html(html))
# get height and width for #dfi_table
locator = page.locator("#dfi_table table")
bbox = locator.bounding_box()
width = bbox["width"]
height = bbox["height"]
page.set_viewport_size(
{"width": math.ceil(width) + 20, "height": math.ceil(height) + 20}
)
if self.use_mathjax:
mj = page.locator("mjx-container math")
try:
Expand All @@ -42,19 +58,18 @@ def screenshot(self, html):
)
pass
page.wait_for_timeout(200)
screenshot_bytes = page.screenshot(full_page=True)
screenshot_bytes = locator.screenshot()
im = Image.open(BytesIO(screenshot_bytes))
return im


class AsyncPlayWrightConverter(BrowserConverter):

async def run(self, html: str) -> bytes:
im = await self.screenshot(html)
temp_img = self.crop(im)
image_bytes = self.finalize_image(temp_img)
return image_bytes

async def screenshot(self, html):
try:
from playwright.async_api import Error, async_playwright
Expand Down Expand Up @@ -84,7 +99,14 @@ async def screenshot(self, html):
device_scale_factor=self.device_scale_factor, bypass_csp=True
)
page = await context.new_page()
await page.set_content(self.get_css() + html)
await page.set_content(self.build_valid_html(html))
locator = await page.locator("#dfi_table table")
bbox = locator.bounding_box()
width = bbox["width"]
height = bbox["height"]
await page.set_viewport_size(
{"width": math.ceil(width) + 20, "height": math.ceil(height) + 20}
)
if self.use_mathjax:
mj = page.locator("mjx-container math")
try:
Expand All @@ -96,6 +118,6 @@ async def screenshot(self, html):
)
pass
page.wait_for_timeout(200)
screenshot_bytes = await page.screenshot(full_page=True)
screenshot_bytes = await locator.screenshot()
im = Image.open(BytesIO(screenshot_bytes))
return im
return im
7 changes: 4 additions & 3 deletions dataframe_image/converter/browser/selenium_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@ def screenshot(self, html: str) -> Image:
f.write(self.get_css() + html)

with selenium.webdriver.Firefox(options=options, service=service) as driver:
driver.get(f"file://{str(temp_html)}") # selenium will do the rest
driver.get(temp_html.as_uri()) # selenium will do the rest

# get "#dfi_table table" width and height
required_width = driver.execute_script(
"return document.body.parentNode.scrollWidth"
"return document.querySelector('#dfi_table table').scrollWidth"
)
required_height = driver.execute_script(
"return document.body.parentNode.scrollHeight"
"return document.querySelector('#dfi_table table').scrollHeight"
)
driver.set_window_size(required_width + 150, required_height + 90)
driver.save_screenshot(str(temp_img))
Expand Down
4 changes: 2 additions & 2 deletions dataframe_image/converter/matplotlib_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ def __init__(
self,
fontsize=14,
encode_base64=True,
limit_crop=True,
crop_top=True,
for_document=True,
savefig_dpi=None,
format="png",
):
self.original_fontsize = fontsize
self.encode_base64 = encode_base64
self.limit_crop = limit_crop
self.crop_top = crop_top
self.for_document = for_document
self.figwidth = 1
self.figheight = 1
Expand Down
Loading

0 comments on commit e057912

Please sign in to comment.