Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make conversion non-destructive to soup; improve div/article/section handling #184

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 84 additions & 27 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from bs4 import BeautifulSoup, NavigableString, Comment, Doctype
from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
from textwrap import fill
import re
import six
Expand Down Expand Up @@ -79,6 +79,7 @@ def should_remove_whitespace_inside(el):
if html_heading_re.match(el.name) is not None:
return True
return el.name in ('p', 'blockquote',
'article', 'div', 'section',
'ol', 'ul', 'li',
'table', 'thead', 'tbody', 'tfoot',
'tr', 'td', 'th')
Expand All @@ -89,6 +90,41 @@ def should_remove_whitespace_outside(el):
return should_remove_whitespace_inside(el) or (el and el.name == 'pre')


def _is_block_content_element(el):
"""
In a block context, returns:

- True for content elements (tags and non-whitespace text)
- False for non-content elements (whitespace text, comments, doctypes)
"""
if isinstance(el, Tag):
return True
elif isinstance(el, (Comment, Doctype)):
return False # (subclasses of NavigableString, must test first)
elif isinstance(el, NavigableString):
return el.strip() != ''
else:
return False


def _prev_block_content_sibling(el):
"""Returns the first previous sibling that is a content element, else None."""
while el is not None:
el = el.previous_sibling
if _is_block_content_element(el):
return el
return None


def _next_block_content_sibling(el):
"""Returns the first next sibling that is a content element, else None."""
while el is not None:
el = el.next_sibling
if _is_block_content_element(el):
return el
return None


class MarkdownConverter(object):
class DefaultOptions:
autolinks = True
Expand Down Expand Up @@ -143,29 +179,38 @@ def process_tag(self, node, convert_as_inline):
or node.name in ['td', 'th'] # table cells
)

# Remove whitespace-only textnodes just before, after or
# inside block-level elements.
# Collect child elements to process, ignoring whitespace-only text elements
# adjacent to the inner/outer boundaries of block elements.
should_remove_inside = should_remove_whitespace_inside(node)
for el in node.children:
# Only extract (remove) whitespace-only text node if any of the
# conditions is true:
# - el is the first element in its parent (block-level)
# - el is the last element in its parent (block-level)
# - el is adjacent to a block-level node
can_extract = (should_remove_inside and (not el.previous_sibling
or not el.next_sibling)
or should_remove_whitespace_outside(el.previous_sibling)
or should_remove_whitespace_outside(el.next_sibling))
if (isinstance(el, NavigableString)
and six.text_type(el).strip() == ''
and can_extract):
el.extract()

# Convert the children first
for el in node.children:
if isinstance(el, Comment) or isinstance(el, Doctype):
continue
def _can_ignore(el):
if isinstance(el, Tag):
# Tags are always processed.
return False
elif isinstance(el, (Comment, Doctype)):
# Comment and Doctype elements are always ignored.
# (subclasses of NavigableString, must test first)
return True
elif isinstance(el, NavigableString):
if six.text_type(el).strip() != '':
# Non-whitespace text nodes are always processed.
return False
elif should_remove_inside and (not el.previous_sibling or not el.next_sibling):
# Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
return True
elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
# Outside block elements (including <pre>), ignore adjacent whitespace elements.
return True
else:
return False
else:
raise ValueError('Unexpected element type: %s' % type(el))

children_to_convert = [child for child in node.children if not _can_ignore(child)]

# Convert the children first
for el in children_to_convert:
if isinstance(el, NavigableString):
text += self.process_text(el)
else:
text_strip = text.rstrip('\n')
Expand Down Expand Up @@ -337,6 +382,16 @@ def convert_code(self, el, text, convert_as_inline):

convert_del = abstract_inline_conversion(lambda self: '~~')

def convert_div(self, el, text, convert_as_inline):
if convert_as_inline:
return ' ' + text.strip() + ' '
text = text.strip()
return '\n\n%s\n\n' % text if text else ''

convert_article = convert_div

convert_section = convert_div

convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])

convert_kbd = convert_code
Expand Down Expand Up @@ -415,7 +470,8 @@ def convert_list(self, el, text, convert_as_inline):

nested = False
before_paragraph = False
if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
next_sibling = _next_block_content_sibling(el)
if next_sibling and next_sibling.name not in ['ul', 'ol']:
before_paragraph = True
while el:
if el.name == 'li':
Expand Down Expand Up @@ -539,22 +595,23 @@ def convert_th(self, el, text, convert_as_inline):

def convert_tr(self, el, text, convert_as_inline):
cells = el.find_all(['td', 'th'])
is_first_row = el.find_previous_sibling() is None
is_headrow = (
all([cell.name == 'th' for cell in cells])
or (el.parent.name == 'thead'
# avoid multiple tr in thead
and len(el.parent.find_all('tr')) == 1)
)
is_head_row_missing = (
(not el.previous_sibling and not el.parent.name == 'tbody')
or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
(is_first_row and not el.parent.name == 'tbody')
or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
)
overline = ''
underline = ''
if ((is_headrow
or (is_head_row_missing
and self.options['table_infer_header']))
and not el.previous_sibling):
and is_first_row):
# first row and:
# - is headline or
# - headline is missing and header inference is enabled
Expand All @@ -568,10 +625,10 @@ def convert_tr(self, el, text, convert_as_inline):
underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
elif ((is_head_row_missing
and not self.options['table_infer_header'])
or (not el.previous_sibling
or (is_first_row
and (el.parent.name == 'table'
or (el.parent.name == 'tbody'
and not el.parent.previous_sibling)))):
and not el.parent.find_previous_sibling())))):
# headline is missing and header inference is disabled or:
# first row, not headline, and:
# - the parent is table or
Expand Down
2 changes: 1 addition & 1 deletion tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def test_single_tag():


def test_soup():
assert md('<div><span>Hello</div></span>') == 'Hello'
assert md('<div><span>Hello</div></span>') == '\n\nHello\n\n'


def test_whitespace():
Expand Down
15 changes: 13 additions & 2 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,19 @@ def test_del():
inline_tests('del', '~~')


def test_div():
assert md('Hello</div> World') == 'Hello World'
def test_div_section_article():
for tag in ['div', 'section', 'article']:
assert md(f'<{tag}>456</{tag}>') == '\n\n456\n\n'
assert md(f'123<{tag}>456</{tag}>789') == '123\n\n456\n\n789'
assert md(f'123<{tag}>\n 456 \n</{tag}>789') == '123\n\n456\n\n789'
assert md(f'123<{tag}><p>456</p></{tag}>789') == '123\n\n456\n\n789'
assert md(f'123<{tag}>\n<p>456</p>\n</{tag}>789') == '123\n\n456\n\n789'
assert md(f'123<{tag}><pre>4 5 6</pre></{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789'
assert md(f'123<{tag}>\n<pre>4 5 6</pre>\n</{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789'
assert md(f'123<{tag}>4\n5\n6</{tag}>789') == '123\n\n4\n5\n6\n\n789'
assert md(f'123<{tag}>\n4\n5\n6\n</{tag}>789') == '123\n\n4\n5\n6\n\n789'
assert md(f'123<{tag}>\n<p>\n4\n5\n6\n</p>\n</{tag}>789') == '123\n\n4\n5\n6\n\n789'
assert md(f'<{tag}><h1>title</h1>body</{{tag}}>', heading_style=ATX) == '\n\n# title\n\nbody\n\n'


def test_em():
Expand Down