Skip to content

Commit

Permalink
Update html.py
Browse files Browse the repository at this point in the history
updated according to linter tests
  • Loading branch information
AhmedTammaa authored Nov 8, 2024
1 parent 8dc8e46 commit d4efd97
Showing 1 changed file with 87 additions and 88 deletions.
175 changes: 87 additions & 88 deletions libs/text-splitters/langchain_text_splitters/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from langchain_core.documents import Document

from langchain_text_splitters.character import RecursiveCharacterTextSplitter

from bs4 import BeautifulSoup
from bs4.element import Tag
from langchain.docstore.document import Document

class ElementType(TypedDict):
"""Element type as typed dict."""
Expand Down Expand Up @@ -91,104 +93,101 @@ def split_text(self, text: str) -> List[Document]:
return self.split_text_from_file(StringIO(text))


def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file using BeautifulSoup.
def split_text_from_file(self, file: Any) -> List[Document]:
"""Split HTML file using BeautifulSoup.
Args:
file: HTML file path or file-like object.
Args:
file: HTML file path or file-like object.
Returns:
List of Document objects with page_content and metadata.
"""
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
import bs4

# Read the HTML content from the file or file-like object
if isinstance(file, str):
with open(file, 'r', encoding='utf-8') as f:
html_content = f.read()
else:
# Assuming file is a file-like object
html_content = file.read()
Returns:
List of Document objects with page_content and metadata.
"""

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Read the HTML content from the file or file-like object
if isinstance(file, str):
with open(file, 'r', encoding='utf-8') as f:
html_content = f.read()
else:
# Assuming file is a file-like object
html_content = file.read()

# Extract the header tags and their corresponding metadata keys
headers_to_split_on = [tag[0] for tag in self.headers_to_split_on]
header_mapping = dict(self.headers_to_split_on)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

documents = []
# Extract the header tags and their corresponding metadata keys
headers_to_split_on = [tag[0] for tag in self.headers_to_split_on]
header_mapping = dict(self.headers_to_split_on)

# Find the body of the document
body = soup.body if soup.body else soup

# Find all header tags in the order they appear
all_headers = body.find_all(headers_to_split_on)

# If there's content before the first header, collect it
first_header = all_headers[0] if all_headers else None
if first_header:
pre_header_content = ''
for elem in first_header.find_all_previous():
if isinstance(elem, bs4.Tag):
text = elem.get_text(separator=' ', strip=True)
if text:
pre_header_content = text + ' ' + pre_header_content
if pre_header_content.strip():
documents.append(Document(
page_content=pre_header_content.strip(),
metadata={} # No metadata since there's no header
))
else:
# If no headers are found, return the whole content
full_text = body.get_text(separator=' ', strip=True)
if full_text.strip():
documents.append(Document(
page_content=full_text.strip(),
metadata={}
))
return documents

# Process each header and its associated content
for header in all_headers:
current_metadata = {}
header_name = header.name
header_text = header.get_text(separator=' ', strip=True)
current_metadata[header_mapping[header_name]] = header_text

# Collect all sibling elements until the next header of the same or higher level
content_elements = []
for sibling in header.find_next_siblings():
if sibling.name in headers_to_split_on:
# Stop at the next header
break
if isinstance(sibling, bs4.Tag):
content_elements.append(sibling)
documents = []

# Find the body of the document
body = soup.body if soup.body else soup

# Get the text content of the collected elements
current_content = ''
for elem in content_elements:
# Find all header tags in the order they appear
all_headers = body.find_all(headers_to_split_on)

# If there's content before the first header, collect it
first_header = all_headers[0] if all_headers else None
if first_header:
pre_header_content = ''
for elem in first_header.find_all_previous():
if isinstance(elem, Tag):
text = elem.get_text(separator=' ', strip=True)
if text:
current_content += text + ' '

# Create a Document if there is content
if current_content.strip():
documents.append(Document(
page_content=current_content.strip(),
metadata=current_metadata.copy()
))
else:
# If there's no content, but we have metadata, still create a Document
documents.append(Document(
page_content='',
metadata=current_metadata.copy()
))

pre_header_content = text + ' ' + pre_header_content
if pre_header_content.strip():
documents.append(Document(
page_content=pre_header_content.strip(),
metadata={} # No metadata since there's no header
))
else:
# If no headers are found, return the whole content
full_text = body.get_text(separator=' ', strip=True)
if full_text.strip():
documents.append(Document(
page_content=full_text.strip(),
metadata={}
))
return documents

# Process each header and its associated content
for header in all_headers:
current_metadata = {}
header_name = header.name
header_text = header.get_text(separator=' ', strip=True)
current_metadata[header_mapping[header_name]] = header_text

# Collect all sibling elements until the next header of the same or higher level
content_elements = []
for sibling in header.find_next_siblings():
if sibling.name in headers_to_split_on:
# Stop at the next header
break
if isinstance(sibling, Tag):
content_elements.append(sibling)

# Get the text content of the collected elements
current_content = ''
for elem in content_elements:
text = elem.get_text(separator=' ', strip=True)
if text:
current_content += text + ' '

# Create a Document if there is content
if current_content.strip():
documents.append(Document(
page_content=current_content.strip(),
metadata=current_metadata.copy()
))
else:
# If there's no content, but we have metadata, still create a Document
documents.append(Document(
page_content='',
metadata=current_metadata.copy()
))

return documents



class HTMLSectionSplitter:
Expand Down

0 comments on commit d4efd97

Please sign in to comment.