Skip to content

Commit

Permalink
MediaWiki docloader improvements + unit tests (langchain-ai#5879)
Browse files Browse the repository at this point in the history
Starting over from langchain-ai#5654 because I utterly borked the poetry.lock file.

Adds new paramerters for to the MWDumpLoader class:

* skip_redirecst (bool) Tells the loader to skip articles that redirect
to other articles. False by default.
* stop_on_error (bool) Tells the parser to skip any page that causes a
parse error. True by default.
* namespaces (List[int]) Tells the parser which namespaces to parse.
Contains namespaces from -2 to 15 by default.

Default values are chosen to preserve backwards compatibility.

Sample dump XML and full unit test coverage (with extended tests that
pass!) also included!

---------

Co-authored-by: Harrison Chase <[email protected]>
Co-authored-by: Bagatur <[email protected]>
  • Loading branch information
3 people authored Jul 15, 2023
1 parent 4c81063 commit 96f3dff
Show file tree
Hide file tree
Showing 5 changed files with 447 additions and 29 deletions.
74 changes: 54 additions & 20 deletions langchain/document_loaders/mediawikidump.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
"""Load Data from a MediaWiki dump xml."""
from typing import List, Optional
import logging
from pathlib import Path
from typing import List, Optional, Sequence, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


class MWDumpLoader(BaseLoader):
"""
Expand All @@ -29,34 +33,64 @@ class MWDumpLoader(BaseLoader):
:type file_path: str
:param encoding: Charset encoding, defaults to "utf8"
:type encoding: str, optional
:param namespaces: The namespace of pages you want to parse.
See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
for a list of all common namespaces
:type namespaces: List[int],optional
:param skip_redirects: TR=rue to skip pages that redirect to other pages,
False to keep them. False by default
:type skip_redirects: bool, optional
:param stop_on_error: False to skip over pages that cause parsing errors,
True to stop. True by default
:type stop_on_error: bool, optional
"""

def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
"""Initialize with a file path.
Args:
file_path: XML local file path
encoding: Charset encoding, defaults to "utf8"
"""
self.file_path = file_path
def __init__(
self,
file_path: Union[str, Path],
encoding: Optional[str] = "utf8",
namespaces: Optional[Sequence[int]] = None,
skip_redirects: Optional[bool] = False,
stop_on_error: Optional[bool] = True,
):
self.file_path = file_path if isinstance(file_path, str) else str(file_path)
self.encoding = encoding
# Namespaces range from -2 to 15, inclusive.
self.namespaces = namespaces or list(range(-2, 16))
self.skip_redirects = skip_redirects
self.stop_on_error = stop_on_error

def load(self) -> List[Document]:
"""Load from a file path."""
import mwparserfromhell
import mwxml
try:
import mwparserfromhell
import mwxml
except ImportError as e:
raise ImportError(
"Unable to import 'mwparserfromhell' or 'mwxml'. Please install with"
" `pip install mwparserfromhell mwxml`."
) from e

dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))

docs = []

for page in dump.pages:
for revision in page:
code = mwparserfromhell.parse(revision.text)
text = code.strip_code(
normalize=True, collapse=True, keep_template_params=False
)
metadata = {"source": page.title}
docs.append(Document(page_content=text, metadata=metadata))

if self.skip_redirects and page.redirect:
continue
if page.namespace not in self.namespaces:
continue
try:
for revision in page:
code = mwparserfromhell.parse(revision.text)
text = code.strip_code(
normalize=True, collapse=True, keep_template_params=False
)
metadata = {"source": page.title}
docs.append(Document(page_content=text, metadata=metadata))
except Exception as e:
logger.error("Parsing error: {}".format(e))
if self.stop_on_error:
raise e
else:
continue
return docs
138 changes: 130 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ pyspark = {version = "^3.4.0", optional = true}
clarifai = {version = ">=9.1.0", optional = true}
tigrisdb = {version = "^1.0.0b6", optional = true}
nebula3-python = {version = "^3.4.0", optional = true}
mwparserfromhell = {version = "^0.6.4", optional = true}
mwxml = {version = "^0.3.3", optional = true}
awadb = {version = "^0.3.3", optional = true}
azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
esprima = {version = "^4.0.1", optional = true}
Expand Down Expand Up @@ -343,7 +345,8 @@ extended_testing = [
"tqdm",
"lxml",
"atlassian-python-api",
"beautifulsoup4",
"mwparserfromhell",
"mwxml",
"pandas",
"telethon",
"psychicapi",
Expand Down
Loading

0 comments on commit 96f3dff

Please sign in to comment.