MediaWiki docloader improvements + unit tests (langchain-ai#5879)

Starting over from langchain-ai#5654 because I utterly borked the poetry.lock file. Adds new paramerters for to the MWDumpLoader class: * skip_redirecst (bool) Tells the loader to skip articles that redirect to other articles. False by default. * stop_on_error (bool) Tells the parser to skip any page that causes a parse error. True by default. * namespaces (List[int]) Tells the parser which namespaces to parse. Contains namespaces from -2 to 15 by default. Default values are chosen to preserve backwards compatibility. Sample dump XML and full unit test coverage (with extended tests that pass!) also included! --------- Co-authored-by: Harrison Chase <[email protected]> Co-authored-by: Bagatur <[email protected]>
feixiangge · Jul 15, 2023 · 96f3dff · 96f3dff
1 parent 4c81063
commit 96f3dff
Show file tree

Hide file tree

Showing 5 changed files with 447 additions and 29 deletions.
diff --git a/langchain/document_loaders/mediawikidump.py b/langchain/document_loaders/mediawikidump.py
@@ -1,9 +1,13 @@
 """Load Data from a MediaWiki dump xml."""
-from typing import List, Optional
+import logging
+from pathlib import Path
+from typing import List, Optional, Sequence, Union
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 
+logger = logging.getLogger(__name__)
+
 
 class MWDumpLoader(BaseLoader):
     """
@@ -29,34 +33,64 @@ class MWDumpLoader(BaseLoader):
     :type file_path: str
     :param encoding: Charset encoding, defaults to "utf8"
     :type encoding: str, optional
+    :param namespaces: The namespace of pages you want to parse.
+        See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation
+        for a list of all common namespaces
+    :type namespaces: List[int],optional
+    :param skip_redirects: TR=rue to skip pages that redirect to other pages,
+        False to keep them. False by default
+    :type skip_redirects: bool, optional
+    :param stop_on_error: False to skip over pages that cause parsing errors,
+        True to stop. True by default
+    :type stop_on_error: bool, optional
     """
 
-    def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
-        """Initialize with a file path.
-
-        Args:
-            file_path: XML local file path
-            encoding: Charset encoding, defaults to "utf8"
-        """
-        self.file_path = file_path
+    def __init__(
+        self,
+        file_path: Union[str, Path],
+        encoding: Optional[str] = "utf8",
+        namespaces: Optional[Sequence[int]] = None,
+        skip_redirects: Optional[bool] = False,
+        stop_on_error: Optional[bool] = True,
+    ):
+        self.file_path = file_path if isinstance(file_path, str) else str(file_path)
         self.encoding = encoding
+        # Namespaces range from -2 to 15, inclusive.
+        self.namespaces = namespaces or list(range(-2, 16))
+        self.skip_redirects = skip_redirects
+        self.stop_on_error = stop_on_error
 
     def load(self) -> List[Document]:
         """Load from a file path."""
-        import mwparserfromhell
-        import mwxml
+        try:
+            import mwparserfromhell
+            import mwxml
+        except ImportError as e:
+            raise ImportError(
+                "Unable to import 'mwparserfromhell' or 'mwxml'. Please install with"
+                " `pip install mwparserfromhell mwxml`."
+            ) from e
 
         dump = mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding))
 
         docs = []
-
         for page in dump.pages:
-            for revision in page:
-                code = mwparserfromhell.parse(revision.text)
-                text = code.strip_code(
-                    normalize=True, collapse=True, keep_template_params=False
-                )
-                metadata = {"source": page.title}
-                docs.append(Document(page_content=text, metadata=metadata))
-
+            if self.skip_redirects and page.redirect:
+                continue
+            if page.namespace not in self.namespaces:
+                continue
+            try:
+                for revision in page:
+                    code = mwparserfromhell.parse(revision.text)
+                    text = code.strip_code(
+                        normalize=True, collapse=True, keep_template_params=False
+                    )
+                    metadata = {"source": page.title}
+                    docs.append(Document(page_content=text, metadata=metadata))
+            except Exception as e:
+                logger.error("Parsing error: {}".format(e))
+                if self.stop_on_error:
+                    raise e
+                else:
+                    continue
         return docs
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -108,6 +108,8 @@ pyspark = {version = "^3.4.0", optional = true}
 clarifai = {version = ">=9.1.0", optional = true}
 tigrisdb = {version = "^1.0.0b6", optional = true}
 nebula3-python = {version = "^3.4.0", optional = true}
+mwparserfromhell = {version = "^0.6.4", optional = true}
+mwxml = {version = "^0.3.3", optional = true}
 awadb = {version = "^0.3.3", optional = true}
 azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true}
 esprima = {version = "^4.0.1", optional = true}
@@ -343,7 +345,8 @@ extended_testing = [
  "tqdm",
  "lxml",
  "atlassian-python-api",
- "beautifulsoup4",
+ "mwparserfromhell",
+ "mwxml",
  "pandas",
  "telethon",
  "psychicapi",