docs: improve document loaders index (#25365)

Co-authored-by: Erick Friis <[email protected]>
langchain-ai · Aug 14, 2024 · 967b6f2 · 967b6f2 · shirly59 · Aug 14, 2024
1 parent 4a78be7
commit 967b6f2
Show file tree

Hide file tree

Showing 5 changed files with 151 additions and 10 deletions.
diff --git a/docs/docs/integrations/document_loaders/index.mdx b/docs/docs/integrations/document_loaders/index.mdx
@@ -7,6 +7,39 @@ sidebar_class_name: hidden
 
 import { CategoryTable, IndexTable } from "@theme/FeatureTables";
 
+DocumentLoaders load data into the standard LangChain Document format.
+
+Each DocumentLoader has its own specific parameters, but they can all be invoked in the same way with the .load method.
+An example use case is as follows:
+
+```python
+from langchain_community.document_loaders.csv_loader import CSVLoader
+
+loader = CSVLoader(
+    ...  # <-- Integration specific parameters here
+)
+data = loader.load()
+```
+
+## Common File Types
+
+The below document loaders allow you to load data from common data formats.
+
+<CategoryTable category="common_loaders" />
+
+## PDFs
+
+The below document loaders allow you to load documents.
+
+<CategoryTable category="pdf_loaders" />
+
+## Webpages
+
+The below document loaders allow you to load webpages.
+
+<CategoryTable category="webpage_loaders" />
+
+
 ## All document loaders
 
 <IndexTable />
diff --git a/docs/docs/integrations/document_loaders/pypdfloader.ipynb b/docs/docs/integrations/document_loaders/pypdfloader.ipynb
@@ -6,7 +6,7 @@
    "source": [
     "# PyPDFLoader\n",
     "\n",
-    "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
+    "This notebook provides a quick overview for getting started with `PyPDF` [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html).\n",
     "\n",
     "\n",
     "## Overview\n",
@@ -43,7 +43,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install -qU langchain_community"
+    "%pip install -qU langchain_community pypdf"
    ]
   },
   {
@@ -180,7 +180,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -194,9 +194,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.1"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/docs/docs/integrations/document_loaders/web_base.ipynb b/docs/docs/integrations/document_loaders/web_base.ipynb
@@ -44,7 +44,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install -qU langchain_community"
+    "%pip install -qU langchain_community beautifulsoup4"
    ]
   },
   {
@@ -330,7 +330,10 @@
    "cell_type": "markdown",
    "id": "672264ad",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Using proxies\n",
@@ -343,7 +346,10 @@
    "execution_count": null,
    "id": "9caf0310",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "outputs": [],
    "source": [
@@ -384,7 +390,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.1"
   }
  },
  "nbformat": 4,

diff --git a/docs/src/theme/FeatureTables.js b/docs/src/theme/FeatureTables.js
@@ -440,6 +440,108 @@ const FEATURE_TABLES = {
         columns: [],
         items: [],
     },
+    webpage_loaders: {
+        link: 'docs/integrations/loaders',
+        columns: [
+            {title: "Document Loader", formatter: (item) => <a href={
+                item.link
+            }>{item.name}</a>},
+            {title: "Description", formatter: (item) => item.source},
+            {title: "Package/API", formatter: (item) => item.api},
+        ],
+        items: [
+            {
+                name: "Web",
+                link: "web_base",
+                source: "Uses urllib and BeautifulSoup to load and parse HTML web pages",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.web_base.WebBaseLoader.html"
+            },
+            {
+                name: "RecursiveURL",
+                link: "recursive_url",
+                source: "Recursively scrapes all child links from a root URL",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.recursive_url_loader.RecursiveUrlLoader.html"
+            },
+            {
+                name: "Sitemap",
+                link: "sitemap",
+                source: "Scrapes all pages on a given sitemap",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.sitemap.SitemapLoader.html"
+            },
+            {
+                name: "Firecrawl",
+                link: "firecrawl",
+                source: "API service that can be deployed locally, hosted version has free credits.",
+                api: "API",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.firecrawl.FireCrawlLoader.html"
+            }
+        ]
+    },
+    pdf_loaders: {
+        link: 'docs/integrations/loaders',
+        columns: [
+            {title: "Document Loader", formatter: (item) => <a href={
+                item.link
+            }>{item.name}</a>},
+            {title: "Description", formatter: (item) => item.source},
+            {title: "Package/API", formatter: (item) => item.api},
+        ],
+        items: [
+            {
+                name: "PyPDF",
+                link: "pypdfloader",
+                source: "Uses `pypdf` to load and parse PDFs",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html"
+            },
+            {
+                name: "Unstructured",
+                link: "unstructured_file",
+                source: "Uses Unstructured's open source library to load PDFs",
+                api: "Package",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
+            },
+            {
+                name: "Amazon Textract",
+                link: "amazon_textract",
+                source: "Uses AWS API to load PDFs",
+                api: "API",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.AmazonTextractPDFLoader.html"
+            }
+        ]
+    },
+    common_loaders: {
+        link: 'docs/integrations/loaders',
+        columns: [
+            {title: "Document Loader", formatter: (item) => <a href={
+                item.link
+            }>{item.name}</a>},
+            {title: "Data Type", formatter: (item) => item.source},
+        ],
+        items: [
+            {
+                name: "CSVLoader",
+                link: "csv",
+                source: "CSV files",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html"
+            },
+            {
+                name: "DirectoryLoader",
+                link: "document_loader_directory",
+                source: "All files in a given directory",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html"
+            },
+            {
+                name: "Unstructured",
+                link: "unstructured_file",
+                source: "All file types",
+                apiLink: "https://api.python.langchain.com/en/latest/document_loaders/langchain_unstructured.document_loaders.UnstructuredLoader.html"
+            },
+        ]
+    },
     vectorstores: {
         link: 'docs/integrations/vectorstores',
         columns: [

diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
@@ -237,7 +237,7 @@ def __init__(
             import pypdf  # noqa:F401
         except ImportError:
             raise ImportError(
-                "pypdf package not found, please install it with " "`pip install pypdf`"
+                "pypdf package not found, please install it with `pip install pypdf`"
             )
         super().__init__(file_path, headers=headers)
         self.parser = PyPDFParser(