RSS Feed / OPML loader (langchain-ai#8694)

Replace this comment with: - Description: added a document loader for a list of RSS feeds or OPML. It iterates through the list and uses NewsURLLoader to load each article. - Issue: N/A - Dependencies: feedparser, listparser - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: @Ruze --------- Co-authored-by: Bagatur <[email protected]>
feixiangge · Aug 3, 2023 · 8ef7e14 · 8ef7e14
1 parent 53e4148
commit 8ef7e14
Show file tree

Hide file tree

Showing 10 changed files with 699 additions and 20 deletions.
diff --git a/docs/extras/integrations/document_loaders/example_data/sample_rss_feeds.opml b/docs/extras/integrations/document_loaders/example_data/sample_rss_feeds.opml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<opml version="1.0">
+    <head>
+        <title>Sample RSS feed subscriptions</title>
+    </head>
+    <body>
+        <outline text="Tech" title="Tech">
+            <outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
+            <outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/>
+        </outline>
+    </body>
+</opml>
diff --git a/docs/extras/integrations/document_loaders/rss.ipynb b/docs/extras/integrations/document_loaders/rss.ipynb
@@ -0,0 +1,170 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2dfc4698",
+   "metadata": {},
+   "source": [
+    "# RSS Feeds\n",
+    "\n",
+    "This covers how to load HTML news articles from a list of RSS feed URLs into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16c3699e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import RSSFeedLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "836fbac1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urls = [\"https://www.engadget.com/rss.xml\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33089aba-ff74-4d00-8f40-9449c29587cc",
+   "metadata": {},
+   "source": [
+    "Pass in urls to load them into Documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00f46fda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = RSSFeedLoader(urls=urls)\n",
+    "data = loader.load()\n",
+    "print(len(data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "data[0]"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "b447468cc42266d0"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can pass arguments to the NewsURLLoader which it uses to load articles."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "c36d3b0d329faf2a"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "loader = RSSFeedLoader(urls=urls, nlp=True)\n",
+    "data = loader.load()\n",
+    "print(len(data))"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "5fdada62470d3019"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "data[0].metadata['keywords']"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "11d71963f7735c1d"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "data[0].metadata['summary']"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "9fb64ba0e8780966"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can also use an OPML file such as a Feedly export.  Pass in either a URL or the OPML contents."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "98ac26c488315bff"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b6f07ae526a897c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"example_data/sample_rss_feeds.opml\", \"r\") as f:\n",
+    "    loader = RSSFeedLoader(opml=f.read())\n",
+    "data = loader.load()\n",
+    "print(len(data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "data[0]"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "b68a26b3"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py
@@ -128,6 +128,7 @@
 from langchain.document_loaders.reddit import RedditPostsLoader
 from langchain.document_loaders.roam import RoamLoader
 from langchain.document_loaders.rocksetdb import RocksetLoader
+from langchain.document_loaders.rss import RSSFeedLoader
 from langchain.document_loaders.rst import UnstructuredRSTLoader
 from langchain.document_loaders.rtf import UnstructuredRTFLoader
 from langchain.document_loaders.s3_directory import S3DirectoryLoader
@@ -280,6 +281,7 @@
     "RedditPostsLoader",
     "RoamLoader",
     "RocksetLoader",
+    "RSSFeedLoader",
     "S3DirectoryLoader",
     "S3FileLoader",
     "SRTLoader",

diff --git a/libs/langchain/langchain/document_loaders/rss.py b/libs/langchain/langchain/document_loaders/rss.py
@@ -0,0 +1,133 @@
+"""Loader that uses unstructured to load HTML files."""
+import logging
+from typing import Any, Iterator, List, Optional, Sequence
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.news import NewsURLLoader
+
+logger = logging.getLogger(__name__)
+
+
+class RSSFeedLoader(BaseLoader):
+    """Loader that uses newspaper to load news articles from RSS feeds.
+
+    Args:
+        urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document.
+        opml: OPML file to load feed urls from. Only one of urls or opml should be provided.  The value
+        can be a URL string, or OPML markup contents as byte or string.
+        continue_on_failure: If True, continue loading documents even if
+            loading fails for a particular URL.
+        show_progress_bar: If True, use tqdm to show a loading progress bar. Requires
+            tqdm to be installed, ``pip install tqdm``.
+        **newsloader_kwargs: Any additional named arguments to pass to
+            NewsURLLoader.
+
+    Example:
+        .. code-block:: python
+
+            from langchain.document_loaders import RSSFeedLoader
+
+            loader = RSSFeedLoader(
+                urls=["<url-1>", "<url-2>"],
+            )
+            docs = loader.load()
+
+    The loader uses feedparser to parse RSS feeds.  The feedparser library is not installed by default so you should
+    install it if using this loader:
+    https://pythonhosted.org/feedparser/
+
+    If you use OPML, you should also install listparser:
+    https://pythonhosted.org/listparser/
+
+    Finally, newspaper is used to process each article:
+    https://newspaper.readthedocs.io/en/latest/
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        urls: Optional[Sequence[str]] = None,
+        opml: Optional[str] = None,
+        continue_on_failure: bool = True,
+        show_progress_bar: bool = False,
+        **newsloader_kwargs: Any,
+    ) -> None:
+        """Initialize with urls or OPML."""
+        if (urls is None) == (
+            opml is None
+        ):  # This is True if both are None or neither is None
+            raise ValueError(
+                "Provide either the urls or the opml argument, but not both."
+            )
+        self.urls = urls
+        self.opml = opml
+        self.continue_on_failure = continue_on_failure
+        self.show_progress_bar = show_progress_bar
+        self.newsloader_kwargs = newsloader_kwargs
+
+    def load(self) -> List[Document]:
+        iter = self.lazy_load()
+        if self.show_progress_bar:
+            try:
+                from tqdm import tqdm
+            except ImportError as e:
+                raise ImportError(
+                    "Package tqdm must be installed if show_progress_bar=True. "
+                    "Please install with 'pip install tqdm' or set "
+                    "show_progress_bar=False."
+                ) from e
+            iter = tqdm(iter)
+        return list(iter)
+
+    @property
+    def _get_urls(self) -> Sequence[str]:
+        if self.urls:
+            return self.urls
+        try:
+            import listparser
+        except ImportError as e:
+            raise ImportError(
+                "Package listparser must be installed if the opml arg is used. "
+                "Please install with 'pip install listparser' or use the "
+                "urls arg instead."
+            ) from e
+        rss = listparser.parse(self.opml)
+        return [feed.url for feed in rss.feeds]
+
+    def lazy_load(self) -> Iterator[Document]:
+        try:
+            import feedparser  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "feedparser package not found, please install it with "
+                "`pip install feedparser`"
+            )
+
+        for url in self._get_urls:
+            try:
+                feed = feedparser.parse(url)
+                if getattr(feed, "bozo", False):
+                    raise ValueError(
+                        f"Error fetching {url}, exception: {feed.bozo_exception}"
+                    )
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.error(f"Error fetching {url}, exception: {e}")
+                    continue
+                else:
+                    raise e
+            try:
+                for entry in feed.entries:
+                    loader = NewsURLLoader(
+                        urls=[entry.link],
+                        **self.newsloader_kwargs,
+                    )
+                    article = loader.load()[0]
+                    article.metadata["feed"] = url
+                    yield article
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.error(f"Error processing entry {entry.link}, exception: {e}")
+                    continue
+                else:
+                    raise e