forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
RSS Feed / OPML loader (langchain-ai#8694)
Replace this comment with: - Description: added a document loader for a list of RSS feeds or OPML. It iterates through the list and uses NewsURLLoader to load each article. - Issue: N/A - Dependencies: feedparser, listparser - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: @Ruze --------- Co-authored-by: Bagatur <[email protected]>
- Loading branch information
Showing
10 changed files
with
699 additions
and
20 deletions.
There are no files selected for viewing
13 changes: 13 additions & 0 deletions
13
docs/extras/integrations/document_loaders/example_data/sample_rss_feeds.opml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
|
||
<opml version="1.0"> | ||
<head> | ||
<title>Sample RSS feed subscriptions</title> | ||
</head> | ||
<body> | ||
<outline text="Tech" title="Tech"> | ||
<outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/> | ||
<outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/> | ||
</outline> | ||
</body> | ||
</opml> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "2dfc4698", | ||
"metadata": {}, | ||
"source": [ | ||
"# RSS Feeds\n", | ||
"\n", | ||
"This covers how to load HTML news articles from a list of RSS feed URLs into a document format that we can use downstream." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "16c3699e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import RSSFeedLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "836fbac1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"urls = [\"https://www.engadget.com/rss.xml\"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "33089aba-ff74-4d00-8f40-9449c29587cc", | ||
"metadata": {}, | ||
"source": [ | ||
"Pass in urls to load them into Documents" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "00f46fda", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = RSSFeedLoader(urls=urls)\n", | ||
"data = loader.load()\n", | ||
"print(len(data))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"outputs": [], | ||
"source": [ | ||
"data[0]" | ||
], | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"id": "b447468cc42266d0" | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"You can pass arguments to the NewsURLLoader which it uses to load articles." | ||
], | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"id": "c36d3b0d329faf2a" | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"outputs": [], | ||
"source": [ | ||
"loader = RSSFeedLoader(urls=urls, nlp=True)\n", | ||
"data = loader.load()\n", | ||
"print(len(data))" | ||
], | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"id": "5fdada62470d3019" | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"outputs": [], | ||
"source": [ | ||
"data[0].metadata['keywords']" | ||
], | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"id": "11d71963f7735c1d" | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"outputs": [], | ||
"source": [ | ||
"data[0].metadata['summary']" | ||
], | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"id": "9fb64ba0e8780966" | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"You can also use an OPML file such as a Feedly export. Pass in either a URL or the OPML contents." | ||
], | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"id": "98ac26c488315bff" | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "8b6f07ae526a897c", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"with open(\"example_data/sample_rss_feeds.opml\", \"r\") as f:\n", | ||
" loader = RSSFeedLoader(opml=f.read())\n", | ||
"data = loader.load()\n", | ||
"print(len(data))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"outputs": [], | ||
"source": [ | ||
"data[0]" | ||
], | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"id": "b68a26b3" | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
"""Loader that uses unstructured to load HTML files.""" | ||
import logging | ||
from typing import Any, Iterator, List, Optional, Sequence | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
from langchain.document_loaders.news import NewsURLLoader | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class RSSFeedLoader(BaseLoader): | ||
"""Loader that uses newspaper to load news articles from RSS feeds. | ||
Args: | ||
urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document. | ||
opml: OPML file to load feed urls from. Only one of urls or opml should be provided. The value | ||
can be a URL string, or OPML markup contents as byte or string. | ||
continue_on_failure: If True, continue loading documents even if | ||
loading fails for a particular URL. | ||
show_progress_bar: If True, use tqdm to show a loading progress bar. Requires | ||
tqdm to be installed, ``pip install tqdm``. | ||
**newsloader_kwargs: Any additional named arguments to pass to | ||
NewsURLLoader. | ||
Example: | ||
.. code-block:: python | ||
from langchain.document_loaders import RSSFeedLoader | ||
loader = RSSFeedLoader( | ||
urls=["<url-1>", "<url-2>"], | ||
) | ||
docs = loader.load() | ||
The loader uses feedparser to parse RSS feeds. The feedparser library is not installed by default so you should | ||
install it if using this loader: | ||
https://pythonhosted.org/feedparser/ | ||
If you use OPML, you should also install listparser: | ||
https://pythonhosted.org/listparser/ | ||
Finally, newspaper is used to process each article: | ||
https://newspaper.readthedocs.io/en/latest/ | ||
""" # noqa: E501 | ||
|
||
def __init__( | ||
self, | ||
urls: Optional[Sequence[str]] = None, | ||
opml: Optional[str] = None, | ||
continue_on_failure: bool = True, | ||
show_progress_bar: bool = False, | ||
**newsloader_kwargs: Any, | ||
) -> None: | ||
"""Initialize with urls or OPML.""" | ||
if (urls is None) == ( | ||
opml is None | ||
): # This is True if both are None or neither is None | ||
raise ValueError( | ||
"Provide either the urls or the opml argument, but not both." | ||
) | ||
self.urls = urls | ||
self.opml = opml | ||
self.continue_on_failure = continue_on_failure | ||
self.show_progress_bar = show_progress_bar | ||
self.newsloader_kwargs = newsloader_kwargs | ||
|
||
def load(self) -> List[Document]: | ||
iter = self.lazy_load() | ||
if self.show_progress_bar: | ||
try: | ||
from tqdm import tqdm | ||
except ImportError as e: | ||
raise ImportError( | ||
"Package tqdm must be installed if show_progress_bar=True. " | ||
"Please install with 'pip install tqdm' or set " | ||
"show_progress_bar=False." | ||
) from e | ||
iter = tqdm(iter) | ||
return list(iter) | ||
|
||
@property | ||
def _get_urls(self) -> Sequence[str]: | ||
if self.urls: | ||
return self.urls | ||
try: | ||
import listparser | ||
except ImportError as e: | ||
raise ImportError( | ||
"Package listparser must be installed if the opml arg is used. " | ||
"Please install with 'pip install listparser' or use the " | ||
"urls arg instead." | ||
) from e | ||
rss = listparser.parse(self.opml) | ||
return [feed.url for feed in rss.feeds] | ||
|
||
def lazy_load(self) -> Iterator[Document]: | ||
try: | ||
import feedparser # noqa:F401 | ||
except ImportError: | ||
raise ImportError( | ||
"feedparser package not found, please install it with " | ||
"`pip install feedparser`" | ||
) | ||
|
||
for url in self._get_urls: | ||
try: | ||
feed = feedparser.parse(url) | ||
if getattr(feed, "bozo", False): | ||
raise ValueError( | ||
f"Error fetching {url}, exception: {feed.bozo_exception}" | ||
) | ||
except Exception as e: | ||
if self.continue_on_failure: | ||
logger.error(f"Error fetching {url}, exception: {e}") | ||
continue | ||
else: | ||
raise e | ||
try: | ||
for entry in feed.entries: | ||
loader = NewsURLLoader( | ||
urls=[entry.link], | ||
**self.newsloader_kwargs, | ||
) | ||
article = loader.load()[0] | ||
article.metadata["feed"] = url | ||
yield article | ||
except Exception as e: | ||
if self.continue_on_failure: | ||
logger.error(f"Error processing entry {entry.link}, exception: {e}") | ||
continue | ||
else: | ||
raise e |
Oops, something went wrong.