Skip to content

Commit

Permalink
community: add Yuque document loader
Browse files Browse the repository at this point in the history
  • Loading branch information
Dounx committed Feb 22, 2024
1 parent 919b8a3 commit fcd54a0
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 0 deletions.
77 changes: 77 additions & 0 deletions docs/docs/integrations/document_loaders/yuque.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "66a7777e",
"metadata": {},
"source": [
"# Yuque\n",
"\n",
">[Yuque](https://www.yuque.com/) is a professional cloud-based knowledge base for team collaboration in documentation.\n",
"\n",
"This notebook covers how to load documents from `Yuque`.\n",
"\n",
"You can obtain the personal access token by clicking on your personal avatar in the [Personal Settings](https://www.yuque.com/settings/tokens) page."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ec8a3b3",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain_community.document_loaders import YuqueLoader"
]
},
{
"cell_type": "code",
"outputs": [],
"source": [
"loader = YuqueLoader(access_token=\"<your_personal_access_token>\")"
],
"metadata": {
"collapsed": false
},
"id": "2ea958f0327ed6e8"
},
{
"cell_type": "code",
"execution_count": null,
"id": "3470dadf",
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"docs = loader.load()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@
GoogleApiYoutubeLoader,
YoutubeLoader,
)
from langchain_community.document_loaders.yuque import YuqueLoader

# Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader
Expand Down Expand Up @@ -415,4 +416,5 @@
"XorbitsLoader",
"YoutubeAudioLoader",
"YoutubeLoader",
"YuqueLoader",
]
92 changes: 92 additions & 0 deletions libs/community/langchain_community/document_loaders/yuque.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re
from typing import List, Dict, Iterator
import requests

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


class YuqueLoader(BaseLoader):

Check failure on line 10 in libs/community/langchain_community/document_loaders/yuque.py

View workflow job for this annotation

GitHub Actions / cd libs/community / - / make lint #3.8

Ruff (I001)

langchain_community/document_loaders/yuque.py:1:1: I001 Import block is un-sorted or un-formatted
"""Load documents from `Yuque`."""

def __init__(self, access_token: str, api_url: str = "https://www.yuque.com"):
"""Initialize with Yuque access_token and api_url.
Args:
access_token: Personal access token - see https://www.yuque.com/settings/tokens.
api_url: Yuque API url.
"""
self.access_token = access_token
self.api_url = api_url

@property
def headers(self) -> Dict[str, str]:
return {
"Content-Type": "application/json",
"X-Auth-Token": self.access_token,
}

def get_user_id(self) -> int:
url = f"{self.api_url}/api/v2/user"
response = self.http_get(url=url)

return response["data"]["id"]

def get_books(self, user_id: int) -> List[Dict]:
url = f"{self.api_url}/api/v2/users/{user_id}/repos"
response = self.http_get(url=url)

return response["data"]

def get_document_ids(self, book_id: int) -> List[int]:
url = f"{self.api_url}/api/v2/repos/{book_id}/docs"
response = self.http_get(url=url)

return [document["id"] for document in response["data"]]

def get_document(self, book_id: int, document_id: int) -> Dict:
url = f"{self.api_url}/api/v2/repos/{book_id}/docs/{document_id}"
response = self.http_get(url=url)

return response["data"]

def parse_document(self, document: Dict) -> Document:
content = self.parse_document_body(document["body"])
metadata = {
"title": document["title"],
"description": document["description"],
"created_at": document["created_at"],
"updated_at": document["updated_at"],
}

return Document(page_content=content, metadata=metadata)

@staticmethod
def parse_document_body(body: str) -> str:
result = re.sub(r'<a name="(.*)"></a>', "", body)
result = re.sub(r'<br\s*/?>', '', result)

return result

def http_get(self, url: str) -> Dict:
response = requests.get(url, headers=self.headers)
response.raise_for_status()

return response.json()

def get_documents(self) -> Iterator[Document]:
user_id = self.get_user_id()
books = self.get_books(user_id)

for book in books:
book_id = book["id"]
document_ids = self.get_document_ids(book_id)
for document_id in document_ids:
document = self.get_document(book_id, document_id)
parsed_document = self.parse_document(document)
yield parsed_document

def load(self) -> List[Document]:
"""Load documents from `Yuque`."""
return list(self.get_documents())

0 comments on commit fcd54a0

Please sign in to comment.