Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Support OneDrive URLs #596

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
15 changes: 15 additions & 0 deletions daras_ai_v2/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from loguru import logger
from requests import HTTPError
from starlette.status import HTTP_402_PAYMENT_REQUIRED
from starlette.status import HTTP_401_UNAUTHORIZED

from daras_ai_v2 import settings

Expand Down Expand Up @@ -103,6 +104,20 @@ def __init__(self, user: "AppUser", sr: "SavedRun"):
super().__init__(message, status_code=HTTP_402_PAYMENT_REQUIRED)


class OneDriveAuth(UserError):
def __init__(self, auth_url):
message = f"""
<p>
Onedrive access token or refresh token is missing for this workspace.
</p>

<p>
<a href="{auth_url}">Please login</a> to your Onedrive account to use Onedrive files.
</p>
"""
super().__init__(message, status_code=HTTP_401_UNAUTHORIZED)


FFMPEG_ERR_MSG = (
"Unsupported File Format\n\n"
"We encountered an issue processing your file as it appears to be in a format not supported by our system or may be corrupted. "
Expand Down
98 changes: 98 additions & 0 deletions daras_ai_v2/onedrive_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import base64

import requests
from furl import furl

from bots.models import SavedRun
from daras_ai_v2.exceptions import UserError
from daras_ai_v2.exceptions import raise_for_status, OneDriveAuth
from routers.onedrive_api import (
generate_onedrive_auth_url,
get_access_token_from_refresh_token,
)


def is_onedrive_url(f: furl) -> bool:
if f.host == "1drv.ms":
return True
elif f.host == "onedrive.live.com":
raise UserError(
"Direct onedrive.live.com links are not supported. Please provide a shareable OneDrive link (from Share > Copy Link) E.g. https://1drv.ms/xxx"
)


_url_encode_translation = str.maketrans({"/": "_", "+": "-", "=": ""})


def encode_onedrive_url(sharing_url: str) -> str:
# https://learn.microsoft.com/en-us/onedrive/developer/rest-api/api/shares_get
base64_value = base64.b64encode(sharing_url.encode()).decode()
encoded_url = base64_value.translate(_url_encode_translation)
return f"u!{encoded_url}"


def onedrive_download(mime_type: str, export_links: dict):
download_url = export_links.get(mime_type)
if not download_url:
raise ValueError(
"Download URL not found in export_links. Cannot download file."
)
r = requests.get(download_url)
raise_for_status(r)
file_content = r.content
return file_content, mime_type


def onedrive_meta(f_url: str, sr: SavedRun, *, try_refresh: bool = True):
# check if saved run workspace has onedrive_access_token and onedrive_refresh_token
if not (
sr.workspace
and sr.workspace.onedrive_access_token
and sr.workspace.onedrive_refresh_token
):
raise OneDriveAuth(generate_onedrive_auth_url(sr.id))
try:
encoded_url = encode_onedrive_url(f_url)
headers = {"Authorization": f"Bearer {sr.workspace.onedrive_access_token}"}
r = requests.get(
f"https://graph.microsoft.com/v1.0/shares/{encoded_url}/driveItem",
headers=headers,
)
raise_for_status(r)
metadata = r.json()

if "folder" in metadata:
raise UserError("Folders / OneNote are not supported yet.")

return metadata

except requests.HTTPError as e:
if e.response.status_code == 401 and try_refresh:
try:
(
sr.workspace.onedrive_access_token,
sr.workspace.onedrive_refresh_token,
) = get_access_token_from_refresh_token(
sr.workspace.onedrive_refresh_token
)
sr.workspace.save(update_fields=["onedrive_access_token"])
return onedrive_meta(f_url, sr, try_refresh=False)
except requests.HTTPError:
raise OneDriveAuth(generate_onedrive_auth_url(sr.id))

elif e.response.status_code == 403:
raise UserError(
message=f"""
<p>
<a href="{f_url}" target="_blank">This document </a> is not accessible by "{sr.workspace.onedrive_user_name}".
Please share the document with this account (Share > Manage Access).
</p>
<p>
Alternatively, <a href="{generate_onedrive_auth_url(sr.id)}" target="_blank">Login</a> with a OneDrive account that can access this file.
Note that you can only be logged in to one OneDrive account at a time.
</p>
"""
)

else:
raise
4 changes: 4 additions & 0 deletions daras_ai_v2/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,10 @@
SLACK_CLIENT_ID = config("SLACK_CLIENT_ID", "")
SLACK_CLIENT_SECRET = config("SLACK_CLIENT_SECRET", "")

ONEDRIVE_CLIENT_ID = config("ONEDRIVE_CLIENT_ID", "")
ONEDRIVE_CLIENT_SECRET = config("ONEDRIVE_CLIENT_SECRET", "")


TALK_JS_APP_ID = config("TALK_JS_APP_ID", "")
TALK_JS_SECRET_KEY = config("TALK_JS_SECRET_KEY", "")

Expand Down
28 changes: 24 additions & 4 deletions daras_ai_v2/vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from pydantic import BaseModel, Field

from app_users.models import AppUser
from celeryapp.tasks import get_running_saved_run
from daras_ai.image_input import (
get_mimetype_from_response,
safe_filename,
Expand Down Expand Up @@ -58,6 +59,11 @@
url_to_gdrive_file_id,
)
from daras_ai_v2.office_utils_pptx import pptx_to_text_pages
from daras_ai_v2.onedrive_downloader import (
is_onedrive_url,
onedrive_meta,
onedrive_download,
)
from daras_ai_v2.redis_cache import redis_lock
from daras_ai_v2.scraping_proxy import (
SCRAPING_PROXIES,
Expand Down Expand Up @@ -123,7 +129,9 @@ def references_as_prompt(references: list[SearchReference], sep="\n\n") -> str:


def get_top_k_references(
request: DocSearchRequest, is_user_url: bool = True, current_user: AppUser = None
request: DocSearchRequest,
is_user_url: bool = True,
current_user: AppUser | None = None,
) -> typing.Generator[str, None, list[SearchReference]]:
"""
Get the top k documents that ref the search query
Expand Down Expand Up @@ -376,6 +384,15 @@ def doc_url_to_file_metadata(f_url: str) -> FileMetadata:
mime_type = meta["mimeType"]
total_bytes = int(meta.get("size") or 0)
export_links = meta.get("exportLinks", None)

elif is_onedrive_url(f):
meta = onedrive_meta(f_url, get_running_saved_run())
name = meta["name"]
etag = meta.get("eTag") or meta.get("lastModifiedDateTime")
mime_type = meta["file"]["mimeType"]
total_bytes = int(meta.get("size") or 0)
export_links = {mime_type: meta["@microsoft.graph.downloadUrl"]}

else:
if is_user_uploaded_url(f_url):
kwargs = {}
Expand Down Expand Up @@ -835,14 +852,17 @@ def download_content_bytes(
f_url: str,
mime_type: str,
is_user_url: bool = True,
export_links: dict[str, str] = {},
export_links: dict[str, str] | None = None,
) -> tuple[bytes, str]:
if export_links is None:
export_links = {}
if is_yt_dlp_able_url(f_url):
return download_youtube_to_wav(f_url), "audio/wav"
f = furl(f_url)
if is_gdrive_url(f):
# download from google drive
return gdrive_download(f, mime_type, export_links)
elif is_onedrive_url(f):
return onedrive_download(mime_type, export_links)
try:
# download from url
if is_user_uploaded_url(f_url):
Expand Down Expand Up @@ -990,7 +1010,7 @@ def tabular_bytes_to_any_df(
df = pd.read_json(f, dtype=dtype)
case "application/xml":
df = pd.read_xml(f, dtype=dtype)
case _ if "excel" in mime_type or "spreadsheet" in mime_type:
case _ if "excel" in mime_type or "sheet" in mime_type:
df = pd.read_excel(f, dtype=dtype)
case _:
raise UnsupportedDocumentError(
Expand Down
133 changes: 133 additions & 0 deletions routers/onedrive_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import json

import requests
from fastapi import HTTPException
from fastapi.responses import RedirectResponse
from furl import furl
from starlette.requests import Request
from starlette.responses import HTMLResponse

from bots.models import SavedRun
from daras_ai_v2 import settings
from daras_ai_v2.exceptions import raise_for_status
from routers.custom_api_router import CustomAPIRouter

app = CustomAPIRouter()


@app.get("/__/onedrive/connect/")
def onedrive_connect_redirect(request: Request):
from daras_ai_v2.base import SUBMIT_AFTER_LOGIN_Q

if not request.user or request.user.is_anonymous:
redirect_url = furl("/login", query_params={"next": request.url})
return RedirectResponse(str(redirect_url))

code = request.query_params.get("code")
if not code:
error = request.query_params.get("error")
error_description = request.query_params.get("error_description")
return HTMLResponse(
f"Authorization code missing! {error} : {error_description}",
status_code=400,
)

user_access_token, user_refresh_token = _get_access_token_from_code(code)
user_display_name = _get_user_display_name(user_access_token)

sr = load_sr_from_state(request)

sr.workspace.onedrive_access_token = user_access_token
sr.workspace.onedrive_refresh_token = user_refresh_token
sr.workspace.onedrive_user_name = user_display_name
sr.workspace.save(
update_fields=[
"onedrive_access_token",
"onedrive_refresh_token",
"onedrive_user_name",
]
)

redirect_url = sr.get_app_url({SUBMIT_AFTER_LOGIN_Q: "1"})
return RedirectResponse(redirect_url.url)


def generate_onedrive_auth_url(sr_id: int) -> str:
"""Build the Microsoft OAuth URL to start interactive authorization.

Returns:
str: Fully constructed OAuth authorization URL
"""
return furl(
"https://login.microsoftonline.com/common/oauth2/v2.0/authorize",
query_params={
"client_id": settings.ONEDRIVE_CLIENT_ID,
"redirect_uri": onedrive_connect_redirect_url,
"response_type": "code",
"scope": ",".join(
[
"Files.Read.All",
"offline_access",
"User.Read",
]
),
"state": json.dumps(dict(sr_id=sr_id)),
},
).tostr()


def load_sr_from_state(request: Request) -> SavedRun:
sr_id = json.loads(request.query_params.get("state") or "{}").get("sr_id")
try:
return SavedRun.objects.get(id=sr_id)
except SavedRun.DoesNotExist:
raise HTTPException(status_code=404, detail="Published Run not found")


def _get_user_display_name(code: str) -> str:
r = requests.get(
url="https://graph.microsoft.com/v1.0/me",
headers={"Authorization": f"Bearer {code}"},
)
raise_for_status(r)
return r.json()["displayName"]


def _get_access_token_from_code(code: str) -> tuple[str, str]:
r = requests.post(
url="https://login.microsoftonline.com/common/oauth2/v2.0/token",
data={
"client_id": settings.ONEDRIVE_CLIENT_ID,
"client_secret": settings.ONEDRIVE_CLIENT_SECRET,
"redirect_uri": onedrive_connect_redirect_url,
"grant_type": "authorization_code",
"code": code,
},
headers={"Content-Type": "application/x-www-form-urlencoded"},
)
raise_for_status(r)
data = r.json()
return data["access_token"], data["refresh_token"]


def get_access_token_from_refresh_token(refresh_token: str) -> tuple[str, str]:
# https://learn.microsoft.com/en-us/onedrive/developer/rest-api/getting-started/graph-oauth?view=odsp-graph-online
r = requests.post(
"https://login.microsoftonline.com/common/oauth2/v2.0/token",
data={
"client_id": settings.ONEDRIVE_CLIENT_ID,
"client_secret": settings.ONEDRIVE_CLIENT_SECRET,
"redirect_uri": onedrive_connect_redirect_url,
"grant_type": "refresh_token",
"refresh_token": refresh_token,
},
headers={"Content-Type": "application/x-www-form-urlencoded"},
)
raise_for_status(r)
data = r.json()
return data["access_token"], data["refresh_token"]


onedrive_connect_redirect_url = (
furl(settings.APP_BASE_URL) / app.url_path_for(onedrive_connect_redirect.__name__)
).tostr()
2 changes: 2 additions & 0 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
bots_api,
twilio_api,
static_pages,
onedrive_api,
)

app = FastAPI(title="GOOEY.AI", docs_url=None, redoc_url="/docs")
Expand All @@ -58,6 +59,7 @@
app.include_router(broadcast_api.app)
app.include_router(account.app, include_in_schema=False)
app.include_router(facebook_api.app, include_in_schema=False)
app.include_router(onedrive_api.app, include_in_schema=False)
app.include_router(slack_api.router, include_in_schema=False)
app.include_router(url_shortener.app, include_in_schema=False)
app.include_router(paypal.router, include_in_schema=False)
Expand Down
6 changes: 6 additions & 0 deletions workspaces/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ class WorkspaceAdmin(SafeDeleteAdmin):
"domain_name",
"created_at",
"updated_at",
"onedrive_user_name",
"onedrive_access_token",
"onedrive_refresh_token",
] + list(SafeDeleteAdmin.list_display)
list_filter = (
[
Expand All @@ -82,6 +85,9 @@ class WorkspaceAdmin(SafeDeleteAdmin):
("total_payments", "total_charged", "total_usage_cost"),
("created_at", "updated_at"),
"open_in_stripe",
"onedrive_user_name",
"onedrive_access_token",
"onedrive_refresh_token",
]
search_fields = ["name", "created_by__display_name", "domain_name", "handle__name"]
readonly_fields = [
Expand Down
Loading