Skip to content

Commit

Permalink
Try to decode GitHub's bunkum. This is a partial fix for #733.
Browse files Browse the repository at this point in the history
Signed-off-by: Daira Emma Hopwood <[email protected]>
  • Loading branch information
daira committed Oct 28, 2023
1 parent 6db2ef8 commit 85f5193
Showing 1 changed file with 15 additions and 4 deletions.
19 changes: 15 additions & 4 deletions links_and_dests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import sys
from time import sleep
import ssl
from io import BytesIO
from io import BytesIO, StringIO
import json

try:
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -52,13 +53,23 @@ def get_links_and_destinations_from_html(f):
dests = set()

soup = BeautifulSoup(f.read(), "html5lib")

# First try to find this: <script type="application/json" data-target="react-app.embeddedData">
# If it exists, its content is some JSON that we need to parse to get the real content.
for script in soup.find_all('script'):
if script.get('data-target') == "react-app.embeddedData":
content = json.loads(script.string).get('payload', {}).get('blob', {}).get('richText')
if content is not None:
(links, dests) = get_links_and_destinations_from_html(StringIO(content))
break

for link in soup.find_all('a'):
if link.has_attr('href'):
url = link['href']
(internal if url.startswith('#') else links).add(url)
url = link['href']
(internal if url.startswith('#') else links).add(url)

if link.has_attr('name'):
dests.add(link['name'])
dests.add(link['name'])

for link in soup.find_all(id=True):
dests.add(link['id'])
Expand Down

0 comments on commit 85f5193

Please sign in to comment.