Skip to content

Commit

Permalink
Added support for all webperf.se categories as -i sources
Browse files Browse the repository at this point in the history
use for example:
python default.py -i help.webprf
to see all available categories.
  • Loading branch information
7h3Rabbit committed Jan 8, 2025
1 parent 26e4d4c commit 4d429d1
Showing 1 changed file with 56 additions and 14 deletions.
70 changes: 56 additions & 14 deletions engines/webperf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import json
import re
from engines.utils import use_item
from tests.utils import get_http_content
Expand All @@ -24,22 +25,63 @@ def read_sites(input_url, input_skip, input_take):
list: The list of sites read from the specified category on https://webperf.se.
"""
sites = []

if 'offentlig-sektor' in input_url:
input_url = 'https://webperf.se/category/ovrig-offentlig-sektor/'
elif 'kommuner' in input_url:
input_url = 'https://webperf.se/category/kommuner/'
elif 'regioner' in input_url:
input_url = 'https://webperf.se/category/regioner/'
elif 'toplist' in input_url:
input_url = 'https://webperf.se/toplist/'
elif 'digitalt' in input_url:
input_url = 'https://webperf.se/category/digitalt-sverige/'
elif 'webbyraer' in input_url:
input_url = 'https://webperf.se/category/webbyraer/'
all_categories_url = 'https://webperf.se/sites/'
categories_fallback = {
'offentlig-sektor': '/category/ovrig-offentlig-sektor/',
'kommuner': '/category/kommuner/',
'regioner': '/category/regioner/',
'toplist': '/toplist/',
'digitalt': '/category/digitalt-sverige/',
'webbyraer': '/category/webbyraer/'
}

all_categories_content = get_http_content(all_categories_url)
if all_categories_content != '':
categories = {}
categories_regex = r"<th scope=\"col\">Kategori<\/th>.*?<tbody>(?P<categories>.*?)<\/tbody>"
categories_matches = re.finditer(
categories_regex, all_categories_content, re.MULTILINE | re.S)
for _, match in enumerate(categories_matches, start=1):
all_categories_subcontent = match.group('categories')
# <a href=\"(?P<url>\/category\/(?P<name>[^\"]+)/)\">
category_regex = r"<a href=\"(?P<url>\/category\/(?P<name>[^\"]+)/)\">"
category_matches = re.finditer(
category_regex, all_categories_subcontent, re.MULTILINE | re.S)
for _, match in enumerate(category_matches, start=1):
category_url = match.group('url')
category_name = match.group('name')
categories[category_name] = category_url
else:
categories = categories_fallback

found = False
for category_name, category_url in categories.items():
if category_name in input_url:
input_url = category_url
found = True

if not found:
for category_name, category_url in categories_fallback.items():
if category_name in input_url:
input_url = category_url
found = True

if found:
input_url = f'https://webperf.se{input_url}'
else:
raise NotImplementedError('input is incorrect')
print('Error: No valid webperf option')
print('')
print('Available webperf.se input values:')
for category_name, category_url in categories.items():
print(f'-i {category_name}.webprf')
return sites

sites.extend(get_category_sites(input_url, input_skip, input_take))
return sites

def get_category_sites(input_url, input_skip, input_take):
print(f'Retrieving sites from {input_url}')
sites = []
category_content = get_http_content(input_url)

category_regex = r"<a href=\"(?P<detail_url>\/site\/[^\"]+)\""
Expand Down

0 comments on commit 4d429d1

Please sign in to comment.