From d9cc16963ff66e71b431707c236770554dd3d510 Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Thu, 11 Jan 2024 13:27:33 +0100 Subject: [PATCH 01/11] improve output & add check for runtime logs * privoxy-blocklist: remove unnessary newline * privoxy-blocklist: improve debug() function * tests: add check for errors in privoxy runtime log Signed-off-by: Andrwe Lord Weber --- privoxy-blocklist.sh | 86 +++++++++++++++++++---------------- tests/conftest.py | 26 ++++++----- tests/test_01_root_execute.py | 4 ++ 3 files changed, 64 insertions(+), 52 deletions(-) diff --git a/privoxy-blocklist.sh b/privoxy-blocklist.sh index 8fd2b7b..314ad28 100755 --- a/privoxy-blocklist.sh +++ b/privoxy-blocklist.sh @@ -69,7 +69,7 @@ function get_config_path() { function prepare() { if [ ${UID} -ne 0 ]; then - error -e "Root privileges needed. Exit.\n" + error "Root privileges needed. Exit." usage exit 1 fi @@ -82,8 +82,6 @@ function prepare() { fi done - OS="$(uname)" - if [ -z "${SCRIPTCONF:-}" ]; then get_config_path fi @@ -128,7 +126,7 @@ EOF fi if [[ ! -r "${SCRIPTCONF}" ]]; then - debug "Can't read ${SCRIPTCONF}. Permission denied." -1 + debug -1 "Can't read ${SCRIPTCONF}. Permission denied." fi # shellcheck disable=SC1090 @@ -157,8 +155,7 @@ EOF PRIVOXY_CONF="/etc/privoxy/config" ;; esac - PRIVOXY_CONF="/etc/privoxy/config" - info "\$PRIVOXY_CONF isn't set, falling back to '/etc/privoxy/config'" + info "\$PRIVOXY_CONF isn't set, falling back to '${PRIVOXY_CONF}'" fi if [[ -z "${PRIVOXY_USER:-}" ]]; then PRIVOXY_USER="privoxy" @@ -174,8 +171,14 @@ EOF } function debug() { - if [ "${DBG}" -ge "${2}" ]; then - echo -e "${1}" + local expected_level="${1}" + shift 1 + if [ "${DBG}" -ge "${expected_level}" ]; then + if [ "${expected_level}" -eq 0 ]; then + info "${@}" + else + printf '%s\n' "${@}" + fi fi } @@ -190,7 +193,7 @@ function info() { # shellcheck disable=SC2317 function main() { for url in "${URLS[@]}"; do - debug "Processing ${url} ...\n" 0 + debug 0 "Processing ${url} ..." file="${TMPDIR}/$(basename "${url}")" address_file="${TMPDIR}/$(basename "${url}").address" address_except_file="${TMPDIR}/$(basename "${url}").address_except" @@ -205,10 +208,10 @@ function main() { list="$(basename "${file%\.*}")" # download list - debug "Downloading ${url} ..." 0 + debug 0 "Downloading ${url} ..." wget -t 3 --no-check-certificate -O "${file}" "${url}" > "${TMPDIR}/wget-${url//\//#}.log" 2>&1 - debug "$(cat "${TMPDIR}/wget-${url//\//#}.log")" 2 - debug ".. downloading done." 0 + debug 2 "$(cat "${TMPDIR}/wget-${url//\//#}.log")" + debug 0 ".. downloading done." if ! grep -qE '^.*\[Adblock.*\].*$' "${file}"; then info "The list recieved from ${url} does not contain AdblockPlus list header. Try to process anyway." fi @@ -233,76 +236,76 @@ function main() { # convert AdblockPlus list to Privoxy list # blacklist of urls - debug "Creating actionfile for ${list} ..." 1 + debug 1 "Creating actionfile for ${list} ..." echo -e "{ +block{${list}} }" > "${actionfile}" sed '/\$.*/d;/#/d;s/\?/\\?/g;s/\*/.*/g;s/(/\\(/g;s/)/\\)/g;s/\[/\\[/g;s/\]/\\]/g;s/\^$//g;s/^||/\./g;s/^|/^/g;s/|$/\$/g;/|/d' "${domain_name_file}" >> "${actionfile}" - debug "... creating filterfile for ${list} ..." 1 + debug 1 "... creating filterfile for ${list} ..." echo "FILTER: ${list} Tag filter of ${list}" > "${filterfile}" # set filter for html elements sed '/^#/!d;s/^##//g;s/^#\(.*\)\[.*\]\[.*\]*/s@<([a-zA-Z0-9]+)\\s+.*id=.?\1.*>.*<\/\\1>@@g/g;s/^#\(.*\)/s@<([a-zA-Z0-9]+)\\s+.*id=.?\1.*>.*<\/\\1>@@g/g;s/^\.\(.*\)/s@<([a-zA-Z0-9]+)\\s+.*class=.?\1.*>.*<\/\\1>@@g/g;s/^a\[\(.*\)\]/s@.*<\/a>@@g/g;s/^\([a-zA-Z0-9]*\)\.\(.*\)\[.*\]\[.*\]*/s@<\1.*class=.?\2.*>.*<\/\1>@@g/g;s/^\([a-zA-Z0-9]*\)#\(.*\):.*[\:[^:]]*[^:]*/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g;s/^\([a-zA-Z0-9]*\)#\(.*\)/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g;s/^\[\([a-zA-Z]*\).=\(.*\)\]/s@\1^=\2>@@g/g;s/\^/[\/\&:\?=_]/g;s/\.\([a-zA-Z0-9]\)/\\.\1/g' "${file}" >> "${filterfile}" - debug "... filterfile created - adding filterfile to actionfile ..." 1 + debug 1 "... filterfile created - adding filterfile to actionfile ..." echo "{ +filter{${list}} }" >> "${actionfile}" echo "*" >> "${actionfile}" - debug "... filterfile added ..." 1 + debug 1 "... filterfile added ..." # create domain based whitelist # create domain based blacklist # domains=$(sed '/^#/d;/#/!d;s/,~/,\*/g;s/~/;:\*/g;s/^\([a-zA-Z]\)/;:\1/g' ${file}) - # [ -n "${domains}" ] && debug "... creating domainbased filterfiles ..." 1 - # debug "Found Domains: ${domains}." 2 + # [ -n "${domains}" ] && debug 1 "... creating domainbased filterfiles ..." + # debug 2 "Found Domains: ${domains}." # ifs=$IFS # IFS=";:" # for domain in ${domains} # do # dns=$(echo ${domain} | awk -F ',' '{print $1}' | awk -F '#' '{print $1}') - # debug "Modifying line: ${domain}" 2 - # debug " ... creating filterfile for ${dns} ..." 1 + # debug 2 "Modifying line: ${domain}" + # debug 1 " ... creating filterfile for ${dns} ..." # sed '' ${file} > ${file%\.*}-${dns%~}.script.filter - # debug " ... filterfile created ..." 1 - # debug " ... adding filterfile for ${dns} to actionfile ..." 1 + # debug 1 " ... filterfile created ..." + # debug 1 " ... adding filterfile for ${dns} to actionfile ..." # echo "{ +filter{${list}-${dns}} }" >> ${actionfile} # echo "${dns}" >> ${actionfile} - # debug " ... filterfile added ..." 1 + # debug 1 " ... filterfile added ..." # done # IFS=${ifs} - # debug "... all domainbased filterfiles created ..." 1 + # debug 1 "... all domainbased filterfiles created ..." - debug "... creating and adding whitlist for urls ..." 1 + debug 1 "... creating and adding whitlist for urls ..." # whitelist of urls echo "{ -block }" >> "${actionfile}" sed 's/^@@//g;/\$.*/d;/#/d;s/\./\\./g;s/\?/\\?/g;s/\*/.*/g;s/(/\\(/g;s/)/\\)/g;s/\[/\\[/g;s/\]/\\]/g;s/\^/[\/\&:\?=_]/g;s/^||/\./g;s/^|/^/g;s/|$/\$/g;/|/d' "${domain_name_except_file}" >> "${actionfile}" - debug "... created and added whitelist - creating and adding image handler ..." 1 + debug 1 "... created and added whitelist - creating and adding image handler ..." # whitelist of image urls echo "{ -block +handle-as-image }" >> "${actionfile}" sed '/^@@.*/!d;s/^@@//g;/\$.*image.*/!d;s/\$.*image.*//g;/#/d;s/\./\\./g;s/\?/\\?/g;s/\*/.*/g;s/(/\\(/g;s/)/\\)/g;s/\[/\\[/g;s/\]/\\]/g;s/\^/[\/\&:\?=_]/g;s/^||/\./g;s/^|/^/g;s/|$/\$/g;/|/d' "${file}" >> "${actionfile}" - debug "... created and added image handler ..." 1 - debug "... created actionfile for ${list}." 1 + debug 1 "... created and added image handler ..." + debug 1 "... created actionfile for ${list}." # install Privoxy actionsfile install -o "${PRIVOXY_USER}" -g "${PRIVOXY_GROUP}" "${VERBOSE[@]}" "${actionfile}" "${PRIVOXY_DIR}" if ! grep -q "$(basename "${actionfile}")" "${PRIVOXY_CONF}"; then - debug "\nModifying ${PRIVOXY_CONF} ..." 0 + debug 0 "Modifying ${PRIVOXY_CONF} ..." sed "s/^actionsfile user\.action/actionsfile $(basename "${actionfile}")\nactionsfile user.action/" "${PRIVOXY_CONF}" > "${TMPDIR}/config" - debug "... modification done.\n" 0 - debug "Installing new config ..." 0 + debug 0 "... modification done." + debug 0 "Installing new config ..." install -o "${PRIVOXY_USER}" -g "${PRIVOXY_GROUP}" "${VERBOSE[@]}" "${TMPDIR}/config" "${PRIVOXY_CONF}" - debug "... installation done\n" 0 + debug 0 "... installation done" fi # install Privoxy filterfile install -o "${PRIVOXY_USER}" -g "${PRIVOXY_GROUP}" "${VERBOSE[@]}" "${filterfile}" "${PRIVOXY_DIR}" if ! grep -q "$(basename "${filterfile}")" "${PRIVOXY_CONF}"; then - debug "\nModifying ${PRIVOXY_CONF} ..." 0 + debug 0 "Modifying ${PRIVOXY_CONF} ..." sed "s/^\(#*\)filterfile user\.filter/filterfile $(basename "${filterfile}")\n\1filterfile user.filter/" "${PRIVOXY_CONF}" > "${TMPDIR}/config" - debug "... modification done.\n" 0 - debug "Installing new config ..." 0 + debug 0 "... modification done." + debug 0 "Installing new config ..." install -o "${PRIVOXY_USER}" -g "${PRIVOXY_GROUP}" "${VERBOSE[@]}" "${TMPDIR}/config" "${PRIVOXY_CONF}" - debug "... installation done\n" 0 + debug 0 "... installation done" fi - debug "... ${url} installed successfully.\n" 0 + debug 0 "... ${url} installed successfully." done } @@ -319,9 +322,9 @@ function lock() { echo "An instance of ${TMPNAME} is already running. Exit" exit 1 fi - debug "Found dead lock file." 0 + debug 0 "Found dead lock file." rm -f "${PID_FILE}" - debug "File removed." 0 + debug 0 "File removed." fi # safe PID in lock-file @@ -346,6 +349,7 @@ function remove() { VERBOSE=() method="main" +OS="$(uname)" # loop for options while getopts ":c:hrqv:V" opt; do @@ -384,7 +388,9 @@ prepare trap 'rm -fr "${TMPDIR}";exit' INT TERM EXIT lock -debug "URL-List: ${URLS}\nPrivoxy-Configdir: ${PRIVOXY_DIR}\nTemporary directory: ${TMPDIR}" 2 +debug 2 "URL-List: ${URLS[*]}" +debug 2 "Privoxy-Configdir: ${PRIVOXY_DIR}" +debug 2 "Temporary directory: ${TMPDIR}" "${method}" # restore default exit command diff --git a/tests/conftest.py b/tests/conftest.py index e6d037f..232f0f2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,14 +4,14 @@ import os from pathlib import Path from re import search -from typing import Dict, Generator, Optional, cast +from typing import Generator, Optional import pytest import requests -from pytest import CollectReport, StashKey +from pytest import StashKey from pytestshellutils.shell import Daemon -phase_report_key = StashKey[Dict[str, CollectReport]]() +phase_report_key = StashKey[int]() def debug_enabled() -> bool: @@ -35,12 +35,12 @@ def pytest_runtest_makereport(item: pytest.Item): report = yield if item.parent: + # store test results for each phase ("setup", "call", "teardown") of each test + # within module-scope + if phase_report_key not in item.parent.stash: + item.parent.stash.setdefault(phase_report_key, 0) if report.failed: - # store test results for each phase ("setup", "call", "teardown") of each test - # within module-scope - item.parent.stash.setdefault( - phase_report_key, cast(Dict[str, CollectReport], {}) - )[f"{report.nodeid}_{report.when}"] = report + item.parent.stash[phase_report_key] += 1 return report @@ -95,12 +95,14 @@ def start_privoxy(request: pytest.FixtureRequest) -> Generator[bool, None, None] run.start() yield run.is_running() run_result = run.terminate() + logs = run_result.stdout + run_result.stderr # request.node is an "module" because we use the "module" scope node = request.node - if (phase_report_key in node.stash) and len(node.stash[phase_report_key]) > 0: - print( - f"\n\nprivoxy-results\n stdout:\n{run_result.stdout}\n stderr:\n{run_result.stderr}" - ) + if ( + (phase_report_key in node.stash) and node.stash[phase_report_key] > 0 + ) or " Error: " in logs: + print(f"\n\nprivoxy-logs\n{logs}") + assert " Error: " not in logs @pytest.fixture(scope="module") diff --git a/tests/test_01_root_execute.py b/tests/test_01_root_execute.py index fa722fe..ab5805f 100644 --- a/tests/test_01_root_execute.py +++ b/tests/test_01_root_execute.py @@ -150,6 +150,10 @@ def test_missing_deps(shell, privoxy_blocklist) -> None: assert "Please install the package providing" in ret_script.stderr +def test_privoxy_runtime_log() -> None: + """NOOP function to support checking privoxy logs during tear-down.""" + + # Heloer functions From f5398bb686c08885608054ab203fedde00d9411f Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Sun, 14 Jan 2024 23:09:47 +0100 Subject: [PATCH 02/11] improve readability of sed-commands Signed-off-by: Andrwe Lord Weber --- privoxy-blocklist.sh | 99 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 86 insertions(+), 13 deletions(-) diff --git a/privoxy-blocklist.sh b/privoxy-blocklist.sh index 314ad28..992e139 100755 --- a/privoxy-blocklist.sh +++ b/privoxy-blocklist.sh @@ -195,14 +195,15 @@ function main() { for url in "${URLS[@]}"; do debug 0 "Processing ${url} ..." file="${TMPDIR}/$(basename "${url}")" - address_file="${TMPDIR}/$(basename "${url}").address" - address_except_file="${TMPDIR}/$(basename "${url}").address_except" - url_file="${TMPDIR}/$(basename "${url}").url" - url_except_file="${TMPDIR}/$(basename "${url}").url_except" - domain_name_file="${TMPDIR}/$(basename "${url}").domain" - domain_name_except_file="${TMPDIR}/$(basename "${url}").domain_except" - regex_file="${TMPDIR}/$(basename "${url}").regex" - regex_except_file="${TMPDIR}/$(basename "${url}").regex_except" + address_file="${file}.address" + address_except_file="${file}.address_except" + url_file="${file}.url" + url_except_file="${file}.url_except" + domain_name_file="${file}.domain" + domain_name_except_file="${file}.domain_except" + regex_file="${file}.regex" + regex_except_file="${file}.regex_except" + html_file="${file}.html" actionfile=${file%\.*}.script.action filterfile=${file%\.*}.script.filter list="$(basename "${file%\.*}")" @@ -232,21 +233,93 @@ function main() { ## regex block grep '^/^' "${file}" > "${regex_file}" grep '^@@/^' "${file}" > "${regex_except_file}" + ## html element block + grep '^.*##..*' "${file}" > "${html_file}" set -e # convert AdblockPlus list to Privoxy list # blacklist of urls debug 1 "Creating actionfile for ${list} ..." - echo -e "{ +block{${list}} }" > "${actionfile}" - sed '/\$.*/d;/#/d;s/\?/\\?/g;s/\*/.*/g;s/(/\\(/g;s/)/\\)/g;s/\[/\\[/g;s/\]/\\]/g;s/\^$//g;s/^||/\./g;s/^|/^/g;s/|$/\$/g;/|/d' "${domain_name_file}" >> "${actionfile}" + echo "{ +block{${list}} }" > "${actionfile}" + sed ' + # skip domains with additional filter definition + /\$.*/d + # skip domains with HTML filter + /#/d + # replace characters to match Privoxy domain syntax + s/\?/\\?/g;s/\*/.*/g;s/(/\\(/g;s/)/\\)/g;s/\[/\\[/g;s/\]/\\]/g + # replace marking seperator of Adblock + s/\^$//g + # replace domain matcher + s/^||/\./g + ' "${domain_name_file}" >> "${actionfile}" + sed ' + # skip domains with additional filter definition + /\$.*/d + # skip domains with HTML filter + /#/d + # replace characters to match Privoxy domain syntax + s/\?/\\?/g;s/\*/.*/g;s/(/\\(/g;s/)/\\)/g;s/\[/\\[/g;s/\]/\\]/g + # replace marking seperator of Adblock + s/\^$//g + # handle exact domain matching + s/^|\([^|][^|]*\)|/^\1\$/g;s/|$/\$/g + ' "${address_file}" >> "${actionfile}" debug 1 "... creating filterfile for ${list} ..." echo "FILTER: ${list} Tag filter of ${list}" > "${filterfile}" - # set filter for html elements - sed '/^#/!d;s/^##//g;s/^#\(.*\)\[.*\]\[.*\]*/s@<([a-zA-Z0-9]+)\\s+.*id=.?\1.*>.*<\/\\1>@@g/g;s/^#\(.*\)/s@<([a-zA-Z0-9]+)\\s+.*id=.?\1.*>.*<\/\\1>@@g/g;s/^\.\(.*\)/s@<([a-zA-Z0-9]+)\\s+.*class=.?\1.*>.*<\/\\1>@@g/g;s/^a\[\(.*\)\]/s@.*<\/a>@@g/g;s/^\([a-zA-Z0-9]*\)\.\(.*\)\[.*\]\[.*\]*/s@<\1.*class=.?\2.*>.*<\/\1>@@g/g;s/^\([a-zA-Z0-9]*\)#\(.*\):.*[\:[^:]]*[^:]*/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g;s/^\([a-zA-Z0-9]*\)#\(.*\)/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g;s/^\[\([a-zA-Z]*\).=\(.*\)\]/s@\1^=\2>@@g/g;s/\^/[\/\&:\?=_]/g;s/\.\([a-zA-Z0-9]\)/\\.\1/g' "${file}" >> "${filterfile}" + debug 1 "... processing 'class'-matches ..." + sed ' + # only process gloabl classes + /^##\..*/!d + # cleanup + s/^##//g + # convert classes independent of HTML tag + s/^\.\(.*\)/s@<([a-zA-Z0-9]+)\\s+.*class=.?\1.*>.*<\/\\1>@@g/g + # convert classes with defined HTML tag + s/^\([a-zA-Z0-9]*\)\.\(.*\)\[.*\]\[.*\]*/s@<\1.*class=.?\2.*>.*<\/\1>@@g/g + ' "${html_file}" >> "${filterfile}" + # FIXME: add class handling with domains + + debug 1 "... processing 'id'-matches ..." + sed ' + # only process gloabl classes + /^###.*/!d + # cleanup + s/^##//g + # convert id independent of HTML tag + s/^#\(.*\)/s@<.*id=.?\1.*>.*<\/@@g/g + # convert id with defined HTML tag and extended selectors + s/^\([a-zA-Z0-9][a-zA-Z0-9]*\)#\(.*\):.*[\:[^:]]*[^:]*/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g + # convert id with defined HTML tag + s/^\([a-zA-Z0-9][a-zA-Z0-9]*\)#\(.*\)/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g + ' "${html_file}" >> "${filterfile}" + # FIXME: add id handling with domains + + debug 1 "... processing 'attribute'-matches ..." + sed ' + # only process gloabl classes + /^##\[.*/!d + # cleanup + s/^##//g + # convert attribute based filters with exact match with exact match + s/^\[\([^=^]*\)"*=\(.*\)\]/s@\1=\2>@@g/g + # convert attribute based filter with contain match + s/^\[\([^=^]*\)"*\*="*\([^"]*\)"*\]/s@\1=".*\2.*">@@g/g + # convert attribute based filter with startwith match + s/^\[\([^=]*\)"*^="*\([^"]*\)"*\]/s@\1="\2.*">@@g/g + # convert attribute based filter with endswith match + s/^\[\([^=^]*\)"*\$="*\([^"]*\)"*\]/s@\1=".*\2">@@g/g + # convert attribute name-only matches + s/^\[\(.*\)"*\]/s@<.*\1.*\/>@@g\ns@<\([^ ]*\) .*\1.*>.*<\/\\1.*>@@g/g + # convert dots + s/\.\([a-zA-Z0-9]\)/\\.\1/g + ' "${html_file}" >> "${filterfile}" + # FIXME: add attribute handling with domains + debug 1 "... filterfile created - adding filterfile to actionfile ..." echo "{ +filter{${list}} }" >> "${actionfile}" - echo "*" >> "${actionfile}" + echo ".*" >> "${actionfile}" debug 1 "... filterfile added ..." # create domain based whitelist From ea42f8ab7d7cf31c3f5ed08dac1c5fdca605aaec Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Mon, 15 Jan 2024 23:44:00 +0100 Subject: [PATCH 03/11] implement content tests & move to setup.cfg * tests: implement logix to check for content removed by privoxy * tests: centralize test configuration in config.py * tests: move config for python linting tools to centralized setup.cfg Signed-off-by: Andrwe Lord Weber --- .flake8 | 2 - tests/config.py | 69 ++++++++++++++++++ tests/conftest.py | 10 +++ tests/requirements.txt | 1 + tests/response.html | 9 +++ tests/setup.cfg | 5 ++ tests/test_00_minimal.py | 2 +- tests/test_01_root_execute.py | 129 ++++++++++++++-------------------- 8 files changed, 147 insertions(+), 80 deletions(-) delete mode 100644 .flake8 create mode 100644 tests/config.py create mode 100644 tests/response.html create mode 100644 tests/setup.cfg diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 61d9081..0000000 --- a/.flake8 +++ /dev/null @@ -1,2 +0,0 @@ -[flake8] -max-line-length = 99 diff --git a/tests/config.py b/tests/config.py new file mode 100644 index 0000000..83f1bad --- /dev/null +++ b/tests/config.py @@ -0,0 +1,69 @@ +"""Configuration of test suite to configure tests.""" + +from conftest import check_in, check_not_in + +content_removed = [ + "ad_970x250", # class match: https://www.iphoneitalia.com/ + "MyAdsId3", # id match + "AdRight2", # class match with element having multiple classes +] +content_exists = [ + "ajlkl", # should exist, although one element is removed by privoxy +] + +# FIXME: see https://github.com/Andrwe/privoxy-blocklist/issues/35 +urls_allowed = ["duckduckgo.com/", "hs-exp.jp/ads/"] +urls_allowed = ["duckduckgo.com/"] + +# FIXME: implement regex-filter for domains, e.g. +# /^https?:\/\/s3\.*.*\.amazonaws\.com\/[a-f0-9]{45,}\/[a-f,0-9]{8,10}$/$script, +# third-party,xmlhttprequest,domain=~amazon.com +urls_blocked = [ + "andrwe.org/ads/", + "andrwe.jp/ads/", + "pubfeed.linkby.com", + f"s3.{'a'*6}.amazonaws.com/{'0123abcd'*6}/{'ab,12'*2}/", +] +urls_blocked = ["andrwe.org/ads/", "andrwe.jp/ads/", "pubfeed.linkby.com"] + +config_checks = { + "url_extended_config.conf": [ + ( + check_in, + "Processing https://raw.githubusercontent.com/easylist/easylist/master/" + "easylist/easylist_allowlist_general_hide.txt", + ), + ( + check_in, + "Processing https://easylist-downloads.adblockplus.org/easylistgermany.txt", + ), + ( + check_in, + "The list recieved from https://raw.githubusercontent.com/easylist/easylist/master" + "/easylist/easylist_allowlist_general_hide.txt does not contain AdblockPlus list " + "header. Try to process anyway.", + ), + ( + check_not_in, + "created and added image handler", + ), + ], + "debugging.conf": [ + ( + check_in, + "Processing https://easylist-downloads.adblockplus.org/easylistgermany.txt", + ), + ( + check_not_in, + "does not contain AdblockPlus list header.", + ), + ( + check_in, + "‘/tmp/privoxy-blocklist.sh/easylist.txt’ saved", + ), + ( + check_in, + "created and added image handler", + ), + ], +} diff --git a/tests/conftest.py b/tests/conftest.py index 232f0f2..be9238d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,6 +26,16 @@ def debug_enabled() -> bool: ) +def check_in(needle: str, haystack: str) -> bool: + """Check given haystack for given string.""" + return needle in haystack + + +def check_not_in(needle: str, haystack: str) -> bool: + """Check that given string is not in given text.""" + return needle not in haystack + + # based on # https://docs.pytest.org/en/latest/example/simple.html#making-test-result-information-available-in-fixtures @pytest.hookimpl(wrapper=True, tryfirst=True) diff --git a/tests/requirements.txt b/tests/requirements.txt index 295a104..d9def25 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,4 +1,5 @@ pytest pytest-durations +pytest-httpserver pytest-shell-utilities requests diff --git a/tests/response.html b/tests/response.html new file mode 100644 index 0000000..196a7b7 --- /dev/null +++ b/tests/response.html @@ -0,0 +1,9 @@ + + +
just-some-test-string-always-present
+
single class should be removed
+
multiple classes that should be removed
+
multiple classes that should exist
+
id should be removed
+ + diff --git a/tests/setup.cfg b/tests/setup.cfg new file mode 100644 index 0000000..035f93b --- /dev/null +++ b/tests/setup.cfg @@ -0,0 +1,5 @@ +[pycodestyle] +max-line-length = 99 + +[flake8] +max-line-length = 99 diff --git a/tests/test_00_minimal.py b/tests/test_00_minimal.py index e86afff..0402a8a 100644 --- a/tests/test_00_minimal.py +++ b/tests/test_00_minimal.py @@ -11,7 +11,6 @@ def test_permissions() -> None: ".ci_config/bandit.yml", ".ci_config/prospector.yaml", ".editorconfig", - ".flake8", ".github/release.yml", ".github/workflows/pytest.yml", ".github/workflows/release.yml", @@ -22,6 +21,7 @@ def test_permissions() -> None: "tests/Dockerfile_alpine", "tests/Dockerfile_ubuntu", "tests/requirements.txt", + "tests/setup.cfg", "tests/test_00_minimal.py", "tests/test_01_root_execute.py", ] diff --git a/tests/test_01_root_execute.py b/tests/test_01_root_execute.py index ab5805f..b0dbffa 100644 --- a/tests/test_01_root_execute.py +++ b/tests/test_01_root_execute.py @@ -4,29 +4,32 @@ from pathlib import Path from shutil import copyfile, copymode, which +import config import requests +from conftest import check_in, check_not_in +from urllib3.util import parse_url def test_config_generator(shell, privoxy_blocklist) -> None: """Test config generator with default path.""" - config = Path("/etc/privoxy-blocklist.conf") - if config.exists(): - config.unlink() + config_file = Path("/etc/privoxy-blocklist.conf") + if config_file.exists(): + config_file.unlink() ret = shell.run(privoxy_blocklist) assert ret.returncode == 2 assert "Creating default one and exiting" in ret.stdout - assert config.exists() + assert config_file.exists() def test_custom_config_generator(shell, tmp_path, privoxy_blocklist) -> None: """Test config generator with custom path.""" - config = Path(f"{tmp_path}/privoxy-blocklist") - if config.exists(): - config.unlink() - ret = shell.run(privoxy_blocklist, "-c", str(config)) + config_file = Path(f"{tmp_path}/privoxy-blocklist") + if config_file.exists(): + config_file.unlink() + ret = shell.run(privoxy_blocklist, "-c", str(config_file)) assert ret.returncode == 2 assert "Creating default one and exiting" in ret.stdout - assert config.exists() + assert config_file.exists() def test_version_option(shell, tmp_path, privoxy_blocklist) -> None: @@ -58,77 +61,59 @@ def test_next_run(shell, privoxy_blocklist) -> None: def test_request_success(start_privoxy, supported_schemes) -> None: """Test URLs not blocked by privoxy.""" - # FIXME: see https://github.com/Andrwe/privoxy-blocklist/issues/35 - urls = ["duckduckgo.com/", "hs-exp.jp/ads/"] - urls = ["duckduckgo.com/"] - run_requests(start_privoxy, supported_schemes, urls, [200, 301, 302]) + run_requests(start_privoxy, supported_schemes, config.urls_allowed, [200, 301, 302]) def test_request_block_url(start_privoxy, supported_schemes) -> None: """Test URLs blocked by privoxy due to easylist.""" - urls = [ - "andrwe.org/ads/", - "andrwe.jp/ads/", - "pubfeed.linkby.com", - f"s3.{'a'*6}.amazonaws.com/{'0123abcd'*6}/{'ab,12'*2}/", - ] - urls = ["andrwe.org/ads/", "andrwe.jp/ads/", "pubfeed.linkby.com"] - run_requests(start_privoxy, supported_schemes, urls, [403]) + run_requests(start_privoxy, supported_schemes, config.urls_blocked, [403]) +def test_removed_content(start_privoxy, httpserver) -> None: + """Test filters for removing content.""" + with Path(__file__).parent.joinpath("response.html").open( + "r", encoding="UTF-8" + ) as f_h: + response_html = f_h.read() + httpserver.expect_request("/").respond_with_data( + response_data=response_html, content_type="text/html" + ) + parsed_url = parse_url(httpserver.url_for("/")) + parsed_port = f":{parsed_url.port}" if parsed_url.port else "" + scheme_less_url = f"{parsed_url.host}{parsed_port}{parsed_url.request_uri}" + response = run_request( + start_privoxy, + scheme=parsed_url.scheme or "http", + url=scheme_less_url, + expected_code=[200], + ) + # expected response + assert check_in("just-some-test-string-always-present", response.text) + for needle in config.content_removed: + # check presence of needle without privoxy + assert check_in(needle, requests.get(httpserver.url_for("/"), timeout=10).text) + # check presence of needle with privoxy + assert check_not_in(needle, response.text) + for needle in config.content_exists: + # check presence of needle without privoxy + assert check_in(needle, requests.get(httpserver.url_for("/"), timeout=10).text) + # check presence of needle with privoxy + assert check_in(needle, response.text) + + +# must be second last test as it will generate unpredictable privoxy configurations def test_predefined_custom_config_generator(shell, privoxy_blocklist) -> None: """Run tests for all pre-defined configs.""" - checks = { - "url_extended_config.conf": [ - ( - check_in, - "Processing https://raw.githubusercontent.com/easylist/easylist/master/" - "easylist/easylist_allowlist_general_hide.txt", - ), - ( - check_in, - "Processing https://easylist-downloads.adblockplus.org/easylistgermany.txt", - ), - ( - check_in, - "The list recieved from https://raw.githubusercontent.com/easylist/easylist/master" - "/easylist/easylist_allowlist_general_hide.txt does not contain AdblockPlus list " - "header. Try to process anyway.", - ), - ( - check_not_in, - "created and added image handler", - ), - ], - "debugging.conf": [ - ( - check_in, - "Processing https://easylist-downloads.adblockplus.org/easylistgermany.txt", - ), - ( - check_not_in, - "does not contain AdblockPlus list header.", - ), - ( - check_in, - "‘/tmp/privoxy-blocklist.sh/easylist.txt’ saved", - ), - ( - check_in, - "created and added image handler", - ), - ], - } test_config_dir = Path(__file__).parent / "configs" - for config in test_config_dir.iterdir(): - if not config.is_file(): + for config_file in test_config_dir.iterdir(): + if not config_file.is_file(): continue - ret = shell.run(privoxy_blocklist, "-c", str(config)) + ret = shell.run(privoxy_blocklist, "-c", str(config_file)) assert ret.returncode == 0 assert check_not_in("Creating default one and exiting", ret.stdout) - for check in checks.get(config.name, []): + for check in config.config_checks.get(config_file.name, []): assert check[0](check[1], ret.stdout) - assert config.exists() + assert config_file.exists() # must be last test as it will uninstall dependencies and check error handling @@ -157,16 +142,6 @@ def test_privoxy_runtime_log() -> None: # Heloer functions -def check_in(needle: str, haystack: str) -> bool: - """Check given haystack for given string.""" - return needle in haystack - - -def check_not_in(needle: str, haystack: str) -> bool: - """Check that given string is not in given text.""" - return needle not in haystack - - def run_requests( start_privoxy, supported_schemes, urls: list[str], expected_code: list[int] ) -> None: From db8e04e85f30463cc0417f18ac40046e558a308b Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Mon, 15 Jan 2024 23:47:38 +0100 Subject: [PATCH 04/11] add vim & curl for debugging in Docker container Signed-off-by: Andrwe Lord Weber --- tests/Dockerfile_ubuntu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/Dockerfile_ubuntu b/tests/Dockerfile_ubuntu index d5366d5..08b58c0 100644 --- a/tests/Dockerfile_ubuntu +++ b/tests/Dockerfile_ubuntu @@ -5,10 +5,12 @@ COPY helper/install_deps.sh /install_deps.sh ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update \ && apt-get install --no-install-recommends -q --yes \ + curl \ build-essential \ python3-pip \ python3-dev \ sudo \ + vim \ && pip install --no-cache-dir -qr /requirements.txt \ && rm -f /requirements.txt \ && install -d -o root -g root /pytest_cache \ From c6c994afe4d5c04cb86c90a2ddd663c922a897a9 Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Mon, 15 Jan 2024 23:59:42 +0100 Subject: [PATCH 05/11] optimize id & class matching rules Signed-off-by: Andrwe Lord Weber --- .pre-commit-config.yaml | 6 ++ privoxy-blocklist.sh | 128 +++++++++++++++++++++++++++++----------- 2 files changed, 99 insertions(+), 35 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f6db1e5..7f039ef 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,6 +22,12 @@ repos: rev: v3.7.0.1 hooks: - id: shfmt + args: + - "--binary-next-line" + - "--case-indent" + - "--indent" + - "4" + - "--space-redirects" - repo: https://github.com/AleksaC/hadolint-py rev: v2.12.0.3 hooks: diff --git a/privoxy-blocklist.sh b/privoxy-blocklist.sh index 992e139..7c8be59 100755 --- a/privoxy-blocklist.sh +++ b/privoxy-blocklist.sh @@ -267,39 +267,89 @@ function main() { ' "${address_file}" >> "${actionfile}" debug 1 "... creating filterfile for ${list} ..." - echo "FILTER: ${list} Tag filter of ${list}" > "${filterfile}" + echo "FILTER: ${list}_class_global Tag filter of ${list}" > "${filterfile}" debug 1 "... processing 'class'-matches ..." - sed ' - # only process gloabl classes - /^##\..*/!d - # cleanup - s/^##//g - # convert classes independent of HTML tag - s/^\.\(.*\)/s@<([a-zA-Z0-9]+)\\s+.*class=.?\1.*>.*<\/\\1>@@g/g - # convert classes with defined HTML tag - s/^\([a-zA-Z0-9]*\)\.\(.*\)\[.*\]\[.*\]*/s@<\1.*class=.?\2.*>.*<\/\1>@@g/g - ' "${html_file}" >> "${filterfile}" + ( + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl class matches + /^##\..*/!d + # remove all combinations with attribute matching + /^##\..*\[.*/d + # remove all matches with combinators + /^##\..*[>+~ ].*/d + # cleanup + s/^##\.//g + # prepare regex merging + s/$/|/ + ' "${html_file}" | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*class=[%s][^%s]*(' "\"'" "\"'" + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ')[^%s]*[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" "\"'" + lines=() + done + ) >> "${filterfile}" # FIXME: add class handling with domains + # FIXME: add class handling with combinators + # FIXME: add class with defined HTML tag ? + # FIXME: add class with cascading + echo "FILTER: ${list}_id_global Tag filter of ${list}" >> "${filterfile}" debug 1 "... processing 'id'-matches ..." - sed ' - # only process gloabl classes - /^###.*/!d - # cleanup - s/^##//g - # convert id independent of HTML tag - s/^#\(.*\)/s@<.*id=.?\1.*>.*<\/@@g/g - # convert id with defined HTML tag and extended selectors - s/^\([a-zA-Z0-9][a-zA-Z0-9]*\)#\(.*\):.*[\:[^:]]*[^:]*/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g - # convert id with defined HTML tag - s/^\([a-zA-Z0-9][a-zA-Z0-9]*\)#\(.*\)/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g - ' "${html_file}" >> "${filterfile}" + ( + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^###.*/!d + # remove all matches with combinators + /^###.*[>+~].*/d + # cleanup + s/^###//g + # prepare regex merging + s/$/|/ + ' "${html_file}" | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*id=[%s](' "\"'" + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ')[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" + lines=() + done + ) >> "${filterfile}" # FIXME: add id handling with domains + # FIXME: add id handling with combinators + # FIXME: add id with defined HTML tag: + # s/^\([a-zA-Z0-9][a-zA-Z0-9]*\)#\(.*\):.*[\:[^:]]*[^:]*/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g + # s/^\([a-zA-Z0-9][a-zA-Z0-9]*\)#\(.*\)/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g + # FIXME: add id with cascading + echo "FILTER: ${list}_attribute Tag filter of ${list}" >> "${filterfile}" debug 1 "... processing 'attribute'-matches ..." sed ' # only process gloabl classes /^##\[.*/!d + # remove all matches with combinators + /^##\[.*[>+~].*/d # cleanup s/^##//g # convert attribute based filters with exact match with exact match @@ -316,10 +366,18 @@ function main() { s/\.\([a-zA-Z0-9]\)/\\.\1/g ' "${html_file}" >> "${filterfile}" # FIXME: add attribute handling with domains + # FIXME: add attribute handling with combinators + # FIXME: add combination of classes and attributes: ##.OUTBRAIN[data-widget-id^="FMS_REELD_"] debug 1 "... filterfile created - adding filterfile to actionfile ..." - echo "{ +filter{${list}} }" >> "${actionfile}" - echo ".*" >> "${actionfile}" + ( + echo "{ +filter{${list}_class_global} }" + echo "/" + echo "{ +filter{${list}_id_global} }" + echo "/" + echo "{ +filter{${list}_attribute} }" + echo "*" + ) >> "${actionfile}" debug 1 "... filterfile added ..." # create domain based whitelist @@ -406,18 +464,18 @@ function lock() { # shellcheck disable=SC2317 function remove() { - read -rp "Do you really want to remove all build lists?(y/N) " choice - if [ "${choice}" != "y" ]; then - exit 0 + read -rp "Do you really want to remove all build lists?(y/N) " choice + if [ "${choice}" != "y" ]; then + exit 0 fi - if rm -rf "${PRIVOXY_DIR}/"*.script.{action,filter} \ - && sed '/^actionsfile .*\.script\.action$/d;/^filterfile .*\.script\.filter$/d' -i "${PRIVOXY_CONF}"; then - echo "Lists removed." - exit 0 + if rm -rf "${PRIVOXY_DIR}/"*.script.{action,filter} \ + && sed '/^actionsfile .*\.script\.action$/d;/^filterfile .*\.script\.filter$/d' -i "${PRIVOXY_CONF}"; then + echo "Lists removed." + exit 0 fi - error "An error occured while removing the lists." - error "Please have a look into ${PRIVOXY_DIR} whether there are .script.* files and search for *.script.* in ${PRIVOXY_CONF}." - exit 1 + error "An error occured while removing the lists." + error "Please have a look into ${PRIVOXY_DIR} whether there are .script.* files and search for *.script.* in ${PRIVOXY_CONF}." + exit 1 } VERBOSE=() From 519998cb07f4a61dc4dc48c7df258a825c2798fd Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Wed, 17 Jan 2024 00:32:14 +0100 Subject: [PATCH 06/11] move testwebserver to conftest.py Signed-off-by: Andrwe Lord Weber --- .ci_config/prospector.yaml | 7 +++++++ tests/conftest.py | 33 +++++++++++++++++++++++++++++++++ tests/test_01_root_execute.py | 33 +++++++++++++++++---------------- 3 files changed, 57 insertions(+), 16 deletions(-) diff --git a/.ci_config/prospector.yaml b/.ci_config/prospector.yaml index ad9e5ef..8d31dcf 100644 --- a/.ci_config/prospector.yaml +++ b/.ci_config/prospector.yaml @@ -13,3 +13,10 @@ bandit: mypy: run: true + +pydocstyle: + disable: + # conflicts with D211 + - D203 + # conflicts with D211 + - D212 diff --git a/tests/conftest.py b/tests/conftest.py index be9238d..2325574 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,10 +10,30 @@ import requests from pytest import StashKey from pytestshellutils.shell import Daemon +from urllib3.util import Url, parse_url phase_report_key = StashKey[int]() +class UrlParsed: + """Class to parse and store URL.""" + + origin_url: str + parsed_url: Url + scheme: str + scheme_less_url: str + + def __init__(self, url: str): + """Initialize object by parsing given URL.""" + self.origin_url = url + self.parsed_url = parse_url(self.origin_url) + self.scheme = self.parsed_url.scheme or "http" + parsed_port = f":{self.parsed_url.port}" if self.parsed_url.port else "" + self.scheme_less_url = ( + f"{self.parsed_url.host}{parsed_port}{self.parsed_url.request_uri}" + ) + + def debug_enabled() -> bool: """Check if debugging is enabled.""" # RUNNER_DEBUG = set when "debug logging" activated @@ -55,6 +75,19 @@ def pytest_runtest_makereport(item: pytest.Item): return report +@pytest.fixture +def webserver(httpserver) -> UrlParsed: + """Start HTTP server and return parsed URL object.""" + with Path(__file__).parent.joinpath("response.html").open( + "r", encoding="UTF-8" + ) as f_h: + response_html = f_h.read() + httpserver.expect_request("/").respond_with_data( + response_data=response_html, content_type="text/html" + ) + return UrlParsed(httpserver.url_for("/")) + + @pytest.fixture(scope="module") def privoxy_blocklist() -> str: """Return the path to privoxy-blocklist.sh.""" diff --git a/tests/test_01_root_execute.py b/tests/test_01_root_execute.py index b0dbffa..16b1b51 100644 --- a/tests/test_01_root_execute.py +++ b/tests/test_01_root_execute.py @@ -7,7 +7,6 @@ import config import requests from conftest import check_in, check_not_in -from urllib3.util import parse_url def test_config_generator(shell, privoxy_blocklist) -> None: @@ -69,34 +68,36 @@ def test_request_block_url(start_privoxy, supported_schemes) -> None: run_requests(start_privoxy, supported_schemes, config.urls_blocked, [403]) -def test_removed_content(start_privoxy, httpserver) -> None: +def test_content_removed(start_privoxy, webserver) -> None: """Test filters for removing content.""" - with Path(__file__).parent.joinpath("response.html").open( - "r", encoding="UTF-8" - ) as f_h: - response_html = f_h.read() - httpserver.expect_request("/").respond_with_data( - response_data=response_html, content_type="text/html" - ) - parsed_url = parse_url(httpserver.url_for("/")) - parsed_port = f":{parsed_url.port}" if parsed_url.port else "" - scheme_less_url = f"{parsed_url.host}{parsed_port}{parsed_url.request_uri}" response = run_request( start_privoxy, - scheme=parsed_url.scheme or "http", - url=scheme_less_url, + scheme=webserver.scheme, + url=webserver.scheme_less_url, expected_code=[200], ) # expected response assert check_in("just-some-test-string-always-present", response.text) for needle in config.content_removed: # check presence of needle without privoxy - assert check_in(needle, requests.get(httpserver.url_for("/"), timeout=10).text) + assert check_in(needle, requests.get(webserver.origin_url, timeout=10).text) # check presence of needle with privoxy assert check_not_in(needle, response.text) + + +def test_content_exists(start_privoxy, webserver) -> None: + """Test filters for removing content.""" + response = run_request( + start_privoxy, + scheme=webserver.scheme, + url=webserver.scheme_less_url, + expected_code=[200], + ) + # expected response + assert check_in("just-some-test-string-always-present", response.text) for needle in config.content_exists: # check presence of needle without privoxy - assert check_in(needle, requests.get(httpserver.url_for("/"), timeout=10).text) + assert check_in(needle, requests.get(webserver.origin_url, timeout=10).text) # check presence of needle with privoxy assert check_in(needle, response.text) From bd759be6cd06579181dcf7872c9936a6f5b740fc Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Wed, 17 Jan 2024 02:16:35 +0100 Subject: [PATCH 07/11] add attribute-name-only handler & fix left-over handling Signed-off-by: Andrwe Lord Weber --- privoxy-blocklist.sh | 155 ++++++++++++++++++++++++++++++++----------- tests/config.py | 1 + tests/response.html | 1 + 3 files changed, 117 insertions(+), 40 deletions(-) diff --git a/privoxy-blocklist.sh b/privoxy-blocklist.sh index 7c8be59..b42579a 100755 --- a/privoxy-blocklist.sh +++ b/privoxy-blocklist.sh @@ -204,6 +204,7 @@ function main() { regex_file="${file}.regex" regex_except_file="${file}.regex_except" html_file="${file}.html" + html_except_file="${file}.html_except" actionfile=${file%\.*}.script.action filterfile=${file%\.*}.script.filter list="$(basename "${file%\.*}")" @@ -234,7 +235,8 @@ function main() { grep '^/^' "${file}" > "${regex_file}" grep '^@@/^' "${file}" > "${regex_except_file}" ## html element block - grep '^.*##..*' "${file}" > "${html_file}" + grep -E '^.*##.+' "${file}" > "${html_file}" + grep -E '^.*#@#.+' "${file}" > "${html_except_file}" set -e # convert AdblockPlus list to Privoxy list @@ -267,9 +269,11 @@ function main() { ' "${address_file}" >> "${actionfile}" debug 1 "... creating filterfile for ${list} ..." + debug 1 "... processing global 'class'-matches ..." echo "FILTER: ${list}_class_global Tag filter of ${list}" > "${filterfile}" - debug 1 "... processing 'class'-matches ..." ( + # allow handling of left-over lines from last while-loop-run + shopt -s lastpipe lines=() # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex sed -e ' @@ -299,22 +303,38 @@ function main() { printf ')[^%s]*[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" "\"'" lines=() done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*class=[%s][^%s]*(' "\"'" "\"'" + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ')[^%s]*[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" "\"'" + fi + shopt -u lastpipe ) >> "${filterfile}" + + debug 1 "... registering ${list}_class_global in actionfile ..." + ( + echo "{ +filter{${list}_class_global} }" + echo "/" + ) >> "${actionfile}" + debug 1 "... registered ..." # FIXME: add class handling with domains # FIXME: add class handling with combinators # FIXME: add class with defined HTML tag ? # FIXME: add class with cascading + debug 1 "... processing global 'id'-matches ..." echo "FILTER: ${list}_id_global Tag filter of ${list}" >> "${filterfile}" - debug 1 "... processing 'id'-matches ..." ( + # allow handling of left-over lines from last while-loop-run + shopt -s lastpipe lines=() # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex sed -e ' - # only process gloabl classes + # only process gloabl id-only matches /^###.*/!d # remove all matches with combinators - /^###.*[>+~].*/d + /^###.*[>+~ ].*/d # cleanup s/^###//g # prepare regex merging @@ -335,50 +355,105 @@ function main() { printf ')[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" lines=() done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*id=[%s](' "\"'" + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ')[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" + fi + shopt -u lastpipe ) >> "${filterfile}" + + debug 1 "... registering ${list}_id_global in actionfile ..." + ( + echo "{ +filter{${list}_id_global} }" + echo "/" + ) >> "${actionfile}" + debug 1 "... registered ..." # FIXME: add id handling with domains # FIXME: add id handling with combinators - # FIXME: add id with defined HTML tag: - # s/^\([a-zA-Z0-9][a-zA-Z0-9]*\)#\(.*\):.*[\:[^:]]*[^:]*/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g - # s/^\([a-zA-Z0-9][a-zA-Z0-9]*\)#\(.*\)/s@<\1.*id=.?\2.*>.*<\/\1>@@g/g # FIXME: add id with cascading - echo "FILTER: ${list}_attribute Tag filter of ${list}" >> "${filterfile}" - debug 1 "... processing 'attribute'-matches ..." - sed ' - # only process gloabl classes - /^##\[.*/!d - # remove all matches with combinators - /^##\[.*[>+~].*/d - # cleanup - s/^##//g - # convert attribute based filters with exact match with exact match - s/^\[\([^=^]*\)"*=\(.*\)\]/s@\1=\2>@@g/g - # convert attribute based filter with contain match - s/^\[\([^=^]*\)"*\*="*\([^"]*\)"*\]/s@\1=".*\2.*">@@g/g - # convert attribute based filter with startwith match - s/^\[\([^=]*\)"*^="*\([^"]*\)"*\]/s@\1="\2.*">@@g/g - # convert attribute based filter with endswith match - s/^\[\([^=^]*\)"*\$="*\([^"]*\)"*\]/s@\1=".*\2">@@g/g - # convert attribute name-only matches - s/^\[\(.*\)"*\]/s@<.*\1.*\/>@@g\ns@<\([^ ]*\) .*\1.*>.*<\/\\1.*>@@g/g - # convert dots - s/\.\([a-zA-Z0-9]\)/\\.\1/g - ' "${html_file}" >> "${filterfile}" - # FIXME: add attribute handling with domains - # FIXME: add attribute handling with combinators - # FIXME: add combination of classes and attributes: ##.OUTBRAIN[data-widget-id^="FMS_REELD_"] + debug 1 "... processing 'attribute'-matches with name only and no HTML tag ..." + echo "FILTER: ${list}_attribute_global_name_only Tag filter of ${list}" >> "${filterfile}" + ( + # allow handling of left-over lines from last while-loop-run + shopt -s lastpipe + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^=][^=]*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert attribute name-only matches + s/^\[\([^=][^=]*\)\]/\1/g + # convert dots + s/\.\([^\.]\)/\\.\1/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi + shopt -u lastpipe + ) >> "${filterfile}" - debug 1 "... filterfile created - adding filterfile to actionfile ..." + debug 1 "... registering ${list}_attribute_global_name_only in actionfile ..." ( - echo "{ +filter{${list}_class_global} }" - echo "/" - echo "{ +filter{${list}_id_global} }" + echo "{ +filter{${list}_attribute_global_name_only} }" echo "/" - echo "{ +filter{${list}_attribute} }" - echo "*" ) >> "${actionfile}" - debug 1 "... filterfile added ..." + debug 1 "... registered ..." + + #debug 1 "... processing 'attribute'-matches ..." + #sed ' + ## only process gloabl classes + #/^##\[.*/!d + ## remove all matches with combinators + #/^##\[.*[>+~].*/d + ## cleanup + #s/^##//g + ## convert attribute based filters with exact match with + #s/^\[\([^=^]*\)"*=\(.*\)\]/s@\1=\2>@@g/g + ## convert attribute based filter with contain match + #s/^\[\([^=^]*\)"*\*="*\([^"]*\)"*\]/s@\1=".*\2.*">@@g/g + ## convert attribute based filter with startwith match + #s/^\[\([^=]*\)"*^="*\([^"]*\)"*\]/s@\1="\2.*">@@g/g + ## convert attribute based filter with endswith match + #s/^\[\([^=^]*\)"*\$="*\([^"]*\)"*\]/s@\1=".*\2">@@g/g + ## convert dots + #s/\.\([a-zA-Z0-9]\)/\\.\1/g + #' "${html_file}" >> "${filterfile}" + + #debug 1 "... registering ${list}_attribute in actionfile ..." + #( + # echo "{ +filter{${list}_attribute} }" + # echo "*" + #) >> "${actionfile}" + #debug 1 "... registered ..." + # FIXME: add attribute handling with domains + # FIXME: add attribute handling with combinators + # FIXME: add combination of classes and attributes: ##.OUTBRAIN[data-widget-id^="FMS_REELD_"] # create domain based whitelist diff --git a/tests/config.py b/tests/config.py index 83f1bad..5f3bb12 100644 --- a/tests/config.py +++ b/tests/config.py @@ -6,6 +6,7 @@ "ad_970x250", # class match: https://www.iphoneitalia.com/ "MyAdsId3", # id match "AdRight2", # class match with element having multiple classes + "data-ad-manager-id", # attribute match ] content_exists = [ "ajlkl", # should exist, although one element is removed by privoxy diff --git a/tests/response.html b/tests/response.html index 196a7b7..28ceb21 100644 --- a/tests/response.html +++ b/tests/response.html @@ -5,5 +5,6 @@
multiple classes that should be removed
multiple classes that should exist
id should be removed
+
name-only attibute should be removed
From 25f7dfb987b5d0e4a067d6c0f589b048460b0a57 Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Sat, 27 Jan 2024 15:13:46 +0100 Subject: [PATCH 08/11] add attribute matches for exact, startswith & endswith Signed-off-by: Andrwe Lord Weber --- privoxy-blocklist.sh | 194 ++++++++++++++++++++++++++++++++++++------- tests/config.py | 5 ++ tests/response.html | 6 ++ 3 files changed, 175 insertions(+), 30 deletions(-) diff --git a/privoxy-blocklist.sh b/privoxy-blocklist.sh index b42579a..872be2d 100755 --- a/privoxy-blocklist.sh +++ b/privoxy-blocklist.sh @@ -374,11 +374,11 @@ function main() { # FIXME: add id handling with combinators # FIXME: add id with cascading - debug 1 "... processing 'attribute'-matches with name only and no HTML tag ..." - echo "FILTER: ${list}_attribute_global_name_only Tag filter of ${list}" >> "${filterfile}" + debug 1 "... processing 'attribute'-matches with no HTML tag ..." ( - # allow handling of left-over lines from last while-loop-run shopt -s lastpipe + # allow handling of left-over lines from last while-loop-run + echo "FILTER: ${list}_attribute_global_name_only Tag filter of ${list}" lines=() # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex sed -e ' @@ -415,42 +415,176 @@ function main() { printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' printf ').*>.*<\/\\1[^>]*>@@g\n' fi + + echo "FILTER: ${list}_attribute_exact Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^=^*][^=^*]*=.*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert attribute name-only matches + s/^\[\([^=][^=]*\)=\(.*\)\]/\1=\2/g + # convert dots + s/\.\([^\.]\)/\\.\1/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi + + echo "FILTER: ${list}_attribute_contain Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^*][^*]*\*=.*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert dots + s/\.\([^\.]\)/\\.\1/g + # convert attribute based filter with contain match + s/^\[\([^*][^*]*\)\*=\(["'"'"']*\)\([^"][^"]*\)"*\(["'"'"']*\)\]/\1=\2.*\3.*\4/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi + + echo "FILTER: ${list}_attribute_startswith Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^=^][^=^]*\^=.*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert dots + s/\.\([^\.]\)/\\.\1/g + # convert attribute based filter with startwith match + s/^\[\([^^][^^]*\)^=\(["'"'"']*\)\(.*[^"'"'"']\)\(["'"'"']*\)\]/\1=\2\3.*\4/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi + + echo "FILTER: ${list}_attribute_endswith Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^$][^=$]*\$=.*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert dots + s/\.\([^\.]\)/\\.\1/g + # convert attribute based filter with endswith match + s/^\[\([^\$][^\$]*\)\$=\(["'"'"']*\)\(.*[^"'"'"']\)\(["'"'"']*\)\]/\1=\2.*\3\4/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi shopt -u lastpipe ) >> "${filterfile}" - debug 1 "... registering ${list}_attribute_global_name_only in actionfile ..." + debug 1 "... registering ${list}_attribute filters in actionfile ..." ( echo "{ +filter{${list}_attribute_global_name_only} }" echo "/" + echo "{ +filter{${list}_attribute_exact} }" + echo "/" + echo "{ +filter{${list}_attribute_contain} }" + echo "/" + echo "{ +filter{${list}_attribute_startswith} }" + echo "/" + echo "{ +filter{${list}_attribute_endswith} }" + echo "/" ) >> "${actionfile}" debug 1 "... registered ..." - #debug 1 "... processing 'attribute'-matches ..." - #sed ' - ## only process gloabl classes - #/^##\[.*/!d - ## remove all matches with combinators - #/^##\[.*[>+~].*/d - ## cleanup - #s/^##//g - ## convert attribute based filters with exact match with - #s/^\[\([^=^]*\)"*=\(.*\)\]/s@\1=\2>@@g/g - ## convert attribute based filter with contain match - #s/^\[\([^=^]*\)"*\*="*\([^"]*\)"*\]/s@\1=".*\2.*">@@g/g - ## convert attribute based filter with startwith match - #s/^\[\([^=]*\)"*^="*\([^"]*\)"*\]/s@\1="\2.*">@@g/g - ## convert attribute based filter with endswith match - #s/^\[\([^=^]*\)"*\$="*\([^"]*\)"*\]/s@\1=".*\2">@@g/g - ## convert dots - #s/\.\([a-zA-Z0-9]\)/\\.\1/g - #' "${html_file}" >> "${filterfile}" - - #debug 1 "... registering ${list}_attribute in actionfile ..." - #( - # echo "{ +filter{${list}_attribute} }" - # echo "*" - #) >> "${actionfile}" - #debug 1 "... registered ..." # FIXME: add attribute handling with domains # FIXME: add attribute handling with combinators # FIXME: add combination of classes and attributes: ##.OUTBRAIN[data-widget-id^="FMS_REELD_"] diff --git a/tests/config.py b/tests/config.py index 5f3bb12..df40ece 100644 --- a/tests/config.py +++ b/tests/config.py @@ -7,9 +7,14 @@ "MyAdsId3", # id match "AdRight2", # class match with element having multiple classes "data-ad-manager-id", # attribute match + 'data-role="tile-ads-module"', # attribute exact match + 'onclick="content.ad/"', # attribute contain match + 'class="adDisplay-module_foobar"', # attribute startswith match + "onclick=\"location.href='http://www.reimageplus.com/foobar'", # attribute startswith match ] content_exists = [ "ajlkl", # should exist, although one element is removed by privoxy + '"adDisplay-modul"', # should exist ] # FIXME: see https://github.com/Andrwe/privoxy-blocklist/issues/35 diff --git a/tests/response.html b/tests/response.html index 28ceb21..8882c68 100644 --- a/tests/response.html +++ b/tests/response.html @@ -6,5 +6,11 @@
multiple classes that should exist
id should be removed
name-only attibute should be removed
+
exact match attibute should be removed
+
1. contain match attribute should be removed
+
2. contain match attribute should be removed
+
1. startswith match attribute should be removed
+
startswith match attribute should be exist
+
2. startswith match attribute should be removed
From 321cfdd1c1715c91da441a0b0d99a408d36bed16 Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Sun, 28 Jan 2024 22:42:42 +0100 Subject: [PATCH 09/11] implement flag for content filter activation Signed-off-by: Andrwe Lord Weber --- privoxy-blocklist.sh | 713 +++++++++++++++++++--------------- tests/conftest.py | 20 + tests/test_01_root_execute.py | 16 +- 3 files changed, 426 insertions(+), 323 deletions(-) diff --git a/privoxy-blocklist.sh b/privoxy-blocklist.sh index 872be2d..51d251c 100755 --- a/privoxy-blocklist.sh +++ b/privoxy-blocklist.sh @@ -30,6 +30,18 @@ set -euo pipefail # dependencies DEPENDS=('privoxy' 'sed' 'grep' 'bash' 'wget') +# types of content filters +# used in conftest.py, thus keep structure +FILTERTYPES=( + "attribute_global_name" + "attribute_global_exact" + "attribute_global_contain" + "attribute_global_startswith" + "attribute_global_endswith" + "class_global" + "id_global" +) + ###################################################################### # # No changes needed after this line. @@ -41,13 +53,16 @@ function usage() { echo "${TMPNAME:-This} is a script to convert AdBlockPlus-lists into Privoxy-lists and install them." echo " " echo "Options:" - echo " -h: Show this help." - echo " -c: Path to script configuration file. (default = ${SCRIPTCONF} - OS specific)" - echo " -q: Don't give any output." - echo " -v 1: Enable verbosity 1. Show a little bit more output." - echo " -v 2: Enable verbosity 2. Show a lot more output." - echo " -v 3: Enable verbosity 3. Show all possible output and don't delete temporary files.(For debugging only!!)" - echo " -r: Remove all lists build by this script." + echo " -h: Show this help." + echo " -c: Path to script configuration file. (default = ${SCRIPTCONF} - OS specific)" + echo " -f filter: only activate given content filter, can be used multiple times. (default: empty, content-filter disabled)" + echo " Supported values: ${FILTERTYPES[*]}" + echo " -q: Don't give any output." + echo " -v 1: Enable verbosity 1. Show a little bit more output." + echo " -v 2: Enable verbosity 2. Show a lot more output." + echo " -v 3: Enable verbosity 3. Show all possible output and don't delete temporary files.(For debugging only!!)" + echo " -V: Show version." + echo " -r: Remove all lists build by this script." } function get_config_path() { @@ -98,7 +113,15 @@ function prepare() { # array of URL for AdblockPlus lists # for more sources just add it within the round brackets -URLS=("https://easylist-downloads.adblockplus.org/easylistgermany.txt" "https://easylist-downloads.adblockplus.org/easylist.txt") +URLS=( + "https://easylist-downloads.adblockplus.org/easylistgermany.txt" + "https://easylist-downloads.adblockplus.org/easylist.txt" +) + +# array of content filters to convert +# for supported values check: $0 -h +# empty by default to deactivate as content filters slowdown privoxy a lot +FILTERS=() # config for privoxy initscript providing PRIVOXY_CONF, PRIVOXY_USER and PRIVOXY_GROUP INIT_CONF="/etc/conf.d/privoxy" @@ -134,6 +157,9 @@ EOF if [ -n "${OPT_DBG:-}" ]; then DBG="${OPT_DBG}" fi + if [ -n "${OPT_FILTERS[*]}" ]; then + FILTERS=("${OPT_FILTERS[@]}") + fi # load privoxy config # shellcheck disable=SC1090 if [[ -r "${INIT_CONF:-no-init-conf}" ]]; then @@ -190,6 +216,11 @@ function info() { printf '\e[1;33m%s\e[0m\n' "$@" } +# shellcheck disable=SC2317 # function is called in case of FILTERS not empty +function filter_active() { + grep -qxF "$1" <(printf '%s\n' "${FILTERS[@]}") +} + # shellcheck disable=SC2317 function main() { for url in "${URLS[@]}"; do @@ -268,326 +299,354 @@ function main() { s/^|\([^|][^|]*\)|/^\1\$/g;s/|$/\$/g ' "${address_file}" >> "${actionfile}" - debug 1 "... creating filterfile for ${list} ..." - debug 1 "... processing global 'class'-matches ..." - echo "FILTER: ${list}_class_global Tag filter of ${list}" > "${filterfile}" - ( - # allow handling of left-over lines from last while-loop-run - shopt -s lastpipe - lines=() - # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex - sed -e ' - # only process gloabl class matches - /^##\..*/!d - # remove all combinations with attribute matching - /^##\..*\[.*/d - # remove all matches with combinators - /^##\..*[>+~ ].*/d - # cleanup - s/^##\.//g - # prepare regex merging - s/$/|/ - ' "${html_file}" | while read -r line; do - # number of matches within one rule impacts runtime of each request to modify the content - if [ "${#lines[@]}" -lt 1000 ]; then - lines+=("$line") - continue - fi - # complexity of regex impacts runtime of each request to modify the content - # using removal of whole HTML tag as multiple matches with different classes in same element are not possible - # printf to inject both quoting characters " and ' - printf 's@<([a-zA-Z0-9]+)\\s+.*class=[%s][^%s]*(' "\"'" "\"'" - # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - # printf to inject both quoting characters " and ' - printf ')[^%s]*[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" "\"'" - lines=() - done - # process last chunk with less than 1000 entries - if [ "${#lines[@]}" -gt 0 ]; then - printf 's@<([a-zA-Z0-9]+)\\s+.*class=[%s][^%s]*(' "\"'" "\"'" - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - printf ')[^%s]*[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" "\"'" + echo > "${filterfile}" + if [ -n "${FILTERS[*]}" ]; then + debug 1 "... creating filterfile for ${list} ..." + if filter_active "class_global"; then + debug 1 "... processing global 'class'-matches ..." + ( + # allow handling of left-over lines from last while-loop-run + shopt -s lastpipe + echo "FILTER: ${list}_class_global Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl class matches + /^##\..*/!d + # remove all combinations with attribute matching + /^##\..*\[.*/d + # remove all matches with combinators + /^##\..*[>+~ ].*/d + # cleanup + s/^##\.//g + # prepare regex merging + s/$/|/ + ' "${html_file}" | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*class=[%s][^%s]*(' "\"'" "\"'" + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ')[^%s]*[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" "\"'" + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*class=[%s][^%s]*(' "\"'" "\"'" + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ')[^%s]*[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" "\"'" + fi + shopt -u lastpipe + ) >> "${filterfile}" + + debug 1 "... registering ${list}_class_global in actionfile ..." + ( + echo "{ +filter{${list}_class_global} }" + echo "/" + ) >> "${actionfile}" + debug 1 "... registered ..." + # FIXME: add class handling with domains + # FIXME: add class handling with combinators + # FIXME: add class with defined HTML tag ? + # FIXME: add class with cascading fi - shopt -u lastpipe - ) >> "${filterfile}" - - debug 1 "... registering ${list}_class_global in actionfile ..." - ( - echo "{ +filter{${list}_class_global} }" - echo "/" - ) >> "${actionfile}" - debug 1 "... registered ..." - # FIXME: add class handling with domains - # FIXME: add class handling with combinators - # FIXME: add class with defined HTML tag ? - # FIXME: add class with cascading - - debug 1 "... processing global 'id'-matches ..." - echo "FILTER: ${list}_id_global Tag filter of ${list}" >> "${filterfile}" - ( - # allow handling of left-over lines from last while-loop-run - shopt -s lastpipe - lines=() - # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex - sed -e ' - # only process gloabl id-only matches - /^###.*/!d - # remove all matches with combinators - /^###.*[>+~ ].*/d - # cleanup - s/^###//g - # prepare regex merging - s/$/|/ - ' "${html_file}" | while read -r line; do - # number of matches within one rule impacts runtime of each request to modify the content - if [ "${#lines[@]}" -lt 1000 ]; then - lines+=("$line") - continue - fi - # complexity of regex impacts runtime of each request to modify the content - # using removal of whole HTML tag as multiple matches with different classes in same element are not possible - # printf to inject both quoting characters " and ' - printf 's@<([a-zA-Z0-9]+)\\s+.*id=[%s](' "\"'" - # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - # printf to inject both quoting characters " and ' - printf ')[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" - lines=() - done - # process last chunk with less than 1000 entries - if [ "${#lines[@]}" -gt 0 ]; then - printf 's@<([a-zA-Z0-9]+)\\s+.*id=[%s](' "\"'" - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - printf ')[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" + + if filter_active "id_global"; then + debug 1 "... processing global 'id'-matches ..." + echo "FILTER: ${list}_id_global Tag filter of ${list}" >> "${filterfile}" + ( + # allow handling of left-over lines from last while-loop-run + shopt -s lastpipe + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl id-only matches + /^###.*/!d + # remove all matches with combinators + /^###.*[>+~ ].*/d + # cleanup + s/^###//g + # prepare regex merging + s/$/|/ + ' "${html_file}" | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*id=[%s](' "\"'" + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ')[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*id=[%s](' "\"'" + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ')[%s].*>.*<\/\\1[^>]*>@@g\n' "\"'" + fi + shopt -u lastpipe + ) >> "${filterfile}" + + debug 1 "... registering ${list}_id_global in actionfile ..." + ( + echo "{ +filter{${list}_id_global} }" + echo "/" + ) >> "${actionfile}" + debug 1 "... registered ..." + # FIXME: add id handling with domains + # FIXME: add id handling with combinators + # FIXME: add id with cascading fi - shopt -u lastpipe - ) >> "${filterfile}" - - debug 1 "... registering ${list}_id_global in actionfile ..." - ( - echo "{ +filter{${list}_id_global} }" - echo "/" - ) >> "${actionfile}" - debug 1 "... registered ..." - # FIXME: add id handling with domains - # FIXME: add id handling with combinators - # FIXME: add id with cascading - - debug 1 "... processing 'attribute'-matches with no HTML tag ..." - ( - shopt -s lastpipe - # allow handling of left-over lines from last while-loop-run - echo "FILTER: ${list}_attribute_global_name_only Tag filter of ${list}" - lines=() - # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex - sed -e ' - # only process gloabl classes - /^##\[[^=][^=]*$/!d - # remove all matches with combinators - /^##.*[>+~ ].*/d - # cleanup - s/^##//g - # convert attribute name-only matches - s/^\[\([^=][^=]*\)\]/\1/g - # convert dots - s/\.\([^\.]\)/\\.\1/g - s/$/|/ - ' "${html_file}" | sort -u | while read -r line; do - # number of matches within one rule impacts runtime of each request to modify the content - if [ "${#lines[@]}" -lt 1000 ]; then - lines+=("$line") - continue + + debug 1 "... processing 'attribute'-matches with no HTML tag ..." + ( + shopt -s lastpipe + + if filter_active "attribute_global_name"; then + # allow handling of left-over lines from last while-loop-run + echo "FILTER: ${list}_attribute_global_name Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^=][^=]*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert attribute name-only matches + s/^\[\([^=][^=]*\)\]/\1/g + # convert dots + s/\.\([^\.]\)/\\.\1/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi fi - # complexity of regex impacts runtime of each request to modify the content - # using removal of whole HTML tag as multiple matches with different classes in same element are not possible - # printf to inject both quoting characters " and ' - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - # printf to inject both quoting characters " and ' - printf ').*>.*<\/\\1[^>]*>@@g\n' - lines=() - done - # process last chunk with less than 1000 entries - if [ "${#lines[@]}" -gt 0 ]; then - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - printf ').*>.*<\/\\1[^>]*>@@g\n' - fi - echo "FILTER: ${list}_attribute_exact Tag filter of ${list}" - lines=() - # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex - sed -e ' - # only process gloabl classes - /^##\[[^=^*][^=^*]*=.*$/!d - # remove all matches with combinators - /^##.*[>+~ ].*/d - # cleanup - s/^##//g - # convert attribute name-only matches - s/^\[\([^=][^=]*\)=\(.*\)\]/\1=\2/g - # convert dots - s/\.\([^\.]\)/\\.\1/g - s/$/|/ - ' "${html_file}" | sort -u | while read -r line; do - # number of matches within one rule impacts runtime of each request to modify the content - if [ "${#lines[@]}" -lt 1000 ]; then - lines+=("$line") - continue + if filter_active "attribute_global_exact"; then + echo "FILTER: ${list}_attribute_global_exact Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^=^*][^=^*]*=.*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert attribute name-only matches + s/^\[\([^=][^=]*\)=\(.*\)\]/\1=\2/g + # convert dots + s/\.\([^\.]\)/\\.\1/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi fi - # complexity of regex impacts runtime of each request to modify the content - # using removal of whole HTML tag as multiple matches with different classes in same element are not possible - # printf to inject both quoting characters " and ' - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - # printf to inject both quoting characters " and ' - printf ').*>.*<\/\\1[^>]*>@@g\n' - lines=() - done - # process last chunk with less than 1000 entries - if [ "${#lines[@]}" -gt 0 ]; then - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - printf ').*>.*<\/\\1[^>]*>@@g\n' - fi - echo "FILTER: ${list}_attribute_contain Tag filter of ${list}" - lines=() - # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex - sed -e ' - # only process gloabl classes - /^##\[[^*][^*]*\*=.*$/!d - # remove all matches with combinators - /^##.*[>+~ ].*/d - # cleanup - s/^##//g - # convert dots - s/\.\([^\.]\)/\\.\1/g - # convert attribute based filter with contain match - s/^\[\([^*][^*]*\)\*=\(["'"'"']*\)\([^"][^"]*\)"*\(["'"'"']*\)\]/\1=\2.*\3.*\4/g - s/$/|/ - ' "${html_file}" | sort -u | while read -r line; do - # number of matches within one rule impacts runtime of each request to modify the content - if [ "${#lines[@]}" -lt 1000 ]; then - lines+=("$line") - continue + if filter_active "attribute_global_contain"; then + echo "FILTER: ${list}_attribute_global_contain Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^*][^*]*\*=.*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert dots + s/\.\([^\.]\)/\\.\1/g + # convert attribute based filter with contain match + s/^\[\([^*][^*]*\)\*=\(["'"'"']*\)\([^"][^"]*\)"*\(["'"'"']*\)\]/\1=\2.*\3.*\4/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi fi - # complexity of regex impacts runtime of each request to modify the content - # using removal of whole HTML tag as multiple matches with different classes in same element are not possible - # printf to inject both quoting characters " and ' - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - # printf to inject both quoting characters " and ' - printf ').*>.*<\/\\1[^>]*>@@g\n' - lines=() - done - # process last chunk with less than 1000 entries - if [ "${#lines[@]}" -gt 0 ]; then - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - printf ').*>.*<\/\\1[^>]*>@@g\n' - fi - echo "FILTER: ${list}_attribute_startswith Tag filter of ${list}" - lines=() - # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex - sed -e ' - # only process gloabl classes - /^##\[[^=^][^=^]*\^=.*$/!d - # remove all matches with combinators - /^##.*[>+~ ].*/d - # cleanup - s/^##//g - # convert dots - s/\.\([^\.]\)/\\.\1/g - # convert attribute based filter with startwith match - s/^\[\([^^][^^]*\)^=\(["'"'"']*\)\(.*[^"'"'"']\)\(["'"'"']*\)\]/\1=\2\3.*\4/g - s/$/|/ - ' "${html_file}" | sort -u | while read -r line; do - # number of matches within one rule impacts runtime of each request to modify the content - if [ "${#lines[@]}" -lt 1000 ]; then - lines+=("$line") - continue + if filter_active "attribute_global_startswith"; then + echo "FILTER: ${list}_attribute_global_startswith Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^=^][^=^]*\^=.*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert dots + s/\.\([^\.]\)/\\.\1/g + # convert attribute based filter with startwith match + s/^\[\([^^][^^]*\)^=\(["'"'"']*\)\(.*[^"'"'"']\)\(["'"'"']*\)\]/\1=\2\3.*\4/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi fi - # complexity of regex impacts runtime of each request to modify the content - # using removal of whole HTML tag as multiple matches with different classes in same element are not possible - # printf to inject both quoting characters " and ' - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - # printf to inject both quoting characters " and ' - printf ').*>.*<\/\\1[^>]*>@@g\n' - lines=() - done - # process last chunk with less than 1000 entries - if [ "${#lines[@]}" -gt 0 ]; then - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - printf ').*>.*<\/\\1[^>]*>@@g\n' - fi - echo "FILTER: ${list}_attribute_endswith Tag filter of ${list}" - lines=() - # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex - sed -e ' - # only process gloabl classes - /^##\[[^$][^=$]*\$=.*$/!d - # remove all matches with combinators - /^##.*[>+~ ].*/d - # cleanup - s/^##//g - # convert dots - s/\.\([^\.]\)/\\.\1/g - # convert attribute based filter with endswith match - s/^\[\([^\$][^\$]*\)\$=\(["'"'"']*\)\(.*[^"'"'"']\)\(["'"'"']*\)\]/\1=\2.*\3\4/g - s/$/|/ - ' "${html_file}" | sort -u | while read -r line; do - # number of matches within one rule impacts runtime of each request to modify the content - if [ "${#lines[@]}" -lt 1000 ]; then - lines+=("$line") - continue + if filter_active "attribute_global_endswith"; then + echo "FILTER: ${list}_attribute_global_endswith Tag filter of ${list}" + lines=() + # using while-loop as privoxy cannot handle more than 2000 or-connected strings within one regex + sed -e ' + # only process gloabl classes + /^##\[[^$][^=$]*\$=.*$/!d + # remove all matches with combinators + /^##.*[>+~ ].*/d + # cleanup + s/^##//g + # convert dots + s/\.\([^\.]\)/\\.\1/g + # convert attribute based filter with endswith match + s/^\[\([^\$][^\$]*\)\$=\(["'"'"']*\)\(.*[^"'"'"']\)\(["'"'"']*\)\]/\1=\2.*\3\4/g + s/$/|/ + ' "${html_file}" | sort -u | while read -r line; do + # number of matches within one rule impacts runtime of each request to modify the content + if [ "${#lines[@]}" -lt 1000 ]; then + lines+=("$line") + continue + fi + # complexity of regex impacts runtime of each request to modify the content + # using removal of whole HTML tag as multiple matches with different classes in same element are not possible + # printf to inject both quoting characters " and ' + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + # printf to inject both quoting characters " and ' + printf ').*>.*<\/\\1[^>]*>@@g\n' + lines=() + done + # process last chunk with less than 1000 entries + if [ "${#lines[@]}" -gt 0 ]; then + printf 's@<([a-zA-Z0-9]+)\\s+.*(' + printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' + printf ').*>.*<\/\\1[^>]*>@@g\n' + fi fi - # complexity of regex impacts runtime of each request to modify the content - # using removal of whole HTML tag as multiple matches with different classes in same element are not possible - # printf to inject both quoting characters " and ' - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - # using tr to merge lines because sed-based approachs takes up to 6 MB RAM and >10 seconds during testing - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - # printf to inject both quoting characters " and ' - printf ').*>.*<\/\\1[^>]*>@@g\n' - lines=() - done - # process last chunk with less than 1000 entries - if [ "${#lines[@]}" -gt 0 ]; then - printf 's@<([a-zA-Z0-9]+)\\s+.*(' - printf '%s\n' "${lines[@]}" | sed '$ s/|//' | tr -d '\n' - printf ').*>.*<\/\\1[^>]*>@@g\n' - fi - shopt -u lastpipe - ) >> "${filterfile}" - - debug 1 "... registering ${list}_attribute filters in actionfile ..." - ( - echo "{ +filter{${list}_attribute_global_name_only} }" - echo "/" - echo "{ +filter{${list}_attribute_exact} }" - echo "/" - echo "{ +filter{${list}_attribute_contain} }" - echo "/" - echo "{ +filter{${list}_attribute_startswith} }" - echo "/" - echo "{ +filter{${list}_attribute_endswith} }" - echo "/" - ) >> "${actionfile}" - debug 1 "... registered ..." - - # FIXME: add attribute handling with domains - # FIXME: add attribute handling with combinators - # FIXME: add combination of classes and attributes: ##.OUTBRAIN[data-widget-id^="FMS_REELD_"] + shopt -u lastpipe + ) >> "${filterfile}" + + debug 1 "... registering ${list}_attribute filters in actionfile ..." + ( + if filter_active "attribute_global_name"; then + echo "{ +filter{${list}_attribute_global_name} }" + echo "/" + fi + if filter_active "attribute_global_exact"; then + echo "{ +filter{${list}_attribute_global_exact} }" + echo "/" + fi + if filter_active "attribute_global_contain"; then + echo "{ +filter{${list}_attribute_global_contain} }" + echo "/" + fi + if filter_active "attribute_global_startswith"; then + echo "{ +filter{${list}_attribute_global_startswith} }" + echo "/" + fi + if filter_active "attribute_global_endswith"; then + echo "{ +filter{${list}_attribute_global_endswith} }" + echo "/" + fi + ) >> "${actionfile}" + debug 1 "... registered ..." + + # FIXME: add attribute handling with domains + # FIXME: add attribute handling with combinators + # FIXME: add combination of classes and attributes: ##.OUTBRAIN[data-widget-id^="FMS_REELD_"] + fi # create domain based whitelist @@ -690,16 +749,16 @@ function remove() { VERBOSE=() method="main" OS="$(uname)" +OPT_FILTERS=() # loop for options -while getopts ":c:hrqv:V" opt; do +while getopts ":c:f:hrqv:V" opt; do case "${opt}" in "c") SCRIPTCONF="${OPTARG}" ;; - "v") - OPT_DBG="${OPTARG}" - VERBOSE=("-v") + "f") + OPT_FILTERS+=("${OPTARG,,}") ;; "q") OPT_DBG=-1 @@ -707,6 +766,10 @@ while getopts ":c:hrqv:V" opt; do "r") method="remove" ;; + "v") + OPT_DBG="${OPTARG}" + VERBOSE=("-v") + ;; "V") #
is replaced by release process echo "Version:
" @@ -723,6 +786,13 @@ while getopts ":c:hrqv:V" opt; do esac done +if [ -n "${OPT_FILTERS[*]}" ]; then + if unknown="$(grep -vxFf <(printf '%s\n' "${FILTERTYPES[@]}") <(printf '%s\n' "${OPT_FILTERS[@]}"))"; then + error "Unknown filters: ${unknown}" + exit 1 + fi +fi + prepare trap 'rm -fr "${TMPDIR}";exit' INT TERM EXIT @@ -731,6 +801,7 @@ lock debug 2 "URL-List: ${URLS[*]}" debug 2 "Privoxy-Configdir: ${PRIVOXY_DIR}" debug 2 "Temporary directory: ${TMPDIR}" +debug 2 "Content filters: ${OPT_FILTERS[*]:-disabled}" "${method}" # restore default exit command diff --git a/tests/conftest.py b/tests/conftest.py index 2325574..901636b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -88,6 +88,26 @@ def webserver(httpserver) -> UrlParsed: return UrlParsed(httpserver.url_for("/")) +@pytest.fixture(scope="module") +def filtertypes() -> list[str]: + """Return filtertypes supported by privoxy-blocklist.""" + filter_types = [] + with Path(__file__).parent.parent.joinpath("privoxy-blocklist.sh").open( + "r", encoding="UTF-8" + ) as f_h: + found_line = False + for line in f_h.readlines(): + if not found_line and not line.startswith("FILTERTYPES"): + continue + if line.startswith("FILTERTYPES"): + found_line = True + continue + if line.endswith(")\n"): + break + filter_types.append(line.strip().strip('"')) + return filter_types + + @pytest.fixture(scope="module") def privoxy_blocklist() -> str: """Return the path to privoxy-blocklist.sh.""" diff --git a/tests/test_01_root_execute.py b/tests/test_01_root_execute.py index 16b1b51..0d59fd7 100644 --- a/tests/test_01_root_execute.py +++ b/tests/test_01_root_execute.py @@ -48,9 +48,21 @@ def test_version_option(shell, tmp_path, privoxy_blocklist) -> None: assert ret.stdout == "Version: 0.0.1\n" -def test_next_run(shell, privoxy_blocklist) -> None: +def test_filter_check(shell, privoxy_blocklist) -> None: + """Test filtertype check.""" + cmd = [privoxy_blocklist, "-f", "bla"] + ret_script = shell.run(*cmd) + assert ret_script.returncode == 1 + assert "" == ret_script.stdout + assert "Unknown filters: bla" in ret_script.stderr.strip() + + +def test_next_run(shell, privoxy_blocklist, filtertypes) -> None: """Test followup runs.""" - ret_script = shell.run(privoxy_blocklist) + cmd = [privoxy_blocklist] + for filtertype in filtertypes: + cmd.extend(["-f", filtertype]) + ret_script = shell.run(*cmd) assert ret_script.returncode == 0 ret_privo = shell.run( "/usr/sbin/privoxy", "--no-daemon", "--config-test", "/etc/privoxy/config" From 4ed83f9f5cc1c8838cf8fa9ba3ee25d85e4909f6 Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Sun, 28 Jan 2024 23:04:24 +0100 Subject: [PATCH 10/11] update implementation status in README Signed-off-by: Andrwe Lord Weber --- README.md | 21 ++++++++++++++++----- privoxy-blocklist.sh | 3 ++- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c9bf04f..b6861ec 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,6 @@ The following table shows features of AdBlock Plus filters and there status with | Feature | Type | Status | Test | | ------- | ---- | ------ | ---- | -| `#$#` | CSS selector - Snippet filter | :question: | :question: | | `:-abp-contains()` | extended CSS selector | :question: | :question: | | `:-abp-has()` | extended CSS selector | :question: | :question: | | `:-abp-properties()` | extended CSS selector | :question: | :question: | @@ -33,10 +32,22 @@ The following table shows features of AdBlock Plus filters and there status with | `\|…\|` | block exact domain matching including scheme | :question: | :question: | | `!…` | comments | :white_check_mark: | | | `csp=` | filter options | :question: | :question: | -| `##…[…]` | CSS attribute selector | :question: | :question: | -| `##` | CSS selector - Element hiding | :white_check_mark: | | -| `#?#` | CSS selector - Element hiding emulation | :question: | :question: | -| `#@#` | CSS selector - Element hiding exception | :question: | :question: | +| `##.class` | global CSS attribute selector with matching for class | :white_check_mark: | :white_check_mark: | +| `###id` | global CSS attribute selector with matching for id | :white_check_mark: | :white_check_mark: | +| `##[attribute]` | global CSS attribute selector with matching for attribute-name | :white_check_mark: | :white_check_mark: | +| `##[attribute=value]` | global CSS attribute selector with matching for attribute-value pair | :white_check_mark: | :white_check_mark: | +| `##[attribute^=value]` | global CSS attribute selector with matching for attribute with value starting with | :white_check_mark: | :white_check_mark: | +| `##[attribute$=value]` | global CSS attribute selector with matching for attribute with value ending with | :white_check_mark: | :white_check_mark: | +| `##[attribute*=value]` | global CSS attribute selector with matching for attribute with value containing | :white_check_mark: | :white_check_mark: | +| `##html-tag[attribute]` | global CSS attribute selector for html-tag with matching for attribute-name | :construction: | :construction: | +| `##html-tag[attribute=value]` | global CSS attribute selector for html-tag with matching for attribute-value pair | :construction: | :construction: | +| `##html-tag[attribute^=value]` | global CSS attribute selector for html-tag with matching for attribute with value starting with | :construction: | :construction: | +| `##html-tag[attribute$=value]` | global CSS attribute selector for html-tag with matching for attribute with value ending with | :construction: | :construction: | +| `##html-tag[attribute*=value]` | global CSS attribute selector for html-tag with matching for attribute with value containing | :construction: | :construction: | +| `[…]#$#` | domain based CSS selector - Snippet filter | :question: | :question: | +| `[…]##` | domain based CSS selector - Element hiding | :white_check_mark: | | +| `[…]#?#` | domain based CSS selector - Element hiding emulation | :question: | :question: | +| `[…]#@#` | domain based CSS selector - Element hiding exception | :question: | :question: | | `document` | filter options | :question: | :question: | | `~domain=` | filter options | :question: | :question: | | `domain=` | filter options | :question: | :question: | diff --git a/privoxy-blocklist.sh b/privoxy-blocklist.sh index 51d251c..6842fba 100755 --- a/privoxy-blocklist.sh +++ b/privoxy-blocklist.sh @@ -160,6 +160,8 @@ EOF if [ -n "${OPT_FILTERS[*]}" ]; then FILTERS=("${OPT_FILTERS[@]}") fi + debug 2 "Content filters: ${OPT_FILTERS[*]:-disabled}" + # load privoxy config # shellcheck disable=SC1090 if [[ -r "${INIT_CONF:-no-init-conf}" ]]; then @@ -801,7 +803,6 @@ lock debug 2 "URL-List: ${URLS[*]}" debug 2 "Privoxy-Configdir: ${PRIVOXY_DIR}" debug 2 "Temporary directory: ${TMPDIR}" -debug 2 "Content filters: ${OPT_FILTERS[*]:-disabled}" "${method}" # restore default exit command From 9f874fcf0ba74b26ee16962643452560fe05f497 Mon Sep 17 00:00:00 2001 From: Andrwe Lord Weber Date: Sun, 28 Jan 2024 23:16:56 +0100 Subject: [PATCH 11/11] update test configurations Signed-off-by: Andrwe Lord Weber --- tests/configs/debugging.conf | 5 +++++ tests/configs/url_extended_config.conf | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tests/configs/debugging.conf b/tests/configs/debugging.conf index 05f13ef..ac9b183 100644 --- a/tests/configs/debugging.conf +++ b/tests/configs/debugging.conf @@ -7,6 +7,11 @@ URLS=( "https://easylist-downloads.adblockplus.org/easylist.txt" ) +# array of content filters to convert +# for supported values check: $0 -h +# empty by default to deactivate as content filters slowdown privoxy a lot +FILTERS=() + # config for privoxy initscript providing PRIVOXY_CONF, PRIVOXY_USER and PRIVOXY_GROUP #INIT_CONF="/etc/conf.d/privoxy" diff --git a/tests/configs/url_extended_config.conf b/tests/configs/url_extended_config.conf index 236b1f8..65f9c9b 100644 --- a/tests/configs/url_extended_config.conf +++ b/tests/configs/url_extended_config.conf @@ -8,6 +8,11 @@ URLS=( "https://raw.githubusercontent.com/easylist/easylist/master/easylist/easylist_allowlist_general_hide.txt" ) +# array of content filters to convert +# for supported values check: $0 -h +# empty by default to deactivate as content filters slowdown privoxy a lot +FILTERS=() + # config for privoxy initscript providing PRIVOXY_CONF, PRIVOXY_USER and PRIVOXY_GROUP #INIT_CONF="/etc/conf.d/privoxy"