diff --git a/import/allowlist.txt b/import/allowlist.txt new file mode 100644 index 00000000..5150f236 --- /dev/null +++ b/import/allowlist.txt @@ -0,0 +1 @@ +wordpress.com \ No newline at end of file diff --git a/sources/_imported/README.md b/import/importlist.txt similarity index 83% rename from sources/_imported/README.md rename to import/importlist.txt index c2cf2d6b..ac5a4607 100644 --- a/sources/_imported/README.md +++ b/import/importlist.txt @@ -1,6 +1,6 @@ peertube_isolation fediverse.txt|https://peertube_isolation.frama.io/list/peertube_isolation.txt Adfilt TabloidRemover.txt|https://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover.txt -Adfilt TabloidRemover-MastodonCategoryForImports fediverse.txthttps://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover-MastodonCategoryForImports.csv +Adfilt TabloidRemover-MastodonCategoryForImports fediverse.txt|https://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover-MastodonCategoryForImports.csv Attack Vectors fake-local-journals-list.txt|https://raw.githubusercontent.com/MassMove/AttackVectors/master/LocalJournals/fake-local-journals-list.txt Windscribe clickbait.txt|https://assets.windscribe.com/custom_blocklists/clickbait.txt StevenBlack fakenews.txt|https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts diff --git a/scripts/cleanup.sh b/scripts/cleanup.sh deleted file mode 100755 index b847091a..00000000 --- a/scripts/cleanup.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -# Use this script to cleanup sources and normalize them to a list of domains while keeping comments. -# e.g. -# ./scripts/cleanup.sh - -# Cleanup sources -## Special cleanup for imported sources of other formats (AdBlock, hosts, etc.) -find ./sources/_imported -type f -name "*.txt" -exec sed -ri 's/^[^#[:alnum:]]/#&/; s/^0\.0\.0\.0[[:space:]]*//i' {} \; -## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact -find ./sources -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \; -find ./sources -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \; # Removing "www." twice because unmaintained imported lists are weird. -## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization) -find ./sources -type f -name "*.txt" -exec bash -c ' - awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt"; - mv "$0_temp.txt" "$0"; -' {} \; \ No newline at end of file diff --git a/scripts/download.sh b/scripts/download.sh index d2dead05..1c324e5a 100755 --- a/scripts/download.sh +++ b/scripts/download.sh @@ -1,10 +1,10 @@ #!/usr/bin/env bash -# Use this script to download external blocklists from a non-txt list formatted as "file.txt|url" to the sources/_imported/ folder. +# Use this script to download external blocklists from a list formatted as "file.txt|url" to the "/import/original/" folder. # e.g. -# ./scripts/download.sh < list.md +# ./scripts/download.sh < list.txt IFS='|' while read FILE URL; do - wget -O ./sources/_imported/"$FILE" -- "$URL" + wget -O ./import/original/"$FILE" -- "$URL" done \ No newline at end of file diff --git a/scripts/import.sh b/scripts/import.sh index a14a3e38..c95dec4a 100755 --- a/scripts/import.sh +++ b/scripts/import.sh @@ -5,7 +5,28 @@ # ./scripts/import.sh # Download external blocklists -./scripts/download.sh < ./sources/_imported/README.md +./scripts/download.sh < ./import/importlist.txt -# Cleanup sources -./scripts/cleanup.sh \ No newline at end of file +# Copy to modified +cp -a ./import/original/. ./import/modified/ + +# Cleanup imported sources (Same code in update.sh) +## Special cleanup for imported sources of other formats (AdBlock, hosts, etc.) +find ./import/modified -type f -name "*.txt" -exec sed -ri 's/^[^#[:alnum:]]/#&/; s/^0\.0\.0\.0[[:space:]]*//i' {} \; +## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact +find ./import/modified -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \; +find ./import/modified -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \; # Removing "www." twice because unmaintained imported lists are weird. +## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization) +find ./import/modified -type f -name "*.txt" -exec bash -c ' + awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt"; + mv "$0_temp.txt" "$0"; +' {} \; + +# Remove entries from the allowlist +find ./import/modified -type f -name "*.txt" -exec bash -c ' + grep -vxFf "./import/allowlist.txt" "$0" > "$0_temp.txt"; + mv "$0_temp.txt" "$0"; +' {} \; + +# Copy to sources +cp -a ./import/modified/. ./sources/_imported/ \ No newline at end of file diff --git a/scripts/update.sh b/scripts/update.sh index 08933fbc..6145477e 100755 --- a/scripts/update.sh +++ b/scripts/update.sh @@ -4,8 +4,15 @@ # e.g. # ./scripts/update.sh -# Cleanup sources -./scripts/cleanup.sh +# Cleanup sources (same code in import.sh) +## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact +find ./sources -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \; +find ./sources -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \; # Removing "www." twice because unmaintained imported lists are weird. +## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization) +find ./sources -type f -name "*.txt" -exec bash -c ' + awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt"; + mv "$0_temp.txt" "$0"; +' {} \; # Combine all sources into a domains list find ./sources -type f -iname "*.txt" -exec cat {} \; > domains.txt