changed import + added allowlist

NotaInutilis · Nov 23, 2023 · ccde310 · ccde310
1 parent 5cf3b86
commit ccde310
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 26 deletions.
diff --git a/import/allowlist.txt b/import/allowlist.txt
@@ -0,0 +1 @@
+wordpress.com
diff --git a/sources/_imported/README.md → import/importlist.txt b/sources/_imported/README.md → import/importlist.txt
@@ -1,6 +1,6 @@
 peertube_isolation fediverse.txt|https://peertube_isolation.frama.io/list/peertube_isolation.txt
 Adfilt TabloidRemover.txt|https://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover.txt
-Adfilt TabloidRemover-MastodonCategoryForImports fediverse.txthttps://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover-MastodonCategoryForImports.csv
+Adfilt TabloidRemover-MastodonCategoryForImports fediverse.txt|https://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover-MastodonCategoryForImports.csv
 Attack Vectors fake-local-journals-list.txt|https://raw.githubusercontent.com/MassMove/AttackVectors/master/LocalJournals/fake-local-journals-list.txt
 Windscribe clickbait.txt|https://assets.windscribe.com/custom_blocklists/clickbait.txt
 StevenBlack fakenews.txt|https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts

diff --git a/scripts/cleanup.sh b/scripts/cleanup.sh
diff --git a/scripts/download.sh b/scripts/download.sh
@@ -1,10 +1,10 @@
 #!/usr/bin/env bash
 
-# Use this script to download external blocklists from a non-txt list formatted as "file.txt|url" to the sources/_imported/ folder.
+# Use this script to download external blocklists from a list formatted as "file.txt|url" to the "/import/original/" folder.
 # e.g.
-# ./scripts/download.sh < list.md
+# ./scripts/download.sh < list.txt
 
 IFS='|'
 while read FILE URL; do
-    wget -O ./sources/_imported/"$FILE" -- "$URL"
+    wget -O ./import/original/"$FILE" -- "$URL"
 done
diff --git a/scripts/import.sh b/scripts/import.sh
@@ -5,7 +5,28 @@
 # ./scripts/import.sh
 
 # Download external blocklists
-./scripts/download.sh < ./sources/_imported/README.md
+./scripts/download.sh < ./import/importlist.txt
 
-# Cleanup sources
-./scripts/cleanup.sh
+# Copy to modified
+cp -a ./import/original/. ./import/modified/
+
+# Cleanup imported sources (Same code in update.sh)
+## Special cleanup for imported sources of other formats (AdBlock, hosts, etc.)
+find ./import/modified -type f -name "*.txt" -exec sed -ri 's/^[^#[:alnum:]]/#&/; s/^0\.0\.0\.0[[:space:]]*//i' {} \;
+## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact
+find ./import/modified -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
+find ./import/modified -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \; # Removing "www." twice because unmaintained imported lists are weird.
+## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization)
+find ./import/modified -type f -name "*.txt" -exec bash -c '
+    awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt";
+    mv "$0_temp.txt" "$0";
+' {} \;
+
+# Remove entries from the allowlist
+find ./import/modified -type f -name "*.txt" -exec bash -c '
+    grep -vxFf "./import/allowlist.txt" "$0" > "$0_temp.txt";
+    mv "$0_temp.txt" "$0";
+' {} \;
+
+# Copy to sources
+cp -a ./import/modified/. ./sources/_imported/
diff --git a/scripts/update.sh b/scripts/update.sh
@@ -4,8 +4,15 @@
 # e.g.
 # ./scripts/update.sh
 
-# Cleanup sources
-./scripts/cleanup.sh
+# Cleanup sources (same code in import.sh)
+## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact
+find ./sources -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
+find ./sources -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \; # Removing "www." twice because unmaintained imported lists are weird.
+## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization)
+find ./sources -type f -name "*.txt" -exec bash -c '
+    awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt";
+    mv "$0_temp.txt" "$0";
+' {} \;
 
 # Combine all sources into a domains list
 find ./sources -type f -iname "*.txt" -exec cat {} \; > domains.txt