Skip to content

Commit

Permalink
changed import + added allowlist
Browse files Browse the repository at this point in the history
  • Loading branch information
NotaInutilis committed Nov 23, 2023
1 parent 5cf3b86 commit ccde310
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 26 deletions.
1 change: 1 addition & 0 deletions import/allowlist.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
wordpress.com
2 changes: 1 addition & 1 deletion sources/_imported/README.md → import/importlist.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
peertube_isolation fediverse.txt|https://peertube_isolation.frama.io/list/peertube_isolation.txt
Adfilt TabloidRemover.txt|https://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover.txt
Adfilt TabloidRemover-MastodonCategoryForImports fediverse.txthttps://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover-MastodonCategoryForImports.csv
Adfilt TabloidRemover-MastodonCategoryForImports fediverse.txt|https://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover-MastodonCategoryForImports.csv
Attack Vectors fake-local-journals-list.txt|https://raw.githubusercontent.com/MassMove/AttackVectors/master/LocalJournals/fake-local-journals-list.txt
Windscribe clickbait.txt|https://assets.windscribe.com/custom_blocklists/clickbait.txt
StevenBlack fakenews.txt|https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts
Expand Down
17 changes: 0 additions & 17 deletions scripts/cleanup.sh

This file was deleted.

6 changes: 3 additions & 3 deletions scripts/download.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/usr/bin/env bash

# Use this script to download external blocklists from a non-txt list formatted as "file.txt|url" to the sources/_imported/ folder.
# Use this script to download external blocklists from a list formatted as "file.txt|url" to the "/import/original/" folder.
# e.g.
# ./scripts/download.sh < list.md
# ./scripts/download.sh < list.txt

IFS='|'
while read FILE URL; do
wget -O ./sources/_imported/"$FILE" -- "$URL"
wget -O ./import/original/"$FILE" -- "$URL"
done
27 changes: 24 additions & 3 deletions scripts/import.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,28 @@
# ./scripts/import.sh

# Download external blocklists
./scripts/download.sh < ./sources/_imported/README.md
./scripts/download.sh < ./import/importlist.txt

# Cleanup sources
./scripts/cleanup.sh
# Copy to modified
cp -a ./import/original/. ./import/modified/

# Cleanup imported sources (Same code in update.sh)
## Special cleanup for imported sources of other formats (AdBlock, hosts, etc.)
find ./import/modified -type f -name "*.txt" -exec sed -ri 's/^[^#[:alnum:]]/#&/; s/^0\.0\.0\.0[[:space:]]*//i' {} \;
## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact
find ./import/modified -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
find ./import/modified -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \; # Removing "www." twice because unmaintained imported lists are weird.
## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization)
find ./import/modified -type f -name "*.txt" -exec bash -c '
awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt";
mv "$0_temp.txt" "$0";
' {} \;

# Remove entries from the allowlist
find ./import/modified -type f -name "*.txt" -exec bash -c '
grep -vxFf "./import/allowlist.txt" "$0" > "$0_temp.txt";
mv "$0_temp.txt" "$0";
' {} \;

# Copy to sources
cp -a ./import/modified/. ./sources/_imported/
11 changes: 9 additions & 2 deletions scripts/update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@
# e.g.
# ./scripts/update.sh

# Cleanup sources
./scripts/cleanup.sh
# Cleanup sources (same code in import.sh)
## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact
find ./sources -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
find ./sources -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \; # Removing "www." twice because unmaintained imported lists are weird.
## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization)
find ./sources -type f -name "*.txt" -exec bash -c '
awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt";
mv "$0_temp.txt" "$0";
' {} \;

# Combine all sources into a domains list
find ./sources -type f -iname "*.txt" -exec cat {} \; > domains.txt
Expand Down

0 comments on commit ccde310

Please sign in to comment.