Skip to content

Commit

Permalink
preparing import
Browse files Browse the repository at this point in the history
  • Loading branch information
NotaInutilis committed Nov 22, 2023
1 parent 3e85caa commit 50f9053
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 19 deletions.
16 changes: 16 additions & 0 deletions scripts/cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash

# Use this script to cleanup sources and normalize them to a list of domains while keeping comments.
# e.g.
# ./scripts/cleanup.sh

# Cleanup sources
## Special cleanup for imported sources of other formats (AdBlock, hosts, etc.)
find ./sources/_imported -type f -name "*.txt" -exec sed -ri 's/^[^#[:alnum:]]/#&/; s/^0\.0\.0\.0[[:space:]]*//i' {} \;
## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact
find ./sources -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization)
find ./sources -type f -name "*.txt" -exec bash -c '
awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt";
mv "$0_temp.txt" "$0";
' {} \;
12 changes: 12 additions & 0 deletions scripts/import.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash

# Use this script to import external blocklists from a list formatted as "file.txt url".
# e.g.
# ./scripts/import.sh < list.md

while read FILE URL; do
wget -O "$FILE" -- "$URL"
done

# Cleanup sources
./scripts/cleanup.sh
10 changes: 1 addition & 9 deletions scripts/update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,7 @@
# ./scripts/update.sh

# Cleanup sources
## Special cleanup for imported sources of other formats (AdBlock, hosts, etc.)
find ./sources/_imported -type f -name "*.txt" -exec sed -ri 's/^[^#[:alnum:]]/#&/; s/^0\.0\.0\.0[[:space:]]*//i' {} \;
## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact
find ./sources -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization)
find ./sources -type f -name "*.txt" -exec bash -c '
awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt";
mv "$0_temp.txt" "$0";
' {} \;
./scripts/cleanup.sh

# Combine all sources into a domains list
find ./sources -type f -iname "*.txt" -exec cat {} \; > domains.txt
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# What the #MeToo movement is against

# Famous rapists
# Known and protected sexual abusers
damien-abad.fr
# Roman Polanski supporters
# Roman Polanski and his supporters
roman-polanski.com
roman-polanski.net
romanpolanski.online.fr
Expand Down
18 changes: 10 additions & 8 deletions sources/_imported/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
https://peertube_isolation.frama.io/
https://github.com/DandelionSprout/adfilt/blob/master/Sensitive%20lists/TabloidRemover.txt
https://github.com/DandelionSprout/adfilt/blob/master/Sensitive%20lists/TabloidRemover-MastodonCategoryForImports.csv
https://github.com/MassMove/AttackVectors/blob/master/LocalJournals/fake-local-journals-list.txt
https://assets.windscribe.com/custom_blocklists/clickbait.txt
https://github.com/StevenBlack/hosts/blob/master/alternates/fakenews-only/hosts
https://github.com/antifa-n/pihole
https://github.com/AFNIL-AntiFakeNewsInternationalList/AFNIL/blob/master/hosts
"peertube_isolation fediverse.txt" https://peertube_isolation.frama.io/list/peertube_isolation.txt
"Adfilt TabloidRemover.txt" https://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover.txt
"Adfilt TabloidRemover-MastodonCategoryForImports fediverse.txt"https://raw.githubusercontent.com/DandelionSprout/adfilt/master/Sensitive%20lists/TabloidRemover-MastodonCategoryForImports.csv
"Attack Vectors fake-local-journals-list.txt" https://raw.githubusercontent.com/MassMove/AttackVectors/master/LocalJournals/fake-local-journals-list.txt
"Windscribe clickbait.txt" https://assets.windscribe.com/custom_blocklists/clickbait.txt
"StevenBlack fakenews.txt" https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts
"antifa-n blocklist.txt" https://raw.githubusercontent.com/antifa-n/pihole/master/blocklist.txt
"antifa-n blocklist-alttech.txt" https://raw.githubusercontent.com/antifa-n/pihole/master/blocklist-alttech.txt
"antifa-n blocklist-pop.txt" https://raw.githubusercontent.com/antifa-n/pihole/master/blocklist-pop.txt
"AFNIL.txt" https://raw.githubusercontent.com/AFNIL-AntiFakeNewsInternationalList/AFNIL/master/hosts

0 comments on commit 50f9053

Please sign in to comment.