Skip to content

Commit

Permalink
scripts and organization update
Browse files Browse the repository at this point in the history
  • Loading branch information
NotaInutilis committed Feb 9, 2024
1 parent fcf3913 commit a5704e8
Show file tree
Hide file tree
Showing 93 changed files with 47 additions and 33 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/import.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
name: Import blocklists

on:
push:
paths:
- 'sources/imports/**.txt'
schedule:
- cron: '42 3,15 * * *'
workflow_dispatch:

jobs:
build:
import:
name: Import blocklists

runs-on: ubuntu-latest

Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ name: Update blocklists

on:
push:
branches:
- main
paths:
- 'sources/**.txt'
- 'filters/**.txt'
paths-ignore:
- 'sources/imports/**'
workflow_dispatch:

jobs:
build:
update:
name: Update blocklists

runs-on: ubuntu-latest

Expand Down
4 changes: 2 additions & 2 deletions scripts/download.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#!/usr/bin/env bash

# Use this script to download external blocklists from a list formatted as "file.txt|url" to the "/import/original/" folder.
# Use this script to download external blocklists from a list formatted as "file.txt|url" to the "/imports/original/" folder.
# e.g.
# ./scripts/download.sh < list.txt

IFS='|'
while read FILE URL; do
curl -fL "$URL" -o ./import/original/"$FILE"
curl -fL "$URL" -o ./sources/imports/original/"$FILE"
done
21 changes: 11 additions & 10 deletions scripts/import.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,29 @@
# ./scripts/import.sh

# Download external blocklists
./scripts/download.sh < ./import/importlist.txt
./scripts/download.sh < ./sources/imports/importlist.txt

# Copy to modified
cp -a ./import/original/. ./import/modified/
cp -a ./sources/imports/original/. ./sources/imports/modified/

# Cleanup imported sources (Same code in update.sh)
## Special cleanup for imported sources of other formats (match, hosts, AdBlock, etc.)
find ./import/modified -type f -name "*.txt" -exec sed -ri 's/^\*\:\/\///i; s/^\*\.//i; s/^0\.0\.0\.0[[:space:]]*//i; s/^[^#[:alnum:]]/#&/' {} \;
find ./sources/imports/modified -type f -name "*.txt" -exec sed -ri 's/^\*\:\/\///i; s/^\*\.//i; s/^0\.0\.0\.0[[:space:]]*//i; s/^[^#[:alnum:]]/#&/' {} \;
## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact
find ./import/modified -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
find ./import/modified -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \; # Removing "www." twice because unmaintained imported lists are weird.
find ./sources/imports/modified -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
## Removing "www." twice because unmaintained imported lists are weird.
find ./sources/imports/modified -type f -name "*.txt" -exec sed -ri 's/^www\.//i' {} \;
## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization)
find ./import/modified -type f -name "*.txt" -exec bash -c '
find ./sources/imports/modified -type f -name "*.txt" -exec bash -c '
awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt";
mv "$0_temp.txt" "$0";
' {} \;

# Remove entries from the allowlist
find ./import/modified -type f -name "*.txt" -exec bash -c '
grep -vxFf "./import/allowlist.txt" "$0" > "$0_temp.txt";
find ./sources/imports/modified -type f -name "*.txt" -exec bash -c '
grep -vxFf "./sources/imports/allowlist.txt" "$0" > "$0_temp.txt";
mv "$0_temp.txt" "$0";
' {} \;

# Copy to sources
cp -a ./import/modified/. ./sources/_imported/
# Copy to sources/domains
cp -a ./sources/imports/modified/. ./sources/domains/_imported/
36 changes: 21 additions & 15 deletions scripts/update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,34 @@
# e.g.
# ./scripts/update.sh

# Cleanup sources (same code in import.sh)
# Cleanup sources/domains (same code in import.sh)
## Normalizes URLs into domains: lowercases, remove leading spaces, protocol (`x://`) `www.` subdomains, everything after `/`, only one space before `#`. Keeps comments intact
find ./sources -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
find ./sources/domains -type f -name "*.txt" -exec sed -ri 'h; s/[^#]*//1; x; s/#.*//; s/.*/\L&/; s/^[[:space:]]*//i; s/^.*:\/\///i; s/^[.*]*//i; s/^www\.//i; s/\/[^[:space:]]*//i; s/[[:space:]].*$/ /i; G; s/(.*)\n/\1/' {} \;
## Remove duplicate domains from each source file (keeps repeated comments and empty lines for organization)
find ./sources -type f -name "*.txt" -exec bash -c '
find ./sources/domains -type f -name "*.txt" -exec bash -c '
awk "(\$0 ~ /^[[:space:]]*#/ || NF == 0 || !seen[\$0]++)" "$0" > "$0_temp.txt";
mv "$0_temp.txt" "$0";
' {} \;

# Combine all sources into a domains list
find ./sources -type f -iname "*.txt" -exec cat {} \; > domains.txt
# Combine all sources/domains into a domains list
find ./sources/domains -type f -iname "*.txt" -exec cat {} \; > ./sources/domains.txt
## Fediverse domains list
find ./sources -type f -iname "*fediverse*.txt" -exec cat {} \; > fediverse_domains.txt
find ./sources/domains -type f -iname "*fediverse*.txt" -exec cat {} \; > ./sources/fediverse_domains.txt

# Cleanup the domains list
## Remove comments, inline comments, spaces and empty lines
sed -i '/^#/d; s/#.*//; s/ //g; /^ *$/d' domains.txt fediverse_domains.txt
sed -i '/^#/d; s/#.*//; s/ //g; /^ *$/d' ./sources/domains.txt fediverse_domains.txt
## Sort and remove duplicates
sort -u domains.txt > domains_temp.txt
mv domains_temp.txt domains.txt
sort -u fediverse_domains.txt > fediverse_domains_temp.txt
mv fediverse_domains_temp.txt fediverse_domains.txt
sort -u ./sources/domains.txt > domains_temp.txt
mv domains_temp.txt ./sources/domains.txt
sort -u ./sources/fediverse_domains.txt > fediverse_domains_temp.txt
mv fediverse_domains_temp.txt ./sources/fediverse_domains.txt

# Generate blocklists from the domains list

## Domains
cp ./sources/domains.txt domains.txt

## For DNS filtering
### Hosts
python scripts/domains_to_hosts.py > hosts.txt
Expand All @@ -38,23 +42,25 @@ python scripts/domains_to_dnsmasq.py > dnsmasq.txt
## For browser extensions
### Adblock
python scripts/domains_to_adblock.py > adblock_temp.txt
cp ./headers/adblock.txt adblock.txt
cp ./sources/headers/adblock.txt adblock.txt
cat adblock_temp.txt >> adblock.txt
rm adblock_temp.txt
### uBlacklist
python scripts/domains_to_ublacklist.py > ublacklist_temp.txt
cp ./headers/adblock.txt ublacklist.txt # Currently using the same adblock header until uBlacklist implements its own header. https://github.com/iorate/ublacklist/issues/351
cp ./sources/headers/ublacklist.txt ublacklist.txt
cat ublacklist_temp.txt >> ublacklist.txt
rm ublacklist_temp.txt

## Generate Fediverse blocklists
### Domains
cp ./sources/fediverse_domains.txt fediverse_domains.txt
### Mastodon
python scripts/fediverse_domains_to_mastodon.py > mastodon_temp.txt
cp ./headers/mastodon.csv mastodon.csv
cp ./sources/headers/mastodon.csv mastodon.csv
cat mastodon_temp.txt >> mastodon.csv
rm mastodon_temp.txt
### FediBlockHole
python scripts/fediverse_domains_to_fediblockhole.py > fediblockhole_temp.txt
cp ./headers/fediblockhole.csv fediblockhole.csv
cp ./sources/headers/fediblockhole.csv fediblockhole.csv
cat fediblockhole_temp.txt >> fediblockhole.csv
rm fediblockhole_temp.txt
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion headers/adblock.txt → sources/headers/adblock.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
! Title: Super SEO Spam Suppressor
! Description: A domains blocklist of sites abusing SEO tactics to spam web searches with advertisement, empty content (monetized with ads) and malware (looking like ads).
! Description: A domains blocklist of sites abusing SEO tactics to spam web searches with advertisement, empty content (monetized with ads), malware (looking like ads) and generative AI garbage.
! Expires: 1 days
! Homepage: https://github.com/NotaInutilis/Super-SEO-Spam-Suppressor
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions sources/headers/ublacklist.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Title: Super SEO Spam Suppressor
# Description: A domains blocklist of sites abusing SEO tactics to spam web searches with advertisement, empty content (monetized with ads), malware (looking like ads) and generative AI garbage.
# Homepage: https://github.com/NotaInutilis/Super-SEO-Spam-Suppressor
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit a5704e8

Please sign in to comment.