From 893bbcdcc31cf5fcf5f02f982d289872bd56ee5d Mon Sep 17 00:00:00 2001 From: J <91372088+jarelllama@users.noreply.github.com> Date: Sat, 8 Feb 2025 18:01:14 +0800 Subject: [PATCH] fix cybersquatting and regex sources and improve mawk implementation --- scripts/build_nsfw.sh | 2 +- scripts/retrieve_domains.sh | 117 +++++++++++++++++++----------------- scripts/test_functions.sh | 4 +- scripts/tools.sh | 10 +-- scripts/update_readme.sh | 13 ---- scripts/validate_domains.sh | 14 ++--- 6 files changed, 74 insertions(+), 86 deletions(-) diff --git a/scripts/build_nsfw.sh b/scripts/build_nsfw.sh index fe4faa08e..1066d8aac 100644 --- a/scripts/build_nsfw.sh +++ b/scripts/build_nsfw.sh @@ -49,7 +49,7 @@ readonly -a WHITELIST=( # raw file, format the file and remove dead domains. build() { # Format raw file to Domains format - mawk '/[|]/ {gsub(/[|^]/, ""); print}' "$BLOCKLIST" > raw.tmp + mawk '/[|]/ { gsub(/[|^]/, ""); print }' "$BLOCKLIST" > raw.tmp # Remove already processed domains comm -23 toplist.tmp raw.tmp > temp diff --git a/scripts/retrieve_domains.sh b/scripts/retrieve_domains.sh index 5e7ff9818..478b0ed1d 100644 --- a/scripts/retrieve_domains.sh +++ b/scripts/retrieve_domains.sh @@ -74,14 +74,14 @@ main() { # blacklist and whitelist. check_review_file() { # Add blacklisted entries to blacklist and remove them from the review file - mawk -F ',' '$4 == "y" && $5 != "y" {print $2}' "$REVIEW_CONFIG" \ + mawk -F ',' '$4 == "y" && $5 != "y" { print $2 }' "$REVIEW_CONFIG" \ | tee >(sort -u - "$BLACKLIST" -o "$BLACKLIST") \ | xargs -I {} sed -i "/,{},/d" "$REVIEW_CONFIG" # Add whitelisted entries to whitelist after formatting to regex and remove # them from the review file - mawk -F ',' '$5 == "y" && $4 != "y" {print $2}' "$REVIEW_CONFIG" \ - | tee >(mawk '{gsub(/\./, "\."); print "^" $0 "$"}' \ + mawk -F ',' '$5 == "y" && $4 != "y" { print $2 }' "$REVIEW_CONFIG" \ + | tee >(mawk '{ gsub(/\./, "\."); print "^" $0 "$" }' \ | sort -u - "$WHITELIST" -o "$WHITELIST") \ | xargs -I {} sed -i "/,{},/d" "$REVIEW_CONFIG" } @@ -162,14 +162,15 @@ filter() { if [[ "$3" == '--preserve' ]]; then # Save entries for console output - mawk -v tag="$tag" '{print $0 " (" tag ")"}' <<< "$entries" \ + mawk -v tag="$tag" '{ print $0 " (" tag ")" }' <<< "$entries" \ >> entries_for_review.tmp # Save entries into review config file mawk -v source="$source_name" -v reason="$tag" \ - '{print source "," $0 "," reason ",,"}' <<< "$entries" \ + '{ print source "," $0 "," reason ",," }' <<< "$entries" \ >> "$REVIEW_CONFIG" - # Remove duplicates + + # Remove duplicates from review config file mawk '!seen[$0]++' "$REVIEW_CONFIG" > temp mv temp "$REVIEW_CONFIG" @@ -546,7 +547,7 @@ search_google() { # Get domains from each page page_domains="$(jq -r '.items[].link' <<< "$page_results" \ - | mawk -F '/' '{print $3}')" + | mawk -F '/' '{ print $3 }')" printf "%s\n" "$page_domains" >> "$source_results" # Stop search term if no more pages are required @@ -557,12 +558,12 @@ search_google() { } source_cybersquatting() { - # Last checked: 27/01/25 + # Last checked: 08/02/25 source_name='Cybersquatting' [[ "$USE_EXISTING_RESULTS" == true ]] && return - local tlds row count runs results + local tlds # Install dnstwist command -v dnstwist > /dev/null || pip install -q dnstwist @@ -579,31 +580,31 @@ source_cybersquatting() { # Get TLDs from the NRD feed for dnstwist. # This is not needed for URLCrazy as that already checks for # alternate TLDs. - tlds="$(mawk -F '.' '{print $NF}' nrd.tmp | sort | uniq -c \ - | sort -nr | mawk '{print $2}')" + tlds="$(mawk -F '.' '!seen[$NF]++ { print $NF }' nrd.tmp)" # Loop through phishing targets - mawk -F ',' '$4 == "y" {print $1}' "$PHISHING_TARGETS" \ + mawk -F ',' '$4 == "y" { print $1 }' "$PHISHING_TARGETS" \ | while read -r target; do - # Get info of the target domain - row="$(mawk -v target="$target" '$0 ~ target' "$PHISHING_TARGETS")" - count="$(mawk -F ',' '{print $2}' <<< "$row")" - runs="$(mawk -F ',' '{print $3}' <<< "$row")" - - # Run dnstwist - results="$(dnstwist "${target}.com" -f list)" - - # Append TLDs to dnstwist results + # Run dnstwist and append possible TLDs. # Note the dnstwist --tld argument only replaces the TLDs of the - # original domain. - while read -r tld; do - printf "%s\n" "$results" | sed "s/\.com/.${tld}/" >> results.tmp - done <<< "$tlds" + # original domain, not the resulting domains. + dnstwist "${target}.com" -f list | mawk -v tlds="$tlds" ' + BEGIN { + n = split(tlds, tldArray, "\n") + } + { + for (i = 1; i <= n; i++) { + modified = $0 + gsub(/\.com/, "." tldArray[i], modified) + print modified + } + } + ' >> results.tmp # Run URLCrazy (bash does not work) ./urlcrazy-master/urlcrazy -r "${target}.com" -f CSV \ - | mawk -F ',' '!/"Original"/ {print $2}' \ + | mawk -F ',' '$1 !~ /Original/ { print $2 }' \ | grep -oE "$DOMAIN_REGEX" >> results.tmp sort -u results.tmp -o results.tmp @@ -616,11 +617,14 @@ source_cybersquatting() { cat results.tmp >> source_results.tmp # Update counts for the target - mawk -F ',' -v target="$target" \ - -v count="$(( count + $(wc -l < results.tmp) ))" \ - -v runs="$(( runs + 1 ))" \ - 'BEGIN {OFS=","} $1==target {$2=count; $3=runs} 1' \ - "$PHISHING_TARGETS" > temp + mawk -F ',' -v target="$target" -v count="$(wc -l < results.tmp)" ' + BEGIN {OFS = ","} + $1 == target { + $6 += count + $7 += 1 + } + { print } + ' "$PHISHING_TARGETS" > temp mv temp "$PHISHING_TARGETS" # Reset results file for the next target domain @@ -668,42 +672,43 @@ source_dga_detector() { } source_regex() { - # Last checked: 06/02/25 + # Last checked: 08/02/25 source_name='Regex' exclude_from_light=true [[ "$USE_EXISTING_RESULTS" == true ]] && return - local row count runs pattern results + local pattern # Loop through phishing targets - mawk -F ',' '$8 == "y" {print $1}' "$PHISHING_TARGETS" \ + mawk -F ',' '$8 == "y" { print $1 }' "$PHISHING_TARGETS" \ | while read -r target; do - # Get info of the target domain - row="$(mawk -v target="$target" '$0 ~ target' "$PHISHING_TARGETS")" - count="$(mawk -F ',' '{print $6}' <<< "$row")" - runs="$(mawk -F ',' '{print $7}' <<< "$row")" - pattern="$(mawk -F ',' '{printf $5}' <<< "$row")" - # Get regex of target - local escaped_domain="${target//[.]/\\.}" - local regex="${pattern//&/${escaped_domain}}" - - # Get matches in NRD feed + pattern="$(mawk -F ',' -v target="$target" ' + $1 == target { + print $5 + } + ' "$PHISHING_TARGETS")" + local escaped_target="${target//[.]/\\.}" + local regex="${pattern//&/${escaped_target}}" + + # Get matches in NRD feed and update counts # awk is used here instead of mawk for compatibility with the regex # expressions. - results="$(awk "/${regex}/" nrd.tmp | sort -u)" - - # Collate results - printf "%s\n" "$results" >> source_results.tmp - - # Update counts for the target - mawk -F ',' -v target="$target" \ - -v count="$(( count + $(wc -w <<< "$results") ))" \ - -v runs="$(( runs + 1 ))" \ - 'BEGIN {OFS=","} $1==target {$6=count; $7=runs} 1' \ - "$PHISHING_TARGETS" > temp + mawk -F ',' -v target="$target" -v results="$( + awk "/${regex}/" nrd.tmp \ + | sort -u \ + | tee -a source_results.tmp \ + | wc -l + )" ' + BEGIN {OFS = ","} + $1 == target { + $6 += results + $7 += 1 + } + { print } + ' "$PHISHING_TARGETS" > temp mv temp "$PHISHING_TARGETS" done } @@ -955,7 +960,7 @@ source_viriback_tracker() { [[ "$USE_EXISTING_RESULTS" == true ]] && return curl -sS "$source_url" | mawk -v year="$(date +"%Y")" \ - -F ',' '$4 ~ year {print $2}' \ + -F ',' '$4 ~ year { print $2 }' \ | grep -Po "^https?://\K${DOMAIN_REGEX}" > source_results.tmp } diff --git a/scripts/test_functions.sh b/scripts/test_functions.sh index 95a9f8a37..9f753be67 100644 --- a/scripts/test_functions.sh +++ b/scripts/test_functions.sh @@ -187,7 +187,7 @@ TEST_DEAD_CHECK() { # Remove placeholder lines local file for file in "$RAW" "$RAW_LIGHT" "$DEAD_DOMAINS" "$DOMAIN_LOG"; do - mawk '!/placeholder/' "$file" > temp || true + mawk '!/^placeholder/' "$file" > temp || true mv temp "$file" done @@ -223,7 +223,7 @@ TEST_PARKED_CHECK() { # Remove placeholder lines local file for file in "$RAW" "$RAW_LIGHT" "$PARKED_DOMAINS" "$DOMAIN_LOG"; do - mawk '!/placeholder/' "$file" > temp || true + mawk '!/^placeholder/' "$file" > temp || true mv temp "$file" done diff --git a/scripts/tools.sh b/scripts/tools.sh index 0dd378620..0c9db88e4 100644 --- a/scripts/tools.sh +++ b/scripts/tools.sh @@ -18,17 +18,17 @@ format_file() { case "$file" in 'data/dead_domains.txt'|'data/parked_domains.txt') # Remove duplicates, whitespaces, and convert to lowercase - mawk '!seen[$0]++ {gsub(/ /, ""); print tolower($0)}' "$file" \ + mawk '!seen[$0]++ { gsub(/ /, ""); print tolower($0) }' "$file" \ > temp ;; 'config/parked_terms.txt') # Convert to lowercase, sort, and remove duplicates - mawk '{print tolower($0)}' "$file" | sort -u -o temp + mawk '{ print tolower($0) }' "$file" | sort -u -o temp ;; *.txt|*.tmp) # Remove whitespaces, convert to lowercase, sort, and remove # duplicates - mawk '{gsub(/ /, ""); print tolower($0)}' "$file" \ + mawk '{ gsub(/ /, ""); print tolower($0) }' "$file" \ | sort -u -o temp ;; *) @@ -85,7 +85,7 @@ log_domains() { printf "%s\n" "$domains" \ | mawk -v event="$2" -v source="$3" -v time="$timestamp" \ - '{print time "," event "," $0 "," source}' >> config/domain_log.csv + '{ print time "," event "," $0 "," source }' >> config/domain_log.csv } # Function 'prune_lines' prunes lines in the given file to keep its number of @@ -120,7 +120,7 @@ download_toplist() { # curl -L required curl -sSL 'https://tranco-list.eu/top-1m.csv.zip' -o temp - unzip -p temp | mawk -F ',' '{print $2}' > toplist.tmp + unzip -p temp | mawk -F ',' '{ print $2 }' > toplist.tmp [[ ! -s toplist.tmp ]] && error 'Error downloading toplist.' diff --git a/scripts/update_readme.sh b/scripts/update_readme.sh index a36b4e922..744978e76 100644 --- a/scripts/update_readme.sh +++ b/scripts/update_readme.sh @@ -198,19 +198,6 @@ sum() { | mawk '{ sum += $1 } END { print sum }' } -sum_domains() { - mawk -F -v time="$1" ',' '/today,dead_count/ { - sum += $3 - } - END { - if (sum == "") { - print 0 - } else { - print sum - } - }' -} - # Function 'sum_excluded' is an echo wrapper that returns the percentage of # excluded domains out of the raw count retrieved by the given source. # Input: diff --git a/scripts/validate_domains.sh b/scripts/validate_domains.sh index 7a3a6a785..5d7fa555f 100644 --- a/scripts/validate_domains.sh +++ b/scripts/validate_domains.sh @@ -27,14 +27,14 @@ main() { # whitelist/blacklist. check_review_file() { # Add blacklisted entries to blacklist and remove them from the review file - mawk -F ',' '$4 == "y" && $5 != "y" {print $2}' "$REVIEW_CONFIG" \ + mawk -F ',' '$4 == "y" && $5 != "y" { print $2 }' "$REVIEW_CONFIG" \ | tee >(sort -u - "$BLACKLIST" -o "$BLACKLIST") \ | xargs -I {} sed -i "/,{},/d" "$REVIEW_CONFIG" # Add whitelisted entries to whitelist after formatting to regex and remove # them from the review file - mawk -F ',' '$5 == "y" && $4 != "y" {print $2}' "$REVIEW_CONFIG" \ - | tee >(mawk '{gsub(/\./, "\."); print "^" $0 "$"}' \ + mawk -F ',' '$5 == "y" && $4 != "y" { print $2 }' "$REVIEW_CONFIG" \ + | tee >(mawk '{ gsub(/\./, "\."); print "^" $0 "$" }' \ | sort -u - "$WHITELIST" -o "$WHITELIST") \ | xargs -I {} sed -i "/,{},/d" "$REVIEW_CONFIG" } @@ -56,7 +56,7 @@ filter() { if [[ "$3" == '--preserve' ]]; then # Save entries into review config file mawk -v reason="$tag" \ - '{print "raw," $0 "," reason ",,"}' <<< "$entries" \ + '{ print "raw," $0 "," reason ",," }' <<< "$entries" \ >> "$REVIEW_CONFIG" # Remove duplicates mawk '!seen[$0]++' "$REVIEW_CONFIG" > temp @@ -68,7 +68,7 @@ filter() { fi # Record entries into filter log for console output - mawk -v tag="$tag" '{print $0 " (" tag ")"}' <<< "$entries" \ + mawk -v tag="$tag" '{ print $0 " (" tag ")" }' <<< "$entries" \ >> filter_log.tmp # Call shell wrapper to log entries into domain log @@ -149,10 +149,6 @@ validate() { printf "\n\e[1mProblematic domains (%s):\e[0m\n" "$(wc -l < filter_log.tmp)" sed 's/(toplist)/& - \o033[31mmanual verification required\o033[0m/' filter_log.tmp - # Do not notify for subdomains (the notifications got annoying) - mawk '!/subdomain/' filter_log.tmp > temp - mv temp filter_log.tmp - [[ ! -s filter_log.tmp ]] && return # Call shell wrapper to send telegram notification