Skip to content

Commit

Permalink
fix cybersquatting and regex sources and improve mawk implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
jarelllama authored Feb 8, 2025
1 parent 4cd7d4c commit 893bbcd
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 86 deletions.
2 changes: 1 addition & 1 deletion scripts/build_nsfw.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ readonly -a WHITELIST=(
# raw file, format the file and remove dead domains.
build() {
# Format raw file to Domains format
mawk '/[|]/ {gsub(/[|^]/, ""); print}' "$BLOCKLIST" > raw.tmp
mawk '/[|]/ { gsub(/[|^]/, ""); print }' "$BLOCKLIST" > raw.tmp

# Remove already processed domains
comm -23 toplist.tmp raw.tmp > temp
Expand Down
117 changes: 61 additions & 56 deletions scripts/retrieve_domains.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,14 @@ main() {
# blacklist and whitelist.
check_review_file() {
# Add blacklisted entries to blacklist and remove them from the review file
mawk -F ',' '$4 == "y" && $5 != "y" {print $2}' "$REVIEW_CONFIG" \
mawk -F ',' '$4 == "y" && $5 != "y" { print $2 }' "$REVIEW_CONFIG" \
| tee >(sort -u - "$BLACKLIST" -o "$BLACKLIST") \
| xargs -I {} sed -i "/,{},/d" "$REVIEW_CONFIG"

# Add whitelisted entries to whitelist after formatting to regex and remove
# them from the review file
mawk -F ',' '$5 == "y" && $4 != "y" {print $2}' "$REVIEW_CONFIG" \
| tee >(mawk '{gsub(/\./, "\."); print "^" $0 "$"}' \
mawk -F ',' '$5 == "y" && $4 != "y" { print $2 }' "$REVIEW_CONFIG" \
| tee >(mawk '{ gsub(/\./, "\."); print "^" $0 "$" }' \
| sort -u - "$WHITELIST" -o "$WHITELIST") \
| xargs -I {} sed -i "/,{},/d" "$REVIEW_CONFIG"
}
Expand Down Expand Up @@ -162,14 +162,15 @@ filter() {

if [[ "$3" == '--preserve' ]]; then
# Save entries for console output
mawk -v tag="$tag" '{print $0 " (" tag ")"}' <<< "$entries" \
mawk -v tag="$tag" '{ print $0 " (" tag ")" }' <<< "$entries" \
>> entries_for_review.tmp

# Save entries into review config file
mawk -v source="$source_name" -v reason="$tag" \
'{print source "," $0 "," reason ",,"}' <<< "$entries" \
'{ print source "," $0 "," reason ",," }' <<< "$entries" \
>> "$REVIEW_CONFIG"
# Remove duplicates

# Remove duplicates from review config file
mawk '!seen[$0]++' "$REVIEW_CONFIG" > temp
mv temp "$REVIEW_CONFIG"

Expand Down Expand Up @@ -546,7 +547,7 @@ search_google() {

# Get domains from each page
page_domains="$(jq -r '.items[].link' <<< "$page_results" \
| mawk -F '/' '{print $3}')"
| mawk -F '/' '{ print $3 }')"
printf "%s\n" "$page_domains" >> "$source_results"

# Stop search term if no more pages are required
Expand All @@ -557,12 +558,12 @@ search_google() {
}

source_cybersquatting() {
# Last checked: 27/01/25
# Last checked: 08/02/25
source_name='Cybersquatting'

[[ "$USE_EXISTING_RESULTS" == true ]] && return

local tlds row count runs results
local tlds

# Install dnstwist
command -v dnstwist > /dev/null || pip install -q dnstwist
Expand All @@ -579,31 +580,31 @@ source_cybersquatting() {
# Get TLDs from the NRD feed for dnstwist.
# This is not needed for URLCrazy as that already checks for
# alternate TLDs.
tlds="$(mawk -F '.' '{print $NF}' nrd.tmp | sort | uniq -c \
| sort -nr | mawk '{print $2}')"
tlds="$(mawk -F '.' '!seen[$NF]++ { print $NF }' nrd.tmp)"

# Loop through phishing targets
mawk -F ',' '$4 == "y" {print $1}' "$PHISHING_TARGETS" \
mawk -F ',' '$4 == "y" { print $1 }' "$PHISHING_TARGETS" \
| while read -r target; do

# Get info of the target domain
row="$(mawk -v target="$target" '$0 ~ target' "$PHISHING_TARGETS")"
count="$(mawk -F ',' '{print $2}' <<< "$row")"
runs="$(mawk -F ',' '{print $3}' <<< "$row")"

# Run dnstwist
results="$(dnstwist "${target}.com" -f list)"

# Append TLDs to dnstwist results
# Run dnstwist and append possible TLDs.
# Note the dnstwist --tld argument only replaces the TLDs of the
# original domain.
while read -r tld; do
printf "%s\n" "$results" | sed "s/\.com/.${tld}/" >> results.tmp
done <<< "$tlds"
# original domain, not the resulting domains.
dnstwist "${target}.com" -f list | mawk -v tlds="$tlds" '
BEGIN {
n = split(tlds, tldArray, "\n")
}
{
for (i = 1; i <= n; i++) {
modified = $0
gsub(/\.com/, "." tldArray[i], modified)
print modified
}
}
' >> results.tmp

# Run URLCrazy (bash does not work)
./urlcrazy-master/urlcrazy -r "${target}.com" -f CSV \
| mawk -F ',' '!/"Original"/ {print $2}' \
| mawk -F ',' '$1 !~ /Original/ { print $2 }' \
| grep -oE "$DOMAIN_REGEX" >> results.tmp

sort -u results.tmp -o results.tmp
Expand All @@ -616,11 +617,14 @@ source_cybersquatting() {
cat results.tmp >> source_results.tmp

# Update counts for the target
mawk -F ',' -v target="$target" \
-v count="$(( count + $(wc -l < results.tmp) ))" \
-v runs="$(( runs + 1 ))" \
'BEGIN {OFS=","} $1==target {$2=count; $3=runs} 1' \
"$PHISHING_TARGETS" > temp
mawk -F ',' -v target="$target" -v count="$(wc -l < results.tmp)" '
BEGIN {OFS = ","}
$1 == target {
$6 += count
$7 += 1
}
{ print }
' "$PHISHING_TARGETS" > temp
mv temp "$PHISHING_TARGETS"

# Reset results file for the next target domain
Expand Down Expand Up @@ -668,42 +672,43 @@ source_dga_detector() {
}

source_regex() {
# Last checked: 06/02/25
# Last checked: 08/02/25
source_name='Regex'
exclude_from_light=true

[[ "$USE_EXISTING_RESULTS" == true ]] && return

local row count runs pattern results
local pattern

# Loop through phishing targets
mawk -F ',' '$8 == "y" {print $1}' "$PHISHING_TARGETS" \
mawk -F ',' '$8 == "y" { print $1 }' "$PHISHING_TARGETS" \
| while read -r target; do

# Get info of the target domain
row="$(mawk -v target="$target" '$0 ~ target' "$PHISHING_TARGETS")"
count="$(mawk -F ',' '{print $6}' <<< "$row")"
runs="$(mawk -F ',' '{print $7}' <<< "$row")"
pattern="$(mawk -F ',' '{printf $5}' <<< "$row")"

# Get regex of target
local escaped_domain="${target//[.]/\\.}"
local regex="${pattern//&/${escaped_domain}}"

# Get matches in NRD feed
pattern="$(mawk -F ',' -v target="$target" '
$1 == target {
print $5
}
' "$PHISHING_TARGETS")"
local escaped_target="${target//[.]/\\.}"
local regex="${pattern//&/${escaped_target}}"

# Get matches in NRD feed and update counts
# awk is used here instead of mawk for compatibility with the regex
# expressions.
results="$(awk "/${regex}/" nrd.tmp | sort -u)"

# Collate results
printf "%s\n" "$results" >> source_results.tmp

# Update counts for the target
mawk -F ',' -v target="$target" \
-v count="$(( count + $(wc -w <<< "$results") ))" \
-v runs="$(( runs + 1 ))" \
'BEGIN {OFS=","} $1==target {$6=count; $7=runs} 1' \
"$PHISHING_TARGETS" > temp
mawk -F ',' -v target="$target" -v results="$(
awk "/${regex}/" nrd.tmp \
| sort -u \
| tee -a source_results.tmp \
| wc -l
)" '
BEGIN {OFS = ","}
$1 == target {
$6 += results
$7 += 1
}
{ print }
' "$PHISHING_TARGETS" > temp
mv temp "$PHISHING_TARGETS"
done
}
Expand Down Expand Up @@ -955,7 +960,7 @@ source_viriback_tracker() {
[[ "$USE_EXISTING_RESULTS" == true ]] && return

curl -sS "$source_url" | mawk -v year="$(date +"%Y")" \
-F ',' '$4 ~ year {print $2}' \
-F ',' '$4 ~ year { print $2 }' \
| grep -Po "^https?://\K${DOMAIN_REGEX}" > source_results.tmp
}

Expand Down
4 changes: 2 additions & 2 deletions scripts/test_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ TEST_DEAD_CHECK() {
# Remove placeholder lines
local file
for file in "$RAW" "$RAW_LIGHT" "$DEAD_DOMAINS" "$DOMAIN_LOG"; do
mawk '!/placeholder/' "$file" > temp || true
mawk '!/^placeholder/' "$file" > temp || true
mv temp "$file"
done

Expand Down Expand Up @@ -223,7 +223,7 @@ TEST_PARKED_CHECK() {
# Remove placeholder lines
local file
for file in "$RAW" "$RAW_LIGHT" "$PARKED_DOMAINS" "$DOMAIN_LOG"; do
mawk '!/placeholder/' "$file" > temp || true
mawk '!/^placeholder/' "$file" > temp || true
mv temp "$file"
done

Expand Down
10 changes: 5 additions & 5 deletions scripts/tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ format_file() {
case "$file" in
'data/dead_domains.txt'|'data/parked_domains.txt')
# Remove duplicates, whitespaces, and convert to lowercase
mawk '!seen[$0]++ {gsub(/ /, ""); print tolower($0)}' "$file" \
mawk '!seen[$0]++ { gsub(/ /, ""); print tolower($0) }' "$file" \
> temp
;;
'config/parked_terms.txt')
# Convert to lowercase, sort, and remove duplicates
mawk '{print tolower($0)}' "$file" | sort -u -o temp
mawk '{ print tolower($0) }' "$file" | sort -u -o temp
;;
*.txt|*.tmp)
# Remove whitespaces, convert to lowercase, sort, and remove
# duplicates
mawk '{gsub(/ /, ""); print tolower($0)}' "$file" \
mawk '{ gsub(/ /, ""); print tolower($0) }' "$file" \
| sort -u -o temp
;;
*)
Expand Down Expand Up @@ -85,7 +85,7 @@ log_domains() {

printf "%s\n" "$domains" \
| mawk -v event="$2" -v source="$3" -v time="$timestamp" \
'{print time "," event "," $0 "," source}' >> config/domain_log.csv
'{ print time "," event "," $0 "," source }' >> config/domain_log.csv
}

# Function 'prune_lines' prunes lines in the given file to keep its number of
Expand Down Expand Up @@ -120,7 +120,7 @@ download_toplist() {

# curl -L required
curl -sSL 'https://tranco-list.eu/top-1m.csv.zip' -o temp
unzip -p temp | mawk -F ',' '{print $2}' > toplist.tmp
unzip -p temp | mawk -F ',' '{ print $2 }' > toplist.tmp

[[ ! -s toplist.tmp ]] && error 'Error downloading toplist.'

Expand Down
13 changes: 0 additions & 13 deletions scripts/update_readme.sh
Original file line number Diff line number Diff line change
Expand Up @@ -198,19 +198,6 @@ sum() {
| mawk '{ sum += $1 } END { print sum }'
}

sum_domains() {
mawk -F -v time="$1" ',' '/today,dead_count/ {
sum += $3
}
END {
if (sum == "") {
print 0
} else {
print sum
}
}'
}

# Function 'sum_excluded' is an echo wrapper that returns the percentage of
# excluded domains out of the raw count retrieved by the given source.
# Input:
Expand Down
14 changes: 5 additions & 9 deletions scripts/validate_domains.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ main() {
# whitelist/blacklist.
check_review_file() {
# Add blacklisted entries to blacklist and remove them from the review file
mawk -F ',' '$4 == "y" && $5 != "y" {print $2}' "$REVIEW_CONFIG" \
mawk -F ',' '$4 == "y" && $5 != "y" { print $2 }' "$REVIEW_CONFIG" \
| tee >(sort -u - "$BLACKLIST" -o "$BLACKLIST") \
| xargs -I {} sed -i "/,{},/d" "$REVIEW_CONFIG"

# Add whitelisted entries to whitelist after formatting to regex and remove
# them from the review file
mawk -F ',' '$5 == "y" && $4 != "y" {print $2}' "$REVIEW_CONFIG" \
| tee >(mawk '{gsub(/\./, "\."); print "^" $0 "$"}' \
mawk -F ',' '$5 == "y" && $4 != "y" { print $2 }' "$REVIEW_CONFIG" \
| tee >(mawk '{ gsub(/\./, "\."); print "^" $0 "$" }' \
| sort -u - "$WHITELIST" -o "$WHITELIST") \
| xargs -I {} sed -i "/,{},/d" "$REVIEW_CONFIG"
}
Expand All @@ -56,7 +56,7 @@ filter() {
if [[ "$3" == '--preserve' ]]; then
# Save entries into review config file
mawk -v reason="$tag" \
'{print "raw," $0 "," reason ",,"}' <<< "$entries" \
'{ print "raw," $0 "," reason ",," }' <<< "$entries" \
>> "$REVIEW_CONFIG"
# Remove duplicates
mawk '!seen[$0]++' "$REVIEW_CONFIG" > temp
Expand All @@ -68,7 +68,7 @@ filter() {
fi

# Record entries into filter log for console output
mawk -v tag="$tag" '{print $0 " (" tag ")"}' <<< "$entries" \
mawk -v tag="$tag" '{ print $0 " (" tag ")" }' <<< "$entries" \
>> filter_log.tmp

# Call shell wrapper to log entries into domain log
Expand Down Expand Up @@ -149,10 +149,6 @@ validate() {
printf "\n\e[1mProblematic domains (%s):\e[0m\n" "$(wc -l < filter_log.tmp)"
sed 's/(toplist)/& - \o033[31mmanual verification required\o033[0m/' filter_log.tmp

# Do not notify for subdomains (the notifications got annoying)
mawk '!/subdomain/' filter_log.tmp > temp
mv temp filter_log.tmp

[[ ! -s filter_log.tmp ]] && return

# Call shell wrapper to send telegram notification
Expand Down

0 comments on commit 893bbcd

Please sign in to comment.