Skip to content

Commit

Permalink
[Domains] Add version of public suffix list shipped with release pack…
Browse files Browse the repository at this point in the history
…ages

- log for processed public suffix list
  - MD5 and SHA-512
  - number of bytes, lines and rules
  - commit date and git hash, cf.
    publicsuffix/list#1808
  • Loading branch information
sebastian-nagel committed Nov 12, 2024
1 parent a3ddc52 commit c834545
Showing 1 changed file with 67 additions and 28 deletions.
95 changes: 67 additions & 28 deletions src/main/java/crawlercommons/domains/EffectiveTldFinder.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
/**
* Copyright 2016 Crawler-Commons
*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand All @@ -22,16 +22,24 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.math.BigInteger;
import java.net.IDN;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Locale;
import java.nio.charset.StandardCharsets;
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

import org.apache.commons.io.input.BoundedInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -43,7 +51,7 @@
* class implements the
* <a href="https://publicsuffix.org/list/">publicsuffix.org ruleset</a> and
* uses a copy of the public suffix list.
*
*
* For more information, see
* <ul>
* <li><a href="https://www.publicsuffix.org/">publicsuffix.org</a></li>
Expand All @@ -54,7 +62,7 @@
* Service</a>: for historic reasons the class name stems from the term
* &quot;effective top-level domain&quot; (eTLD)</li>
* </ul>
*
*
* EffectiveTldFinder loads the public suffix list as file
* "effective_tld_names.dat" from the Java classpath. Make sure your classpath
* does not contain any other file with the same name, eg. an outdated list
Expand All @@ -75,9 +83,9 @@
* >https://raw.githubusercontent.com/publicsuffix/list/master/
* public_suffix_list.dat</a></li>
* </ul>
*
*
* <h2>ICANN vs. Private Domains</h2>
*
*
* The <a href="https://publicsuffix.org/list/">public suffix list (see section
* &quot;divisions&quot;)</a> is subdivided into &quot;ICANN&quot; and
* &quot;PRIVATE&quot; domains. To restrict the EffectiveTldFinder to
Expand All @@ -87,7 +95,7 @@
* {@link EffectiveTldFinder#getEffectiveTLD(String, boolean)}. This will
* exclude the eTLDs from the PRIVATE domain section of the public suffix list
* while a domain or eTLD is matched.
*
*
*/
public class EffectiveTldFinder {
private static final Logger LOGGER = LoggerFactory.getLogger(EffectiveTldFinder.class);
Expand All @@ -98,13 +106,14 @@ public class EffectiveTldFinder {
public static final String EXCEPTION = "!";
public static final String WILD_CARD = "*.";
public static final char DOT = '.';
public static Pattern VERSION_PATTERN = Pattern.compile("^//\\s*(COMMIT|VERSION):\\s*(\\S+)");

/**
* Max. length in ASCII characters of a dot-separated segment in host names
* (applies to domain names as well), cf.
* https://tools.ietf.org/html/rfc1034#section-3.1 and
* https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames
*
*
* Note: We only have to validate domain names and not the host names passed
* as input. For domain names a verification of the segment length also
* implies that the entire domain names stays in the limit of 253
Expand Down Expand Up @@ -134,7 +143,7 @@ private EffectiveTldFinder() {

/**
* Get singleton instance of EffectiveTldFinder with default configuration.
*
*
* @return singleton instance of EffectiveTldFinder
*/
public static EffectiveTldFinder getInstance() {
Expand All @@ -146,7 +155,7 @@ public static EffectiveTldFinder getInstance() {

/**
* (Re)initialize EffectiveTldFinder with custom public suffix list.
*
*
* @param effectiveTldDataStream
* content of public suffix list as input stream
* @return true if (re)initialization was successful
Expand All @@ -156,27 +165,57 @@ public boolean initialize(InputStream effectiveTldDataStream) {
domainTrie = new SuffixTrie<>();
boolean inPrivateDomainSection = false;
try {
BufferedReader input = new BufferedReader(new InputStreamReader(effectiveTldDataStream, StandardCharsets.UTF_8));
int linesRead = 0, rulesRead = 0;
BoundedInputStream isCounting = BoundedInputStream.builder().setInputStream(effectiveTldDataStream).get();
InputStream is = isCounting;
List<MessageDigest> digests = new ArrayList<>();
try {
MessageDigest md5 = MessageDigest.getInstance("MD5");
is = new DigestInputStream(is, md5);
digests.add(md5);
MessageDigest sha512 = MessageDigest.getInstance("SHA-512");
is = new DigestInputStream(is, sha512);
digests.add(sha512);
} catch (NoSuchAlgorithmException e) {
LOGGER.warn("Failed to initialize digesting input streams", e);
}
BufferedReader input = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
String line = null;
while (null != (line = input.readLine())) {
linesRead++;
if (line.trim().isEmpty()) {
continue;
} else if (line.startsWith(COMMENT)) {
if (line.contains("===BEGIN PRIVATE DOMAINS===")) {
inPrivateDomainSection = true;
} else if (line.contains("===END PRIVATE DOMAINS===")) {
inPrivateDomainSection = false;
} else {
Matcher m = VERSION_PATTERN.matcher(line);
if (m.matches()) {
LOGGER.info("Public suffix list {}: {}", m.group(1), m.group(2));
}
}
continue;
} else {
EffectiveTLD entry = new EffectiveTLD(line, inPrivateDomainSection);
for (String var : entry.getNameVariants()) {
domains.put(var, entry);
domainTrie.put(var, entry);
}
}
rulesRead++;
EffectiveTLD entry = new EffectiveTLD(line, inPrivateDomainSection);
for (String var : entry.getNameVariants()) {
domains.put(var, entry);
domainTrie.put(var, entry);
}
}
configured = true;

is.close();
long bytesRead = isCounting.getCount();
LOGGER.info("Successfully read public suffix list: {} bytes, {} lines, {} rules", bytesRead, linesRead, rulesRead);
for (MessageDigest digest : digests) {
byte[] d = digest.digest();
BigInteger bi = new BigInteger(1, d);
String hexDigest = String.format(Locale.ROOT, "%0" + (d.length << 1) + "X", bi);
LOGGER.info("Digest of public suffix list: {} = {}", digest.getAlgorithm(), hexDigest);
}
} catch (IOException e) {
LOGGER.error("EffectiveTldFinder configuration failed: ", e);
configured = false;
Expand All @@ -194,7 +233,7 @@ public static Map<String, EffectiveTLD> getEffectiveTLDs() {
/**
* Get EffectiveTLD for host name using the singleton instance of
* EffectiveTldFinder.
*
*
* @param hostname
* the hostname for which to find the {@link EffectiveTLD}
* @return the {@link EffectiveTLD}
Expand All @@ -206,7 +245,7 @@ public static EffectiveTLD getEffectiveTLD(String hostname) {
/**
* Get EffectiveTLD for host name using the singleton instance of
* EffectiveTldFinder.
*
*
* @param hostname
* the hostname for which to find the {@link EffectiveTLD}
* @param excludePrivate
Expand All @@ -225,7 +264,7 @@ public static EffectiveTLD getEffectiveTLD(String hostname, boolean excludePriva
/**
* Find EffectiveTLD and offset in host name using the singleton instance of
* EffectiveTldFinder.
*
*
* @param hostname
* the hostname for which to find the {@link EffectiveTLD}
* @param excludePrivate
Expand Down Expand Up @@ -274,7 +313,7 @@ private static SuffixTrie.LookupResult<EffectiveTLD> findEffectiveTLD(String hos
/**
* This method uses the effective TLD to determine which component of a FQDN
* is the NIC-assigned domain name (aka "Paid Level Domain").
*
*
* @param hostname
* a string for which to obtain a NIC-assigned domain name
* @return the NIC-assigned domain name or as fall-back the hostname if no
Expand All @@ -287,7 +326,7 @@ public static String getAssignedDomain(String hostname) {
/**
* This method uses the effective TLD to determine which component of a FQDN
* is the NIC-assigned domain name (aka "Paid Level Domain").
*
*
* @param hostname
* a string for which to obtain a NIC-assigned domain name
* @param strict
Expand All @@ -303,7 +342,7 @@ public static String getAssignedDomain(String hostname, boolean strict) {
/**
* This method uses the effective TLD to determine which component of a FQDN
* is the NIC-assigned domain name.
*
*
* @param hostname
* a string for which to obtain a NIC-assigned domain name
* @param strict
Expand Down Expand Up @@ -463,7 +502,7 @@ public static class EffectiveTLD {
/**
* Parse one non-empty, non-comment line in the public suffix list and
* hold the public suffix and its properties in the created object.
*
*
* @param line
* non-empty, non-comment line in the public suffix list
* @param isPrivateDomain
Expand Down Expand Up @@ -496,7 +535,7 @@ public EffectiveTLD(String line, boolean isPrivateDomain) throws IllegalArgument
* Normalize a domain name: convert characters into to lowercase and
* encode dot-separated segments containing non-ASCII characters. Cf.
* {@link #asciiConvert(String)} and {@link IDN#toASCII(String)}
*
*
* @param str
* domain name segment
* @return normalized domain name containing only ASCII characters
Expand All @@ -516,7 +555,7 @@ private String normalizeName(String name) throws IllegalArgumentException {
* Generate name variants caused by Internationalized Domain Names:
* every IDN part of a eTLD can be replaced by its punycoded ASCII
* variant. For two-part IDN eTLDs this will generate 4 variants.
*
*
* @return set of variant names
*/
public Set<String> getNameVariants() {
Expand Down Expand Up @@ -558,7 +597,7 @@ public Set<String> getNameVariants() {
/**
* Converts a single domain name segment (separated by dots) to ASCII if
* it contains non-ASCII character, cf. {@link IDN#toASCII(String)}.
*
*
* @param str
* domain name segment
* @return ASCII "Punycode" representation of the domain name segment
Expand Down

0 comments on commit c834545

Please sign in to comment.