Skip to content

Commit

Permalink
- Set all applicable HTTP-headers when retrying a connection.
Browse files Browse the repository at this point in the history
- Code polishing.
  • Loading branch information
LSmyrnaios committed Feb 8, 2024
1 parent 2e6bf9a commit bf56665
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,9 @@ public static HashSet<String> extractInternalLinksFromHtml(String pageHtml, Stri
// TODO - Somehow I need to detect if a link has the parameter "?isAllowd=n" or "&isAllowd=n".
// In that case the whole page should be discarded as not having any docUrls!

// Even though this seems to be the case, I am not entirely sure that this param is not used in any other type of url inside the page, apart from docUrl.
// TODO - It would be best for this check to happen upon trying to connect with a url and throw a special exception which will indicate restricted use for the whole page.

// TODO - ALso, each individual link coming in the program, containing the above, should be discarded. Such rules should be added in some regex.

// Check the text appearing next-to or as the link, inside the html.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileDeleteStrategy;
import org.apache.commons.lang3.StringUtils;

import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -89,6 +88,31 @@ public class ConnSupportUtils

public static final ConcurrentHashMap<String, DomainConnectionData> domainsWithConnectionData = new ConcurrentHashMap<>();

public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String acceptLanguage = "en-US,en;q=0.5";


public static void setHttpHeaders(HttpURLConnection conn, String domainStr)
{
conn.setRequestProperty("User-Agent", userAgent);
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br");
//conn.setRequestProperty("TE", "trailers"); // TODO - Investigate the "transfer-encoding" header.

if ( !HttpConnUtils.domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
conn.setRequestProperty("Accept-Language", acceptLanguage);

conn.setRequestProperty("DNT", "1");
conn.setRequestProperty("Connection", "keep-alive");
conn.setRequestProperty("Sec-Fetch-Dest", "document");
conn.setRequestProperty("Sec-Fetch-Mode", "navigate");
conn.setRequestProperty("Sec-Fetch-Site", "cross-site");
conn.setRequestProperty("Upgrade-Insecure-Requests", "1");
conn.setRequestProperty("Pragma", "no-cache");
conn.setRequestProperty("Cache-Control", "no-cache");
conn.setRequestProperty("Host", domainStr);
}


public static void setKnownMimeTypes()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,6 @@ public class HttpConnUtils

public static AtomicInteger numOfDomainsBlockedDueToSSLException = new AtomicInteger(0);

public static String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"; // This should not be "final", another program, using this software as a library, should be able to set its own "UserAgent".
public static String acceptLanguage = "en-US,en;q=0.5";

public static final int maxConnGETWaitingTime = 15_000; // Max time (in ms) to wait for a connection, using "HTTP GET".
public static final int maxConnHEADWaitingTime = 10_000; // Max time (in ms) to wait for a connection, using "HTTP HEAD".

Expand All @@ -75,7 +72,7 @@ public class HttpConnUtils

public static AtomicInteger timesDidOfflineSlashRedirect = new AtomicInteger(0);

public static ThreadLocal<Boolean> isSpecialUrl = new ThreadLocal<Boolean>(); // Every Thread has its own variable.
public static ThreadLocal<Boolean> isSpecialUrl = new ThreadLocal<Boolean>(); // Every Thread has its own variable. This variable is used only in non-failure cases.

public static final String docFileNotRetrievedMessage = DocFileNotRetrievedException.class.getSimpleName() + " was thrown before the docFile could be stored. "; // Get the class-name programmatically, in order to easily spot the error if the exception-name changes.

Expand Down Expand Up @@ -113,12 +110,7 @@ public static boolean connectAndCheckMimeType(String urlId, String sourceUrl, St

conn = handleConnection(urlId, sourceUrl, pageUrl, resourceURL, domainStr, calledForPageUrl, calledForPossibleDocOrDatasetUrl);

// Check if we are able to find the mime type, if not then try "Content-Disposition".
String mimeType = conn.getContentType();
String contentDisposition = null;

String finalUrlStr = conn.getURL().toString();

if ( !finalUrlStr.contains(domainStr) ) // Get the new domain after possible change from redirects.
if ( (domainStr = UrlUtils.getDomainStr(finalUrlStr, null)) == null )
throw new RuntimeException("Unable to obtain the domain!"); // The cause it's already logged inside "getDomainStr()".
Expand All @@ -127,10 +119,10 @@ public static boolean connectAndCheckMimeType(String urlId, String sourceUrl, St
String firstHtmlLine = null;
BufferedReader bufferedReader = null;

///////////////////////////
//mimeType = null; // DEBUG!
///////////////////////////
// Check if we are able to find the mime type, if not then try "Content-Disposition".
String contentDisposition = null;

String mimeType = conn.getContentType();
if ( mimeType == null ) {
contentDisposition = conn.getHeaderField("Content-Disposition");
if ( contentDisposition == null ) {
Expand Down Expand Up @@ -326,25 +318,7 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do

URL url = new URL(resourceURL);
conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent", userAgent);
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8");

conn.setRequestProperty("Accept-Encoding", "gzip, deflate, br");
//conn.setRequestProperty("TE", "trailers"); // TODO - Investigate the "transfer-encoding" header.

if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
conn.setRequestProperty("Accept-Language", acceptLanguage);

conn.setRequestProperty("DNT", "1");
conn.setRequestProperty("Connection", "keep-alive");
conn.setRequestProperty("Sec-Fetch-Dest", "document");
conn.setRequestProperty("Sec-Fetch-Mode", "navigate");
conn.setRequestProperty("Sec-Fetch-Site", "cross-site");
conn.setRequestProperty("Upgrade-Insecure-Requests", "1");
conn.setRequestProperty("Pragma", "no-cache");
conn.setRequestProperty("Cache-Control", "no-cache");
conn.setRequestProperty("Host", domainStr);

ConnSupportUtils.setHttpHeaders(conn, domainStr);
conn.setInstanceFollowRedirects(false); // We manage redirects on our own, in order to control redirectsNum, avoid redirecting to unwantedUrls and handling errors.

boolean useHttpGetMethod = false;
Expand Down Expand Up @@ -378,7 +352,7 @@ public static HttpURLConnection openHttpConnection(String resourceURL, String do
domainsWithUnsupportedAcceptLanguageParameter.add(domainStr); // Take note that this domain does not support it..

conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty("User-Agent", userAgent);
ConnSupportUtils.setHttpHeaders(conn, domainStr);
conn.setInstanceFollowRedirects(false);

if ( useHttpGetMethod ) {
Expand Down Expand Up @@ -409,10 +383,8 @@ else if ( ((responseCode == 405) || (responseCode == 501)) && conn.getRequestMet
// If we accept connection's retrying, using "GET", move on reconnecting.
// No call of "conn.disconnect()" here, as we will connect to the same server.
conn = (HttpURLConnection) url.openConnection();
ConnSupportUtils.setHttpHeaders(conn, domainStr);
conn.setRequestMethod("GET"); // To reach here, it means that the HEAD method is unsupported.
conn.setRequestProperty("User-Agent", userAgent);
if ( !domainsWithUnsupportedAcceptLanguageParameter.contains(domainStr) )
conn.setRequestProperty("Accept-Language", acceptLanguage);
conn.setConnectTimeout(maxConnGETWaitingTime);
conn.setReadTimeout(maxConnGETWaitingTime);
conn.setInstanceFollowRedirects(false);
Expand All @@ -433,8 +405,8 @@ else if ( ((responseCode == 405) || (responseCode == 501)) && conn.getRequestMet
domainsWithUnsupportedAcceptLanguageParameter.add(domainStr); // Take note that this domain does not support it..

conn = (HttpURLConnection) url.openConnection();
ConnSupportUtils.setHttpHeaders(conn, domainStr);
conn.setRequestMethod("GET"); // To reach here, it means that the HEAD method is unsupported.
conn.setRequestProperty("User-Agent", userAgent);
conn.setConnectTimeout(maxConnGETWaitingTime);
conn.setReadTimeout(maxConnGETWaitingTime);
conn.setInstanceFollowRedirects(false);
Expand Down

0 comments on commit bf56665

Please sign in to comment.