From abad66ecc12680420032d2f98d8c9708b2470e10 Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Wed, 6 Mar 2024 17:20:04 +0200
Subject: [PATCH] Optimize performance and memory-usage by setting the
 buffer-size of "BufferedInputStream", "BufferedOutputStream" and
 "BufferedReader" to have the value of the "contentSize" of the resource,
 whenever it's available AND it's lower than the upper threshold (currently
 5MB).

---
 .../util/file/FileUtils.java                  | 14 ++++++----
 .../util/http/ConnSupportUtils.java           | 28 +++++++++++++++----
 .../test/TestNonStandardInputOutput.java      |  2 +-
 3 files changed, 31 insertions(+), 13 deletions(-)
diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java
index 39f9ebe..1eb7f57 100644
--- a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java
+++ b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java
@@ -364,11 +364,12 @@ else if ( docFileNameType.equals(DocFileNameType.originalName) )
 		}
 		long bytesCount = 0;
 		FileOutputStream fileOutputStream = docFileData.getFileOutputStream();
+		int bufferSize = (((contentSize != -2) && contentSize < fiveMb) ? contentSize : fiveMb);
 
-		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
-			  BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), fiveMb) )
+		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, bufferSize);
+			  BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), bufferSize) )
 		{
-			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
+			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);	// It handles the "-2" case.
 			int readByte = -1;
 			long startTime = System.nanoTime();
 			while ( (readByte = inStream.read()) != -1 )
@@ -469,11 +470,12 @@ public static synchronized DocFileData storeDocFileWithNumberName(HttpURLConnect
 			throw new RuntimeException("MD5 HASH ALGO MISSING");
 		}
 		long bytesCount = 0;
+		int bufferSize = (((contentSize != -2) && contentSize < fiveMb) ? contentSize : fiveMb);
 
-		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
-			BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), fiveMb))
+		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, bufferSize);
+			BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), bufferSize))
 		{
-			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
+			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);	// It handles the "-2" case.
 			int readByte = -1;
 			long startTime = System.nanoTime();
 			while ( (readByte = inStream.read()) != -1 )
diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
index e3b1a4b..6d15de6 100644
--- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
+++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
@@ -349,6 +349,7 @@ public static DocFileData downloadAndStoreDocFile(HttpURLConnection conn, String
 			int contentSize = 0;
 			if ( (contentSize = getContentSize(conn, true, false)) == -1 )	// "Unacceptable size"-code..
 				throw new DocFileNotRetrievedException("The HTTP-reported size of this file was unacceptable!");
+			// It may be "-2", in case it was not retrieved..
 
 			// Write the downloaded bytes to the docFile and return the docFileName.
 			DocFileData docFileData =  null;
@@ -830,12 +831,14 @@ public static List<String> blockSharedSiteSessionDomains(String targetUrl, Strin
 
 	public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader, boolean isForError)
 	{
-		if ( getContentSize(conn, false, isForError) == -1 ) {	// "Unacceptable size"-code..
+		int contentSize = 0;
+		if ( (contentSize = getContentSize(conn, false, isForError)) == -1 ) {	// "Unacceptable size"-code..
 			if ( !isForError )	// It's expected to have ZERO-length most times, and thus the extraction cannot continue. Do not show a message. It's rare that we get an error-message anyway.
 				logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString());
 			ConnSupportUtils.closeBufferedReader(bufferedReader);	// This page's content-type was auto-detected, and the process fails before re-requesting the conn-inputStream, then make sure we close the last one.
 			return null;
 		}
+		// It may be "-2" in case the "contentSize" was not available.
 
 		StringBuilder htmlStrB = htmlStrBuilder.get();
 		if ( htmlStrB == null ) {
@@ -843,14 +846,16 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer
 			htmlStrBuilder.set(htmlStrB);	// Save it for future use by this thread.
 		}
 
+		int bufferSize = 0;
 		InputStream inputStream = null;
 		if ( bufferedReader == null ) {
 			inputStream = checkEncodingAndGetInputStream(conn, isForError);
 			if ( inputStream == null )	// The error is already logged inside.
 				return null;
+			bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb);
 		}
 
-		try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb)) )	// Try-with-resources
+		try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), bufferSize)) )	// Try-with-resources
 		{
 			String inputLine;
 			while ( (inputLine = br.readLine()) != null )
@@ -963,18 +968,22 @@ public static ArrayList<Object> detectContentTypeFromResponseBody(String finalUr
 	 */
 	public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConnection conn)
 	{
-		if ( getContentSize(conn, false, false) == -1 ) {	// "Unacceptable size"-code..
-			logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString());
+		int contentSize = 0;
+		if ( (contentSize = getContentSize(conn, false, false)) == -1) {	// "Unacceptable size"-code..
+			logger.warn("Aborting content-extraction for pageUrl: " + conn.getURL().toString());
 			return null;
 		}
+		// It may be "-2" in case the "contentSize" was not available.
 
 		InputStream inputStream = checkEncodingAndGetInputStream(conn, false);
 		if ( inputStream == null )
 			return null;
 
+		int bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb);
+
 		BufferedReader br = null;
 		try {
-			br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb);
+			br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), bufferSize);
 			String inputLine;
 
 			// Skip empty lines in the beginning of the HTML-code
@@ -1172,8 +1181,15 @@ public static InputStream getInputStreamFromInputDataUrl()
 			if ( inputStream == null )
 				throw new RuntimeException("Could not acquire the InputStream!");
 
+			// Check if we should abort the download based on its content-size.
+			int contentSize = 0;
+			if ( (contentSize = getContentSize(conn, true, false)) == -1 )	// "Unacceptable size"-code..
+				throw new DocFileNotRetrievedException("The HTTP-reported size of this file was unacceptable!");
+			// It may be "-2", in case it was not retrieved..
+			int bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb);
+
 			// Wrap it with a buffer, for increased efficiency.
-			inputStream = new BufferedInputStream(inputStream, FileUtils.fiveMb);
+			inputStream = new BufferedInputStream(inputStream, bufferSize);
 
 		} catch (Exception e) {
 			String errorMessage = "Unexpected error when retrieving the input-stream from the inputDataUrl:\n" + e.getMessage();
diff --git a/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java b/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java
index 48ec755..9dbb383 100644
--- a/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java
+++ b/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java
@@ -85,7 +85,7 @@ public void testCustomInputOutputWithOriginalDocFileNames()
 		args[3] = "-docFileNameType";
 		args[4] = "originalName";
 		args[5] = "-docFilesStorage";
-		args[6] = "/storage/runs/run1/docFiles";
+		args[6] = "docFiles";
 		main(args);
 	}