Optimize performance and memory-usage by setting the buffer-size of "…

…BufferedInputStream", "BufferedOutputStream" and "BufferedReader" to have the value of the "contentSize" of the resource, whenever it's available AND it's lower than the upper threshold (currently 5MB).
LSmyrnaios · Mar 6, 2024 · abad66e · abad66e
1 parent 183fb4d
commit abad66e
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 13 deletions.
diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java
@@ -364,11 +364,12 @@ else if ( docFileNameType.equals(DocFileNameType.originalName) )
 		}
 		long bytesCount = 0;
 		FileOutputStream fileOutputStream = docFileData.getFileOutputStream();
+		int bufferSize = (((contentSize != -2) && contentSize < fiveMb) ? contentSize : fiveMb);
 
-		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
-			  BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), fiveMb) )
+		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, bufferSize);
+			  BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), bufferSize) )
 		{
-			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
+			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);	// It handles the "-2" case.
 			int readByte = -1;
 			long startTime = System.nanoTime();
 			while ( (readByte = inStream.read()) != -1 )
@@ -469,11 +470,12 @@ public static synchronized DocFileData storeDocFileWithNumberName(HttpURLConnect
 			throw new RuntimeException("MD5 HASH ALGO MISSING");
 		}
 		long bytesCount = 0;
+		int bufferSize = (((contentSize != -2) && contentSize < fiveMb) ? contentSize : fiveMb);
 
-		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb);
-			BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), fiveMb))
+		try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, bufferSize);
+			BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), bufferSize))
 		{
-			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);
+			int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize);	// It handles the "-2" case.
 			int readByte = -1;
 			long startTime = System.nanoTime();
 			while ( (readByte = inStream.read()) != -1 )

diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java
@@ -349,6 +349,7 @@ public static DocFileData downloadAndStoreDocFile(HttpURLConnection conn, String
 			int contentSize = 0;
 			if ( (contentSize = getContentSize(conn, true, false)) == -1 )	// "Unacceptable size"-code..
 				throw new DocFileNotRetrievedException("The HTTP-reported size of this file was unacceptable!");
+			// It may be "-2", in case it was not retrieved..
 
 			// Write the downloaded bytes to the docFile and return the docFileName.
 			DocFileData docFileData =  null;
@@ -830,27 +831,31 @@ public static List<String> blockSharedSiteSessionDomains(String targetUrl, Strin
 
 	public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader, boolean isForError)
 	{
-		if ( getContentSize(conn, false, isForError) == -1 ) {	// "Unacceptable size"-code..
+		int contentSize = 0;
+		if ( (contentSize = getContentSize(conn, false, isForError)) == -1 ) {	// "Unacceptable size"-code..
 			if ( !isForError )	// It's expected to have ZERO-length most times, and thus the extraction cannot continue. Do not show a message. It's rare that we get an error-message anyway.
 				logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString());
 			ConnSupportUtils.closeBufferedReader(bufferedReader);	// This page's content-type was auto-detected, and the process fails before re-requesting the conn-inputStream, then make sure we close the last one.
 			return null;
 		}
+		// It may be "-2" in case the "contentSize" was not available.
 
 		StringBuilder htmlStrB = htmlStrBuilder.get();
 		if ( htmlStrB == null ) {
 			htmlStrB = new StringBuilder(100000);	// Initialize and pre-allocate the StringBuilder.
 			htmlStrBuilder.set(htmlStrB);	// Save it for future use by this thread.
 		}
 
+		int bufferSize = 0;
 		InputStream inputStream = null;
 		if ( bufferedReader == null ) {
 			inputStream = checkEncodingAndGetInputStream(conn, isForError);
 			if ( inputStream == null )	// The error is already logged inside.
 				return null;
+			bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb);
 		}
 
-		try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb)) )	// Try-with-resources
+		try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), bufferSize)) )	// Try-with-resources
 		{
 			String inputLine;
 			while ( (inputLine = br.readLine()) != null )
@@ -963,18 +968,22 @@ public static ArrayList<Object> detectContentTypeFromResponseBody(String finalUr
 	 */
 	public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConnection conn)
 	{
-		if ( getContentSize(conn, false, false) == -1 ) {	// "Unacceptable size"-code..
-			logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString());
+		int contentSize = 0;
+		if ( (contentSize = getContentSize(conn, false, false)) == -1) {	// "Unacceptable size"-code..
+			logger.warn("Aborting content-extraction for pageUrl: " + conn.getURL().toString());
 			return null;
 		}
+		// It may be "-2" in case the "contentSize" was not available.
 
 		InputStream inputStream = checkEncodingAndGetInputStream(conn, false);
 		if ( inputStream == null )
 			return null;
 
+		int bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb);
+
 		BufferedReader br = null;
 		try {
-			br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb);
+			br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), bufferSize);
 			String inputLine;
 
 			// Skip empty lines in the beginning of the HTML-code
@@ -1172,8 +1181,15 @@ public static InputStream getInputStreamFromInputDataUrl()
 			if ( inputStream == null )
 				throw new RuntimeException("Could not acquire the InputStream!");
 
+			// Check if we should abort the download based on its content-size.
+			int contentSize = 0;
+			if ( (contentSize = getContentSize(conn, true, false)) == -1 )	// "Unacceptable size"-code..
+				throw new DocFileNotRetrievedException("The HTTP-reported size of this file was unacceptable!");
+			// It may be "-2", in case it was not retrieved..
+			int bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb);
+
 			// Wrap it with a buffer, for increased efficiency.
-			inputStream = new BufferedInputStream(inputStream, FileUtils.fiveMb);
+			inputStream = new BufferedInputStream(inputStream, bufferSize);
 
 		} catch (Exception e) {
 			String errorMessage = "Unexpected error when retrieving the input-stream from the inputDataUrl:\n" + e.getMessage();

diff --git a/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java b/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java
@@ -85,7 +85,7 @@ public void testCustomInputOutputWithOriginalDocFileNames()
 		args[3] = "-docFileNameType";
 		args[4] = "originalName";
 		args[5] = "-docFilesStorage";
-		args[6] = "/storage/runs/run1/docFiles";
+		args[6] = "docFiles";
 		main(args);
 	}