diff --git a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java index 39f9ebe..1eb7f57 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/file/FileUtils.java @@ -364,11 +364,12 @@ else if ( docFileNameType.equals(DocFileNameType.originalName) ) } long bytesCount = 0; FileOutputStream fileOutputStream = docFileData.getFileOutputStream(); + int bufferSize = (((contentSize != -2) && contentSize < fiveMb) ? contentSize : fiveMb); - try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb); - BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), fiveMb) ) + try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, bufferSize); + BufferedOutputStream outStream = new BufferedOutputStream(((fileOutputStream != null) ? fileOutputStream : new FileOutputStream(docFile)), bufferSize) ) { - int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize); + int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize); // It handles the "-2" case. int readByte = -1; long startTime = System.nanoTime(); while ( (readByte = inStream.read()) != -1 ) @@ -469,11 +470,12 @@ public static synchronized DocFileData storeDocFileWithNumberName(HttpURLConnect throw new RuntimeException("MD5 HASH ALGO MISSING"); } long bytesCount = 0; + int bufferSize = (((contentSize != -2) && contentSize < fiveMb) ? contentSize : fiveMb); - try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, fiveMb); - BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), fiveMb)) + try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, bufferSize); + BufferedOutputStream outStream = new BufferedOutputStream(new FileOutputStream(docFile), bufferSize)) { - int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize); + int maxStoringWaitingTime = getMaxStoringWaitingTime(contentSize); // It handles the "-2" case. int readByte = -1; long startTime = System.nanoTime(); while ( (readByte = inStream.read()) != -1 ) diff --git a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java index e3b1a4b..6d15de6 100644 --- a/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java +++ b/src/main/java/eu/openaire/publications_retriever/util/http/ConnSupportUtils.java @@ -349,6 +349,7 @@ public static DocFileData downloadAndStoreDocFile(HttpURLConnection conn, String int contentSize = 0; if ( (contentSize = getContentSize(conn, true, false)) == -1 ) // "Unacceptable size"-code.. throw new DocFileNotRetrievedException("The HTTP-reported size of this file was unacceptable!"); + // It may be "-2", in case it was not retrieved.. // Write the downloaded bytes to the docFile and return the docFileName. DocFileData docFileData = null; @@ -830,12 +831,14 @@ public static List blockSharedSiteSessionDomains(String targetUrl, Strin public static String getHtmlString(HttpURLConnection conn, BufferedReader bufferedReader, boolean isForError) { - if ( getContentSize(conn, false, isForError) == -1 ) { // "Unacceptable size"-code.. + int contentSize = 0; + if ( (contentSize = getContentSize(conn, false, isForError)) == -1 ) { // "Unacceptable size"-code.. if ( !isForError ) // It's expected to have ZERO-length most times, and thus the extraction cannot continue. Do not show a message. It's rare that we get an error-message anyway. logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString()); ConnSupportUtils.closeBufferedReader(bufferedReader); // This page's content-type was auto-detected, and the process fails before re-requesting the conn-inputStream, then make sure we close the last one. return null; } + // It may be "-2" in case the "contentSize" was not available. StringBuilder htmlStrB = htmlStrBuilder.get(); if ( htmlStrB == null ) { @@ -843,14 +846,16 @@ public static String getHtmlString(HttpURLConnection conn, BufferedReader buffer htmlStrBuilder.set(htmlStrB); // Save it for future use by this thread. } + int bufferSize = 0; InputStream inputStream = null; if ( bufferedReader == null ) { inputStream = checkEncodingAndGetInputStream(conn, isForError); if ( inputStream == null ) // The error is already logged inside. return null; + bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb); } - try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb)) ) // Try-with-resources + try (BufferedReader br = ((bufferedReader != null) ? bufferedReader : new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), bufferSize)) ) // Try-with-resources { String inputLine; while ( (inputLine = br.readLine()) != null ) @@ -963,18 +968,22 @@ public static ArrayList detectContentTypeFromResponseBody(String finalUr */ public static DetectedContentType extractContentTypeFromResponseBody(HttpURLConnection conn) { - if ( getContentSize(conn, false, false) == -1 ) { // "Unacceptable size"-code.. - logger.warn("Aborting HTML-extraction for pageUrl: " + conn.getURL().toString()); + int contentSize = 0; + if ( (contentSize = getContentSize(conn, false, false)) == -1) { // "Unacceptable size"-code.. + logger.warn("Aborting content-extraction for pageUrl: " + conn.getURL().toString()); return null; } + // It may be "-2" in case the "contentSize" was not available. InputStream inputStream = checkEncodingAndGetInputStream(conn, false); if ( inputStream == null ) return null; + int bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb); + BufferedReader br = null; try { - br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), FileUtils.fiveMb); + br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8), bufferSize); String inputLine; // Skip empty lines in the beginning of the HTML-code @@ -1172,8 +1181,15 @@ public static InputStream getInputStreamFromInputDataUrl() if ( inputStream == null ) throw new RuntimeException("Could not acquire the InputStream!"); + // Check if we should abort the download based on its content-size. + int contentSize = 0; + if ( (contentSize = getContentSize(conn, true, false)) == -1 ) // "Unacceptable size"-code.. + throw new DocFileNotRetrievedException("The HTTP-reported size of this file was unacceptable!"); + // It may be "-2", in case it was not retrieved.. + int bufferSize = (((contentSize != -2) && contentSize < FileUtils.fiveMb) ? contentSize : FileUtils.fiveMb); + // Wrap it with a buffer, for increased efficiency. - inputStream = new BufferedInputStream(inputStream, FileUtils.fiveMb); + inputStream = new BufferedInputStream(inputStream, bufferSize); } catch (Exception e) { String errorMessage = "Unexpected error when retrieving the input-stream from the inputDataUrl:\n" + e.getMessage(); diff --git a/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java b/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java index 48ec755..9dbb383 100644 --- a/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java +++ b/src/test/java/eu/openaire/publications_retriever/test/TestNonStandardInputOutput.java @@ -85,7 +85,7 @@ public void testCustomInputOutputWithOriginalDocFileNames() args[3] = "-docFileNameType"; args[4] = "originalName"; args[5] = "-docFilesStorage"; - args[6] = "/storage/runs/run1/docFiles"; + args[6] = "docFiles"; main(args); }