Skip to content

Commit b5b62d0

Browse files
authored
Merge pull request #1206 from digital-preservation/add-zip-file-fallback
Add ZIP file fallback.
2 parents fba0b3a + 816a811 commit b5b62d0

File tree

7 files changed

+206
-30
lines changed

7 files changed

+206
-30
lines changed

droid-container/pom.xml

+5-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,11 @@
136136
<dependency>
137137
<groupId>commons-io</groupId>
138138
<artifactId>commons-io</artifactId>
139-
<scope>test</scope>
139+
<scope>provided</scope>
140+
</dependency>
141+
<dependency>
142+
<groupId>org.apache.commons</groupId>
143+
<artifactId>commons-compress</artifactId>
140144
</dependency>
141145
<dependency>
142146
<groupId>com.github.tomakehurst</groupId>

droid-container/src/main/java/uk/gov/nationalarchives/droid/container/AbstractIdentifierEngine.java

+3-3
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
*
4444
* @author rbrennan
4545
*/
46-
public abstract class AbstractIdentifierEngine implements IdentifierEngine {
46+
public abstract class AbstractIdentifierEngine implements IdentifierEngine<InputStream> {
4747

4848
private IdentificationRequestFactory<InputStream> requestFactory;
4949

@@ -62,14 +62,14 @@ protected ByteReader newByteReader(InputStream in) throws IOException {
6262
/**
6363
* @param requestFactory the requestFactory to set
6464
*/
65-
public void setRequestFactory(IdentificationRequestFactory requestFactory) {
65+
public void setRequestFactory(IdentificationRequestFactory<InputStream> requestFactory) {
6666
this.requestFactory = requestFactory;
6767
}
6868

6969
/**
7070
* @return the requestFactory
7171
*/
72-
protected IdentificationRequestFactory getRequestFactory() {
72+
protected IdentificationRequestFactory<InputStream> getRequestFactory() {
7373
return requestFactory;
7474
}
7575
}

droid-container/src/main/java/uk/gov/nationalarchives/droid/container/IdentifierEngine.java

+5-4
Original file line numberDiff line numberDiff line change
@@ -39,22 +39,23 @@
3939
/**
4040
*
4141
* @author rbrennan
42+
* @param <T> The type of the byte source for the engine.
4243
*/
43-
public interface IdentifierEngine {
44+
public interface IdentifierEngine<T> {
4445

4546
/**
4647
* Process the identification request.
4748
*
4849
* @param request The identification request
4950
* @param matches the Container signature match collection
50-
*
51+
*
5152
* @throws IOException if a problem occurred with processing
5253
*/
53-
void process(IdentificationRequest request, ContainerSignatureMatchCollection matches) throws IOException;
54+
void process(IdentificationRequest<T> request, ContainerSignatureMatchCollection matches) throws IOException;
5455

5556
/**
5657
* Sets the identification request factory to use to obtain new readers for internal byte streams.
5758
* @param requestFactory The IdentificationRequestFactory to set.
5859
*/
59-
void setRequestFactory(IdentificationRequestFactory requestFactory);
60+
void setRequestFactory(IdentificationRequestFactory<T> requestFactory);
6061
}

droid-container/src/main/java/uk/gov/nationalarchives/droid/container/ole2/Ole2IdentifierEngine.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ public class Ole2IdentifierEngine extends AbstractIdentifierEngine {
6767

6868
//CHECKSTYLE:OFF - cyclomatic complexity too high.
6969
@Override
70-
public void process(IdentificationRequest request, ContainerSignatureMatchCollection matches) throws IOException {
70+
public void process(IdentificationRequest<InputStream> request, ContainerSignatureMatchCollection matches) throws IOException {
7171
//CHECKSTYLE:ON
7272
InputStream in = null;
7373
POIFSFileSystem reader = null;

droid-container/src/main/java/uk/gov/nationalarchives/droid/container/zip/ZipIdentifierEngine.java

+47-20
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333

3434
import net.java.truevfs.comp.zip.ZipEntry;
3535
import net.java.truevfs.comp.zip.ZipFile;
36+
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
37+
import org.slf4j.Logger;
38+
import org.slf4j.LoggerFactory;
3639
import uk.gov.nationalarchives.droid.container.AbstractIdentifierEngine;
3740
import uk.gov.nationalarchives.droid.container.ContainerSignatureMatch;
3841
import uk.gov.nationalarchives.droid.container.ContainerSignatureMatchCollection;
@@ -43,44 +46,68 @@
4346
import java.io.IOException;
4447
import java.io.InputStream;
4548
import java.util.List;
49+
import java.util.zip.ZipException;
4650

4751
/**
4852
*
4953
* @author rbrennan
5054
*/
5155
public class ZipIdentifierEngine extends AbstractIdentifierEngine {
5256

57+
private static final Logger LOG = LoggerFactory.getLogger(ZipIdentifierEngine.class);
58+
5359
@Override
54-
public void process(IdentificationRequest request, ContainerSignatureMatchCollection matches) throws IOException {
55-
ZipFile zipFile = new ZipFile(new ByteseekWindowWrapper(request.getWindowReader()), ZipFile.DEFAULT_CHARSET, true, false);
60+
public void process(IdentificationRequest<InputStream> request, ContainerSignatureMatchCollection matches) throws IOException {
5661

57-
try {
62+
try (ZipFile zipFile = new ZipFile(new ByteseekWindowWrapper(request.getWindowReader()), ZipFile.DEFAULT_CHARSET, true, false)) {
5863
// For each entry:
5964
for (String entryName : matches.getAllFileEntries()) {
6065
final ZipEntry entry = zipFile.entry(entryName);
6166
if (entry != null) {
6267
// Get a stream for the entry and a byte reader over the stream:
6368
InputStream stream = zipFile.getInputStream(entry.getName());
64-
ByteReader reader = null;
65-
try {
66-
reader = newByteReader(stream);
67-
// For each signature to match:
68-
List<ContainerSignatureMatch> matchList = matches.getContainerSignatureMatches();
69-
for (ContainerSignatureMatch match : matchList) {
70-
match.matchBinaryContent(entryName, reader);
71-
}
72-
} finally {
73-
if (reader != null) {
74-
reader.close();
75-
}
76-
if (stream != null) {
77-
stream.close();
78-
}
79-
}
69+
matchEntry(matches, entryName, stream);
70+
}
71+
}
72+
} catch (ZipException ze) {
73+
LOG.warn("Initial zip file parsing failed. Will try again with commons-compress {}", ze.getMessage());
74+
processFallback(request, matches);
75+
}
76+
}
77+
78+
private void processFallback(IdentificationRequest<InputStream> request, ContainerSignatureMatchCollection matches) throws IOException {
79+
try (var zipFile = org.apache.commons.compress.archivers.zip.ZipFile.builder()
80+
.setIgnoreLocalFileHeader(true)
81+
.setSeekableByteChannel(new ByteseekWindowWrapper(request.getWindowReader()))
82+
.get()) {
83+
// For each entry:
84+
for (String entryName : matches.getAllFileEntries()) {
85+
final ZipArchiveEntry entry = zipFile.getEntry(entryName);
86+
if (entry != null) {
87+
// Get a stream for the entry and a byte reader over the stream:
88+
InputStream stream = zipFile.getInputStream(entry);
89+
matchEntry(matches, entryName, stream);
8090
}
8191
}
92+
}
93+
}
94+
95+
private void matchEntry(ContainerSignatureMatchCollection matches, String entryName, InputStream stream) throws IOException {
96+
ByteReader reader = null;
97+
try {
98+
reader = newByteReader(stream);
99+
// For each signature to match:
100+
List<ContainerSignatureMatch> matchList = matches.getContainerSignatureMatches();
101+
for (ContainerSignatureMatch match : matchList) {
102+
match.matchBinaryContent(entryName, reader);
103+
}
82104
} finally {
83-
zipFile.close();
105+
if (reader != null) {
106+
reader.close();
107+
}
108+
if (stream != null) {
109+
stream.close();
110+
}
84111
}
85112
}
86113
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/*
2+
* Copyright (c) 2016, The National Archives <[email protected]>
3+
* All rights reserved.
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following
7+
* conditions are met:
8+
*
9+
* * Redistributions of source code must retain the above copyright
10+
* notice, this list of conditions and the following disclaimer.
11+
*
12+
* * Redistributions in binary form must reproduce the above copyright
13+
* notice, this list of conditions and the following disclaimer in the
14+
* documentation and/or other materials provided with the distribution.
15+
*
16+
* * Neither the name of the The National Archives nor the
17+
* names of its contributors may be used to endorse or promote products
18+
* derived from this software without specific prior written permission.
19+
*
20+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23+
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
24+
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25+
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26+
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28+
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29+
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
*/
32+
package uk.gov.nationalarchives.droid.container.zip;
33+
34+
import net.java.truevfs.comp.zip.ZipEntry;
35+
import net.java.truevfs.comp.zip.ZipOutputStream;
36+
import org.junit.Test;
37+
import uk.gov.nationalarchives.droid.container.*;
38+
import uk.gov.nationalarchives.droid.core.interfaces.IdentificationRequest;
39+
import uk.gov.nationalarchives.droid.core.interfaces.RequestIdentifier;
40+
import uk.gov.nationalarchives.droid.core.interfaces.archive.AbstractArchiveRequestFactory;
41+
import uk.gov.nationalarchives.droid.core.interfaces.resource.RequestMetaData;
42+
import uk.gov.nationalarchives.droid.core.interfaces.resource.ZipEntryIdentificationRequest;
43+
import uk.gov.nationalarchives.droid.core.signature.droid6.ByteSequence;
44+
import uk.gov.nationalarchives.droid.core.signature.droid6.InternalSignature;
45+
import uk.gov.nationalarchives.droid.core.signature.droid6.InternalSignatureCollection;
46+
47+
import java.io.ByteArrayInputStream;
48+
import java.io.ByteArrayOutputStream;
49+
import java.io.IOException;
50+
import java.io.InputStream;
51+
import java.nio.file.Files;
52+
import java.util.List;
53+
54+
import static org.junit.Assert.*;
55+
56+
public class ZipFailureFallbackTest {
57+
58+
@Test
59+
public void testFallbackWithCorruptedZipFile() throws IOException {
60+
String fileName = "file.txt";
61+
byte[] zipBytes = createZipBytes(fileName);
62+
63+
//14 bytes from the end is the file count. Setting it to zero causes truevfs to fail but commons-compress succeeds.
64+
zipBytes[zipBytes.length - 14] = 0x00;
65+
66+
List<ContainerSignatureMatch> containerSignatureMatches = getContainerSignatureMatches(zipBytes, fileName);
67+
68+
assertEquals(containerSignatureMatches.size(), 1);
69+
assertEquals(containerSignatureMatches.getFirst().getSignature().getId(), 1);
70+
assertTrue(containerSignatureMatches.getFirst().isMatch());
71+
}
72+
73+
@Test
74+
public void testValidZipFileMatches() throws IOException {
75+
String fileName = "file.txt";
76+
byte[] zipBytes = createZipBytes(fileName);
77+
78+
List<ContainerSignatureMatch> containerSignatureMatches = getContainerSignatureMatches(zipBytes, fileName);
79+
80+
assertEquals(containerSignatureMatches.size(), 1);
81+
assertEquals(containerSignatureMatches.getFirst().getSignature().getId(), 1);
82+
assertTrue(containerSignatureMatches.getFirst().isMatch());
83+
}
84+
85+
@Test
86+
public void testNoMatchesIfFileNotFound() throws IOException {
87+
String fileName = "file.txt";
88+
String missingFileName = "missingFile.txt";
89+
byte[] zipBytes = createZipBytes(fileName);
90+
91+
List<ContainerSignatureMatch> containerSignatureMatches = getContainerSignatureMatches(zipBytes, missingFileName);
92+
93+
assertEquals(containerSignatureMatches.size(), 1);
94+
assertFalse(containerSignatureMatches.getFirst().isMatch());
95+
}
96+
97+
private static List<ContainerSignatureMatch> getContainerSignatureMatches(byte[] zipBytes, String filePath) throws IOException {
98+
ByteArrayInputStream inputStream = new ByteArrayInputStream(zipBytes);
99+
100+
RequestMetaData metaData = new RequestMetaData(1L, 1L, "");
101+
IdentificationRequest<InputStream> request = new ZipEntryIdentificationRequest(metaData, null, Files.createTempDirectory("test"));
102+
request.open(inputStream);
103+
104+
ContainerSignature containerSignature = new ContainerSignature();
105+
containerSignature.setId(1);
106+
ContainerFile containerFile = new ContainerFile();
107+
containerFile.setPath(filePath);
108+
109+
InternalSignatureCollection internalSignatureCollection = new InternalSignatureCollection();
110+
InternalSignature internalSignature = new InternalSignature();
111+
ByteSequence byteSequence = new ByteSequence();
112+
byteSequence.setSequence("");
113+
internalSignature.addByteSequence(byteSequence);
114+
internalSignatureCollection.setInternalSignatures(List.of(internalSignature));
115+
containerFile.setBinarySignatures(internalSignatureCollection);
116+
containerSignature.setFiles(List.of(containerFile));
117+
118+
ContainerSignatureMatchCollection matchCollection = new ContainerSignatureMatchCollection(List.of(containerSignature), List.of(filePath), 1024);
119+
AbstractArchiveRequestFactory<InputStream> requestFactory = new AbstractArchiveRequestFactory<>() {
120+
121+
@Override
122+
public IdentificationRequest<InputStream> newRequest(RequestMetaData metaData, RequestIdentifier identifier) {
123+
return request;
124+
}
125+
};
126+
ZipIdentifierEngine zipIdentifierEngine = new ZipIdentifierEngine();
127+
zipIdentifierEngine.setRequestFactory(requestFactory);
128+
zipIdentifierEngine.process(request, matchCollection);
129+
return matchCollection.getContainerSignatureMatches();
130+
}
131+
132+
private static byte[] createZipBytes(String filePath) throws IOException {
133+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
134+
try (ZipOutputStream zos = new ZipOutputStream(baos)) {
135+
ZipEntry entry1 = new ZipEntry(filePath);
136+
zos.putNextEntry(entry1);
137+
zos.write("Test".getBytes());
138+
zos.closeEntry();
139+
}
140+
141+
return baos.toByteArray();
142+
}
143+
}

droid-parent/pom.xml

+2-1
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@
104104
<slf4j.version>2.0.16</slf4j.version>
105105
<log4j2.version>2.24.3</log4j2.version>
106106
<checkstyle.version>10.21.2</checkstyle.version>
107+
<commons.version>2.18.0</commons.version>
107108
</properties>
108109

109110
<build>
@@ -664,7 +665,7 @@ Copyright &copy; ${project.inceptionYear}-{currentYear} <a href="${project.organ
664665
<dependency>
665666
<groupId>commons-io</groupId>
666667
<artifactId>commons-io</artifactId>
667-
<version>2.18.0</version>
668+
<version>${commons.version}</version>
668669
</dependency>
669670
<dependency>
670671
<groupId>commons-lang</groupId>

0 commit comments

Comments
 (0)