diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml
index ce824aeff..d7cc2e41e 100644
--- a/jvector-examples/pom.xml
+++ b/jvector-examples/pom.xml
@@ -11,6 +11,9 @@
jvector-examples
JVector Examples
+
+ 2.21.10
+
@@ -42,12 +45,17 @@
software.amazon.awssdk
s3-transfer-manager
- 2.21.2
+ ${awssdk.version}
software.amazon.awssdk
aws-crt-client
- 2.21.2
+ ${awssdk.version}
+
+
+ software.amazon.awssdk
+ s3
+ ${awssdk.version}
com.kohlschutter.junixsocket
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
index 9ad3c23fa..9324a1afd 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
@@ -5,6 +5,10 @@
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.s3.S3AsyncClient;
import software.amazon.awssdk.services.s3.S3AsyncClientBuilder;
+import software.amazon.awssdk.services.s3.S3Client;
+import software.amazon.awssdk.services.s3.model.ListObjectsV2Request;
+import software.amazon.awssdk.services.s3.model.ListObjectsV2Response;
+import software.amazon.awssdk.services.s3.model.S3Object;
import software.amazon.awssdk.transfer.s3.S3TransferManager;
import software.amazon.awssdk.transfer.s3.model.CompletedFileDownload;
import software.amazon.awssdk.transfer.s3.model.DownloadFileRequest;
@@ -19,31 +23,39 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
public class DownloadHelper {
+ private static String bucketName = "astra-vector";
- public static void maybeDownloadFvecs() {
- // TODO how to detect and recover from incomplete downloads?
- String[] keys = {
- "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec",
- "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec",
- "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec"
- };
-
- String bucketName = "astra-vector";
-
+ private static S3AsyncClientBuilder getS3AsyncClientBuilder() {
S3AsyncClientBuilder s3ClientBuilder = S3AsyncClient.builder()
- .region(Region.of("us-east-1"))
+ .region(Region.US_EAST_1)
.httpClient(AwsCrtAsyncHttpClient.builder()
- .maxConcurrency(1)
- .build())
+ .maxConcurrency(1)
+ .build())
.credentialsProvider(AnonymousCredentialsProvider.create());
+ return s3ClientBuilder;
+ }
+
+ public static void maybeDownloadFvecs(List files) {
+ List keys;
+ if (null == files || files.isEmpty()) {
+ keys = Arrays.asList(new String[] {
+ "wikipedia_squad/100k/ada_002_100000_base_vectors.fvec",
+ "wikipedia_squad/100k/ada_002_100000_query_vectors_10000.fvec",
+ "wikipedia_squad/100k/ada_002_100000_indices_query_10000.ivec",
+ });
+ } else {
+ keys = files;
+ }
+ // TODO how to detect and recover from incomplete downloads?
// get directory from paths in keys
- List dirs = Arrays.stream(keys).map(key -> key.substring(0, key.lastIndexOf("/"))).distinct().collect(Collectors.toList());
+ List dirs = keys.stream().map(key -> key.substring(0, key.lastIndexOf("/"))).distinct().collect(Collectors.toList());
for (String dir : dirs) {
try {
dir = "fvec/" + dir;
@@ -53,7 +65,7 @@ public static void maybeDownloadFvecs() {
}
}
- try (S3AsyncClient s3Client = s3ClientBuilder.build()) {
+ try (S3AsyncClient s3Client = getS3AsyncClientBuilder().build()) {
S3TransferManager tm = S3TransferManager.builder().s3Client(s3Client).build();
for (String key : keys) {
Path path = Paths.get("fvec", key);
@@ -69,11 +81,20 @@ public static void maybeDownloadFvecs() {
.destination(Paths.get(path.toString()))
.build();
- FileDownload downloadFile = tm.downloadFile(downloadFileRequest);
-
- CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
- System.out.println("Downloaded file of length " + downloadResult.response().contentLength());
-
+ // 3 retries
+ for (int i = 0; i < 3; i++) {
+ FileDownload downloadFile = tm.downloadFile(downloadFileRequest);
+ CompletedFileDownload downloadResult = downloadFile.completionFuture().join();
+ long downloadedSize = Files.size(path);
+
+ // Check if downloaded file size matches the expected size
+ if (downloadedSize == downloadResult.response().contentLength()) {
+ System.out.println("Downloaded file of length " + downloadResult.response().contentLength());
+ break; // Successfully downloaded
+ } else {
+ System.out.println("Incomplete download. Retrying...");
+ }
+ }
}
tm.close();
} catch (Exception e) {
@@ -82,14 +103,19 @@ public static void maybeDownloadFvecs() {
}
}
+ public static void maybeDownloadFvecs() {
+ maybeDownloadFvecs(null);
+ }
+
public static void maybeDownloadHdf5(String datasetName) {
- var fullPath = Path.of(Hdf5Loader.HDF5_DIR).resolve(datasetName);
+ Path path = Path.of(Hdf5Loader.HDF5_DIR);
+ var fullPath = path.resolve(datasetName);
if (Files.exists(fullPath)) {
return;
}
- // Download from http://ann-benchmarks.com/datasetName
- var url = "http://ann-benchmarks.com/" + datasetName;
+ // Download from https://ann-benchmarks.com/datasetName
+ var url = "https://ann-benchmarks.com/" + datasetName;
System.out.println("Downloading: " + url);
HttpURLConnection connection = null;
@@ -111,11 +137,22 @@ public static void maybeDownloadHdf5(String datasetName) {
}
try (InputStream in = connection.getInputStream()) {
- Files.createDirectories(Path.of(Hdf5Loader.HDF5_DIR));
+ Files.createDirectories(path);
Files.copy(in, fullPath, StandardCopyOption.REPLACE_EXISTING);
} catch (IOException e) {
System.out.println("Error downloading data: " + e.getMessage());
System.exit(1);
}
}
+
+ public static List s3FileListing() {
+ S3Client s3 = S3Client.builder().region(Region.US_EAST_1).credentialsProvider(AnonymousCredentialsProvider.create()).build();
+ ListObjectsV2Request req = ListObjectsV2Request.builder().bucket(bucketName).build();
+ ListObjectsV2Response res = s3.listObjectsV2(req);
+ List filenames = res.contents().stream().map(S3Object::key).collect(Collectors.toList());
+ /*for (String filename : filenames) {
+ System.out.println(filename);
+ }*/
+ return filenames;
+ }
}