stellar · chowbao · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 3, 2024
diff --git a/services/galexie/scripts/README.md b/services/galexie/scripts/README.md
@@ -0,0 +1,15 @@
+## Galexie: Backfill Examples
+
+The files in this directory are examples in different ways to use Galexie to backfill Stellar network data into a Google Cloud Storage (GCS) bucket.
+
+## Notes and Tips
+
+* An unoptimized full history backfill with pubnet data using Galexie took roughly 4.5 days
+* Total costs ~= $1100 USD
+  * Compute Costs ~= $500 USD
+  * GCS Class A Operations (writes) Costs ~= $600 USD
+* Pubnet full history size is ~= 3 TB (as of 2024-07-31)
+* Using Galexie for earlier ledgers will be processed faster than ledgers closer to the current time. This is due to the fact that ledgers closer to the current time have more data due to additional features added over the years as well as larger adoption and usage of the Stellar network in general.
+* There is a noticable inflection point in runtime around ledger 30000000 (30 million). At this time it is recommened to use smaller ledger ranges for the backfilling process.
+* There are extra flags that can be enabled in the captive-core.cfg to output extra information such as `ENABLE_SOROBAN_DIAGNOSTIC_EVENTS`. Please see more captive-core options [here](https://github.com/stellar/go/blob/f692f1246b01fb09af2c232630d4ad31025de747/ingest/ledgerbackend/toml.go#L74-L109)
+* Large ledger ranges (e.g., 100000 VS 2500000 ledger range) may slow down processing speed (this assumption has not been confirmed and may not affect your use case)
diff --git a/services/galexie/scripts/batch_config.yml b/services/galexie/scripts/batch_config.yml
@@ -0,0 +1,52 @@
+# This yaml file serves as an example job configuration file for GCP batch.
+# https://cloud.google.com/batch
+
+job:
+  taskGroups:
+  - taskSpec:
+      computeResource:
+        cpuMilli: 3000
+        memoryMib: 2000
+      maxRetryCount: 1
+      container:
+        imageUri: "stellar/stellar-galexie:1.0.0"
+        entrypoint: "galexie"
+        commands: ["append", "--config-file", "/mnt/galexie-config-pubnet-batch/config-pubnet.toml", "--start", "${START}", "--end", "#{END}"]
+      tasks:
+      # It is possible to use the GCP batch index instead of manually naming each task
+      - name: "galexie-1"
+        environments:
+          START: "2"
+          END: "2499999"
+      - name: "galexie-2"
+        environments:
+          START: "2500000"
+          END: "4999999"
+
+      ...
+
+      - name: "galexie-3"
+        environments:
+          START: "30000000"
+          END: "31249999"
+      - name: "galexie-4"
+        environments:
+          START: "31250000"
+          END: "32499999"
+
+      ...
+
+    requireHostsFile: true
+    requireTaskHostsFile: true
+  allocationPolicy:
+    instances:
+    - policy: 
+        machineType: "e2-standard-2"
+        disks:
+        - newDisk:
+            type: "pd-standard"
+            sizeGb: 10
+          mountPoint: "/mnt/shared"
+        - existingDisk:
+            disk: "<disk name>"
+          mountPoint: "/mnt/galexie-config-pubnet-batch"
diff --git a/services/galexie/scripts/generate_compute_instances.py b/services/galexie/scripts/generate_compute_instances.py
@@ -0,0 +1,58 @@
+#!/usr/bin/python3
+"""
+This Python script serves as an example of how you could create a series of commands
+to create GCP compute instances that run galexie for backfill purposes.
+
+This script may need slight modifications depending on the GCP project
+you plan to create compute instances in.
+"""
+
+command = """gcloud compute instances create-with-container {instance_name} \
+--project={gcp_project} \
+--zone={zone} \
+--machine-type=e2-standard-2 \
+--network-interface=network-tier=PREMIUM,stack-type=IPV4_ONLY,subnet=default \
+--maintenance-policy=MIGRATE \
+--provisioning-model=STANDARD \
+--service-account={service_account} \
+--scopes=https://www.googleapis.com/auth/cloud-platform \
+--image=projects/cos-cloud/global/images/cos-stable-113-18244-85-29 \
+--boot-disk-size=10GB \
+--boot-disk-type=pd-balanced \
+--boot-disk-device-name=galexie-pubnet-custom-config \
+--container-image=stellar/stellar-galexie:1.0.0 \
+--container-restart-policy=always \
+--container-privileged \
+--container-command=galexie \
+--container-arg=append \
+--container-arg=--config-file \
+--container-arg=/mnt/galexie-config-pubnet/config-pubnet.toml \
+--container-arg=--start \
+--container-arg={start} \
+--container-arg=--end \
+--container-arg={end} \
+--container-mount-disk=mode=rw,mount-path=/mnt/galexie-config-pubnet,name=galexie-config-pubnet-batch-{batch_num},partition=0 \
+--disk=boot=no,device-name=galexie-config-pubnet-batch-{batch_num},mode=rw,name=galexie-config-pubnet-batch-{batch_num},scope=regional \
+--no-shielded-secure-boot \
+--shielded-vtpm \
+--shielded-integrity-monitoring \
+--labels=goog-ec-src=vm_add-gcloud,container-vm=cos-stable-113-18244-85-29"""
+
+gcp_project = ""
+zone = ""
+service_account = ""
+
+commands = []
+batch_size = 2500000
+start = 0
+last_ledger = 52124262
+
+for i in range(1, 22):
+    instance_name = f"galexie-pubnet-custom-config-{i}"
+    end = start + batch_size - 1
+    if i == 21:
+        end = last_ledger
+    commands.append(command.format(instance_name=instance_name, start=start, end=end, batch_num=i))
+    start = end + 1
+
+print(";\n\n".join(commands))