From f4fdcfb40d1c9e771946f8c65af59c7d61b2bacc Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Mon, 25 Sep 2023 13:32:49 +1300 Subject: [PATCH 1/5] Collect resources from core + staging buckets This sets out the pattern for reading S3 inventories and turning them into resource collections. The JSON output will ultimately be used by nextstrain.org to both provide a listing of available resources and to be queried by versioned dataset requests (in order to go from a requested date to the corresponding S3 version IDs of the relevant objects). Eventually this flat JSON file may be replaced with a database, but for now this is a simple way to introduce the functionality. The collected resources JSON for core + staging is a ~3.2Mb JSON file (gzipped). When naively loaded into node it increases the total size of the allocated heap (V8) by ~60Mb (presumably this would be reduced by mapping certain string constants to variables). Currently only working for S3 buckets nextstrain-data and nextstrain-staging. Narratives are not yet considered, in part because they are not stored on S3. `node resourceIndexer/main.js --help` for how to run. AWS credentials with permission to read s3://nextstrain-inventories will need to be set in the usual way. --- .gitignore | 3 + package-lock.json | 494 ++++++++++++++++++++++++++++++- package.json | 4 +- resourceIndexer/constants.js | 35 +++ resourceIndexer/coreStagingS3.js | 296 ++++++++++++++++++ resourceIndexer/errors.js | 1 + resourceIndexer/inventory.js | 259 ++++++++++++++++ resourceIndexer/logger.js | 10 + resourceIndexer/main.js | 120 ++++++++ test/inventory_parsing.test.js | 57 ++++ 10 files changed, 1266 insertions(+), 13 deletions(-) create mode 100644 resourceIndexer/constants.js create mode 100644 resourceIndexer/coreStagingS3.js create mode 100644 resourceIndexer/errors.js create mode 100644 resourceIndexer/inventory.js create mode 100644 resourceIndexer/logger.js create mode 100644 resourceIndexer/main.js create mode 100644 test/inventory_parsing.test.js diff --git a/.gitignore b/.gitignore index 8d0b1a69a..564a660f2 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,9 @@ # local key-value storage (e.g. when Redis isn't configured) /data/kv.db +# data caches etc for development purposes +/devData/ + # Generated by scripts/collect-datasets.js /data/datasets_influenza.json /data/datasets_staging.json diff --git a/package-lock.json b/package-lock.json index 0d7be3deb..e5cd2691d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -34,9 +34,11 @@ "jszip": "^3.10.1", "keyv": "^4.5.4", "lodash.partition": "^4.6.0", + "luxon": "^3.4.4", "make-fetch-happen": "^10.0.0", "marked": "^0.7.0", "mime": "^2.5.2", + "neat-csv": "^7.0.0", "negotiator": "^0.6.2", "node-fetch": "^2.6.0", "passport": "^0.4.0", @@ -45,6 +47,7 @@ "proxy-agent": "^6.3.1", "raw-body": "^2.4.2", "session-file-store": "^1.3.1", + "winston": "^3.11.0", "yaml-front-matter": "^4.0.0" }, "devDependencies": { @@ -60,7 +63,6 @@ "http-proxy-middleware": "^1.3.1", "jest": "^27.5.1", "jest-extended": "^1.1.0", - "luxon": "^3.0.4", "nodemon": "^2.0.22", "request": "^2.88.2", "start-server-and-test": "^1.11.4" @@ -4563,6 +4565,24 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, + "node_modules/@colors/colors": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.6.0.tgz", + "integrity": "sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA==", + "engines": { + "node": ">=0.1.90" + } + }, + "node_modules/@dabh/diagnostics": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/@dabh/diagnostics/-/diagnostics-2.0.3.tgz", + "integrity": "sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==", + "dependencies": { + "colorspace": "1.1.x", + "enabled": "2.0.x", + "kuler": "^2.0.0" + } + }, "node_modules/@eslint/eslintrc": { "version": "1.3.2", "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-1.3.2.tgz", @@ -8330,6 +8350,11 @@ "integrity": "sha512-Hl219/BT5fLAaz6NDkSuhzasy49dwQS/DSdu4MdggFB8zcXv7vflBI3xp7FEmkmdDkBUI2bPUNeMttp2knYdxw==", "dev": true }, + "node_modules/@types/triple-beam": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz", + "integrity": "sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==" + }, "node_modules/@types/ws": { "version": "8.5.3", "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.5.3.tgz", @@ -8836,6 +8861,11 @@ "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz", "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==" }, + "node_modules/async": { + "version": "3.2.5", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.5.tgz", + "integrity": "sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==" + }, "node_modules/async-limiter": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz", @@ -10250,6 +10280,15 @@ "integrity": "sha512-iBPtljfCNcTKNAto0KEtDfZ3qzjJvqE3aTGZsbhjSBlorqpXJlaWWtPO35D+ZImoC3KWejX64o+yPGxhWSTzfg==", "dev": true }, + "node_modules/color": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/color/-/color-3.2.1.tgz", + "integrity": "sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==", + "dependencies": { + "color-convert": "^1.9.3", + "color-string": "^1.6.0" + } + }, "node_modules/color-convert": { "version": "1.9.3", "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", @@ -10263,6 +10302,15 @@ "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=" }, + "node_modules/color-string": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz", + "integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==", + "dependencies": { + "color-name": "^1.0.0", + "simple-swizzle": "^0.2.2" + } + }, "node_modules/color-support": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/color-support/-/color-support-1.1.3.tgz", @@ -10271,6 +10319,15 @@ "color-support": "bin.js" } }, + "node_modules/colorspace": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.4.tgz", + "integrity": "sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==", + "dependencies": { + "color": "^3.1.3", + "text-hex": "1.0.x" + } + }, "node_modules/combined-stream": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", @@ -10476,6 +10533,28 @@ "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==", "dev": true }, + "node_modules/csv-parser": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.0.0.tgz", + "integrity": "sha512-s6OYSXAK3IdKqYO33y09jhypG/bSDHPuyCme/IdEHfWpLf/jKcpitVFyOC6UemgGk8v7Q5u2XE0vvwmanxhGlQ==", + "dependencies": { + "minimist": "^1.2.0" + }, + "bin": { + "csv-parser": "bin/csv-parser" + }, + "engines": { + "node": ">= 10" + } + }, + "node_modules/csv-parser/node_modules/minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/dashdash": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", @@ -10802,6 +10881,11 @@ "url": "https://github.com/sindresorhus/emittery?sponsor=1" } }, + "node_modules/enabled": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/enabled/-/enabled-2.0.0.tgz", + "integrity": "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==" + }, "node_modules/encodeurl": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", @@ -11858,6 +11942,11 @@ "bser": "2.1.1" } }, + "node_modules/fecha": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz", + "integrity": "sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw==" + }, "node_modules/file-entry-cache": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz", @@ -12006,6 +12095,11 @@ "integrity": "sha512-5nqDSxl8nn5BSNxyR3n4I6eDmbolI6WT+QqR547RwxQapgjQBmtktdP+HTBb/a/zLsbzERTONyUB5pefh5TtjQ==", "dev": true }, + "node_modules/fn.name": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fn.name/-/fn.name-1.1.0.tgz", + "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==" + }, "node_modules/follow-redirects": { "version": "1.5.10", "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz", @@ -12976,7 +13070,6 @@ "version": "2.0.1", "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", - "dev": true, "engines": { "node": ">=8" }, @@ -15746,6 +15839,11 @@ "node": ">=6" } }, + "node_modules/kuler": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz", + "integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==" + }, "node_modules/lazy-ass": { "version": "1.6.0", "resolved": "https://registry.npmjs.org/lazy-ass/-/lazy-ass-1.6.0.tgz", @@ -15866,6 +15964,27 @@ "resolved": "https://registry.npmjs.org/lodash.partition/-/lodash.partition-4.6.0.tgz", "integrity": "sha1-o45GtzRp4EILDaEhLmbUFL42S6Q=" }, + "node_modules/logform": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/logform/-/logform-2.6.0.tgz", + "integrity": "sha512-1ulHeNPp6k/LD8H91o7VYFBng5i1BDE7HoKxVbZiGFidS1Rj65qcywLxX+pVfAPoQJEjRdvKcusKwOupHCVOVQ==", + "dependencies": { + "@colors/colors": "1.6.0", + "@types/triple-beam": "^1.3.2", + "fecha": "^4.2.0", + "ms": "^2.1.1", + "safe-stable-stringify": "^2.3.1", + "triple-beam": "^1.3.0" + }, + "engines": { + "node": ">= 12.0.0" + } + }, + "node_modules/logform/node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" + }, "node_modules/lowercase-keys": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz", @@ -15883,10 +16002,9 @@ } }, "node_modules/luxon": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.0.4.tgz", - "integrity": "sha512-aV48rGUwP/Vydn8HT+5cdr26YYQiUZ42NM6ToMoaGKwYfWbfLeRkEu1wXWMHBZT6+KyLfcbbtVcoQFCbbPjKlw==", - "dev": true, + "version": "3.4.4", + "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.4.4.tgz", + "integrity": "sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==", "engines": { "node": ">=12" } @@ -16552,6 +16670,32 @@ "integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=", "dev": true }, + "node_modules/neat-csv": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/neat-csv/-/neat-csv-7.0.0.tgz", + "integrity": "sha512-ZmiKZNkdqb6hrBU3lDHm52vWXs6CuFPfw6ZoJZNnY7IIpfA1fxM0UPPi+iQpqQo82qcLbsZPwLkQ1cdrMDtwwA==", + "dependencies": { + "csv-parser": "^3.0.0", + "get-stream": "^6.0.1" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/neat-csv/node_modules/get-stream": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-6.0.1.tgz", + "integrity": "sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/negotiator": { "version": "0.6.2", "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz", @@ -17083,6 +17227,14 @@ "wrappy": "1" } }, + "node_modules/one-time": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/one-time/-/one-time-1.0.0.tgz", + "integrity": "sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==", + "dependencies": { + "fn.name": "1.x.x" + } + }, "node_modules/onetime": { "version": "5.1.2", "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz", @@ -18199,6 +18351,14 @@ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" }, + "node_modules/safe-stable-stringify": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.4.3.tgz", + "integrity": "sha512-e2bDA2WJT0wxseVd4lsDP4+3ONX6HpMXQa1ZhFQ7SU+GjvORCmShbCMltrtIDfkYhVHrOcPtj+KhmDBdPdZD1g==", + "engines": { + "node": ">=10" + } + }, "node_modules/safer-buffer": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", @@ -18396,6 +18556,19 @@ "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==" }, + "node_modules/simple-swizzle": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", + "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==", + "dependencies": { + "is-arrayish": "^0.3.1" + } + }, + "node_modules/simple-swizzle/node_modules/is-arrayish": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", + "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==" + }, "node_modules/simple-update-notifier": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/simple-update-notifier/-/simple-update-notifier-1.1.0.tgz", @@ -18639,6 +18812,14 @@ "node": ">= 8" } }, + "node_modules/stack-trace": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz", + "integrity": "sha512-KGzahc7puUKkzyMt+IqAep+TVNbKP+k2Lmwhub39m1AsTSkaDutx56aDCo+HLDzf/D26BIHTJWNiTG1KAJiQCg==", + "engines": { + "node": "*" + } + }, "node_modules/stack-utils": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-2.0.2.tgz", @@ -19115,6 +19296,11 @@ "url": "https://github.com/sponsors/isaacs" } }, + "node_modules/text-hex": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz", + "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==" + }, "node_modules/text-table": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", @@ -19234,6 +19420,14 @@ "node": "*" } }, + "node_modules/triple-beam": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz", + "integrity": "sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==", + "engines": { + "node": ">= 14.0.0" + } + }, "node_modules/tslib": { "version": "1.10.0", "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.10.0.tgz", @@ -19731,6 +19925,66 @@ "node": ">= 0.10.0" } }, + "node_modules/winston": { + "version": "3.11.0", + "resolved": "https://registry.npmjs.org/winston/-/winston-3.11.0.tgz", + "integrity": "sha512-L3yR6/MzZAOl0DsysUXHVjOwv8mKZ71TrA/41EIduGpOOV5LQVodqN+QdQ6BS6PJ/RdIshZhq84P/fStEZkk7g==", + "dependencies": { + "@colors/colors": "^1.6.0", + "@dabh/diagnostics": "^2.0.2", + "async": "^3.2.3", + "is-stream": "^2.0.0", + "logform": "^2.4.0", + "one-time": "^1.0.0", + "readable-stream": "^3.4.0", + "safe-stable-stringify": "^2.3.1", + "stack-trace": "0.0.x", + "triple-beam": "^1.3.0", + "winston-transport": "^4.5.0" + }, + "engines": { + "node": ">= 12.0.0" + } + }, + "node_modules/winston-transport": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.6.0.tgz", + "integrity": "sha512-wbBA9PbPAHxKiygo7ub7BYRiKxms0tpfU2ljtWzb3SjRjv5yl6Ozuy/TkXf00HTAt+Uylo3gSkNwzc4ME0wiIg==", + "dependencies": { + "logform": "^2.3.2", + "readable-stream": "^3.6.0", + "triple-beam": "^1.3.0" + }, + "engines": { + "node": ">= 12.0.0" + } + }, + "node_modules/winston-transport/node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/winston/node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/word-wrap": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz", @@ -23780,6 +24034,21 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, + "@colors/colors": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.6.0.tgz", + "integrity": "sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA==" + }, + "@dabh/diagnostics": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/@dabh/diagnostics/-/diagnostics-2.0.3.tgz", + "integrity": "sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA==", + "requires": { + "colorspace": "1.1.x", + "enabled": "2.0.x", + "kuler": "^2.0.0" + } + }, "@eslint/eslintrc": { "version": "1.3.2", "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-1.3.2.tgz", @@ -26886,6 +27155,11 @@ "integrity": "sha512-Hl219/BT5fLAaz6NDkSuhzasy49dwQS/DSdu4MdggFB8zcXv7vflBI3xp7FEmkmdDkBUI2bPUNeMttp2knYdxw==", "dev": true }, + "@types/triple-beam": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/@types/triple-beam/-/triple-beam-1.3.5.tgz", + "integrity": "sha512-6WaYesThRMCl19iryMYP7/x2OVgCtbIVflDGFpWnb9irXI3UjYE4AzmYuiUKY1AJstGijoY+MgUszMgRxIYTYw==" + }, "@types/ws": { "version": "8.5.3", "resolved": "https://registry.npmjs.org/@types/ws/-/ws-8.5.3.tgz", @@ -27273,6 +27547,11 @@ } } }, + "async": { + "version": "3.2.5", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.5.tgz", + "integrity": "sha512-baNZyqaaLhyLVKm/DlvdW051MSgO6b8eVfIezl9E5PqWxFgzLm/wQntEW4zOytVburDEr0JlALEpdOFwvErLsg==" + }, "async-limiter": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/async-limiter/-/async-limiter-1.0.1.tgz", @@ -28389,6 +28668,15 @@ "integrity": "sha512-iBPtljfCNcTKNAto0KEtDfZ3qzjJvqE3aTGZsbhjSBlorqpXJlaWWtPO35D+ZImoC3KWejX64o+yPGxhWSTzfg==", "dev": true }, + "color": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/color/-/color-3.2.1.tgz", + "integrity": "sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==", + "requires": { + "color-convert": "^1.9.3", + "color-string": "^1.6.0" + } + }, "color-convert": { "version": "1.9.3", "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.3.tgz", @@ -28402,11 +28690,29 @@ "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.3.tgz", "integrity": "sha1-p9BVi9icQveV3UIyj3QIMcpTvCU=" }, + "color-string": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz", + "integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==", + "requires": { + "color-name": "^1.0.0", + "simple-swizzle": "^0.2.2" + } + }, "color-support": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/color-support/-/color-support-1.1.3.tgz", "integrity": "sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==" }, + "colorspace": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/colorspace/-/colorspace-1.1.4.tgz", + "integrity": "sha512-BgvKJiuVu1igBUF2kEjRCZXol6wiiGbY5ipL/oVPwm0BL9sIpMIzM8IK7vwuxIIzOXMV3Ey5w+vxhm0rR/TN8w==", + "requires": { + "color": "^3.1.3", + "text-hex": "1.0.x" + } + }, "combined-stream": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", @@ -28582,6 +28888,21 @@ } } }, + "csv-parser": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/csv-parser/-/csv-parser-3.0.0.tgz", + "integrity": "sha512-s6OYSXAK3IdKqYO33y09jhypG/bSDHPuyCme/IdEHfWpLf/jKcpitVFyOC6UemgGk8v7Q5u2XE0vvwmanxhGlQ==", + "requires": { + "minimist": "^1.2.0" + }, + "dependencies": { + "minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==" + } + } + }, "dashdash": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", @@ -28835,6 +29156,11 @@ "integrity": "sha512-uDfvUjVrfGJJhymx/kz6prltenw1u7WrCg1oa94zYY8xxVpLLUu045LAT0dhDZdXG58/EpPL/5kA180fQ/qudg==", "dev": true }, + "enabled": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/enabled/-/enabled-2.0.0.tgz", + "integrity": "sha512-AKrN98kuwOzMIdAizXGI86UFBoo26CL21UM763y1h/GMSJ4/OHU9k2YlsmBpyScFo/wbLzWQJBMCW4+IO3/+OQ==" + }, "encodeurl": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-1.0.2.tgz", @@ -29622,6 +29948,11 @@ "bser": "2.1.1" } }, + "fecha": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/fecha/-/fecha-4.2.3.tgz", + "integrity": "sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw==" + }, "file-entry-cache": { "version": "6.0.1", "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-6.0.1.tgz", @@ -29737,6 +30068,11 @@ "integrity": "sha512-5nqDSxl8nn5BSNxyR3n4I6eDmbolI6WT+QqR547RwxQapgjQBmtktdP+HTBb/a/zLsbzERTONyUB5pefh5TtjQ==", "dev": true }, + "fn.name": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fn.name/-/fn.name-1.1.0.tgz", + "integrity": "sha512-GRnmB5gPyJpAhTQdSZTSp9uaPSvl09KoYcMQtsB9rQoOmzs9dH6ffeccH+Z+cv6P68Hu5bC6JjRh4Ah/mHSNRw==" + }, "follow-redirects": { "version": "1.5.10", "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz", @@ -30498,8 +30834,7 @@ "is-stream": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", - "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", - "dev": true + "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==" }, "is-typedarray": { "version": "1.0.0", @@ -32599,6 +32934,11 @@ "integrity": "sha512-eTIzlVOSUR+JxdDFepEYcBMtZ9Qqdef+rnzWdRZuMbOywu5tO2w2N7rqjoANZ5k9vywhL6Br1VRjUIgTQx4E8w==", "dev": true }, + "kuler": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/kuler/-/kuler-2.0.0.tgz", + "integrity": "sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==" + }, "lazy-ass": { "version": "1.6.0", "resolved": "https://registry.npmjs.org/lazy-ass/-/lazy-ass-1.6.0.tgz", @@ -32704,6 +33044,26 @@ "resolved": "https://registry.npmjs.org/lodash.partition/-/lodash.partition-4.6.0.tgz", "integrity": "sha1-o45GtzRp4EILDaEhLmbUFL42S6Q=" }, + "logform": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/logform/-/logform-2.6.0.tgz", + "integrity": "sha512-1ulHeNPp6k/LD8H91o7VYFBng5i1BDE7HoKxVbZiGFidS1Rj65qcywLxX+pVfAPoQJEjRdvKcusKwOupHCVOVQ==", + "requires": { + "@colors/colors": "1.6.0", + "@types/triple-beam": "^1.3.2", + "fecha": "^4.2.0", + "ms": "^2.1.1", + "safe-stable-stringify": "^2.3.1", + "triple-beam": "^1.3.0" + }, + "dependencies": { + "ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==" + } + } + }, "lowercase-keys": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-1.0.1.tgz", @@ -32715,10 +33075,9 @@ "integrity": "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==" }, "luxon": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.0.4.tgz", - "integrity": "sha512-aV48rGUwP/Vydn8HT+5cdr26YYQiUZ42NM6ToMoaGKwYfWbfLeRkEu1wXWMHBZT6+KyLfcbbtVcoQFCbbPjKlw==", - "dev": true + "version": "3.4.4", + "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.4.4.tgz", + "integrity": "sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==" }, "make-dir": { "version": "3.1.0", @@ -33232,6 +33591,22 @@ "integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=", "dev": true }, + "neat-csv": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/neat-csv/-/neat-csv-7.0.0.tgz", + "integrity": "sha512-ZmiKZNkdqb6hrBU3lDHm52vWXs6CuFPfw6ZoJZNnY7IIpfA1fxM0UPPi+iQpqQo82qcLbsZPwLkQ1cdrMDtwwA==", + "requires": { + "csv-parser": "^3.0.0", + "get-stream": "^6.0.1" + }, + "dependencies": { + "get-stream": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-6.0.1.tgz", + "integrity": "sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==" + } + } + }, "negotiator": { "version": "0.6.2", "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.6.2.tgz", @@ -33646,6 +34021,14 @@ "wrappy": "1" } }, + "one-time": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/one-time/-/one-time-1.0.0.tgz", + "integrity": "sha512-5DXOiRKwuSEcQ/l0kGCF6Q3jcADFv5tSmRaJck/OqkVFcOzutB134KRSfF0xDrL39MNnqxbHBbUUcjZIhTgb2g==", + "requires": { + "fn.name": "1.x.x" + } + }, "onetime": { "version": "5.1.2", "resolved": "https://registry.npmjs.org/onetime/-/onetime-5.1.2.tgz", @@ -34491,6 +34874,11 @@ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" }, + "safe-stable-stringify": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/safe-stable-stringify/-/safe-stable-stringify-2.4.3.tgz", + "integrity": "sha512-e2bDA2WJT0wxseVd4lsDP4+3ONX6HpMXQa1ZhFQ7SU+GjvORCmShbCMltrtIDfkYhVHrOcPtj+KhmDBdPdZD1g==" + }, "safer-buffer": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", @@ -34657,6 +35045,21 @@ "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.7.tgz", "integrity": "sha512-wnD2ZE+l+SPC/uoS0vXeE9L1+0wuaMqKlfz9AMUo38JsyLSBWSFcHR1Rri62LZc12vLr1gb3jl7iwQhgwpAbGQ==" }, + "simple-swizzle": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", + "integrity": "sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==", + "requires": { + "is-arrayish": "^0.3.1" + }, + "dependencies": { + "is-arrayish": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz", + "integrity": "sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==" + } + } + }, "simple-update-notifier": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/simple-update-notifier/-/simple-update-notifier-1.1.0.tgz", @@ -34845,6 +35248,11 @@ "minipass": "^3.1.1" } }, + "stack-trace": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/stack-trace/-/stack-trace-0.0.10.tgz", + "integrity": "sha512-KGzahc7puUKkzyMt+IqAep+TVNbKP+k2Lmwhub39m1AsTSkaDutx56aDCo+HLDzf/D26BIHTJWNiTG1KAJiQCg==" + }, "stack-utils": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/stack-utils/-/stack-utils-2.0.2.tgz", @@ -35196,6 +35604,11 @@ } } }, + "text-hex": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz", + "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg==" + }, "text-table": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", @@ -35290,6 +35703,11 @@ "optional": true, "peer": true }, + "triple-beam": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/triple-beam/-/triple-beam-1.4.1.tgz", + "integrity": "sha512-aZbgViZrg1QNcG+LULa7nhZpJTZSLm/mXnHXnbAbjmN5aSa0y7V+wvv6+4WaBtpISJzThKy+PIPxc1Nq1EJ9mg==" + }, "tslib": { "version": "1.10.0", "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.10.0.tgz", @@ -35684,6 +36102,58 @@ "optional": true, "peer": true }, + "winston": { + "version": "3.11.0", + "resolved": "https://registry.npmjs.org/winston/-/winston-3.11.0.tgz", + "integrity": "sha512-L3yR6/MzZAOl0DsysUXHVjOwv8mKZ71TrA/41EIduGpOOV5LQVodqN+QdQ6BS6PJ/RdIshZhq84P/fStEZkk7g==", + "requires": { + "@colors/colors": "^1.6.0", + "@dabh/diagnostics": "^2.0.2", + "async": "^3.2.3", + "is-stream": "^2.0.0", + "logform": "^2.4.0", + "one-time": "^1.0.0", + "readable-stream": "^3.4.0", + "safe-stable-stringify": "^2.3.1", + "stack-trace": "0.0.x", + "triple-beam": "^1.3.0", + "winston-transport": "^4.5.0" + }, + "dependencies": { + "readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "requires": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + } + } + } + }, + "winston-transport": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/winston-transport/-/winston-transport-4.6.0.tgz", + "integrity": "sha512-wbBA9PbPAHxKiygo7ub7BYRiKxms0tpfU2ljtWzb3SjRjv5yl6Ozuy/TkXf00HTAt+Uylo3gSkNwzc4ME0wiIg==", + "requires": { + "logform": "^2.3.2", + "readable-stream": "^3.6.0", + "triple-beam": "^1.3.0" + }, + "dependencies": { + "readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "requires": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + } + } + } + }, "word-wrap": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz", diff --git a/package.json b/package.json index 2d52a9522..c4c35d8fe 100644 --- a/package.json +++ b/package.json @@ -50,9 +50,11 @@ "jszip": "^3.10.1", "keyv": "^4.5.4", "lodash.partition": "^4.6.0", + "luxon": "^3.4.4", "make-fetch-happen": "^10.0.0", "marked": "^0.7.0", "mime": "^2.5.2", + "neat-csv": "^7.0.0", "negotiator": "^0.6.2", "node-fetch": "^2.6.0", "passport": "^0.4.0", @@ -61,6 +63,7 @@ "proxy-agent": "^6.3.1", "raw-body": "^2.4.2", "session-file-store": "^1.3.1", + "winston": "^3.11.0", "yaml-front-matter": "^4.0.0" }, "devDependencies": { @@ -76,7 +79,6 @@ "http-proxy-middleware": "^1.3.1", "jest": "^27.5.1", "jest-extended": "^1.1.0", - "luxon": "^3.0.4", "nodemon": "^2.0.22", "request": "^2.88.2", "start-server-and-test": "^1.11.4" diff --git a/resourceIndexer/constants.js b/resourceIndexer/constants.js new file mode 100644 index 000000000..9ec0404e7 --- /dev/null +++ b/resourceIndexer/constants.js @@ -0,0 +1,35 @@ +export const DATESTAMP_REGEX = /_\d{4}-\d{2}-\d{2}/; + +export const INVALID_AUSPICE_PATTERNS = [/_seq\.json$/, /_sequences\.json$/, /_entropy\.json$/, /_titers\.json$/]; + +/** + * These patterns can be used to classify files which are potentially valid + * auspice files. They should be checked in order, with the first match winning. + * + * Each entry is a tuple of [subresource type, regex match pattern]. + * + * The subresource type (string) is the same as that used internally in the + * server code (used when instantiating (sub-)classes of `Subresource`) + */ +export const VALID_AUSPICE_PATTERNS = [ + ["root-sequence", /_root-sequence\.json$/], + ["tip-frequencies", /_tip-frequencies\.json$/], + ["measurements", /_measurements\.json$/], + ["meta", /_meta\.json$/], + ["tree", /_tree\.json$/], + ["main", /\.json$/], +] + +export const SIDECAR_TYPES = new Set( + VALID_AUSPICE_PATTERNS + .map(([subresourceType, ]) => subresourceType) + .filter((subresourceType) => !['main', 'meta', 'tree'].includes(subresourceType)) +) + +/** + * Following values taken to match the server's `sourceNameToClass`. + */ +export const SOURCE = { + CORE: "core", + STAGING: "staging", +} diff --git a/resourceIndexer/coreStagingS3.js b/resourceIndexer/coreStagingS3.js new file mode 100644 index 000000000..93542b145 --- /dev/null +++ b/resourceIndexer/coreStagingS3.js @@ -0,0 +1,296 @@ +import { SOURCE, VALID_AUSPICE_PATTERNS, INVALID_AUSPICE_PATTERNS, + DATESTAMP_REGEX, SIDECAR_TYPES } from './constants.js'; +import { collectInventory } from './inventory.js'; + +/** + * The inventory of buckets (especially the core bucket) is in some ways a + * historical record of work over the years, but this isn't really what we want + * to display to users. As some examples: + * - Files which don't match a resource to list should be excluded + * - Datestamped files (i.e. _YYYY-MM-DD in the filename) are excluded + * (we use S3 versioning instead) + * + * If the s3 object is to be excluded we return false here. + * + * In the case where the object represents a (part of) a resource we want to + * expose, then we categorise it here by adding the following properties: + * - source (STAGING or CORE) + * - resourceType (dataset, narrative or intermediate) + * - id (the ID by which objects will be grouped together. + * For datasets this is the nextstrain.org URL path, without any temporal signifier) + * - subresourceType (currently only for resourceType=dataset) + */ +function categoriseCoreObjects(item, staging) { + const key = item.key; + item.source = staging ? SOURCE.STAGING : SOURCE.CORE; + item.baseUrl = `https://${item.bucket}.s3.amazonaws.com/${encodeURI(key)}` + if (key.startsWith('search_') + || key.startsWith('manifest_') + || key.startsWith('datasets_') + ) return false; + + // On the core bucket, directory-like hierarchies are used for intermediate + // files. These intermediate files may include files which auspice can + // display, but nextstrain.org cannot map URLs to directory-like hierarchies. + // There are other resourceTypes here we may consider in the future -- e.g. + // model output JSONs + if (key.includes("/")) { + if (staging===true) return false; + if (key.startsWith('files/')) { + if ( + key.includes('/archive/') + || key.includes('/test/') + || key.includes('/workflows/') + || key.includes('/branch/') + || key.includes('/trial/') + || key.includes('/test-data/') + || key.includes('jen_test/') + || key.match(/\/nextclade-full-run-[\d-]+--UTC\//) + || key.match(/\/\d{4}-\d{2}-\d{2}_results.json/) // forecasts-ncov + || key.endsWith('.png') // forecasts-ncov + ) { + return false; + } + item.resourceType = 'intermediate'; + /* The ID is used for grouping. For a nextstrain.org dataset this would be + combined with the source to form a nextstrain URL, however that's not + applicable here. Instead we use the filepath information without the + leading 'files/' and without the (trailing) filename so that different + files in the same directory structure get grouped together. For instance, + files/ncov/open/x.json -> ncov/open */ + item.resourcePath = key.split('/').slice(1, -1).join('/') + return item; + } + return false; + } + + // Some filenames have a double underscore (presumably by mistake) + if (key.includes('__')) return false; + + // We don't have narratives on the core/staging buckets, so all that's left is + // to check if the key looks like a valid auspice file + const auspiceFileInfo = auspiceFile(key); + if (!auspiceFileInfo) return false + item.resourceType = 'dataset'; + item.subresourceType = auspiceFileInfo.subresourceType; + + /** + * Currently the resourcePath is based completely off the key name, + * paralleling how the nextstrain.org URLs of datasets are mapped to resource + * paths and then to S3 keys. In the future we may change this in order to + * group together files with different s3 key names but which we want to + * associate with the same nextstrain.org URL. For example, we may which to + * combine the auspice datasets behind `ncov/gisaid/africa` and + * `ncov/gisaid/africa/all-time`. + */ + item.resourcePath = auspiceFileInfo.urlPath; + + return item; +} + +/** + * Returns false if the filename doesn't appear to be an auspice dataset/sidecar file + * Otherwise returns an object with properties subresourceType and urlPath + */ +function auspiceFile(filename) { + if (filename.match(DATESTAMP_REGEX)) return false; + for (const pattern of INVALID_AUSPICE_PATTERNS) { + if (filename.match(pattern)) return false; + } + for (const [type, pattern] of VALID_AUSPICE_PATTERNS) { + if (filename.match(pattern)) { + return { + subresourceType: type, + urlPath: filename.replace(pattern, '').replace(/_/g, '/'), + } + } + } + return false; +} + + +/** + * Given a list of items (i.e. files) which appear to be valid components of a resource + * we want to group them into versioned resources. As an example, we may have + * - date: A, files: X_tree.json, X_meta.json + * - date: B, files: X_meta.json (invalid) + * - date: C, files: X.json + * - date: D, files: X.json, X_root-sequence.json + * - date: E, files: X_root-sequence.json (invalid) + * - date: F, files: X.json, X.json, X_root-sequence.json (valid, pick the newest X.json) + * and we want to produce a structure like: + * [ + * {date: F, versions: [{main: versionId, root-sequence: versionId}]}, + * {date: D, versions: [{main: versionId, root-sequence: versionId}]}, + * {date: C, versions: [{main: versionId}]}, + * {date: A, versions: [{v1-meta: versionId, v1-tree: versionId}]} + * ] + * + * The maximum temporal resolution is per-day, in other words if a resource was uploaded + * multiple times in a single day then only the last one is used. This matches our + * (implicit) expectation when we started used datestamped datasets during the ncov pandemic. + * It also covers the (somewhat common, I think) case where datasets were re-uploaded after + * an error / omission was noticed. + * + * The returned object may contain `versions:[]` (empty array) if no valid versions are found. + */ +function createVersionedResources(resourceType, id, items) { + const groupedByDate = items.reduce((acc, o) => { + const date = o.date; + if (acc.hasOwnProperty(date)) { // eslint-disable-line no-prototype-builtins + acc[date].push(o) + } else { + acc[date] = [o] + } + return acc; + }, {}); + + // Associate each of the files behind this dataset to its version ID + const versions = Object.entries(groupedByDate) + // sort the groups by the day (first entry: most recent) + .sort(([dateA, ], [dateB, ]) => dateA < dateB ? 1 : dateA > dateB ? -1 : 0) + // (re-)sort the objects within each day (first entry: most recent). + .map(([date, objects]) => [date, objects.sort((a, b) => b.timestamp - a.timestamp)]) + // convert the objects for each day into resource objects (or false) + .map(([date, objects]) => { + if (resourceType==='dataset') { + return validDataset(id, date, objects); + } else if (resourceType==='intermediate') { + return validIntermediate(id, date, objects); + } else { + throw new Error(`Unknown resourceType '${resourceType}' to create versioned resource from`) + } + }) + // remove days without a resource object (some days might have files but no valid dataset) + .filter((version) => !!version); + + const resource = {versions}; + return resource; +} + + +/** + * Given a set of files from the same _day_ (S3 keys) return the subset such + * that, taken together, they represent a dataset. Often a dataset will be + * uploaded multiple times in a single day (often to fix minor mistakes) and we + * only want to surface the last-updated dataset on the day. Note that each + * individual object provided here is a valid dataset-related file in its own right, + * but taken together the objects may not represent a valid dataset, or only a + * subset may represent a valid dataset. + * + * We take the first (i.e. most recent) occurrence of valid files. In theory we + * could have a situation where we take a sidecar file that wasn't intended to + * be grouped with the auspice json, but I think that's worth the + * simplifications it allows here. + */ +function validDataset(id, date, objects) { + /** + * The `subresources` object represents the maximal possible collection of + * subresources for this dataset. The keys are the subresource types, and the + * values are false (subresource doesn't exist) or the relevant s3 object. + */ + const subresources = Object.fromEntries( + VALID_AUSPICE_PATTERNS.map(([subresourceType, ]) => [subresourceType, false]) + ); + + const _firstItem = (type) => objects.find((o) => o.subresourceType===type); + + /** + * Take a v2 dataset over a v2 dataset _even if_ the v1 dataset was uploaded + * more recently. (This is not hypothetical - it is the case for /zika as of + * 2023-11-01.) This is almost certainly an unintentional situation, and the + * behaviour of the nextstrain.org server is to look for a v2 dataset and use + * that, irregardless of whether a v1 dataset exists. + */ + const types = new Set(objects.map((o) => o.subresourceType)); + if (types.has('main')) { + subresources.main = _firstItem('main'); + } else if (types.has('meta') && types.has('tree')) { + subresources.meta = _firstItem('meta'); + subresources.tree = _firstItem('tree'); + } else { + /* It isn't unexpected to encounter days with auspice-like files but no + valid dataset. Looking at the core bucket in early 2024 identified ~70 such + occurrences. It seems like this is (mostly?) due to delete markers being + added for the main dataset (which causes the indexer to remove the + then-latest object) but not sidecars, and thus we observe a day where only + sidecars seem to exist. */ + return false; + } + + ([...types]).filter((subresourceType) => SIDECAR_TYPES.has(subresourceType)) + .forEach((subresourceType) => { + subresources[subresourceType] = _firstItem(subresourceType); + }) + + return { + date, + fileUrls: Object.fromEntries( + Object.entries(subresources).map(([subresourceType, s3object]) => { + if (!s3object.versionId) { // (bucket unversioned) + return [subresourceType, s3object.baseUrl] + } + return [subresourceType, `${s3object.baseUrl}?versionId=${encodeURIComponent(s3object.versionId)}`] + }) + ) + }; +} + +/** + * For a set of intermediate files (on a given day), return the subset to be + * represented by the resource. We don't perform any filename-based pruning at + * the moment, so the files for the resource are everything on the bucket which + * was assigned the same ID - this includes the same "file" under different + * compression schemes (etc), as that results in a different filename (key). + * If multiple files exist on the same day the first (most recent) is taken. + */ +function validIntermediate(id, date, objects) { + const seenKeys = new Set(); + return { + date, + fileUrls: Object.fromEntries( + objects + .filter((o) => { + if (seenKeys.has(o.key)) return false; + seenKeys.add(o.key) + return true; + }) + .map((s3object) => { + const filename = s3object.key.split('/').pop(); + const url = s3object.versionId ? + `${s3object.baseUrl}?versionId=${encodeURIComponent(s3object.versionId)}` : + s3object.baseUrl; + return [filename, url] + }) + ) + }; +} + + +export const coreS3Data = { + name: 'core', + async collect({local}) { + return await collectInventory({ + name: this.name, + local, + inventoryBucket: "nextstrain-inventories", + inventoryPrefix: "nextstrain-data/config-v1/" + }) + }, + categorise: (item) => categoriseCoreObjects(item, false), + createResource: createVersionedResources +}; + +export const stagingS3Data = { + name: 'staging', + async collect({local}) { + return await collectInventory({ + name: this.name, + local, + inventoryBucket: "nextstrain-inventories", + inventoryPrefix: "nextstrain-staging/config-v1/" + }) + }, + categorise: (item) => categoriseCoreObjects(item, true), + createResource: createVersionedResources +}; diff --git a/resourceIndexer/errors.js b/resourceIndexer/errors.js new file mode 100644 index 000000000..7d5bff2cf --- /dev/null +++ b/resourceIndexer/errors.js @@ -0,0 +1 @@ +export class ResourceIndexerError extends Error {} diff --git a/resourceIndexer/inventory.js b/resourceIndexer/inventory.js new file mode 100644 index 000000000..731631c2a --- /dev/null +++ b/resourceIndexer/inventory.js @@ -0,0 +1,259 @@ +import * as fs from 'node:fs/promises'; +import neatCsv from 'neat-csv'; +import zlib from 'zlib'; +import { promisify } from 'util'; +import AWS from 'aws-sdk'; +import {logger} from './logger.js'; +import { DateTime } from 'luxon'; +import escapeStringRegexp from 'escape-string-regexp'; +import { ResourceIndexerError } from './errors.js'; +const gunzip = promisify(zlib.gunzip) + +/** + * Fetches and reads the latest inventory from the provided bucket/prefix: + * - finds the most recent manifest.json via comparison of timestamps in keys + * - uses this manifest.json to get the schema + key of the actual inventory + * - gets the actual inventory & returns the data as an object[] with keys from the schema + * + * Note that we only read a maximum of 999 keys from the provided bucket+prefix. A typical inventory + * update adds ~4 keys, so this should allow for ~8 months of inventories. The bucket where inventories + * are stored should use lifecycles to expire objects. + * + * Returns an object with properties: + * - inventory: object[] list of entries in the inventory, using the schema to define keys + * - versionsExist: boolean are key versions present within the bucket? + */ +const fetchInventoryRemote = async ({bucket, prefix, name}) => { + const S3 = new AWS.S3(); + const _prefix = escapeStringRegexp(prefix.replace(/\/*$/, "/")); + const manifestKeyPattern = new RegExp(`^${_prefix}\\d{4}-\\d{2}-\\d{2}T\\d{2}-\\d{2}Z/manifest\\.json$`); + const manifestKey = await new Promise((resolve, reject) => { + S3.listObjectsV2({Bucket: bucket, Prefix: prefix, MaxKeys: 999}, (err, data) => { + if (err) return reject(err); + const orderedKeys = data.Contents + .map((object) => object.Key) + .filter((key) => key.match(manifestKeyPattern)) + .sort() // keys are identical except for a YYYY-MM-DDTHH-MMZ timestamp within the key itself + .reverse(); // now sorted most recent object first + if (orderedKeys.length===0) reject("No valid inventory manifest.json found") + resolve(orderedKeys[0]) + }); + }); + logger.info(`inventory for ${name} - manifest key: ${manifestKey}`) + + const {schema, inventoryKey, versionsExist} = await S3.getObject({Bucket: bucket, Key: manifestKey}) + .promise() + .then((response) => _parseManifest(JSON.parse(response.Body.toString('utf-8')))); + + logger.info(`inventory for ${name} - parsed manifest JSON`) + + const inventory = await S3.getObject({Bucket: bucket, Key: inventoryKey}) + .promise() + .then((response) => gunzip(response.Body)) + .then((data) => neatCsv(data, schema)); + + logger.info(`inventory for ${name} - fetched ${inventory.length} rows`) + return {inventory, versionsExist}; +} + +/** + * Parse an on-disk inventory. This expects the following files to be present: + * - `./devData/${name}.manifest.json` + * - `./devData/${name}.inventory.csv.gz` + * + * Returns an object with properties: + * - inventory: object[] list of entries in the inventory, using the schema to define keys + * - versionsExist: boolean are key versions present within the bucket? + */ +const fetchInventoryLocal = async ({name}) => { + const manifestPath = `./devData/${name}.manifest.json`; + const inventoryPath = `./devData/${name}.inventory.csv.gz`; + logger.info(`inventory for ${name} -- reading S3 inventories from ${manifestPath} and ${inventoryPath}`); + const manifest = JSON.parse(await fs.readFile(manifestPath)); + const {schema, versionsExist} = _parseManifest(manifest); + const decompress = inventoryPath.toLowerCase().endsWith('.gz') ? gunzip : (x) => x; + const inventory = await neatCsv(await decompress(await fs.readFile(inventoryPath)), schema); + logger.info(`inventory for ${name} - read ${inventory.length} rows from the local file`) + return {inventory, versionsExist}; +} + + +/** + * Returns a list of objects in the requested S3 inventory, which itself + * represents a list of objects + versions within a specific bucket+prefix. + * Objects which should be "deleted" are removed from the returned objects (see + * `removeDeletedObjects` for more) + * + * Function is exported as it is used in tests. + */ +export const parseInventory = async ({objects, versionsExist}) => { + // Ensure all objects are chronological + objects = objects.map((item) => { + item.timestamp = DateTime.fromISO(item.LastModifiedDate) + return item; + }).sort((a, b) => b.timestamp - a.timestamp); + + objects = versionsExist ? _checkVersionedObjects(objects) : _checkNonVersionedObjects(objects); + objects = _removeDeletedObjects(objects); + + /* rename / prune / add properties as I find the default S3 properties / + values awkward to work with */ + return objects.map((item) => { + return { + timestamp: item.timestamp, + date: item.LastModifiedDate.split("T")[0], + key: item.Key, + bucket: item.Bucket, + versionId: item.VersionId, // will be undefined if bucket is not versioned + latest: versionsExist ? item.IsLatest==='true' : true, + } + }); +} + + +/** + * Fetch and parse the latest inventory in the inventoryBucket / inventoryPrefix + * _or_ source a local inventory file (useful for dev purposes to avoid constant + * downloads from S3) + * @returns S3Object[] + */ +export const collectInventory = async ({name, local, inventoryBucket, inventoryPrefix}) => { + let objects, versionsExist; + try { + const fetchInventory = local ? fetchInventoryLocal : fetchInventoryRemote; + ({ inventory: objects, versionsExist} = await fetchInventory( + {bucket: inventoryBucket, prefix: inventoryPrefix, name} + )); + } catch (e) { + logger.error(`There was an error while fetching the S3 inventory for ${name}. This is fatal.`) + throw e; + } + return await parseInventory({objects, versionsExist}) +} + +/** + * For a versioned bucked, ensure that version ID is present on every object + * by filtering out those without a valid-looking version ID. For instance, + * s3://nextstrain-data/WNV_NA_tree.json from 2018-05-09 has an empty-string version ID. + * These may represent objects from before versioning was enabled. + * @param {S3Item[]} Objects chronologically sorted, latest first + */ +function _checkVersionedObjects(objects) { + const keysSeen = new Set(); + + return objects.filter((item) => { + if (!item.VersionId) { + logger.verbose(`Object ${item.Bucket}/${item.Key} is ignored as it is missing a versionId in a bucket we consider to be versioned.`); + return false; + } + if (!item.hasOwnProperty('IsLatest')) { // eslint-disable-line no-prototype-builtins + throw new ResourceIndexerError(`Object ${item.Bucket}/${item.Key} is unexpectedly missing the IsLatest property.`); + } + return true; + }) + .map((item) => { + if (item.IsLatest === 'true') { + if (keysSeen.has(item.Key)) { + throw new ResourceIndexerError(` + These appears to be something amiss for S3 objects ${item.Bucket}/${item.Key}. + Specifically, the version ${item.VersionId} is considered by S3 to be the latest, + however it is not the most recent after sorting on LastModified. + This may result in an invalid index and so this is a fatal error. + `.replace(/\s+/g, ' ')) + } + keysSeen.add(item.Key); + } else { + if (!keysSeen.has(item.Key)) { + throw new ResourceIndexerError(` + These appears to be something amiss for S3 objects ${item.Bucket}/${item.Key}. + Specifically, the most recent object (via sorting on LastModified, version ID: + ${item.VersionId}) is not classified by S3 as the latest. + This may result in an invalid index and so this is a fatal error. + `.replace(/\s+/g, ' ')) + } + } + return item; + }) +} + +/** + * For a non-versioned object, check that the VersionId is _not_ present and that keys are never duplicated. + * Adds the property 'IsLatest' = 'true' for every object + */ +function _checkNonVersionedObjects(objects) { + const keys = new Set(); + objects.forEach((item) => { + if (item.hasOwnProperty('VersionId')) { // eslint-disable-line no-prototype-builtins + logger.verbose(`Object ${item.Bucket}/${item.Key} has a versionId ('${item.VersionId}') but the bucket is not versioned! The item will be ignored.`); + return false; + } + if (keys.has(item.Key)) { + throw new ResourceIndexerError(` + The S3 Object for ${item.Bucket}/${item.Key} (unexpectedly) appears multiple times in an un-versioned bucket. + This may result in a corrupted index and so is a fatal error. + `.replace(/\s+/g, ' ')) + } + keys.add(item.Key); + }) + return objects; + +} + +/** + * When encountering a delete marker, we remove the delete marker itself and the + * most recent (but older in time) object with a matching s3 key. Back-to-back + * delete markers behave like a single delete marker. + * + * Non-versioned buckets don't have delete markers, and it's safe to run this + * function for them. + * + * @param {S3Item[]} Objects chronologically sorted, most recent first + * @returns {S3Item[]} + */ +function _removeDeletedObjects(objects) { + // For a given (s3) key, was the previously encountered object (i.e. more + // recent in time) a delete marker? + const behindDeleteMarker = {} + + return objects.filter((item) => { + const key = item.Key; + if (item.IsDeleteMarker === "true") { + behindDeleteMarker[key] = true; + return false; + } + if (behindDeleteMarker[key]) { + behindDeleteMarker[key] = false; + return false + } + return true; + }) +} + +/** + * Parses a S3 inventory manifest JSON file + * @param {object} manifest + * @returns {object} object.schema = string[] + * object.inventoryKey = string +*/ +function _parseManifest(manifest) { + if (manifest.files.length>1) { + throw new ResourceIndexerError(` + The manifest file for the S3 inventory for bucket ${manifest.sourceBucket} + includes more than one inventory file. This situation was not encountered + during development, but this is presumably caused by the inventory size + exceeding some threshold and being chunked into multiple files. Please check + this is indeed the case and, if so, amend the code to parse and join each file. + `.replace(/\s+/g, ' ')) + } + const schema = manifest.fileSchema.split(",").map((f) => f.trim()); + return { + schema, + inventoryKey: manifest.files[0].key, + // If a schema uses 'VersionId' then versions may exist in the inventory, so + // we want to check for them. It may be possible to produce manifests of + // versioned buckets but not include this in the manifest, and if so we'll + // treat it as if it were an unversioned bucket. + versionsExist: schema.includes('VersionId'), + } +} + diff --git a/resourceIndexer/logger.js b/resourceIndexer/logger.js new file mode 100644 index 000000000..1f3077133 --- /dev/null +++ b/resourceIndexer/logger.js @@ -0,0 +1,10 @@ +import { createLogger, transports } from 'winston'; + +const logger = createLogger({ + level: 'info', + transports: [new transports.Console()], +}); + +export { + logger, +} \ No newline at end of file diff --git a/resourceIndexer/main.js b/resourceIndexer/main.js new file mode 100644 index 000000000..c7c2d7f95 --- /dev/null +++ b/resourceIndexer/main.js @@ -0,0 +1,120 @@ + +import { ArgumentParser } from 'argparse'; +import fs from 'fs'; +import { coreS3Data, stagingS3Data } from "./coreStagingS3.js"; +import {logger} from './logger.js'; +import zlib from 'zlib'; +import { promisify } from 'util'; +import { ResourceIndexerError } from './errors.js'; + +const gzip = promisify(zlib.gzip) + +/** + * We define a number of collections which each represent some listing of + * nextstrain resources. The actual details are deferred to the provided + * collection objects - e.g. they may represent a GitHub repo listing, an S3 + * inventory. Each of these collections provides functions which allow items + * (files) across collections to be collected into a master list of resources + * using three identifiers: source, resourceType and resourcePath. The intention + * is for source to parallel the information in the corresponding Source + * (sub-)class and resourcePath to parallel the information in the Resource + * (sub-)class. + * + * Currently only sources {core, staging} and resource types {dataset, + * intermediate} are part of the index. + * + * As an example, the core WNV/NA (nextstrain.org/WNV/NA) dataset is indexed + * like so: + * + * core → dataset → WNV/NA → versions -> [ + * {date: "2021-04-08", fileUrls: {main: ...}}, + * {date: "2019-08-30", fileUrls: {meta: ..., tree: ...}} + * ] + * + */ +const COLLECTIONS = [ + coreS3Data, + stagingS3Data, +]; + +function parseArgs() { + const argparser = new ArgumentParser({ + description: ` + Fetch file lists from a number of provided collections (e.g. S3 inventories) and collect them into + resources. Resources are organised in a hierarchical fashion via source → resourceType → resourcePath. + Each resource contains a list of available versions, where applicable. + The output JSON is intended for consumption by the nextstrain.org server. + `, + }); + argparser.addArgument("--local", {action: 'storeTrue', + help: 'Access a local copy of S3 inventories within ./devData/. See docstring of fetchInventoryLocal() for expected filenames.'}) + argparser.addArgument("--collections", {metavar: "", type: "string", nargs: '+', choices: COLLECTIONS.map((c) => c.name), + help: "Only fetch data from a subset of collections. Source names are those defined in COLLECTIONS"}); + argparser.addArgument("--resourceTypes", {metavar: "", type: "string", nargs: '+', choices: ['dataset', 'intermediate'], + help: "Only index data matching specified resource types"}); + argparser.addArgument("--output", {metavar: "", required: true}) + argparser.addArgument("--indent", {action: 'storeTrue', help: 'Indent the output JSON'}) + argparser.addArgument("--gzip", {action: 'storeTrue', help: 'GZip the output JSON'}) + argparser.addArgument("--verbose", {action: 'storeTrue', help: 'Verbose logging'}) + + return argparser.parseArgs(); +} + + +main(parseArgs()) + .catch((err) => { + logger.error(err.message); + if (!(err instanceof ResourceIndexerError)) { + console.trace(err); + } + process.exitCode = 2; + }) + + +async function main(args) { + + if (args.verbose) { + logger.transports.forEach((t) => t.level = 'verbose'); + } + + const resources = {}; + const restrictResourceTypes = args.resourceTypes ? new Set(args.resourceTypes) : false; + + for (const collection of COLLECTIONS) { + if (args.collections && !args.collections.includes(collection.name)) { + continue + } + + const groupedObjects = (await collection.collect({local: args.local})) + .map(collection.categorise) + .filter((item) => !!item) + .filter((item) => restrictResourceTypes ? restrictResourceTypes.has(item.resourceType) : true) + // Collect together all items ("files") based on their assigned resourceType & resourcePath + .reduce((store, item) => { + const {resourceType, resourcePath, source} = item; + if (!store[source]) store[source]={} + if (!store[source][resourceType]) store[source][resourceType]={} + if (!store[source][resourceType][resourcePath]) store[source][resourceType][resourcePath]=[] + store[source][resourceType][resourcePath].push(item); + return store; + }, {}); + + for (const source of Object.keys(groupedObjects)) { + for (const resourceType of Object.keys(groupedObjects[source])) { + for (const [resourcePath, items] of Object.entries(groupedObjects[source][resourceType])) { + const resource = collection.createResource(resourceType, resourcePath, items); + if (resource.versions.length===0) continue; + if (!resources[source]) resources[source]={} + if (!resources[source][resourceType]) resources[source][resourceType]={} + resources[source][resourceType][resourcePath] = resource; + } + } + } + } + + let output = JSON.stringify(resources, null, args.indent ? 2 : null); + if (args.gzip) { + output = await gzip(output) + } + fs.writeFileSync(args.output, output); +} \ No newline at end of file diff --git a/test/inventory_parsing.test.js b/test/inventory_parsing.test.js new file mode 100644 index 000000000..5d6605271 --- /dev/null +++ b/test/inventory_parsing.test.js @@ -0,0 +1,57 @@ +/** + * Tests our handling of delete markers during construction of the resource index + */ + +import { parseInventory } from '../resourceIndexer/inventory.js'; + +/** + * Following is all the versions for s3://nextstrain-data/zika_meta.json as of + * January 2024, taken directly from the inventory. The order is unchanged, but + * in this case all items except the first are chronological. + */ +const zika_meta = [ + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "",IsLatest: "false", IsDeleteMarker: "false", LastModifiedDate: "2018-07-18T19:17:48.000Z", ETag: "a62ca3db012aab995aba66ac48421020"}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "yw1HMWq1CxxJBxLhshhd83QmfWiooRP0",IsLatest: "true", IsDeleteMarker: "false", LastModifiedDate: "2023-05-20T01:48:18.000Z", ETag: "711e4fc15f0c16c58e9a0d1262895553"}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "K.kzHAimpImiTUdyBX.EBpRTIedPDCK2",IsLatest: "false", IsDeleteMarker: "true", LastModifiedDate: "2019-11-08T21:07:14.000Z", ETag: ""}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "M7wNtFHS.iPQvEVNRfbvUjk8j1iV3G8H",IsLatest: "false", IsDeleteMarker: "false", LastModifiedDate: "2019-09-04T19:55:09.000Z", ETag: "0412951533dcdadfa3981f57e6b39579"}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "TPkaMuRION0i5IWhCMwzwm9R21UVjhSO",IsLatest: "false", IsDeleteMarker: "false", LastModifiedDate: "2019-05-26T20:52:25.000Z", ETag: "df39017298c469f123898e7963d7f973"}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "wWs5NO6cVRJKXXNm9nYlOKDgJkDU7iuk",IsLatest: "false", IsDeleteMarker: "false", LastModifiedDate: "2019-05-26T20:22:13.000Z", ETag: "621e662cd518bfbe8c1fa2ae98e8e84e"}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "dXQ7RKthkKu4puoY5vMAFsOJBqg1zF42",IsLatest: "false", IsDeleteMarker: "false", LastModifiedDate: "2018-12-05T23:29:28.000Z", ETag: "333b61eba337428c31cf7903454c36fa"}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "tDx7r51clfnWOpDNAa.1aZsrF0BGSGbE",IsLatest: "false", IsDeleteMarker: "false", LastModifiedDate: "2018-11-21T00:06:08.000Z", ETag: "572f84a28deffcfe1b0a81cf8afb05ab"}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "9Djm_Ug3pAZXP9m5h2FGAxQeQuAtOSHg",IsLatest: "false", IsDeleteMarker: "false", LastModifiedDate: "2018-10-29T16:04:49.000Z", ETag: "d9673eed4000c4156058141d6b95ccf9"}, + {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "YaEy2NzJM_QmLONaIv1Iui2BvDLCoEK9",IsLatest: "false", IsDeleteMarker: "false", LastModifiedDate: "2018-08-04T16:35:59.000Z", ETag: "84864d5c92d456d869130ec8df1c73ea"}, +] + +test('parsing of nextstrain-data/zika_meta.json inventory', async () => { + // el[0] is dropped -- no version ID in a versioned bucket + // el[2] is dropped -- delete marker + // el[3] is dropped -- deleted by the delete marker + const should_be = [zika_meta[1], ...zika_meta.slice(4)] + const parsed = await parseInventory({objects: zika_meta, versionsExist: true}) + expect(parsed.map((el) => el.versionId)) + .toEqual(should_be.map((el) => el.VersionId)) + + const shuffled = await parseInventory({objects: shuffle(zika_meta), versionsExist: true}) + expect(shuffled.map((el) => el.versionId)) + .toEqual(should_be.map((el) => el.VersionId)) +}) + +test('back-to-back delete markers', async () => { + const test_data = zika_meta.slice() + test_data.splice(7, 0, {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "introduced-marker-1",IsLatest: "false", IsDeleteMarker: "true", LastModifiedDate: "2018-12-02T12:00:00.000Z", ETag: "123"}); + test_data.splice(8, 0, {Bucket: "nextstrain-data", Key: "zika_meta.json", VersionId: "introduced-marker-2",IsLatest: "false", IsDeleteMarker: "true", LastModifiedDate: "2018-12-01T12:00:00.000Z", ETag: "456"}); + const should_be = [zika_meta[1], ...zika_meta.slice(4,7), ...zika_meta.slice(8)] + const parsed = await parseInventory({objects: test_data, versionsExist: true}) + expect(parsed.map((el) => el.versionId)) + .toEqual(should_be.map((el) => el.VersionId)) + + const shuffled = await parseInventory({objects: shuffle(test_data), versionsExist: true}) + expect(shuffled.map((el) => el.versionId)) + .toEqual(should_be.map((el) => el.VersionId)) +}) + + +function shuffle(arr) { + // following is not truly random, but good enough + return arr.slice().sort(() => 0.5-Math.random()); +} From 65629292b8a3613a5fc2078bc1c4f4bd3bc5bc5c Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Tue, 3 Oct 2023 10:58:02 +1300 Subject: [PATCH 2/5] Parse resources index Parses a pre-computed index JSON and stores the data in-memory (on the nextstrain.org server). ResourceVersions is a class which dataset/narrative requests can use to get the the (versioned) file URLs for available subresources which were present for a given YYYY-MM-DD. A subsequent commit will allow the usage of "@YYYY-DD-MM" descriptors in URLs, and eventually these data will be used to display all available resources. A subsequent commit in this PR will add documentation regarding the RESOURCE_INDEX variable, but it's design was influenced by . Briefly, to use a local index set `RESOURCE_INDEX="./path/to/json"` and to disable the index use `RESOURCE_INDEX="false"`. --- env/production/config.json | 3 +- env/testing/config.json | 3 +- src/app.js | 11 +++ src/config.js | 12 ++++ src/resourceIndex.js | 134 +++++++++++++++++++++++++++++++++++++ 5 files changed, 161 insertions(+), 2 deletions(-) create mode 100644 src/resourceIndex.js diff --git a/env/production/config.json b/env/production/config.json index c5e6a168d..7c7bc8ff6 100644 --- a/env/production/config.json +++ b/env/production/config.json @@ -109,5 +109,6 @@ "OIDC_USERNAME_CLAIM": "cognito:username", "OIDC_GROUPS_CLAIM": "cognito:groups", "SESSION_COOKIE_DOMAIN": "nextstrain.org", - "GROUPS_DATA_FILE": "groups.json" + "GROUPS_DATA_FILE": "groups.json", + "RESOURCE_INDEX": "s3://nextstrain-inventories/resources.json.gz" } diff --git a/env/testing/config.json b/env/testing/config.json index 68af5aafe..d2e8f8c94 100644 --- a/env/testing/config.json +++ b/env/testing/config.json @@ -107,5 +107,6 @@ "COGNITO_USER_POOL_ID": "us-east-1_zqpCrjM7I", "OIDC_USERNAME_CLAIM": "cognito:username", "OIDC_GROUPS_CLAIM": "cognito:groups", - "GROUPS_DATA_FILE": "groups.json" + "GROUPS_DATA_FILE": "groups.json", + "RESOURCE_INDEX": "s3://nextstrain-inventories/resources.json.gz" } diff --git a/src/app.js b/src/app.js index 4507496cb..b436d4b06 100644 --- a/src/app.js +++ b/src/app.js @@ -1,6 +1,8 @@ import * as redirects from './redirects.js'; import * as routing from "./routing/index.js"; import { setupApp } from "./routing/setup.js"; +import { RESOURCE_INDEX } from './config.js'; +import { updateResourceVersions } from './resourceIndex.js'; const { auspice, @@ -182,5 +184,14 @@ await staticSite.setup(app); */ errors.setup(app); +/** + * Update the resources we know about. If the index is located on S3 then we + * check the eTag to know whether to update, and thus most calls to + * `updateResourceVersions` simply involve a HEAD request. + */ +if (RESOURCE_INDEX) { + updateResourceVersions(); + setInterval(updateResourceVersions, 60*60*1000) +} export default app; diff --git a/src/config.js b/src/config.js index 7fe400583..f62da2698 100644 --- a/src/config.js +++ b/src/config.js @@ -450,3 +450,15 @@ export const GROUPS_DATA_FILE = fromEnvOrConfig("GROUPS_DATA_FILE", undefined, { * @default nextstrain-groups */ export const GROUPS_BUCKET = fromEnvOrConfig("GROUPS_BUCKET", "nextstrain-groups"); + + +/** + * Location of the JSON file to be used as the resource collection index. Can be + * a S3 address or a local filepath, if S3 then the file must be gzipped. + * + * If sourced from the config file, relative paths are resolved relative to the + * repo root directory ("nextstrain.org"). + * + * Falsey values result in the resource collection functionality not being used. + */ +export const RESOURCE_INDEX = fromEnvOrConfig("RESOURCE_INDEX", null); diff --git a/src/resourceIndex.js b/src/resourceIndex.js new file mode 100644 index 000000000..52aa56c10 --- /dev/null +++ b/src/resourceIndex.js @@ -0,0 +1,134 @@ +import fs from 'fs'; +import * as utils from './utils/index.js'; +import zlib from 'zlib'; +import { promisify } from 'util'; +import { NotFound, BadRequest } from './httpErrors.js'; +import { RESOURCE_INDEX } from './config.js'; +import { signedUrl } from './s3.js'; +import { fetch } from './fetch.js'; + +const gunzip = promisify(zlib.gunzip) + +let resources = {}; +let eTag; + +class ResourceVersions { + constructor(sourceId, resourceType, resourcePath) { + this.data = {}; + if (!resources[sourceId]) { + utils.verbose(`Attempted to access resources for source ID ${sourceId} but this is not present in the index`); + return; + } + if (!resources[sourceId][resourceType]) { + utils.verbose(`Attempted to access resources for ${sourceId} / ${resourceType} but there are none in the index`); + return; + } + this.data = resources[sourceId][resourceType][resourcePath] || {}; + } + + /** + * Given a (URL-provided) versionDescriptor return the YYYY-MM-DD date which + * corresponds to the desired resource version, if applicable. This can then + * be used to access specific information about the version via the + * `subresourceUrls` method. + * + * @throws {BadRequest} if the versionDescriptor is not YYYY-MM-DD. + * @throws {NotFound} if there are versions present in the index, but the + * versionDescriptor identifies a version which predated those present. + * @throws {NotFound} if there is no data about the resource in the index + * (i.e. no versions) + * @returns {(string|null)} the YYYY-MM-DD version ID or null if the + * versionDescriptor is more recent than the most recent version in the index + */ + versionDateFromDescriptor(versionDescriptor) { + if (!versionDescriptor.match(/^\d{4}-\d{2}-\d{2}$/)) { + throw new BadRequest(`Requested version must be in YYYY-MM-DD format (version descriptor requested: "${versionDescriptor}")`) + } + + const dates = (this.data?.versions || []) + .map((v) => v.date) + .sort(); + + if (!dates.length) { + throw new NotFound(`Attempted to lookup a version for ${versionDescriptor} however this resource has no versions in the index`) + } + if (versionDescriptor < dates[0]) { + throw new NotFound(`Version descriptor ${versionDescriptor} predates available versions`) + } + if (versionDescriptor > dates[dates.length - 1]) { + return null; + } + + let id; + for (const date of dates) { + if (date <= versionDescriptor) { + id = date; + } + } + return id; + } + + /** + * Given a YYYY-MM-DD string, return the available subresources and their corresponding URLs + */ + subresourceUrls(date) { + for (const version of this.data?.versions || []) { + if (version.date===date) { + return version.fileUrls; + } + } + throw new Error(`Attempted to access a specific date which doesn't exist in the resource index: ${date}`) + } + +} + + +/** + * Updates provided data in-place by fetching the file from S3 or from disk. + * + * If the index is on S3, we first make a HEAD request to obtain the eTag and + * compare this to the previous update to avoid unnecessary updates. + * + * @returns {undefined} + */ +async function updateResourceVersions() { + + if (RESOURCE_INDEX.startsWith("s3://")) { + const parts = RESOURCE_INDEX.match(/^s3:\/\/(.+?)\/(.+)$/); + const [Bucket, Key] = [parts[1], parts[2]]; + utils.verbose(`Updating available resources index from s3://${Bucket}/${Key}`); + try { + const newETag = (await fetch(await signedUrl({bucket:Bucket, key:Key, method: 'HEAD'}), {method: 'HEAD'})) + .headers.get('etag'); // value is enclosed in double quotes, but that doesn't matter for our purposes + if (newETag && newETag === eTag) { + utils.verbose("Skipping available resource update as eTag hasn't changed"); + return; + } + const res = await fetch(await signedUrl({bucket:Bucket, key:Key, method: 'GET'}), {method: 'GET'}) + if (res.status !== 200) { + throw new Error(`Non-200 response code "${res.status}".`); + } + const newResources = JSON.parse(await gunzip(await res.buffer())); + [resources, eTag] = [newResources, newETag]; + } catch (err) { + utils.warn(`Resource updating failed: ${err.message}`) + } + return; + } + + // We now assume it's a local file path (the docs state S3 or local file). Any + // attempt to use a non-local (e.g. HTTP) address will result in the fatal error + // "Error: ENOENT: no such file or directory" on server start. + utils.verbose(`Updating available resources index from local file ${RESOURCE_INDEX}`); + let fileContents = fs.readFileSync(RESOURCE_INDEX) + if (RESOURCE_INDEX.endsWith('.gz')) { + fileContents = await gunzip(fileContents) + } + resources = JSON.parse(fileContents); +} + +export { + ResourceVersions, + + updateResourceVersions, +} \ No newline at end of file From 7e539a92b5026231f9ecf5d2c1e9570fae8b2073 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Tue, 3 Oct 2023 14:17:37 +1300 Subject: [PATCH 3/5] Allow versioned resource access (core only) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nextstrain URLs are extended to allow @ syntax for core datasets. Currently the must be in YYYY-MM-DD format. The returned version is the one which was the latest on the requested day. If the requested version predates any datasets we return 404. We attempt to extract a version descriptor for every request, however for non-core sources (and core narratives)¹ the presence of such a descriptor (i.e. if "@" is in the URL²) will result in a 400 (Bad Request) error. For further examples please see `test/date_descriptor.test.js`. Note that the @YYYY-MM-DD URLs enabled by this commit look similar to some existing URLs where the datestamp is in the dataset name (e.g. "ncov/gisaid/global/6m/2024-01-02") however conceptually these are quite different. ¹ Fetch URLs (e.g. /fetch/...) are excluded ² There are exceptions, such as community URLs allowing @ in the pathname. --- package.json | 2 +- src/endpoints/sources.js | 39 ++-- src/routing/core.js | 1 + src/sources/community.js | 8 +- src/sources/core.js | 39 +++- src/sources/fetch.js | 28 ++- src/sources/groups.js | 8 +- src/sources/models.js | 80 ++++++-- test/date_descriptor.test.js | 330 ++++++++++++++++++++++++++++++++ test/date_descriptor_index.json | 66 +++++++ 10 files changed, 560 insertions(+), 41 deletions(-) create mode 100644 test/date_descriptor.test.js create mode 100644 test/date_descriptor_index.json diff --git a/package.json b/package.json index c4c35d8fe..d335ff8dc 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,7 @@ "start": "npm run server", "//test": "echo 'NODE_OPTIONS is required until this issue is resolved: https://github.com/facebook/jest/issues/9430 '", "test": "NODE_OPTIONS='--experimental-vm-modules' jest --roots ./test/", - "test:ci": "start-server-and-test 'node server.js --verbose >./test/server.log 2>&1' http://localhost:5000 test", + "test:ci": "start-server-and-test 'RESOURCE_INDEX=test/date_descriptor_index.json node server.js --verbose >./test/server.log 2>&1' http://localhost:5000 test", "dev": "./develop.sh" }, "dependencies": { diff --git a/src/endpoints/sources.js b/src/endpoints/sources.js index c458d4332..67d8855df 100644 --- a/src/endpoints/sources.js +++ b/src/endpoints/sources.js @@ -36,7 +36,7 @@ export const setSource = (sourceExtractor) => (req, res, next) => { * @returns {expressMiddleware} */ export const setDataset = (pathExtractor) => (req, res, next) => { - req.context.dataset = req.context.source.dataset(pathParts(pathExtractor(req))); + req.context.dataset = req.context.source.dataset(...pathParts(pathExtractor(req))); next(); }; @@ -195,7 +195,7 @@ export const optionsNarrative = options.forAuthzObject(req => req.context.narrat * @returns {expressMiddleware} */ export const setNarrative = (pathExtractor) => (req, res, next) => { - req.context.narrative = req.context.source.narrative(pathParts(pathExtractor(req))); + req.context.narrative = req.context.source.narrative(...pathParts(pathExtractor(req))); next(); }; @@ -250,24 +250,41 @@ export const putNarrative = contentTypesConsumed([ /** - * Split a dataset or narrative `path` into an array of parts. + * Split a dataset or narrative `path` into an array of parts and a version + * descriptor. * * If `path` is a tangletree path (i.e. refers to two datasets), returns only * the parts for the first dataset. * + * We always attempt to extract a version descriptor from the provided path, + * returning false if one is not present. + * * @param {String} path - * @returns {String[]} + * @returns {[String[], (String|false)]} [0]: array of path parts, [1]: version + * descriptor */ function pathParts(path = "") { - const normalizedPath = path - .split(":")[0] // Use only the first dataset in a tangletree (dual dataset) path. - .replace(/^\/+/, "") // Ignore leading slashes - .replace(/\/+$/, "") // …and trailing slashes. - ; + // Use only the first dataset in a tangletree (dual dataset) path. + let normalizedPath = path.split(":")[0] + + /* The part of the path starting with "@" is the version descriptor - this + will later be mapped to the appropriate fetch URL (e.g. S3 version ID) via the + resource index. The version descriptor is greedy and may itself include '@' + characters. Note that '@' characters may be present in the URL path but not in + the `path` argument here. */ + + let rest; + [normalizedPath, ...rest] = normalizedPath.split("@"); + const versionDescriptor = rest.join("@") || false; + + // Ignore leading & trailing slashes (after version descriptor removal) + normalizedPath = normalizedPath + .replace(/\/+$/, "") + .replace(/^\/+/, ""); - if (!normalizedPath) return []; + const nameParts = normalizedPath ? normalizedPath.split("/") : []; - return normalizedPath.split("/"); + return [nameParts, versionDescriptor] } diff --git a/src/routing/core.js b/src/routing/core.js index 069863f84..72c5f61cc 100644 --- a/src/routing/core.js +++ b/src/routing/core.js @@ -44,6 +44,7 @@ const coreBuildRoutes = coreBuildPaths.map(path => [ path, `${path}/*`, `${path}:*`, // Tangletrees at top-level, e.g. /a:/a/b + `${path}@*`, // version (date) descriptors for a top-level build ]); diff --git a/src/sources/community.js b/src/sources/community.js index 00a83ca07..f7cc5b5be 100644 --- a/src/sources/community.js +++ b/src/sources/community.js @@ -60,11 +60,11 @@ export class CommunitySource extends Source { return `${this.repoName}@${branch}`; } - dataset(pathParts) { - return new CommunityDataset(this, pathParts); + dataset(pathParts, versionDescriptor) { + return new CommunityDataset(this, pathParts, versionDescriptor); } - narrative(pathParts) { - return new CommunityNarrative(this, pathParts); + narrative(pathParts, versionDescriptor) { + return new CommunityNarrative(this, pathParts, versionDescriptor); } async availableDatasets() { diff --git a/src/sources/core.js b/src/sources/core.js index 656d88ada..85603360f 100644 --- a/src/sources/core.js +++ b/src/sources/core.js @@ -4,6 +4,7 @@ import { fetch } from '../fetch.js'; import { NotFound } from '../httpErrors.js'; import * as utils from '../utils/index.js'; import { Source, Dataset } from './models.js'; +import { ResourceVersions } from "../resourceIndex.js"; const authorization = process.env.GITHUB_TOKEN ? `token ${process.env.GITHUB_TOKEN}` @@ -25,8 +26,8 @@ export class CoreSource extends Source { return url.toString(); } - dataset(pathParts) { - return new CoreDataset(this, pathParts); + dataset(pathParts, versionDescriptor) { + return new CoreDataset(this, pathParts, versionDescriptor); } // The computation of these globals should move here. @@ -93,6 +94,7 @@ export class CoreStagingSource extends CoreSource { } class CoreDataset extends Dataset { + /* NOTE: This class is also used for staging datasets */ resolve() { /* XXX TODO: Reimplement this in terms of methods on the source, not by * breaking encapsulation by using a process-wide global. @@ -120,10 +122,41 @@ class CoreDataset extends Dataset { const nextDefaultPart = global.availableDatasets.defaults[sourceName][prefix]; if (nextDefaultPart) { - const dataset = new this.constructor(this.source, [...prefixParts, nextDefaultPart]); + const dataset = new this.constructor(this.source, [...prefixParts, nextDefaultPart], this.versionDescriptor); return dataset.resolve(); } return this; } + + /** + * Parse a URL-provided version descriptor (currently only in YYYY-MM-DD + * format) and return the appropriate version date (YYYY-MM-DD string) and the + * associated versionUrls (an object linking available file types to their + * access URLs). THe returned versionDate is null if no appropriate version is + * available. + * + * We only want to do this for core datasets, not staging. + * + * @param {(string|false)} versionDescriptor from the URL + * @throws {BadRequest || NotFound} + * @returns {([string, Object]|[null, undefined])} [0]: versionDate [1]: + * versionUrls + */ + versionInfo(versionDescriptor) { + + if (this.source.name!=='core') { + return super.versionInfo(versionDescriptor); + } + + if (!versionDescriptor) { + return [null, undefined]; + } + + const versions = new ResourceVersions(this.source.name, 'dataset', this.pathParts.join("/")); + const versionDate = versions.versionDateFromDescriptor(this.versionDescriptor); + const versionUrls = versionDate ? versions.subresourceUrls(versionDate) : undefined + return [versionDate, versionUrls]; + } + } diff --git a/src/sources/fetch.js b/src/sources/fetch.js index bb6cc2087..4d1bf108e 100644 --- a/src/sources/fetch.js +++ b/src/sources/fetch.js @@ -14,11 +14,12 @@ export class UrlDefinedSource extends Source { async baseUrl() { return `https://${this.authority}`; } - dataset(pathParts) { - return new UrlDefinedDataset(this, pathParts); + + dataset(pathParts, versionDescriptor) { + return new UrlDefinedDataset(this, pathParts, versionDescriptor); } - narrative(pathParts) { - return new UrlDefinedNarrative(this, pathParts); + narrative(pathParts, versionDescriptor) { + return new UrlDefinedNarrative(this, pathParts, versionDescriptor); } // available datasets & narratives are unknown when the dataset is specified by the URL @@ -50,8 +51,17 @@ class UrlDefinedDataset extends Dataset { // Override check for underscores (_), as we want to allow arbitrary // external URLs. } + versionInfo() { + /* Version descriptors make no sense for fetch sources, but our routing approach always extracts them */ + return [null, undefined]; + } get baseName() { - return this.baseParts.join("/"); + /** + * The baseName includes anything that looks like a version descriptor. + * Minor bug: If the requested URL ends with a '@' then it'll be dropped here. + */ + const version = this.versionDescriptor ? `@${this.versionDescriptor}` : "" + return this.baseParts.join("/") + version; } subresource(type) { return new UrlDefinedDatasetSubresource(this, type); @@ -89,8 +99,14 @@ class UrlDefinedNarrative extends Narrative { // Override check for underscores (_), as we want to allow arbitrary // external URLs. } + versionInfo() { + /* Version descriptors make no sense for fetch sources, but our routing approach always extracts them */ + return [null, undefined]; + } get baseName() { - return this.baseParts.join("/"); + /* see comment in UrlDefinedDataset */ + const version = this.versionDescriptor ? `@${this.versionDescriptor}` : "" + return this.baseParts.join("/") + version; } subresource(type) { return new UrlDefinedNarrativeSubresource(this, type); diff --git a/src/sources/groups.js b/src/sources/groups.js index 79cc7bb86..99cb7f1ee 100644 --- a/src/sources/groups.js +++ b/src/sources/groups.js @@ -56,11 +56,11 @@ export class GroupSource extends Source { return `${this.group.name}/`; } - dataset(pathParts) { - return new GroupDataset(this, pathParts); + dataset(pathParts, versionDescriptor) { + return new GroupDataset(this, pathParts, versionDescriptor); } - narrative(pathParts) { - return new GroupNarrative(this, pathParts); + narrative(pathParts, versionDescriptor) { + return new GroupNarrative(this, pathParts, versionDescriptor); } async urlFor(path, method = 'GET', headers = {}) { diff --git a/src/sources/models.js b/src/sources/models.js index cae673e5e..676f4995d 100644 --- a/src/sources/models.js +++ b/src/sources/models.js @@ -1,7 +1,7 @@ import authzTags from '../authz/tags.js'; import { fetch } from '../fetch.js'; -import { BadRequest } from '../httpErrors.js'; +import { NotFound, BadRequest, InternalServerError } from '../httpErrors.js'; /* The model classes here are the base classes for the classes defined in * ./core.js, ./community.js, ./groups.js, etc. @@ -97,11 +97,12 @@ export class Source { const url = new URL(path, await this.baseUrl()); return url.toString(); } - dataset(pathParts) { - return new Dataset(this, pathParts); + + dataset(pathParts, versionDescriptor) { + return new Dataset(this, pathParts, versionDescriptor); } - narrative(pathParts) { - return new Narrative(this, pathParts); + narrative(pathParts, versionDescriptor) { + return new Narrative(this, pathParts, versionDescriptor); } // eslint-disable-next-line no-unused-vars @@ -139,9 +140,11 @@ export class Source { } export class Resource { - constructor(source, pathParts) { + constructor(source, pathParts, versionDescriptor) { this.source = source; this.pathParts = pathParts; + this.versionDescriptor = versionDescriptor; + [this.versionDate, this.versionUrls] = this.versionInfo(versionDescriptor); // Require baseParts, otherwise we have no actual dataset/narrative path. // This inspects baseParts because some of the pathParts (above) may not @@ -178,6 +181,22 @@ export class Resource { get baseName() { return this.baseParts.join("_"); } + versionInfo(versionDescriptor) { + /** + * Interrogates the resource index to find the appropriate version of the + * resource and associated subresource URLs by comparing to + * this.versionDescriptor. + * This method should be overridden by subclasses when they are used to + * handle URLs which extract version descriptors. + * @param {(string|false)} versionDescriptor from the URL string + * @throws {BadRequest} + * @returns {([string, Object]|[null, undefined])} [0]: versionDate [1]: versionUrls + */ + if (versionDescriptor) { + throw new BadRequest(`This resource cannot handle versioned dataset requests (version descriptor requested: "${this.versionDescriptor}")`) + } + return [null, undefined]; + } async exists() { throw new Error("exists() must be implemented by Resource subclasses"); } @@ -213,6 +232,25 @@ export class Subresource { throw new Error("validTypes() must be implemented by Subresource subclasses"); } async url(method = 'GET', headers = {}) { + /** + * Check if the resource has versionUrls, and if it does then we use that + * URL rather than constructing a URL from the basename. If we have + * versionUrls but not for this (subresource) type then we know this + * subresource doesn't exist for this version of the resource. + * + * Note that the URL is not signed, and changes will be required to support + * this as needed. + */ + if (this.resource.versionUrls) { + if (!['HEAD', 'GET'].includes(method)) { + throw new InternalServerError(`Only the GET and HEAD methods are available for previous resource versions`); + } + if (this.resource.versionUrls[this.type]) { + return this.resource.versionUrls[this.type]; + } + throw new NotFound(`This version of the resource does not have a subresource for ${this.type}`); + } + return await this.resource.source.urlFor(this.baseName, method, headers); } get baseName() { @@ -234,9 +272,18 @@ export class Dataset extends Resource { async exists() { const method = "HEAD"; - const _exists = async (type) => - (await fetch(await this.subresource(type).url(method), {method, cache: "no-store"})).status === 200; - + const _exists = async (type) => { + let url; + try { + url = await this.subresource(type).url(method); + } catch (err) { + if (err instanceof NotFound) { + return false; + } + throw err; + } + return (await fetch(url, {method, cache: "no-store"})).status === 200; + } const all = async (...promises) => (await Promise.all(promises)).every(x => x); @@ -304,9 +351,18 @@ export class Narrative extends Resource { async exists() { const method = "HEAD"; - const _exists = async () => - (await fetch(await this.subresource("md").url(method), {method, cache: "no-store"})).status === 200; - + const _exists = async () => { + let url; + try { + url = await this.subresource("md").url(method); + } catch (err) { + if (err instanceof NotFound) { + return false; + } + throw err; + } + return (await fetch(url, {method, cache: "no-store"})).status === 200; + } return (await _exists()) || false; } diff --git a/test/date_descriptor.test.js b/test/date_descriptor.test.js new file mode 100644 index 000000000..f1b9bcd6a --- /dev/null +++ b/test/date_descriptor.test.js @@ -0,0 +1,330 @@ +/** + * + * Testing of requests made using URLs with date descriptors (i.e. @YYYY-MM-DD). + * It would be better if we also ran the server from the test process and could + * spy on the requests being made to check the correct versionId is being used + * in the requests to AWS. + * + */ + +/** + * Example dev use: + * run the server in a separate process via: + * RESOURCE_INDEX="./devData/index.json" node server.js --verbose + * Then run this test file via: + * NODE_OPTIONS='--experimental-vm-modules' npx jest test/date_descriptor.test.js + */ + +/* eslint no-prototype-builtins: 0 */ + +import fetch from 'node-fetch'; +import {jest} from '@jest/globals'; + +// As some of the APIs we're calling will themselves call GitHub APIs, +// or will request v1 JSONs which are slow, we set a lenient timeout +jest.setTimeout(15000); + +const BAD_REQUEST = 400; +const NOT_FOUND = 404; + +const coreDataZika = [ + /** + * 2023-02-21 maps to a valid dataset, uploaded that same day. Main dataset + * version ID: xSaqFeCujRdPmjuYx_MEO8gETcvX9xfC. The JSON upload date is + * 2023-02-16. This version does have a root-sequence sidecar. + */ + {prefix: 'zika@2023-02-21', valid: true, jsonUpdatedDate: '2023-02-16'}, + {prefix: 'zika@2023-02-21', sidecar: 'root-sequence', valid: true}, + /** + * 2022-11-01 should access the datestamp object 2022-10-05 which is + * zika.json?versionId=cQV2g9_MA5eIWOvzsgwjTjJnMsaaB0hu, which has a "updated" + * date in the json of 2022-09-30. Objects from this day do not have any + * sidecars. + */ + {prefix: 'zika@2022-11-01', valid: true, jsonUpdatedDate: '2022-09-30'}, + {prefix: 'zika@2022-11-01', sidecar: 'root-sequence', valid: false, errorMessage: 'This version of the resource does not have a subresource for root-sequence'}, + /** + * The earliest dataset in the current index is 2022-08-01, so anything before should 404 + */ + {prefix: 'zika@2022-01-01', valid: false, errorMessage: 'Version descriptor 2022-01-01 predates available versions'}, +] + +const coreDataWNV = [ + /* most recent dataset is datestamped 2019-10-30 and is a v1 meta+tree dataset. No sidecars. + Note that this can take around 10 seconds as both v1 files are fetched then converted server-side + the RESTful API doesn't handle v1 datasets (and probably never will). */ + {prefix: 'WNV/NA@2020-11-17', REST: false, valid: true, jsonUpdatedDate: "30 Oct 2019"}, + {prefix: 'WNV/NA@2020-11-17', REST: false, valid: false, sidecar: 'root-sequence', errorMessage: 'This version of the resource does not have a subresource for root-sequence'}, + // Following should map to the v2 dataset with timestamp 2020-11-18. version id: 7lRiCIu.cP5RTZkm7m7kPSvo6GbD846H. No sidecars + {prefix: 'WNV/NA@2021-01-01', valid: true, jsonUpdatedDate: "2020-11-10"}, + {prefix: 'WNV/NA@2021-01-01', valid: false, sidecar: 'root-sequence', errorMessage: 'This version of the resource does not have a subresource for root-sequence'}, + // Following is _more recent_ than the most recent dataset in the index so we _do not_ make a versioned request, we simply access the latest file + // As of 2024-01-03 we don't have a sidecar file (so it'll 404) we might in the future... + {prefix: 'WNV/NA@2024-01-03', valid: true}, +] + +const nonCoreData = [ + {prefix: 'staging/zika@2023-02-21', valid: false, errorCode: BAD_REQUEST, errorMessage: notHandledError('2023-02-21')}, + {prefix: 'groups/blab/beta-cov@2023-02-21', valid: false, errorCode: BAD_REQUEST, errorMessage: notHandledError('2023-02-21')}, + /* Community URLs can't have version descriptors, but they can have 'repo@commit' syntax */ + {prefix: 'community/blab/ebola-narrative-ms/subsampled/3', valid: true}, + {prefix: 'community/blab/ebola-narrative-ms/subsampled/3@2023-01-01', valid: false, errorCode: BAD_REQUEST, errorMessage: notHandledError('2023-01-01')}, + {prefix: 'community/blab/ebola-narrative-ms@cdacef9/subsampled/3', valid: true}, + {prefix: 'community/blab/ebola-narrative-ms@cdacef9/subsampled/3@2023-01-01', valid: false, errorCode: BAD_REQUEST, errorMessage: notHandledError('2023-01-01')}, + {prefix: 'fetch/data.nextstrain.org/zika.json', valid: true}, + /** + * '@' characters are just fine within a fetch dataset, it's the one exception to the rule. + * Note 1: this is a 404, because the fetched file doesn't exist, but it's not a BAD_REQUEST + * Note 2: we skip the RESTful call for text/html because it returns 200 - this is a bug which predates the + * version descriptors as it's also present for, e.g. 'fetch/data.nextstrain.org/zika.something.json'. + * Loading the page will still end up with a 404-like error page, so the bug is very minor. + */ + {prefix: 'fetch/data.nextstrain.org/zika@something.json', valid: false, REST_HTML: false, errorCode: NOT_FOUND, errorMessage: "Not Found"}, +] + +const malformedDateDescriptorData = [ + // Note that datasets like "zika@" (i.e. an empty string date descriptor) are actually considered ok + {prefix: 'zika@6m', valid: false, errorCode: BAD_REQUEST, errorMessage: 'Requested version must be in YYYY-MM-DD format (version descriptor requested: "6m")'}, + {prefix: 'zika@2023-02-01@huh', valid: false, errorCode: BAD_REQUEST, errorMessage: 'Requested version must be in YYYY-MM-DD format (version descriptor requested: "2023-02-01@huh")'}, +] + + +const datasets = [ + ...coreDataZika, + ...coreDataWNV, + ...nonCoreData, + ...malformedDateDescriptorData, +] + +/** Currently _all_ narratives with a version descriptor should return BadRequest */ +const narratives = [ + {prefix: 'narratives/inrb-ebola-example-sit-rep', valid: true}, + {prefix: 'narratives/inrb-ebola-example-sit-rep@2023-01-01', valid: false, errorCode: BAD_REQUEST, errorMessage: notHandledError('2023-01-01')}, + {prefix: 'groups/blab/narratives/test/fixture@2023-01-01', valid: false, errorCode: BAD_REQUEST, errorMessage: notHandledError('2023-01-01')}, + /* Note that community URLs result in a request to `https://api.github.com/repos/${this.owner}/${this.repoName}` from the Source class constructor, + which means that testing will make many requests to this. Fortunately, most will be a simple cache revalidation. */ + {prefix: 'community/narratives/blab/ebola-narrative-ms/2019-09-13-sit-rep-ENGLISH@2023-01-01', valid: false, errorCode: BAD_REQUEST, errorMessage: notHandledError('2023-01-01')}, + /* Community URLs with a 'repo@commit' structure should continue to work */ + {prefix: 'community/narratives/blab/ebola-narrative-ms@cdacef9/2019-09-13-sit-rep-ENGLISH', REST: false, valid: true}, // FIXME FIXME XXX + /* See the note above in nonCoreData regarding fetch URLs */ + {prefix: 'fetch/narratives/data.nextstrain.org/does/not_exist@so_@this/should/404', valid: false, REST_HTML: false, errorCode: NOT_FOUND, errorMessage: "Not Found"}, +] + + +describe("Request valid main datasets", () => { + + for (const d of datasets.filter((el) => !el.hasOwnProperty('sidecar') && el.valid===true)) { + + if (d.charon!==false) { + it(`Charon API using ${d.prefix}`, async () => { + const url = `${BASE_URL}/charon/getDataset?prefix=${d.prefix}`; + const res = await fetch(url, {redirect: 'manual'}); + + expect(res.status).toEqual(200); + + const dataset = await res.json(); + + /* What we actually want to test is that the server fetched the + AWS URL with the correct versionId, but since we aren't observing + that we do the next best thing */ + if (d.jsonUpdatedDate) { + expect(dataset.meta.updated).toEqual(d.jsonUpdatedDate); + } + }); + } + + /* some APIs are not implemented in REST (e.g. v1 JSONs) */ + if (d.REST===false) continue; + + it(`REST API using ${d.prefix}`, async () => { + const res = await fetchType(`${BASE_URL}/${d.prefix}`, 'json'); + expect(res.status).toEqual(200); + const dataset = await res.json(); + if (d.jsonUpdatedDate) { + expect(dataset.meta.updated).toEqual(d.jsonUpdatedDate); + } + }) + + it(`REST API for ${d.prefix} (HTML content)`, async () => { + /* should send the auspice entrypoint */ + const res = await fetchType(`${BASE_URL}/${d.prefix}`, 'html'); + expect(res.status).toEqual(200); + expect(res.headers.get('Content-Type')).toMatch('text/html') + }) + } +}) + + +describe("Invalid main datasets", () => { + + for (const d of datasets.filter((el) => !el.hasOwnProperty('sidecar') && el.valid!==true)) { + + if (d.charon!==false) { + it(`Charon API using ${d.prefix}`, async () => { + const res = await fetch(`${BASE_URL}/charon/getDataset?prefix=${d.prefix}`, {redirect: 'manual'}); + expect(res.status).toEqual(d.errorCode || NOT_FOUND); + const errors = await res.json(); + expect(errors.error).toEqual(d.errorMessage) + }) + } + + if (d.REST===false) continue; + + it(`REST API using ${d.prefix} (json content)`, async () => { + const res = await fetchType(`${BASE_URL}/${d.prefix}`, 'json'); + expect(res.status).toEqual(d.errorCode || NOT_FOUND); + const errors = await res.json(); + expect(errors.error).toEqual(d.errorMessage) + }) + + if (d.REST_HTML===false) continue; + + it(`REST API for ${d.prefix} (HTML content)`, async () => { + /* should send a gatsby page, but the error code will be the same */ + const res = await fetchType(`${BASE_URL}/${d.prefix}`, 'html'); + expect(res.status).toEqual(d.errorCode || NOT_FOUND); + expect(res.headers.get('Content-Type')).toMatch('text/html') + }) + } +}) + + +describe("Valid sidecars", () => { + + for (const d of datasets.filter((el) => el.hasOwnProperty('sidecar') && el.valid===true)) { + + if (d.charon!==false) { + it(`Charon API using ${d.prefix} with sidecar ${d.sidecar}`, async () => { + const url = `${BASE_URL}/charon/getDataset?prefix=${d.prefix}&type=${d.sidecar}`; + const res = await fetch(url, {redirect: 'manual'}); + expect(res.status).toEqual(200); + /* there's no way to check the sidecar is the one we expect unless we spied on the server + requests, or fetched directly from AWS using the version ID and diffed the output */ + }); + } + + if (d.REST===false) continue; + + it(`REST API using ${d.prefix} asking for sidecar ${d.sidecar}`, async () => { + const res = await fetchType(`${BASE_URL}/${d.prefix}`, d.sidecar); + expect(res.status).toEqual(200); + }) + } +}) + + +describe("Invalid sidecars", () => { + + for (const d of datasets.filter((el) => el.hasOwnProperty('sidecar') && el.valid!==true)) { + + if (d.charon!==false) { + it(`Charon API using ${d.prefix} with sidecar ${d.sidecar}`, async () => { + const url = `${BASE_URL}/charon/getDataset?prefix=${d.prefix}&type=${d.sidecar}`; + const res = await fetch(url, {redirect: 'manual'}); + expect(res.status).toEqual(d.errorCode || NOT_FOUND); + const errors = await res.json(); + expect(errors.error).toEqual(d.errorMessage) + }); + } + + if (d.REST===false) continue; + + it(`REST API using ${d.prefix} asking for sidecar ${d.sidecar}`, async () => { + const res = await fetchType(`${BASE_URL}/${d.prefix}`, d.sidecar); + expect(res.status).toEqual(d.errorCode || NOT_FOUND); + const errors = await res.json(); + expect(errors.error).toEqual(d.errorMessage) + }) + + } +}) + +describe("Invalid narratives", () => { + + for (const d of narratives.filter((n) => !n.valid)) { + + if (d.charon!==false) { + it(`Charon getNarrative API using ${d.prefix}`, async () => { + const url = `${BASE_URL}/charon/getNarrative?prefix=${d.prefix}&type=md`; + const res = await fetch(url, {redirect: 'manual'}); + expect(res.status).toEqual(d.errorCode || NOT_FOUND); + const errors = await res.json(); + expect(errors.error).toEqual(d.errorMessage) + }); + } + + if (d.REST===false) continue; + + it(`REST API for ${d.prefix} (markdown content)`, async () => { + const res = await fetchType(`${BASE_URL}/${d.prefix}`, 'narrative'); + expect(res.status).toEqual(d.errorCode || NOT_FOUND); + const errors = await res.json(); + expect(errors.error).toEqual(d.errorMessage) + }) + + if (d.REST_HTML===false) continue; + + it(`REST API for ${d.prefix} (HTML content)`, async () => { + /* should get a gatsby page, but the error code should be the same as for markdown requests */ + const res = await fetchType(`${BASE_URL}/${d.prefix}`, 'html'); + expect(res.status).toEqual(d.errorCode || NOT_FOUND); + expect(res.headers.get('Content-Type')).toMatch('text/html') + }) + } +}) + +describe("Valid narratives", () => { + + for (const d of narratives.filter((n) => n.valid)) { + + if (d.charon!==false) { + it(`Charon getNarrative API using ${d.prefix}`, async () => { + const url = `${BASE_URL}/charon/getNarrative?prefix=${d.prefix}&type=md`; + const res = await fetch(url, {redirect: 'manual'}); + expect(res.status).toEqual(200) + }); + } + + if (d.REST===false) continue; + + it(`REST API for ${d.prefix} (markdown content)`, async () => { + const res = await fetchType(`${BASE_URL}/${d.prefix}`, 'narrative'); + expect(res.status).toEqual(200) + }) + + it(`REST API for ${d.prefix} (HTML content)`, async () => { + /* should send the auspice entrypoint */ + const res = await fetchType(`${BASE_URL}/${d.prefix}`, 'html'); + expect(res.status).toEqual(200); + expect(res.headers.get('Content-Type')).toMatch('text/html') + }) + } +}) + +/** + * Map the sidecar names we use day-to-day to their content types + * used within the server + */ +const ACCEPT_TYPES = { + json: 'application/json', + 'root-sequence': 'application/vnd.nextstrain.dataset.root-sequence+json', + 'tip-frequencies': "application/vnd.nextstrain.dataset.tip-frequencies+json", + measurements: "application/vnd.nextstrain.dataset.measurements+json", + narrative: "text/markdown", + html: "text/html", +} + +async function fetchType(url, fileType) { + // The Content-Type indicates to the server the format of the request data. + // The Accept header specifies the desired response format. + const acceptType = ACCEPT_TYPES[fileType]; + return await fetch(url, { + method: 'GET', + headers: {Accept: acceptType}, + redirect: 'manual' + }); +} + +function notHandledError(d) { + return `This resource cannot handle versioned dataset requests (version descriptor requested: "${d}")` +} diff --git a/test/date_descriptor_index.json b/test/date_descriptor_index.json new file mode 100644 index 000000000..d9428fc5a --- /dev/null +++ b/test/date_descriptor_index.json @@ -0,0 +1,66 @@ +{ + "core": { + "dataset": { + "zika": { + "versions": [ + { + "date": "2023-05-20", + "fileUrls": { + "root-sequence": "https://nextstrain-data.s3.amazonaws.com/zika_root-sequence.json?versionId=_SzKCiKRw7hZhoQogzoTNM8cXP8CI3ML", + "main": "https://nextstrain-data.s3.amazonaws.com/zika.json?versionId=Wa5c62lKoEHaPLBw9e4ZypwLjmR26JYt" + } + }, + { + "date": "2023-02-21", + "fileUrls": { + "root-sequence": "https://nextstrain-data.s3.amazonaws.com/zika_root-sequence.json?versionId=xSaqFeCujRdPmjuYx_MEO8gETcvX9xfC", + "main": "https://nextstrain-data.s3.amazonaws.com/zika.json?versionId=PTcu5h8Q_NH1a55x9cO_Ynt82FuPQHqu" + } + }, + { + "date": "2022-10-05", + "fileUrls": { + "main": "https://nextstrain-data.s3.amazonaws.com/zika.json?versionId=cQV2g9_MA5eIWOvzsgwjTjJnMsaaB0hu" + } + }, + { + "date": "2022-08-01", + "fileUrls": { + "main": "https://nextstrain-data.s3.amazonaws.com/zika.json?versionId=alPSrW5IF27Aw7S3BiQ35.LNmjnneoXv" + } + } + ] + }, + "WNV/NA": { + "versions": [ + { + "date": "2021-04-08", + "fileUrls": { + "main": "https://nextstrain-data.s3.amazonaws.com/WNV_NA.json?versionId=HTsBopAsMJuAmgVx0soPJp_H8kbsPs5J" + } + }, + { + "date": "2020-11-18", + "fileUrls": { + "main": "https://nextstrain-data.s3.amazonaws.com/WNV_NA.json?versionId=7lRiCIu.cP5RTZkm7m7kPSvo6GbD846H" + } + }, + { + "date": "2019-10-30", + "fileUrls": { + "meta": "https://nextstrain-data.s3.amazonaws.com/WNV_NA_meta.json?versionId=7azn45EHKK_rNpjG4bUdrxEBZvFY83eT", + "tree": "https://nextstrain-data.s3.amazonaws.com/WNV_NA_tree.json?versionId=5le2HoAybE5V739je1FbfHdkyHWV0hVO" + } + }, + { + "date": "2019-08-30", + "fileUrls": { + "meta": "https://nextstrain-data.s3.amazonaws.com/WNV_NA_meta.json?versionId=S39COgoSfnyS2H8AiPw4ZGgzQuSObh6E", + "tree": "https://nextstrain-data.s3.amazonaws.com/WNV_NA_tree.json?versionId=MnxN.X6Svw.mWucLhCbqNLqOztY8ypM6" + } + } + ] + } + } + } +} \ No newline at end of file From 855d3b228511851dde83dcc753588c098eb2c563 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Wed, 1 Nov 2023 20:36:11 +1300 Subject: [PATCH 4/5] [docs] resource collection docs Includes documentation of the AWS changes which are not under terraform control, as well as an overview of the general concepts. --- docs/index.rst | 1 + docs/resource-collection.rst | 119 +++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 docs/resource-collection.rst diff --git a/docs/index.rst b/docs/index.rst index 93a6fe004..526d7f6b8 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -31,4 +31,5 @@ nextstrain.org routing infrastructure terraform + resource-collection glossary diff --git a/docs/resource-collection.rst b/docs/resource-collection.rst new file mode 100644 index 000000000..ff7072605 --- /dev/null +++ b/docs/resource-collection.rst @@ -0,0 +1,119 @@ +=================== +Resource Collection +=================== + +In order for nextstrain.org to handle URLs with `@YYYY-MM-DD` identifiers the +server needs to be aware of which files exist, including past versions. +In the future this data will also be used to list and display all available +resources (and their versions) to the user. + +The index location is set by the env/config variable ``RESOURCE_INDEX``. The +``RESOURCE_INDEX`` must be either a "s3://" address or a local file path. If the +file is located on S3 it must be gzipped. The server loads this index at start +time and refreshes it hourly. The nextstrain.org testing & production configs +currently set this to ``s3://nextstrain-inventories/resources.json.gz``. + +Resource collections can be ignored by the server by setting the env variable +``RESOURCE_INDEX="false"`` or by omitting it from your configuration JSON. + + +Local index generation +====================== + +The resource index may be regenerated locally, see ``node +./resourceIndexer/main.js --help`` for details. By default, this will access S3 +manifest files in order to create the index (see necessary AWS permissions +below), however you can point the indexer to local files if desired. To use +local files, download a suitable manifest and inventory from +``s3://nextstrain-inventories`` and name them like so: + +* For ``nextstrain-data``: ``./devData/core.manifest.json`` and ``core.inventory.csv.gz`` +* For ``nexstrain-staging``: ``./devData/staging.manifest.json`` and ``staging.inventory.csv.gz`` + +then run the indexer with the ``--local`` flag. + +To use a locally produced index, set the env variable ``RESOURCE_INDEX`` to +point to the (JSON) file when you run the server. + + +Automated index generation +========================== + +*This section will be updated once the +index creation is automated.* + +AWS settings necessary for resource collection +============================================== + +The index creation, storage and retrieval requires certain AWS settings which +are documented here as most of them are not under terraform control. We use `S3 +inventories +`__ +to list all the documents in certain buckets (or bucket prefixes) which are +generated daily by AWS. The index creation script will download these +inventories and use them to create an index JSON which it uploads to S3. The +nextstrain.org server will access this JSON from S3. + +S3 inventories +-------------- + +We currently produce inventories for the core (s3://nextstrain-data) and +staging (s3://nextstrain-staging) buckets which are generated daily and +published to s3://nextstrain-inventories. The +s3://nextstrain-inventories bucket is a private bucket with versioning enabled. The inventory +configuration can be found in the AWS console for +`core `__ +and +`staging `__. +The config specifies that additional metadata fields for last modified +and ETag are to be included in the inventory. The inventories for core & +staging are published to +s3://nextstrain-inventories/nextstrain-data/config-v1 and +s3://nextstrain-inventories/nextstrain-staging/config-v1, respectively. +The cost of these is minimal (less than $1/bucket/year). + +A lifecycle rule on the s3://nextstrain-inventories bucket (`console link +`__) +marks inventory-related files as deleted 30 days after they are created, and +permanently deletes them 7 days later. + +Index creation (Inventory access and index upload) +-------------------------------------------------- + +**Automated index generation** + +*This section will be updated once the +index creation is automated.* + +**Local index generation for development purposes** + +For local index generation (e.g. during development) you will need IAM +credentials which can list and get objects from s3://nextstrain-inventories; if +you want finer scale access for local index creation, you can restrict access to +certain prefixes in that bucket - for instance ``nextstrain-data/config-v1`` and +``nextstrain-staging/config-v1`` correspond to core and staging buckets, +respectively. + +To upload the index you will need write access for +s3://nextstrain-inventories/resources.json.gz. Note that if your aims are +limited to local development purposes this is not necessary (see `Local development`_). + + +Index backups +------------- + +The ``nextstrain-inventories`` bucket is version enabled so past versions of +``s3://nextstrain-inventories/resources.json.gz`` are available. + +A lifecycle rule on the s3://nextstrain-inventories bucket (`console link +`__) +permanently deletes past versions of this file 30 days after it became +noncurrent (i.e. it was replaced with a new upload of the index.) + + +Index access by the server +-------------------------- + +IAM users ``nextstrain.org`` and ``nextstrain.org-testing``, which are under +terraform control, have read access to +s3://nextstrain-inventories/resources.json.gz via their associated policies. From c0ee2397a93db5f2735dabc7f13817aa26f82729 Mon Sep 17 00:00:00 2001 From: James Hadfield Date: Fri, 3 Nov 2023 16:38:26 +1300 Subject: [PATCH 5/5] [Github action] Rebuild resource collection index See added documentation for corresponding AWS details. Given that the resources all come from public-facing buckets (core & staging) it seems ok to run this from a public repo, but we may want to revisit this once we start consuming private data. The index is only generated for datasets (not intermediate files) and only for the core bucket (nextstrain-data) as the that's all that's currently handled by the server, so it saves us a little s3 storage, transfer overhead and server memory footprint. Future work listing/visualising all available data will use this and so this filtering is only temporary. --- .github/workflows/index-resources.yml | 39 +++++++++++++++++++++++++++ docs/resource-collection.rst | 17 +++++++++--- 2 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/index-resources.yml diff --git a/.github/workflows/index-resources.yml b/.github/workflows/index-resources.yml new file mode 100644 index 000000000..c0fe9445b --- /dev/null +++ b/.github/workflows/index-resources.yml @@ -0,0 +1,39 @@ +name: index resources + +on: + # Run at ~4am UTC time which is (± an hour) 4am UK, 5am Switzerland, midnight + # US east coast, 9pm US west coast so that for most users (and most + # developers) the index regenerates overnight + schedule: + - cron: '0 4 * * *' + + # Manually triggered using GitHub's UI + workflow_dispatch: + +jobs: + rebuild-index: + runs-on: ubuntu-latest + permissions: + id-token: write # needed to interact with GitHub's OIDC Token endpoint + contents: read + defaults: + run: + shell: bash + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-node@v3 + with: + node-version: '16' + - run: npm ci + - uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: us-east-1 + role-to-assume: arn:aws:iam::827581582529:role/GitHubActionsRoleResourceIndexer + - name: Rebuild the index + run: | + node resourceIndexer/main.js \ + --gzip --output resources.json.gz \ + --resourceTypes dataset --collections core + - name: Upload the new index, overwriting the existing index + run: | + aws s3 cp resources.json.gz s3://nextstrain-inventories/resources.json.gz diff --git a/docs/resource-collection.rst b/docs/resource-collection.rst index ff7072605..a2b8beea6 100644 --- a/docs/resource-collection.rst +++ b/docs/resource-collection.rst @@ -39,8 +39,12 @@ point to the (JSON) file when you run the server. Automated index generation ========================== -*This section will be updated once the -index creation is automated.* +The resource collection index is rebuilt every night via a GitHub action running +from this repo. + +*This approach should be revisited when (if) we start indexing private data, +especially for the potential of the GitHub action logging sensitive information +which will be publicly visible.* AWS settings necessary for resource collection ============================================== @@ -82,8 +86,13 @@ Index creation (Inventory access and index upload) **Automated index generation** -*This section will be updated once the -index creation is automated.* +The GitHub action assumes necessary AWS permissions via the IAM role +`GitHubActionsRoleResourceIndexer +`__ +which is obtained using OIDC. This role uses permissions from the IAM policy +`NextstrainResourceIndexer +`__ +to list & read the S3 inventories, as well as upload the new index. **Local index generation for development purposes**