From 40f75f83a8049f686689db94d964a2cdec384bcb Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 3 May 2023 16:03:26 -0400 Subject: [PATCH 01/10] WIP: Start POST canonicalization spec Based on Alex Osborne's excellent start in the WARC specifications repository: http://iipc.github.io/warc-specifications/guidelines/cdx-non-get-requests/ Todo: - Come up with better/more accurate name, since it doesn't only affect POST requests - Write AMF section --- cdxj/0.1.0/index.html | 1 + index.html | 3 +- post-canonicalization/latest/index.json | 69 ++++++ post-canonicalization/latest/index.md | 270 ++++++++++++++++++++++++ 4 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 post-canonicalization/latest/index.json create mode 100644 post-canonicalization/latest/index.md diff --git a/cdxj/0.1.0/index.html b/cdxj/0.1.0/index.html index e59f95c..f718f16 100644 --- a/cdxj/0.1.0/index.html +++ b/cdxj/0.1.0/index.html @@ -11,6 +11,7 @@ latestVersion: "https://specs.webrecorder.net/cdxj/latest/", shortName: "cdxj", includePermalinks: true, + group: "CDX" editors: [ { name: "Ilya Kreymer", diff --git a/index.html b/index.html index ef2c68f..9ef2801 100644 --- a/index.html +++ b/index.html @@ -58,7 +58,8 @@ * [Web Archive Collection Zipped (WACZ)](wacz/latest/): a packaging standard for web archives on the web * [WACZ Signing and Verification](wacz-auth/latest/): the mechanics for signing and verifying WACZ files for proof of authenticity * [Crawl Index JSON (CDXJ)](cdxj/latest/): an extensible format for WARC index files -* [IPFS Custom File Chunking for WARC and WACZ](wacz-ipfs/latest): a specialized content-aware chunking strategy for adding composite web archive files to IPFS +* [IPFS Custom File Chunking for WARC and WACZ](wacz-ipfs/latest): a specializesd content-aware chunking strategy for adding composite web archive files to IPFS +* [POST Canonicalization](post-canonicalization/latest/): a format for canonicalizing non-GET HTTP requests as GET requests for indexing and replay
diff --git a/post-canonicalization/latest/index.json b/post-canonicalization/latest/index.json new file mode 100644 index 0000000..00d9276 --- /dev/null +++ b/post-canonicalization/latest/index.json @@ -0,0 +1,69 @@ +{ + "specStatus": "DRAFT", + "respec_js": "../../assets/js/respec-webrecorder.js", + "publishDate": "2023-06-27", + "license": "cc-by", + "thisVersion": "https://specs.webrecorder.net/post-canonicalization/latest/", + "latestVersion": "https://specs.webrecorder.net/post-canonicalization/latest/", + "shortName": "post-canonicalization", + "group": "CDX", + "includePermalinks": true, + "authors": [], + "editors": [ + { + "name": "Alex Osborne", + "url": "https://github.com/ato" + }, + { + "name": "Tessa Walsh", + "url": "https://bitarchivist.net" + }, + { + "name": "Ilya Kreymer", + "url": "https://github.com/ikreymer" + } + ], + "group": { + "name": "WACZ Editors", + "url": "https://webrecorder.net" + }, + "otherLinks": [ + { + "key": "Repository", + "data": [ + { + "value": "Github", + "href": "https://github.com/webrecorder/specs" + }, + { + "value": "Issues", + "href": "https://github.com/webrecorder/specs/issues" + }, + { + "value": "Commits", + "href": "https://github.com/webrecorder/specs/commits" + } + ] + } + ], + "maxTocLevel": 3, + "logos": [ + { + "src": "../../assets/images/webrecorder.svg", + "alt": "Webrecorder Logo", + "height": 100 + } + ], + "localBiblio": { + "PYWB-CDXJ": { + "title": "pywb Indexing: CDXJ Format", + "publisher": "Webrecorder", + "href": "https://pywb.readthedocs.io/en/latest/manual/indexing.html#cdxj-index" + } + }, + "lint": { + "privsec-section": false, + "no-http-props": false, + "no-headingless-sections": false + } +} diff --git a/post-canonicalization/latest/index.md b/post-canonicalization/latest/index.md new file mode 100644 index 0000000..47c965f --- /dev/null +++ b/post-canonicalization/latest/index.md @@ -0,0 +1,270 @@ +# POST Canonicalization + +## Abstract + +Originally CDX files were only used to index web archives containing GET requests. As browser-based capture methods can record non-GET requests such as those generated by JavaScript, a way for CDX/CDXJ index records to differentiate based on request method and request body is needed. This document describes the mechanism used for encoding the request method and body in the CDX/CDXJ key by appending additional query parameters, as originally implemented by pywb. + +## Conformance + +As well as sections marked as non-normative, all authoring guidelines, diagrams, examples, and notes in this specification are non-normative. Everything else in this specification is normative. + +The key words MAY and MUST in this document are to be interpreted as described in BCP 14 [RFC2119][1] [RFC8174][2] when, and only when, they appear in all capitals, as shown here. + +## Terminology + +- CDX +- CDXJ +- WACZ +- WARC + +## Introduction + +### Web Archive Formats (WARC and WACZ) + +Web archiving data is often stored in specialized formats, which include a full record of the HTTP network traffic as well as additional metadata. The archived data is often accessed via random-access, loading the appropriate chunks of data based on URLs requested by end users. + +This specification is designed to describe how to store two key file formats used for web archives: + +1. WARC — A widely accepted [ISO standard][3] used by many institutions around the world for storing web archive data. +2. WACZ — A new format [developed by Webrecorder][4] for packaging WARCs with other web archive data which supports random-access reads. + +Both formats are 'composite' formats, containing smaller amounts of data interspersed with metadata. In the case of WARC, the format consists of concatenated records which are appended one after the other, eg. `cat A.warc B.warc > C.warc`. The WARCs may or may not be gzipped, in which case the result is a multi-member gzip. + +WACZ files use the ZIP format which contains a specialized file and directory layout. ZIP is also a composite format, containing the raw (sometimes compressed) data as well as header data which contains the location files and directories within the ZIP file. + +## Web Archive Index Formats (CDX and CDXJ) + +Web archive search and retrieval is frequently intermediated by index files of WARC data, in the CDX or CDXJ formats. WACZ files contain CDXJ indices, which may or may not be gzipped, within the ZIP file that comprises the WACZ. + +### CDX + +CDX is a web archive index format developed as part of the Internet Archive's Wayback Machine, where CDX may have been an acronym for Crawl (or Capture) inDeX. A CDX file consists of plain text, with the first line being a legend and each line afterwards describing a web document. More information about how the format works can be found in the [CDX specification][5]. + +CDX was the precursor to the CDXJ index format. + +### Crawl Index JSON (CDXJ) + +Crawl Index JSON or [CDXJ](4) provides a standardized way of representing an index to one or more WARC files. It allows applications to quickly locate a given page in a set of archived web content, as well as metadata associated with that page. Each CDXJ entry can be looked up by URL, and contains a JSON payload that can be used for representing information about that URL. It is used in the [WACZ specification][4]. + +A CDXJ file is a sorted, line oriented plain-text file (optionally GZIP compressed) where each line represents information about a single capture in a web archive collection. + +Each line MUST have three components that are separated by single spaces (0x20): + +1. a Searchable URL +2. an Integer Timestamp +3. a JSON Block + +The Searchable URL is a normalized form of the archived URL that allows a CDXJ file to be sorted and efficiently scanned using a binary search algorithm. The Searchable URL is sometimes referred to as Sort-friendly URI Reordering Transform (SURT). + +The JSON Block contains a serialized [JSON][7] object with newlines escaped so that it fits completely on one line. The object MUST contain the following properties: + +* url: The URL that was archived +* digest: A cryptographic hash for the HTTP response payload +* mime: The media type for the response payload +* filename: the WARC file where the WARC record is located +* offset: the byte offset for the WARC record +* length: the length in bytes of the WARC record +* status: the HTTP status code for the HTTP response + +## Indexing non-GET HTTP requests + +### Motivation + +POST-canonicalization provides a standardized way of representing a non-GET HTTP request as a GET request for indexing and playback in web archives. The original HTTP request type as well as the encoded request body are appended to the original URL and included in CDX/CDXJ indices as the Searchable URL. This allows web archive playback engines to then reconstruct the original non-GET requests for use in playback with their original HTTP method and request body. + +### Encoding the request method + +If the request method is not `GET` it MUST be appended as the value of query parameter `__wb_method`. + +If the URL does not have a query string a `?` MUST be added: + + http://example.org/ => http://example.org/?__wb_method=POST + +If the URL already has a query string the `__wb_method` parameter MUST be added at the end after a `&` separator: + + http://example.org/?page=1 => http://example.org/?page=1&__wb_method=POST + +Even if the query string already ends in `&` another separator MUST still be added: + + http://example.org/?foo& => http://example.org/?foo&&__wb_method=POST + +### Encoding the request body + +Encoding the request body depends on the content-type. + +| Content-Type | Primary Encoding | Fallback Encoding | +|-----------------------------------|------------------|-------------------| +| application/json | JSON | | +| application/x-amf | AMF | | +| application/x-www-form-urlencoded | urlencoded form | binary | +| multipart/* | multipart form | binary | +| text/plain | JSON | binary | +| * | binary | | + +#### AMF request body encoding + +[TODO: To be written] + +#### Binary request body encoding + +The request body is encoded as Base64 ([RFC 4648][7]) and appended to the query string as the `__wb_post_data` parameter. + +> **Example** +> +> Original request: +> +> POST /chat HTTP/1.0 +> Host: example.org +> Content-Length: 5 +> +> hello +> +> Encoded URL: +> +> http://example.org/chat?__wb_method=POST&__wb_post_data=aGVsbG8= + +#### Encoding a urlencoded form request body + +Decode the body to a string using UTF-8, percent decoded the string, **percent plus encode** it and then append the result to the output. + +If a UTF-8 decoding error occurs then the binary encoding method MUST be used instead. + +> **Example** +> +> Original request: +> +> POST / HTTP/1.0 +> Host: example.org +> Content-Type: application/x-www-form-urlencoded +> Content-Length: 13 +> +> say=Hi&to=Mom +> +> Encoded URL: +> +> http://example.org/?__wb_method=POST&__wb_post_data=say%3DHi%26to%3DMom + +#### Encoding a multipart form request body + +The body MUST be decoded as form data per [RFC 2388][9] and then percent plus encoded. If the body is not a valid multipart/form-data message then the binary encoding method MUST be used instead. + +> **Example** +> +> Original request: +> +> POST / HTTP/1.1 +> Host: example.org +> Content-Type: multipart/form-data; boundary=AaB03x +> Content-Length: Content-Length: 437 +> +> --AaB03x +> Content-Disposition: form-data; name="submit-value" +> +> Example +> --AaB03x +> Content-Disposition: form-data; name="files" +> Content-Type: multipart/mixed; boundary=BbC04y +> +> --BbC04y +> Content-Disposition: file; filename="file1.txt" +> Content-Type: text/plain +> +> Content of file1.txt. +> +> --BbC04y +> Content-Disposition: file; filename="file2.html" +> Content-Type: text/html +> +> Content of file2.html. +> +> --BbC04y-- +> --AaB03x-- +> +> +> Encoded URL: +> +> http://example.org/?__wb_method=POST&__wb_post_data=--AaB03x%0AContent-Disposition%3A%20form-data%3B%20name%3D%22submit-name%22%0A%0AExample%0A--AaB03x%0AContent-Disposition%3A%20form-data%3B%20name%3D%22files%22%0AContent-Type%3A%20multipart%2Fmixed%3B%20boundary%3DBbC04y%0A%0A--BbC04y%0AContent-Disposition%3A%20file%3B%20filename%3D%22file1.txt%22%0AContent-Type%3A%20text%2Fplain%0A%0AContent%20of%20file1.txt.%0A%0A--BbC04y%0AContent-Disposition%3A%20file%3B%20filename%3D%22file2.html%22%0AContent-Type%3A%20text%2Fhtml%0A%0A%3C%21DOCTYPE%20html%3E%3Ctitle%3EContent%20of%20file2.html.%3C%2Ftitle%3E%0A%0A--BbC04y--%0A--AaB03x--%0A + +#### Encoding a JSON request body + +The request MUST be parsed as JSON ([RFC 8259][10]) and then apply the following algorithm with an empty string as the initial value of *name*. + +To **encode a JSON *value***, given a *name* and an initially-empty map *nameCounts* of strings to integers: + +1. If *value* is a JSON object: + 1. Recursively encode each member of the object passing member's name as *name* and the member's value as *value*. +2. If *value* is a JSON array: + 1. Recursively encode each element of the array passing the current value of *name* as + *name* and the value of the element as *value*. +3. Otherwise: + 1. Define the string *encodedValue* as: + 1. If *value* is JSON true then the string "true". + 2. If *value* is JSON false then the string "false". + 3. If *value* is JSON null then the string "null". + 4. If *value* is a JSON string then the result of **percent plus encoding** the string. + 5. If *value* is a JSON number then the number as a string consistent with the output of JavaScript's toString() method for the number. + 2. If *nameCounts* contains the integer *count* for *name*: + 1. Increment *count* by 1. + 2. Store *count* as the new count for *name* in *nameCounts*. + 3. Append the string "&*name*.*count*_=*encodedValue*" to the output. + 3. Otherwise, if *nameCounts* does not contain *name*: + 1. Store the integer 1 in *nameCounts* for *name*. + 2. Append the string "&*name*=*encodedValue*" to the output. + +> **Example** +> +> Original request: +> +> POST /events HTTP/1.0 +> Host: example.org +> Content-Type: application/json +> +> { +> "type": "event", +> "id": 44.0, +> "float": 35.7 +> "values": [true, false, null], +> "source": { +> "type": "component", +> "id": "a+b&c= d", +> "values": [3, 4] +> } +> } +> +> Encoded URL: +> +> http://example.org/events?__wb_method=POST&type=event&id=44&float=35.7&values=true +> &values.2_=false&values.3_=null&type.2_=component&id.2_=a%2Bb%26c%3D+d +> &values.4_=3&values.5_=4 + +## Appendix + +### Percent plus encoding + +To **percent plus encode a string**, first encode it as UTF-8 and then **percent plus encode** the resulting byte sequence. + +To **percent plus encode a byte sequence**, for each byte in the input sequence: + +1. If the byte falls within the following ASCII character ranges, append it to the output as is. + +`'0'-'9', 'a'-'z', 'A'-'Z', '-', '.', '_', '~'` + +2. If the byte is the ASCII space character (' '), append the ASCII plus character ('+') to the output. + +3. Otherwise, append ASCII percent character ('%') to the output and followed by the value of the byte formatted as two uppercase hexadecimal digits. + +> **Compatibility Note** +> +> Prior to Python 3.7 the character "~" was percent encoded. + + +[1]: https://www.rfc-editor.org/rfc/rfc2119 +[2]: https://www.rfc-editor.org/rfc/rfc8174 +[3]: https://iipc.github.io/warc-specifications/ +[4]: https://specs.webrecorder.net/wacz/latest/ +[5]: https://iipc.github.io/warc-specifications/specifications/cdx-format/cdx-2015/ +[6]: https://specs.webrecorder.net/cdxj/0.1.0/ +[7]: https://www.rfc-editor.org/rfc/rfc8259 +[8]: https://tools.ietf.org/html/rfc4648 +[9]: https://datatracker.ietf.org/doc/html/rfc2388 +[10]: https://datatracker.ietf.org/doc/html/rfc8259 From 6bf3dfafe2de8f3c42e5e7daecaac88f5436ab55 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 4 Apr 2024 15:12:15 -0400 Subject: [PATCH 02/10] Add AMF section and more detailed compatibility note --- post-canonicalization/latest/index.md | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/post-canonicalization/latest/index.md b/post-canonicalization/latest/index.md index 47c965f..eb654f3 100644 --- a/post-canonicalization/latest/index.md +++ b/post-canonicalization/latest/index.md @@ -101,13 +101,15 @@ Encoding the request body depends on the content-type. | text/plain | JSON | binary | | * | binary | | -#### AMF request body encoding +#### AMF (Action Message Format) request body encoding -[TODO: To be written] +AMF request body encoding is considered experimental and is only supported in pywb. It is possible this feature will be deprecated in the future. + +The current ([pywb implementation of AMF request body encoding][7]) and ([associated tests][8]) are available in the pywb repository. #### Binary request body encoding -The request body is encoded as Base64 ([RFC 4648][7]) and appended to the query string as the `__wb_post_data` parameter. +The request body is encoded as Base64 ([RFC 4648][9]) and appended to the query string as the `__wb_post_data` parameter. > **Example** > @@ -146,7 +148,7 @@ If a UTF-8 decoding error occurs then the binary encoding method MUST be used in #### Encoding a multipart form request body -The body MUST be decoded as form data per [RFC 2388][9] and then percent plus encoded. If the body is not a valid multipart/form-data message then the binary encoding method MUST be used instead. +The body MUST be decoded as form data per [RFC 2388][10] and then percent plus encoded. If the body is not a valid multipart/form-data message then the binary encoding method MUST be used instead. > **Example** > @@ -187,7 +189,7 @@ The body MUST be decoded as form data per [RFC 2388][9] and then percent plus en #### Encoding a JSON request body -The request MUST be parsed as JSON ([RFC 8259][10]) and then apply the following algorithm with an empty string as the initial value of *name*. +The request MUST be parsed as JSON ([RFC 8259][11]) and then apply the following algorithm with an empty string as the initial value of *name*. To **encode a JSON *value***, given a *name* and an initially-empty map *nameCounts* of strings to integers: @@ -211,6 +213,8 @@ To **encode a JSON *value***, given a *name* and an initially-empty map *nameCou 1. Store the integer 1 in *nameCounts* for *name*. 2. Append the string "&*name*=*encodedValue*" to the output. +The resulting query string will contain encoded key/value pairs of each leaf node of the JSON body. + > **Example** > > Original request: @@ -256,6 +260,8 @@ To **percent plus encode a byte sequence**, for each byte in the input sequence: > **Compatibility Note** > > Prior to Python 3.7 the character "~" was percent encoded. +> +> Older versions of ([pywb][12]) and ([warcio.js][]13) had slight discrepencies in the query strings they output for the same request data. For instance, pywb wrote Pythonic values for some values (`True`, `False`, `None`) rather than native JSON values (`true`, `false`, `null`), and warcio handled nested JSON differently than pywb. As of the publication of this specification, all current versions of Webrecorder software should behave identically. [1]: https://www.rfc-editor.org/rfc/rfc2119 @@ -264,7 +270,10 @@ To **percent plus encode a byte sequence**, for each byte in the input sequence: [4]: https://specs.webrecorder.net/wacz/latest/ [5]: https://iipc.github.io/warc-specifications/specifications/cdx-format/cdx-2015/ [6]: https://specs.webrecorder.net/cdxj/0.1.0/ -[7]: https://www.rfc-editor.org/rfc/rfc8259 -[8]: https://tools.ietf.org/html/rfc4648 -[9]: https://datatracker.ietf.org/doc/html/rfc2388 -[10]: https://datatracker.ietf.org/doc/html/rfc8259 +[7]: https://github.com/webrecorder/pywb/blob/main/pywb/warcserver/amf.py +[8]: https://github.com/webrecorder/pywb/blob/main/pywb/warcserver/test/test_amf.py +[9]: https://tools.ietf.org/html/rfc4648 +[10]: https://datatracker.ietf.org/doc/html/rfc2388 +[11]: https://www.rfc-editor.org/rfc/rfc8259 +[12]: https://github.com/webrecorder/pywb +[13]: https://github.com/webrecorder/warcio.js From 9d644ceed5de00b275e11d7168e5b3e754be9ae5 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 4 Apr 2024 15:13:31 -0400 Subject: [PATCH 03/10] Update publication date --- post-canonicalization/latest/index.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/post-canonicalization/latest/index.json b/post-canonicalization/latest/index.json index 00d9276..38c962f 100644 --- a/post-canonicalization/latest/index.json +++ b/post-canonicalization/latest/index.json @@ -1,7 +1,7 @@ { "specStatus": "DRAFT", "respec_js": "../../assets/js/respec-webrecorder.js", - "publishDate": "2023-06-27", + "publishDate": "2024-04-08", "license": "cc-by", "thisVersion": "https://specs.webrecorder.net/post-canonicalization/latest/", "latestVersion": "https://specs.webrecorder.net/post-canonicalization/latest/", From b96091a8d4aeeb8ded36e8a53652c00a4acc9375 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 4 Apr 2024 15:14:14 -0400 Subject: [PATCH 04/10] Fix typo on index page --- index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.html b/index.html index 9ef2801..c864b53 100644 --- a/index.html +++ b/index.html @@ -58,7 +58,7 @@ * [Web Archive Collection Zipped (WACZ)](wacz/latest/): a packaging standard for web archives on the web * [WACZ Signing and Verification](wacz-auth/latest/): the mechanics for signing and verifying WACZ files for proof of authenticity * [Crawl Index JSON (CDXJ)](cdxj/latest/): an extensible format for WARC index files -* [IPFS Custom File Chunking for WARC and WACZ](wacz-ipfs/latest): a specializesd content-aware chunking strategy for adding composite web archive files to IPFS +* [IPFS Custom File Chunking for WARC and WACZ](wacz-ipfs/latest): a specialized content-aware chunking strategy for adding composite web archive files to IPFS * [POST Canonicalization](post-canonicalization/latest/): a format for canonicalizing non-GET HTTP requests as GET requests for indexing and replay
From 7f7cd3799041c0b51652cccab04a2dd5e92f549b Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 4 Apr 2024 15:16:07 -0400 Subject: [PATCH 05/10] Rename to Request Body Canonicalization --- index.html | 2 +- .../latest/index.json | 6 +++--- .../latest/index.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) rename {post-canonicalization => request-body-canonicalization}/latest/index.json (86%) rename {post-canonicalization => request-body-canonicalization}/latest/index.md (99%) diff --git a/index.html b/index.html index c864b53..455611b 100644 --- a/index.html +++ b/index.html @@ -59,7 +59,7 @@ * [WACZ Signing and Verification](wacz-auth/latest/): the mechanics for signing and verifying WACZ files for proof of authenticity * [Crawl Index JSON (CDXJ)](cdxj/latest/): an extensible format for WARC index files * [IPFS Custom File Chunking for WARC and WACZ](wacz-ipfs/latest): a specialized content-aware chunking strategy for adding composite web archive files to IPFS -* [POST Canonicalization](post-canonicalization/latest/): a format for canonicalizing non-GET HTTP requests as GET requests for indexing and replay +* [Request Body Canonicalization](request-body-canonicalization/latest/): a format for canonicalizing non-GET HTTP requests as GET requests for indexing and replay
diff --git a/post-canonicalization/latest/index.json b/request-body-canonicalization/latest/index.json similarity index 86% rename from post-canonicalization/latest/index.json rename to request-body-canonicalization/latest/index.json index 38c962f..5c6ce09 100644 --- a/post-canonicalization/latest/index.json +++ b/request-body-canonicalization/latest/index.json @@ -3,9 +3,9 @@ "respec_js": "../../assets/js/respec-webrecorder.js", "publishDate": "2024-04-08", "license": "cc-by", - "thisVersion": "https://specs.webrecorder.net/post-canonicalization/latest/", - "latestVersion": "https://specs.webrecorder.net/post-canonicalization/latest/", - "shortName": "post-canonicalization", + "thisVersion": "https://specs.webrecorder.net/request-body-canonicalization/latest/", + "latestVersion": "https://specs.webrecorder.net/request-body-canonicalization/latest/", + "shortName": "request-body-canonicalization", "group": "CDX", "includePermalinks": true, "authors": [], diff --git a/post-canonicalization/latest/index.md b/request-body-canonicalization/latest/index.md similarity index 99% rename from post-canonicalization/latest/index.md rename to request-body-canonicalization/latest/index.md index eb654f3..78a0fcd 100644 --- a/post-canonicalization/latest/index.md +++ b/request-body-canonicalization/latest/index.md @@ -1,4 +1,4 @@ -# POST Canonicalization +# Request Body Canonicalization ## Abstract From b2e635aa7f6e18fec5216db8756f6dc311bf0ee0 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 4 Apr 2024 15:21:39 -0400 Subject: [PATCH 06/10] Improve grammar of draft spec --- request-body-canonicalization/latest/index.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/request-body-canonicalization/latest/index.md b/request-body-canonicalization/latest/index.md index 78a0fcd..554a009 100644 --- a/request-body-canonicalization/latest/index.md +++ b/request-body-canonicalization/latest/index.md @@ -2,7 +2,7 @@ ## Abstract -Originally CDX files were only used to index web archives containing GET requests. As browser-based capture methods can record non-GET requests such as those generated by JavaScript, a way for CDX/CDXJ index records to differentiate based on request method and request body is needed. This document describes the mechanism used for encoding the request method and body in the CDX/CDXJ key by appending additional query parameters, as originally implemented by pywb. +Originally, CDX files were only used to index web archives containing GET requests. As browser-based capture methods can record non-GET requests such as those generated by JavaScript, a way for CDX/CDXJ index records to differentiate based on request method and request body is needed. This document describes the mechanism used for encoding the request method and body in the CDX/CDXJ key by appending additional query parameters, as originally implemented by pywb. ## Conformance @@ -23,18 +23,18 @@ The key words MAY and MUST in this document are to be interpreted as described i Web archiving data is often stored in specialized formats, which include a full record of the HTTP network traffic as well as additional metadata. The archived data is often accessed via random-access, loading the appropriate chunks of data based on URLs requested by end users. -This specification is designed to describe how to store two key file formats used for web archives: +Web archiving data is often stored in two key file formats: 1. WARC — A widely accepted [ISO standard][3] used by many institutions around the world for storing web archive data. 2. WACZ — A new format [developed by Webrecorder][4] for packaging WARCs with other web archive data which supports random-access reads. Both formats are 'composite' formats, containing smaller amounts of data interspersed with metadata. In the case of WARC, the format consists of concatenated records which are appended one after the other, eg. `cat A.warc B.warc > C.warc`. The WARCs may or may not be gzipped, in which case the result is a multi-member gzip. -WACZ files use the ZIP format which contains a specialized file and directory layout. ZIP is also a composite format, containing the raw (sometimes compressed) data as well as header data which contains the location files and directories within the ZIP file. +WACZ files use the ZIP format, which contains a specialized file and directory layout. ZIP is also a composite format, containing the raw (sometimes compressed) data as well as header data which contains the location files and directories within the ZIP file. ## Web Archive Index Formats (CDX and CDXJ) -Web archive search and retrieval is frequently intermediated by index files of WARC data, in the CDX or CDXJ formats. WACZ files contain CDXJ indices, which may or may not be gzipped, within the ZIP file that comprises the WACZ. +Web archive search and retrieval is frequently intermediated by index files of WARC data in the CDX or CDXJ formats. WACZ files contain CDXJ indices, which may or may not be gzipped, within the ZIP file that comprises the WACZ. ### CDX @@ -70,7 +70,7 @@ The JSON Block contains a serialized [JSON][7] object with newlines escaped so t ### Motivation -POST-canonicalization provides a standardized way of representing a non-GET HTTP request as a GET request for indexing and playback in web archives. The original HTTP request type as well as the encoded request body are appended to the original URL and included in CDX/CDXJ indices as the Searchable URL. This allows web archive playback engines to then reconstruct the original non-GET requests for use in playback with their original HTTP method and request body. +Request body canonicalization provides a standardized way of representing a non-GET HTTP request as a GET request for indexing and playback in web archives. The original HTTP request type as well as the encoded request body are appended to the original URL and included in CDX/CDXJ indices as the Searchable URL. This allows web archive playback engines to then reconstruct the original non-GET requests for use in playback with their original HTTP method and request body. ### Encoding the request method From 3578f52529c57ab9a65cde79ccab032bf79053e7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 4 Apr 2024 15:22:18 -0400 Subject: [PATCH 07/10] warcio -> warcio.js --- request-body-canonicalization/latest/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/request-body-canonicalization/latest/index.md b/request-body-canonicalization/latest/index.md index 554a009..b63fec3 100644 --- a/request-body-canonicalization/latest/index.md +++ b/request-body-canonicalization/latest/index.md @@ -261,7 +261,7 @@ To **percent plus encode a byte sequence**, for each byte in the input sequence: > > Prior to Python 3.7 the character "~" was percent encoded. > -> Older versions of ([pywb][12]) and ([warcio.js][]13) had slight discrepencies in the query strings they output for the same request data. For instance, pywb wrote Pythonic values for some values (`True`, `False`, `None`) rather than native JSON values (`true`, `false`, `null`), and warcio handled nested JSON differently than pywb. As of the publication of this specification, all current versions of Webrecorder software should behave identically. +> Older versions of ([pywb][12]) and ([warcio.js][]13) had slight discrepencies in the query strings they output for the same request data. For instance, pywb wrote Pythonic values for some values (`True`, `False`, `None`) rather than native JSON values (`true`, `false`, `null`), and warcio.js handled nested JSON differently than pywb. As of the publication of this specification, all current versions of Webrecorder software should behave identically. [1]: https://www.rfc-editor.org/rfc/rfc2119 From b836eb6c79a6cedf3db99b25fddc84348c5d21db Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 4 Apr 2024 15:26:09 -0400 Subject: [PATCH 08/10] Fix typo with reference --- request-body-canonicalization/latest/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/request-body-canonicalization/latest/index.md b/request-body-canonicalization/latest/index.md index b63fec3..219d6ce 100644 --- a/request-body-canonicalization/latest/index.md +++ b/request-body-canonicalization/latest/index.md @@ -261,7 +261,7 @@ To **percent plus encode a byte sequence**, for each byte in the input sequence: > > Prior to Python 3.7 the character "~" was percent encoded. > -> Older versions of ([pywb][12]) and ([warcio.js][]13) had slight discrepencies in the query strings they output for the same request data. For instance, pywb wrote Pythonic values for some values (`True`, `False`, `None`) rather than native JSON values (`true`, `false`, `null`), and warcio.js handled nested JSON differently than pywb. As of the publication of this specification, all current versions of Webrecorder software should behave identically. +> Older versions of ([pywb][12]) and ([warcio.js][13]) had slight discrepencies in the query strings they output for the same request data. For instance, pywb wrote Pythonic values for some values (`True`, `False`, `None`) rather than native JSON values (`true`, `false`, `null`), and warcio.js handled nested JSON differently than pywb. As of the publication of this specification, all current versions of Webrecorder software should behave identically. [1]: https://www.rfc-editor.org/rfc/rfc2119 From 2eeb0d2195ee888269dab6ce4d790eb94ea0e3db Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Mon, 8 Apr 2024 11:38:31 -0400 Subject: [PATCH 09/10] Apply suggestions from code review Co-authored-by: Henry Wilkinson --- request-body-canonicalization/latest/index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/request-body-canonicalization/latest/index.md b/request-body-canonicalization/latest/index.md index 219d6ce..1e1399d 100644 --- a/request-body-canonicalization/latest/index.md +++ b/request-body-canonicalization/latest/index.md @@ -26,15 +26,15 @@ Web archiving data is often stored in specialized formats, which include a full Web archiving data is often stored in two key file formats: 1. WARC — A widely accepted [ISO standard][3] used by many institutions around the world for storing web archive data. -2. WACZ — A new format [developed by Webrecorder][4] for packaging WARCs with other web archive data which supports random-access reads. +2. WACZ — A new format [developed by Webrecorder][4] for packaging WARCs with other web archive data enabling efficient random-access reads. -Both formats are 'composite' formats, containing smaller amounts of data interspersed with metadata. In the case of WARC, the format consists of concatenated records which are appended one after the other, eg. `cat A.warc B.warc > C.warc`. The WARCs may or may not be gzipped, in which case the result is a multi-member gzip. +Both formats are 'composite' formats, containing smaller amounts of data interspersed with metadata. In the case of WARC, the format consists of concatenated records which are appended one after the other, eg. `cat A.warc B.warc > C.warc`. The WARCs MAY be gzipped, in which case the result is a multi-member gzip. WACZ files use the ZIP format, which contains a specialized file and directory layout. ZIP is also a composite format, containing the raw (sometimes compressed) data as well as header data which contains the location files and directories within the ZIP file. ## Web Archive Index Formats (CDX and CDXJ) -Web archive search and retrieval is frequently intermediated by index files of WARC data in the CDX or CDXJ formats. WACZ files contain CDXJ indices, which may or may not be gzipped, within the ZIP file that comprises the WACZ. +Web archive search and retrieval is frequently intermediated by index files of WARC data in the CDX or CDXJ formats. WACZ files contain CDXJ indices, which MAY be gzipped, within the ZIP file that comprises the WACZ. ### CDX From e0899118013de3981c931efc1eecea81424c1a78 Mon Sep 17 00:00:00 2001 From: Henry Wilkinson Date: Mon, 8 Apr 2024 12:22:12 -0400 Subject: [PATCH 10/10] =?UTF-8?q?"single=20capture"=20=E2=86=92=20"single?= =?UTF-8?q?=20captured=20URL"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Tessa Walsh --- request-body-canonicalization/latest/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/request-body-canonicalization/latest/index.md b/request-body-canonicalization/latest/index.md index 1e1399d..d2a78bb 100644 --- a/request-body-canonicalization/latest/index.md +++ b/request-body-canonicalization/latest/index.md @@ -46,7 +46,7 @@ CDX was the precursor to the CDXJ index format. Crawl Index JSON or [CDXJ](4) provides a standardized way of representing an index to one or more WARC files. It allows applications to quickly locate a given page in a set of archived web content, as well as metadata associated with that page. Each CDXJ entry can be looked up by URL, and contains a JSON payload that can be used for representing information about that URL. It is used in the [WACZ specification][4]. -A CDXJ file is a sorted, line oriented plain-text file (optionally GZIP compressed) where each line represents information about a single capture in a web archive collection. +A CDXJ file is a sorted, line oriented plain-text file (optionally GZIP compressed) where each line represents information about a single captured URL in a web archive collection. Each line MUST have three components that are separated by single spaces (0x20):