-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Further Improve Execution Speed with Explicit Bounds Checks #6094
Comments
For posterity, here is the suuuuuper-hacky WASI polyfill I used to get SpiderMonkey running the const ITERS = 100;
////////////////////////////////////////////////////////////////////////////////
class TextEncoder {
constructor(enc) {
if (enc != "utf-8") {
throw new Error("FITZGEN: unsupported encoding: " + enc);
}
}
encode(s, n) {
let buf = new Uint8Array(s.length);
for (let i = 0; i < Math.min(s.length, n); i++) {
buf[i] = s.charCodeAt(i); // lol
}
return buf;
}
};
class TextDecoder {
constructor(enc) {
if (enc != "utf-8") {
throw new Error("FITZGEN: unsupported encoding: " + enc);
}
}
decode(buf) {
let buf8 = new Uint8Array(buf);
let s = "";
for (let i = 0; i < buf8.length; i++) {
s += String.fromCharCode(buf8[i]); // lol
}
return s;
}
};
////////////////////////////////////////////////////////////////////////////////
async function ionCompile(wasm) {
let module = await WebAssembly.compile(wasm);
while (!wasmHasTier2CompilationCompleted(module)) {
sleep(1);
}
return module;
}
function unimplemented(name) {
return function (...args) {
throw new Error(name + " is unimplemented! args = " + args);
}
}
class ProcExitError extends Error {
constructor(code) {
this.code = code;
this.message = "Program exited with code " + code;
}
toString() {
return "ProcExitError: " + this.message
}
}
function trace(obj) {
let proxy = {};
for (let key of Object.keys(obj)) {
if (typeof obj[key] != "function") {
proxy[key] = obj[key];
continue;
}
proxy[key] = function (...args) {
print("TRACE: " + key + "(" + args + ")");
let ret = obj[key](...args);
print("TRACE: -> " + ret);
return ret;
};
}
return proxy;
}
////////////////////////////////////////////////////////////////////////////////
async function main() {
let smWasm = os.file.readFile("/home/nick/sightglass/benchmarks/spidermonkey/benchmark.wasm", "binary");
let smModule = await ionCompile(smWasm);
for (let i = 0; i < ITERS; i++) {
let args = ["js"];
let mem = null;
let start = null;
let unread = true;
let smInstance = await WebAssembly.instantiate(smModule, {
bench: {
start: function () {
start = monotonicNow()
},
end: function () {
let end = monotonicNow();
print("ITER: " + (end - start));
}
},
wasi_snapshot_preview1: {
random_get: function(a, b) {
while (b > 0) {
(new DataView(mem.buffer)).setInt8(a, 1, true);
b -= 1;
a += 1;
}
return 0;
},
args_get: function(a, b) {
for (let arg of args) {
(new DataView(mem.buffer)).setInt32(a, b, true);
a += 4;
for (let c in arg) {
(new DataView(mem.buffer)).setInt8(b, arg.charCodeAt(c), true);
b += 1;
}
(new DataView(mem.buffer)).setInt8(b, 0, true);
b += 1;
}
return 0;
},
args_sizes_get: function(a, b) {
let len = 0;
for (let arg of args) {
len += arg.length + 1;
}
(new DataView(mem.buffer)).setInt32(a, args.length, true);
(new DataView(mem.buffer)).setInt32(b, len, true);
return 0;
},
clock_res_get: function () { return 1; },
clock_time_get: function(a, b, c) {
const now = Math.round(performance.now() * 1000000);
(new DataView(mem.buffer)).setBigInt64(c, BigInt(now), true);
return 0;
},
fd_filestat_get: function() { throw new Error('fd_filestat_get'); },
fd_read: function(fd, iovecs_ptr, iovecs_len, out_ptr) {
let mem8 = new Uint8Array(mem.buffer);
switch (fd) {
case 4:
let data = os.file.readFile("/home/nick/sightglass/benchmarks/spidermonkey/default.input.md");
let k = 0;
for (let i = 0; i < iovecs_len && k < data.length && unread; i++) {
let ptr = (new DataView(mem.buffer)).getUint32(iovecs_ptr + i * 8, true);
let len = (new DataView(mem.buffer)).getUint32(iovecs_ptr + i * 8 + 4, true);
for (let j = 0; j < len && k < data.length; j++) {
mem8[ptr + j] = data.charCodeAt(k++);
}
}
unread = false;
(new DataView(mem.buffer)).setUint32(out_ptr, k, true);
return 0;
default:
return 8;
}
},
fd_seek: function(fd, offset, whence, out_ptr) {
switch (fd) {
case 4:
let len = os.file.readFile("/home/nick/sightglass/benchmarks/spidermonkey/default.input.md").length;
(new DataView(mem.buffer)).setBigUint64(out_ptr, BigInt(len), true);
return 0;
default:
return 8;
}
},
fd_write: function(a, b, c, d) {
let s = '';
let total = 0;
while (c > 0) {
let base = (new DataView(mem.buffer)).getInt32(b, true);
let len = (new DataView(mem.buffer)).getInt32(b + 4, true);
b += 8;
c -= 1;
while (len > 0) {
let c = new Uint8Array(mem.buffer)[base]
s += String.fromCharCode(c);
len -= 1;
base += 1;
total += 1;
}
}
// print("fd_write(" + a + "): " + s.trimEnd());
(new DataView(mem.buffer)).setInt32(d, total, true);
return 0;
},
fd_fdstat_set_flags: function() { throw new Error('fd_fdstat_set_flags'); },
path_filestat_get: function() { throw new Error('path_filestat_get'); },
path_open: function(fd, dirflags, path_ptr, path_len, oflags, fs_rights_base, fs_rights_inheriting, fdflags, out_ptr) {
let buf = new Uint8Array(path_len);
let mem8 = new Uint8Array(mem.buffer);
for (let i = 0; i < path_len; i++) {
buf[i] = mem8[path_ptr + i];
}
let path = (new TextDecoder('utf-8')).decode(buf);
switch (path) {
case "default.input.md":
(new DataView(mem.buffer)).setInt32(out_ptr, 4, true);
return 0;
default:
print('denying path_open(' + path + ')');
return 2;
}
},
path_remove_directory: function() { throw new Error('path_remove_directory'); },
path_unlink_file: function() { throw new Error('path_unlink_file'); },
sched_yield: function() { throw new Error('sched_yield'); },
environ_get: function() { return 0; },
environ_sizes_get: function(a, b) {
(new DataView(mem.buffer)).setInt32(a, 0, true);
(new DataView(mem.buffer)).setInt32(b, 0, true);
return 0;
},
fd_close: function(fd) {
return 0;
},
fd_fdstat_get: function(fd, out_ptr) {
(new DataView(mem.buffer)).setInt32(out_ptr, 0);
// type
switch (fd) {
case 3:
(new DataView(mem.buffer)).setInt32(out_ptr + 4, 3, true);
break;
case 4:
(new DataView(mem.buffer)).setInt32(out_ptr + 4, 4, true);
break;
default:
(new DataView(mem.buffer)).setInt32(out_ptr + 4, 0, true);
break;
}
(new DataView(mem.buffer)).setInt32(out_ptr + 8, 0, true);
(new DataView(mem.buffer)).setInt32(out_ptr + 12, 0, true);
(new DataView(mem.buffer)).setInt32(out_ptr + 16, 0, true);
(new DataView(mem.buffer)).setInt32(out_ptr + 20, 0, true);
return 0;
},
fd_prestat_get: function(a, b) {
// case for preopened "."
if (a == 3) {
// discriminant for directory
(new DataView(mem.buffer)).setInt32(b, 0, true);
// one entry in directory
(new DataView(mem.buffer)).setInt32(b + 4, 1, true);
return 0;
}
return 8;
},
fd_prestat_dir_name: function(fd, ptr, len) {
let buf = (new TextEncoder('utf-8')).encode(".", 1);
let mem8 = new Uint8Array(mem.buffer);
for (let i = 0; i < Math.min(buf.length, len); i++) {
mem8[ptr + i] = buf[i];
}
mem8[Math.min(buf.length, len)] = 0;
return 0;
},
proc_exit: function(code) { throw new ProcExitError(code); },
},
});
mem = smInstance.exports.memory;
try {
smInstance.exports._start();
} catch (e) {
if ((e instanceof ProcExitError) && e.code == 0) {
continue;
}
throw e;
}
}
print("Okay!");
}
main().catch(function (e) {
print("====== ERROR!!! ======");
print(e);
print(e.stack);
}); |
I talked with @cfallin about these optimizations a little bit yesterday and he pointed out that the (Thinking out loud a bit here:) It is unclear to me whether a local (within the same basic block) version of the All that said, I think the basic constraint-elimination pass would still be beneficial here, but we wouldn't get as much as I originally hoped for in the OP. |
Thanks for summarizing all this @fitzgen; sorry for not getting to it sooner! I don't think a "remove dominated checks within a basic block" optimization is safe, either, for the same reasons; I want to try to give a little intuition into how the out-of-order execution works to show why and help guide any further discussions. The main idea behind OoO is restricted dataflow execution: "restricted" because it operates like a sliding window over the program's execution trace, and "dataflow execution" because within that window, instructions are decoded into a true dependency graph and can execute whenever their inputs are ready. You can think of the CPU has having a frontend that predicts some trace (at each branch, picks a direction; at indirects, predicts a target; at returns, predicts based on an internal predictor callstack); that output is a linear stream of instructions. This stream goes through a "rename" stage that computes dependencies and does a sort of SSA-like thing, assigning new physical registers to instructions' written dests; and an "allocate" stage that puts the instructions into a "reorder buffer" (ROB) as well as the scheduling data structures (think "ready queues" and such). Then there is a "retirement stage" that crawls the reorder buffer from a tail pointer, and at any cycle can take some number of the oldest instructions in-flight and "commit" or "retire". Speculation is a behavior that arises out of the cooperation of the frontend and backend. The frontend is predicting some path; the OoO engine then treats this linear stream as truth; but if a branch resolves differently than it was predicted, or an instruction traps, we need to flush and resteer. In the simplest designs, this happens when the instruction reaches retirement (is the oldest instruction): we need a consistent architectural machine state to restart from, and the most natural way to get that is to let retirement put things back in-order. More complex aproaches use periodic checkpoints but the effect is the same -- the speculation recovery is delayed a bit, and younger instructions (those "beyond" the misspeculation or trap) were still in the dataflow window for however long and could have executed. So if we have a program something like:
and let's say the branch is mispredicted as falling through (predicted not-OOB, but actually is). Then the reorder buffer will contain those five instructions in order and execution obeys only true dataflow dependencies. Even if we can resolve the branch relatively early, misspeculation recovery is delayed somewhat (possibly until retirement in simple designs, and imagine there are 100 instructions prior to these in the window, so it could be a while). If it weren't for the Now say we have
we get these instructions into the window, and the The basic intuition behind the Spectre mitigations, and guidance from all the CPU vendors, is that processors can speculate control-flow and execute instructions in arbitrary order (and this is fundamental to the 10x perf per clock cycle we've gotten in hardware in the last few decades), but they won't speculate dataflow. (Value speculation is a thing in the uarch research literature but AFAIK no one has ever shipped it; and now that Spectre is a thing, likely no one will?). This is why the cmove thing works. Fences also work, because a fence instruction can't enter the OoO window until every older instruction retires (the window is empty). But of course that window flush is quite expensive. Given that, I think we need to obey the invariant that some cmove based on an OOB condition exists in the computation of every Wasm address we load. It's possible we could do a multi-stage thing: if we access |
Ah, the tidbit of intuition I forgot to state explicitly as well, in case it helps: we can think of faulting loads/stores as a kind of "branch" as well, given that we use the same recovery mechanism as for mispredicted branches. So instructions beyond one that must fault may still be executed speculatively, because we either haven't executed the to-fault inst yet, or we have but our recovery mechanism hasn't yet let us flush the window and redirect to the trap handler. So the "make the load fault" mitigation protects that load, because it won't result to a value that lets its dependent instructions run; but it doesn't protect younger instructions, because they can run before or after (of course with architectural effects only committed once earlier insts resolve successfully, so, never if earlier fault). Other tidbit of intuition that might help: it's more accurate to think of an OoO CPU as executing every instruction speculatively, rather than thinking of speculation as a thing that happens when the branch predictor meets a branch. The latter is certainly a form of speculation, which we need to produce the linear stream of instructions into the backend. But strictly speaking every execution of a node in the dataflow graph in the backend when that node is not the oldest is speculation that some condition (trap, misspeculated memory ordering or aliasing (st-to-ld-fwd) condition; etc) will not cause a flush. Effects of this speculative early execution are kept in the ROB (or equivalent structures alongside, such as the store buffer) until retirement. |
If we add a knob for tuning the size of the null guard page, and both Original input, what
Pull offsets out the spectre guard because they are smaller than the null guard page (or just modify
Basic constraint-elimination pass makes
"GVN" (via e-graphs) the
Now we only have a single @cfallin, how does that sound? |
I think this looks workable, yep. It's worth defining here precisely what the "replace |
Could cranelift-wasm track enough information to fully do this transformation itself, instead of implementing a generic optimization in Cranelift? I think that because wasm only has reducible control flow, the dominator tree is implied by the control-flow instructions in the input wasm. So I think we can keep track of which linear-memory accesses dominate the current instruction while making a single pass over the input program. If so, then we could maintain a scoped hash map keyed on CLIF These keys would map to a pair of the maximum offset checked so far, and the That's pretty simple compared to doing it generically in Cranelift, right? |
I think it is possible, yes. The idea of fusing this optimization with an existing pass where we have domination "for free" is definitely tempting. One hardship I expect we would encounter would be that this runs before the phi-elimination pass and therefore we would likely leave some optimizations on the table, especially around Wasm locals and joining control flow.
Perhaps? I don't think the passes I outlined above are too complicated, its mostly just a matter of writing them such that they can be fused with each other and with the egraphs pass (similar to how the alias analysis is written). As long as that is done, then it seems fairly straightforward to me: two passes that each do a single shape of optimization. In contrast, in |
I've been digging into Cranelift's and Wasmtime's code quality and Wasm execution throughput when "dynamic memories" with explicit bounds checks are used to implement Wasm linear memories. Here I'm summarizing the progress made thus far here, and laying out what I believe the next steps to continue this effort should be.
Static memories require large reservations of virtual memory, to the extent that it can become a bottleneck for how many Wasm instances can exist concurrently in a system. Dynamic memories, on the other hand, require very little or even zero virtual memory reservations (depending on configuration) beyond the actually resident pages backing the Wasm linear memory itself. Additionally, when dynamic memories are configured without any guard regions, they incur zero virtual memory-related syscalls on instantiation and tear down (modulo any copy-on-write heap image initialization). These syscalls have been observed to be bottlenecks in concurrent systems due to VMA locks taken in the kernel.
Furthermore these issues will only compound in the future. Right now, each tenant in a multi-tenant system is a single core Wasm module with a single Wasm linear memory. With the component model, a tenant will instead be a Wasm component that is made up of many core Wasm modules, each with their own linear memories. If the average Wasm component contains five linear memories, then the multi-tenant system can only support one fifth as many concurrent tenants under the component model, and instantiating and tearing down each tenant's service will take five times as many syscalls as it does today.
So there is a lot of pressure towards implementing Wasm linear memories with dynamic memories. The downside is that dynamic memories currently incur roughly 55% overhead to Wasm execution throughput compared to static memories. That is, a Wasm computation that takes 1 second in a static memory will take about 1.55 seconds in a dynamic memory. This is because dynamic memory accesses must be explicitly bounds checked rather than caught after the fact via page faults like out-of-bounds static memory accesses. The goal of the effort described here is to minimize this overhead.
I don't intend to continue this work in the immediate future, so I wanted to make sure everything was summarized and documented for posterity, as well as written down while it is still fresh in my mind. Additionally I have proposals for new optimization passes we should implement in the future, and this issue can serve as a forum for discussing them.
Progress Made Thus Far
Before I began this effort, running the
spidermonkey.wasm
benchmark in Sightglass with static memories was 3.06x faster than doing the same with dynamic memories. Now we are down to static memories being only 1.55x faster.This improvement comes from a number of sources:
uadd_overflow_trap
andselect_spectre_guard
instructions, both of which are emitted in our bounds-checking sequence.spidermonkey.wasm
.Additionally, I created the
wasmtime explore
subcommand to dig into native code disassemblies and associate each native instruction with the Wasm bytecode it implements (similar to the Godbolt Compiler Explorer). This new tool has already been incredibly helpful for analyzing what room for improvement still exists, and should help further future optimization efforts in Cranelift and Wasmtime.Recent Sightglass Benchmark Results
Here are some recent Sightglass benchmark results comparing static memories (
static.so
) versus dynamic memories with a small (64KiB) guard region (dyn-with-guard.so
) versus dynamic memories without a guard region (dyn-without-guard.so
).static.so
vsdyn-with-guard.so
static.so
vsdyn-without-guard.so
Comparing Wasmtime and SpiderMonkey
To get a sense of whether our 1.55x slowdown when enabling explicit bounds checks was typical of other engines, I also benchmarked SpiderMonkey (native) running Sightglass's
spidermonkey.wasm
benchmark with virtual memory guard pages vs with explicit bounds checks.Because SpiderMonkey's slowdown is almost identical to Wasmtime's, we can infer that there aren't any critical bounds-checking optimizations that SpiderMonkey performs but Wasmtime/Cranelift does not. This is reassuring: it means we can basically ignore other engines (or at least SpiderMonkey) for now, when trying to further improve codegen when explicit bounds checks are enabled, and focus only on ourselves.
Proposed Future Work
What fruit remains to be harvested? I've been digging into
spidermonkey.wasm
's hot functions (as reported byperf
on Linux) and staring at disassemblies to get an idea of how much room for improvement we still have. In short: a lot!I found many sequences of back-to-back memory accesses in hot functions, often with
This is great because it is the next-easiest pattern to optimize for (after sequences of identical accesses, which we can already optimize well).
Even when the accesses weren't roughly sorted from largest to smallest static offset, the offsets were always small enough to be covered by a small guard region. This allows us to bounds check
index > bound
instead ofindex + offset > bound
which allows us to GVN these comparisons even when their static offsets aren't in order. After that, all we are left with are redundantselect_spectre_guard
s, which I will describe how we can eliminate in a moment.For example, I found a back-to-back sequence of nine stores that all have the same dynamic index operand but different static offset immediates, all in the same basic block:
Only the first store in this sequence needs to be bounds checked! All the following stores' bounds checks are implied by the first check, but we emit the bounds check for every single one of them.
Based on this investigation, I suggest we implement two new optimization passes:
select_spectre_guard
-elimination pass,select_spectre_guard
-Elimination PassWe should develop an optimization pass to remove redundant
select_spectre_guard
instructions.This optimization pass would walk the dominator tree and maintain the set of
spectre_select_guard
conditions in scope that we've already tested and whose results have flowed into an instruction that will raise a trap if theselect_spectre_guard
failed. That is, maintain the set ofx
s for all sequences likethat dominate the current location in our traversal.
Then, whenever we see a new
select_spectre_guard(x, 0, y)
, we can rewrite it to simplyy
, since the dominatingload(select_spectre_guard(x, 0, _))
would have raised a trapped already ifx
was truthy, so control flow reaching this location means thatx
must not be truthy and thereforeselect_spectre_guard(x, _, y)
will always evaluate toy
.This does not break the speculative sandboxing function of the optimized-away
select_spectre_guard
s because there will always still be at least one dominatingselect_spectre_guard
for the same condition.This optimization pass should integrate with the e-graphs framework in the same way that our alias analysis optimization pass does. It should expose a "push"-based interface, where blocks and instructions are fed into the analysis by an external traversal, rather than having the pass walk the IR itself. This will allow us to fuse the pass with existing IR traversals.
Creating this new optimization pass is not super easy, so if we wanted to get an idea of what kind of speed ups it would give, we can implement the much easier #6055, which would give us the non-Spectre equivalent of this optimization pass. The amount that implementing #6055 speeds up benchmarks in an environment configured with Spectre mitigations disabled is roughly what we could expect from this new optimization pass with Spectre mitigations enabled.
A Potential Variation
This analysis is slightly subtle in that it requires chaining a few things together (loads/stores that consume a
select_spectre_guard
that then has a conditionx
). In general, the fewer links in the chain, the simpler the analysis's implementation will be and the more confidence we can have in its correctness. To that end, we could consider havingspectre_load(oob_condition, addr)
andspectre_store(oob_condition, addr, value)
instructions that are morally equivalent toload(select_spectre_guard(oob_condition, 0, addr))
and similar for stores, but which do not get legalized in the mid-end and go all the way to our backends. By fusing the instructions, we remove one link in that proof chain that the analysis has to create, making it simpler and more likely to be correct. The downside is that we don't only useselect_spectre_guard
with plainload
andstore
CLIF instructions; there are{u,s}{load,store}{8,16,32}
instructions which would all either need Spectre-fused variants or to continue using the oldselect_spectre_guard
(and presumably not reap the benefits of this simpler analysis), SIMD load instructions, atomic access instructions, etc. I've filed #6056 to help consolidate some of these instructions, which would slightly alleviate this downside, but it would not be fully addressed. Because of that, I'm thinking that this approach with Spectre-fused instructions is probably not worth the effort.Basic Constraint-Elimination Pass
We should develop a very basic constraint-elimination optimization pass targeted at bounds checks for 32-bit Wasm memories on 64-bit hosts.
This would not be a fully general constraint-elimination pass, like what LLVM has recently added. Instead it would focus only on comparisons of roughly the form
index + constant > heap_bound
.Consider this example, with constants
X
andY
:When
X > Y
, thenv3
impliesv6
, and we can rewrite the example into the following:The catch is that this is only correct if
index + X
does not overflow. Otherwise, it is possible thatindex + X
is "logically out of bounds" but because it overflowed and wrapped around, it ended up back in bounds again. In this scenario,Y
might not be large enough forindex + Y
to overflow, andindex + Y
could remain out of bounds.We can avoid this pitfall by limiting the pass's scope to optimizing bounds checks for only 32-bit memories on 64-bit hosts, which have the following pattern:
For these 32-bit bounds checks, the constant
X
fits in ani32
-- despite being represented as ani64
-- and therefore we are guaranteed thatindex + X
will not overflow. As long as the proposed optimization pass checks this constraint onX
when matching that code pattern, it will be sound.This optimization pass would precede the
select_spectre_guard
-elimination pass described previously, since it would deduplicate bounds checks, which then feed intospectre_select_guard
s, which that pass would then be able to completely remove.For example, consider this input CLIF for two back-to-back loads:
After running this proposed constraint-elimination pass, we learn that
v2
impliesv7
becauseX > Y
and so we can rewrite uses ofv7
withv2
:Finally, after running the previously described
select_spectre_guard
-elimination pass, we can completely remove the secondselect_spectre_guard
bounds check since it is identical to and dominated by the first:This final code is the optimal implementation of the original input.
This optimization pass should also integrate with the e-graphs framework in the same way that the previously described optimization pass would and the way that our alias analysis optimization pass does.
To get an idea of the speed up this optimization can give dynamic memories without any guard pages, we can look at the speed up that #6055 gives dynamic memories with guard pages when Spectre mitigations are disabled. This is at least 1.08x speed up, as that is what we've seen from deduplicating bounds checking comparisons (but not the
select_spectre_guard
s) in #6031, but probably closer to at least 1.15x when paired with removing no-longer-necessaryselect_spectre_guard
s if I were to take a wild guess.Optimizing Memory Accesses with Distinct Dynamic Indices
The examples we've considered thus far have a series of memory accesses that use the same dynamic index, but different static offsets. For example, each of the following loads all have the same
local.get 4
dynamic index:We haven't considered when the memory accesses have different dynamic indices, like in this example:
There are two primary reasons for this:
The latter is much harder to optimize. Determining any kind of relationship between the different indices is hard. Sometimes the second access's index is loaded by the first access (e.g.
load(load(...))
), which will pretty much defeat any static analysis we throw at it. Even if the indices are each of the formbase + a
andbase + b
, and we know thata > b
, we have to consider whether overflow occurs inbase + a
(see the discussion about overflow in the basic constraint-elimination pass section above). Developing a pass to optimize these cases will be difficult, complicated, and prone to unsoundness.Most of the sequences of back-to-back memory accesses I found in
spidermonkey.wasm
's hot functions had the same dynamic index. This is great news since these are also the cases we can optimize relatively easily.To conclude, I don't think it is worth the effort trying to optimize these memory access sequences, where different dynamic indices are accessed.
The text was updated successfully, but these errors were encountered: