diff --git a/analyze.json b/analyze.json index 3e29ae5..4bc9da1 100644 --- a/analyze.json +++ b/analyze.json @@ -1,5 +1,5 @@ { - "lastUpdate": 1710160575440, + "lastUpdate": 1710360181784, "repoUrl": "https://github.com/luau-lang/luau", "entries": { "luau-analyze": [ @@ -11416,6 +11416,72 @@ "extra": "luau-analyze" } ] + }, + { + "commit": { + "author": { + "email": "arseny.kapoulkine@gmail.com", + "name": "Arseny Kapoulkine", + "username": "zeux" + }, + "committer": { + "email": "noreply@github.com", + "name": "GitHub", + "username": "web-flow" + }, + "distinct": true, + "id": "9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1", + "message": "CodeGen: Improve lowering of NUM_TO_VEC on A64 for constants (#1194)\n\nWhen the input is a constant, we use a fairly inefficient sequence of\r\nfmov+fcvt+dup or, when the double isn't encodable in fmov,\r\nadr+ldr+fcvt+dup.\r\n\r\nInstead, we can use the same lowering as X64 when the input is a\r\nconstant, and load the vector from memory. However, if the constant is\r\nencodable via fmov, we can use a vector fmov instead (which is just one\r\ninstruction and doesn't need constant space).\r\n\r\nFortunately the bit encoding of fmov for 32-bit floating point numbers\r\nmatches that of 64-bit: the decoding algorithm is a little different\r\nbecause it expands into a larger exponent, but the values are\r\ncompatible, so if a double can be encoded into a scalar fmov with a\r\ngiven abcdefgh pattern, the same pattern should encode the same float;\r\ndue to the very limited number of mantissa and exponent bits, all values\r\nthat are encodable are also exact in both 32-bit and 64-bit floats.\r\n\r\nThis strategy is ~same as what gcc uses. For complex vectors, we\r\npreviously used 4 instructions and 8 bytes of constant storage, and now\r\nwe use 2 instructions and 16 bytes of constant storage, so the memory\r\nfootprint is the same; for simple vectors we just need 1 instruction (4\r\nbytes).\r\n\r\nclang lowers vector constants a little differently, opting to synthesize\r\na 64-bit integer using 4 instructions (mov/movk) and then move it to the\r\nvector register - this requires 5 instructions and 20 bytes, vs ours/gcc\r\n2 instructions and 8+16=24 bytes. I tried a simpler version of this that\r\nwould be more compact - synthesize a 32-bit integer constant with\r\nmov+movk, and move it to vector register via dup.4s - but this was a\r\nlittle slower on M2, so for now we prefer the slightly larger version as\r\nit's not a regression vs current implementation.\r\n\r\nOn the vector approximation benchmark we get:\r\n\r\n- Before this PR (flag=false): ~7.85 ns/op\r\n- After this PR (flag=true): ~7.74 ns/op\r\n- After this PR, with 0.125 instead of 0.123 in the benchmark code (to\r\nuse fmov): ~7.52 ns/op\r\n- Not part of this PR, but the mov/dup strategy described above: ~8.00\r\nns/op", + "timestamp": "2024-03-13T12:56:11-07:00", + "tree_id": "b46afdd603a2f3bd60b9cac918c2ddc0faf0d668", + "url": "https://github.com/luau-lang/luau/commit/9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1" + }, + "date": 1710360181780, + "tool": "benchmarkluau", + "benches": [ + { + "name": "map-nonstrict", + "value": 4.78128, + "unit": "4ms", + "range": "±0%", + "extra": "luau-analyze" + }, + { + "name": "map-strict", + "value": 5.84051, + "unit": "5ms", + "range": "±0%", + "extra": "luau-analyze" + }, + { + "name": "map-dcr", + "value": 51.0637, + "unit": "ms", + "range": "±0%", + "extra": "luau-analyze" + }, + { + "name": "regex-nonstrict", + "value": 7.7506, + "unit": "7ms", + "range": "±0%", + "extra": "luau-analyze" + }, + { + "name": "regex-strict", + "value": 9.96327, + "unit": "9ms", + "range": "±0%", + "extra": "luau-analyze" + }, + { + "name": "regex-dcr", + "value": 115.89, + "unit": "ms", + "range": "±0%", + "extra": "luau-analyze" + } + ] } ] } diff --git a/bench-codegen.json b/bench-codegen.json index 47a9523..fb6cb5f 100644 --- a/bench-codegen.json +++ b/bench-codegen.json @@ -1,5 +1,5 @@ { - "lastUpdate": 1710160575122, + "lastUpdate": 1710360181462, "repoUrl": "https://github.com/luau-lang/luau", "entries": { "callgrind codegen": [ @@ -29668,6 +29668,254 @@ "extra": "luau-codegen" } ] + }, + { + "commit": { + "author": { + "email": "arseny.kapoulkine@gmail.com", + "name": "Arseny Kapoulkine", + "username": "zeux" + }, + "committer": { + "email": "noreply@github.com", + "name": "GitHub", + "username": "web-flow" + }, + "distinct": true, + "id": "9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1", + "message": "CodeGen: Improve lowering of NUM_TO_VEC on A64 for constants (#1194)\n\nWhen the input is a constant, we use a fairly inefficient sequence of\r\nfmov+fcvt+dup or, when the double isn't encodable in fmov,\r\nadr+ldr+fcvt+dup.\r\n\r\nInstead, we can use the same lowering as X64 when the input is a\r\nconstant, and load the vector from memory. However, if the constant is\r\nencodable via fmov, we can use a vector fmov instead (which is just one\r\ninstruction and doesn't need constant space).\r\n\r\nFortunately the bit encoding of fmov for 32-bit floating point numbers\r\nmatches that of 64-bit: the decoding algorithm is a little different\r\nbecause it expands into a larger exponent, but the values are\r\ncompatible, so if a double can be encoded into a scalar fmov with a\r\ngiven abcdefgh pattern, the same pattern should encode the same float;\r\ndue to the very limited number of mantissa and exponent bits, all values\r\nthat are encodable are also exact in both 32-bit and 64-bit floats.\r\n\r\nThis strategy is ~same as what gcc uses. For complex vectors, we\r\npreviously used 4 instructions and 8 bytes of constant storage, and now\r\nwe use 2 instructions and 16 bytes of constant storage, so the memory\r\nfootprint is the same; for simple vectors we just need 1 instruction (4\r\nbytes).\r\n\r\nclang lowers vector constants a little differently, opting to synthesize\r\na 64-bit integer using 4 instructions (mov/movk) and then move it to the\r\nvector register - this requires 5 instructions and 20 bytes, vs ours/gcc\r\n2 instructions and 8+16=24 bytes. I tried a simpler version of this that\r\nwould be more compact - synthesize a 32-bit integer constant with\r\nmov+movk, and move it to vector register via dup.4s - but this was a\r\nlittle slower on M2, so for now we prefer the slightly larger version as\r\nit's not a regression vs current implementation.\r\n\r\nOn the vector approximation benchmark we get:\r\n\r\n- Before this PR (flag=false): ~7.85 ns/op\r\n- After this PR (flag=true): ~7.74 ns/op\r\n- After this PR, with 0.125 instead of 0.123 in the benchmark code (to\r\nuse fmov): ~7.52 ns/op\r\n- Not part of this PR, but the mov/dup strategy described above: ~8.00\r\nns/op", + "timestamp": "2024-03-13T12:56:11-07:00", + "tree_id": "b46afdd603a2f3bd60b9cac918c2ddc0faf0d668", + "url": "https://github.com/luau-lang/luau/commit/9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1" + }, + "date": 1710360181456, + "tool": "benchmarkluau", + "benches": [ + { + "name": "base64", + "value": 13.385, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "chess", + "value": 52.018, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "life", + "value": 23.356, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "matrixmult", + "value": 9.336, + "unit": "9ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "mesh-normal-scalar", + "value": 13, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "pcmmix", + "value": 1.38, + "unit": "1ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "qsort", + "value": 41.508, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "sha256", + "value": 4.525, + "unit": "4ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "ack", + "value": 40.021, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "binary-trees", + "value": 20.853, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "fannkuchen-redux", + "value": 3.878, + "unit": "3ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "fixpoint-fact", + "value": 49.032, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "heapsort", + "value": 7.701, + "unit": "7ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "mandel", + "value": 40.471, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "n-body", + "value": 9.707, + "unit": "9ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "qt", + "value": 24.955, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "queen", + "value": 0.805, + "unit": "0ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "scimark", + "value": 24.643, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "spectral-norm", + "value": 2.444, + "unit": "2ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "sieve", + "value": 82.952, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "3d-cube", + "value": 3.736, + "unit": "3ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "3d-morph", + "value": 3.744, + "unit": "3ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "3d-raytrace", + "value": 3.304, + "unit": "3ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "controlflow-recursive", + "value": 3.463, + "unit": "3ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "crypto-aes", + "value": 7.228, + "unit": "7ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "fannkuch", + "value": 6.068, + "unit": "6ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "math-cordic", + "value": 3.768, + "unit": "3ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "math-partial-sums", + "value": 1.872, + "unit": "1ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "n-body-oop", + "value": 13.714, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "tictactoe", + "value": 62.961, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "trig", + "value": 6.618, + "unit": "6ms", + "range": "±0.000%", + "extra": "luau-codegen" + }, + { + "name": "voxelgen", + "value": 27.559, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-codegen" + } + ] } ] } diff --git a/bench-gcc.json b/bench-gcc.json index ffae9eb..05e53d8 100644 --- a/bench-gcc.json +++ b/bench-gcc.json @@ -1,5 +1,5 @@ { - "lastUpdate": 1710160575278, + "lastUpdate": 1710360181622, "repoUrl": "https://github.com/luau-lang/luau", "entries": { "callgrind gcc": [ @@ -45037,6 +45037,254 @@ "extra": "luau-gcc" } ] + }, + { + "commit": { + "author": { + "email": "arseny.kapoulkine@gmail.com", + "name": "Arseny Kapoulkine", + "username": "zeux" + }, + "committer": { + "email": "noreply@github.com", + "name": "GitHub", + "username": "web-flow" + }, + "distinct": true, + "id": "9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1", + "message": "CodeGen: Improve lowering of NUM_TO_VEC on A64 for constants (#1194)\n\nWhen the input is a constant, we use a fairly inefficient sequence of\r\nfmov+fcvt+dup or, when the double isn't encodable in fmov,\r\nadr+ldr+fcvt+dup.\r\n\r\nInstead, we can use the same lowering as X64 when the input is a\r\nconstant, and load the vector from memory. However, if the constant is\r\nencodable via fmov, we can use a vector fmov instead (which is just one\r\ninstruction and doesn't need constant space).\r\n\r\nFortunately the bit encoding of fmov for 32-bit floating point numbers\r\nmatches that of 64-bit: the decoding algorithm is a little different\r\nbecause it expands into a larger exponent, but the values are\r\ncompatible, so if a double can be encoded into a scalar fmov with a\r\ngiven abcdefgh pattern, the same pattern should encode the same float;\r\ndue to the very limited number of mantissa and exponent bits, all values\r\nthat are encodable are also exact in both 32-bit and 64-bit floats.\r\n\r\nThis strategy is ~same as what gcc uses. For complex vectors, we\r\npreviously used 4 instructions and 8 bytes of constant storage, and now\r\nwe use 2 instructions and 16 bytes of constant storage, so the memory\r\nfootprint is the same; for simple vectors we just need 1 instruction (4\r\nbytes).\r\n\r\nclang lowers vector constants a little differently, opting to synthesize\r\na 64-bit integer using 4 instructions (mov/movk) and then move it to the\r\nvector register - this requires 5 instructions and 20 bytes, vs ours/gcc\r\n2 instructions and 8+16=24 bytes. I tried a simpler version of this that\r\nwould be more compact - synthesize a 32-bit integer constant with\r\nmov+movk, and move it to vector register via dup.4s - but this was a\r\nlittle slower on M2, so for now we prefer the slightly larger version as\r\nit's not a regression vs current implementation.\r\n\r\nOn the vector approximation benchmark we get:\r\n\r\n- Before this PR (flag=false): ~7.85 ns/op\r\n- After this PR (flag=true): ~7.74 ns/op\r\n- After this PR, with 0.125 instead of 0.123 in the benchmark code (to\r\nuse fmov): ~7.52 ns/op\r\n- Not part of this PR, but the mov/dup strategy described above: ~8.00\r\nns/op", + "timestamp": "2024-03-13T12:56:11-07:00", + "tree_id": "b46afdd603a2f3bd60b9cac918c2ddc0faf0d668", + "url": "https://github.com/luau-lang/luau/commit/9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1" + }, + "date": 1710360181614, + "tool": "benchmarkluau", + "benches": [ + { + "name": "base64", + "value": 25.282, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "chess", + "value": 82.298, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "life", + "value": 89.766, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "matrixmult", + "value": 24.643, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "mesh-normal-scalar", + "value": 32.351, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "pcmmix", + "value": 10.52, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "qsort", + "value": 77.324, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "sha256", + "value": 26.047, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "ack", + "value": 65.366, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "binary-trees", + "value": 29.982, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "fannkuchen-redux", + "value": 12.877, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "fixpoint-fact", + "value": 60.356, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "heapsort", + "value": 22.218, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "mandel", + "value": 64.004, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "n-body", + "value": 37.759, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "qt", + "value": 60.758, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "queen", + "value": 1.968, + "unit": "1ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "scimark", + "value": 89.708, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "spectral-norm", + "value": 12.429, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "sieve", + "value": 105.215, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "3d-cube", + "value": 8.671, + "unit": "8ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "3d-morph", + "value": 10.288, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "3d-raytrace", + "value": 10.329, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "controlflow-recursive", + "value": 5.542, + "unit": "5ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "crypto-aes", + "value": 14.714, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "fannkuch", + "value": 26.444, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "math-cordic", + "value": 16.56, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "math-partial-sums", + "value": 5.028, + "unit": "5ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "n-body-oop", + "value": 56.24, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "tictactoe", + "value": 133.878, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "trig", + "value": 25.972, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + }, + { + "name": "voxelgen", + "value": 54.094, + "unit": "ms", + "range": "±0.000%", + "extra": "luau-gcc" + } + ] } ] } diff --git a/bench.json b/bench.json index d019c71..c53cf3a 100644 --- a/bench.json +++ b/bench.json @@ -1,5 +1,5 @@ { - "lastUpdate": 1710160574948, + "lastUpdate": 1710360181302, "repoUrl": "https://github.com/luau-lang/luau", "entries": { "callgrind clang": [ @@ -45037,6 +45037,254 @@ "extra": "luau" } ] + }, + { + "commit": { + "author": { + "email": "arseny.kapoulkine@gmail.com", + "name": "Arseny Kapoulkine", + "username": "zeux" + }, + "committer": { + "email": "noreply@github.com", + "name": "GitHub", + "username": "web-flow" + }, + "distinct": true, + "id": "9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1", + "message": "CodeGen: Improve lowering of NUM_TO_VEC on A64 for constants (#1194)\n\nWhen the input is a constant, we use a fairly inefficient sequence of\r\nfmov+fcvt+dup or, when the double isn't encodable in fmov,\r\nadr+ldr+fcvt+dup.\r\n\r\nInstead, we can use the same lowering as X64 when the input is a\r\nconstant, and load the vector from memory. However, if the constant is\r\nencodable via fmov, we can use a vector fmov instead (which is just one\r\ninstruction and doesn't need constant space).\r\n\r\nFortunately the bit encoding of fmov for 32-bit floating point numbers\r\nmatches that of 64-bit: the decoding algorithm is a little different\r\nbecause it expands into a larger exponent, but the values are\r\ncompatible, so if a double can be encoded into a scalar fmov with a\r\ngiven abcdefgh pattern, the same pattern should encode the same float;\r\ndue to the very limited number of mantissa and exponent bits, all values\r\nthat are encodable are also exact in both 32-bit and 64-bit floats.\r\n\r\nThis strategy is ~same as what gcc uses. For complex vectors, we\r\npreviously used 4 instructions and 8 bytes of constant storage, and now\r\nwe use 2 instructions and 16 bytes of constant storage, so the memory\r\nfootprint is the same; for simple vectors we just need 1 instruction (4\r\nbytes).\r\n\r\nclang lowers vector constants a little differently, opting to synthesize\r\na 64-bit integer using 4 instructions (mov/movk) and then move it to the\r\nvector register - this requires 5 instructions and 20 bytes, vs ours/gcc\r\n2 instructions and 8+16=24 bytes. I tried a simpler version of this that\r\nwould be more compact - synthesize a 32-bit integer constant with\r\nmov+movk, and move it to vector register via dup.4s - but this was a\r\nlittle slower on M2, so for now we prefer the slightly larger version as\r\nit's not a regression vs current implementation.\r\n\r\nOn the vector approximation benchmark we get:\r\n\r\n- Before this PR (flag=false): ~7.85 ns/op\r\n- After this PR (flag=true): ~7.74 ns/op\r\n- After this PR, with 0.125 instead of 0.123 in the benchmark code (to\r\nuse fmov): ~7.52 ns/op\r\n- Not part of this PR, but the mov/dup strategy described above: ~8.00\r\nns/op", + "timestamp": "2024-03-13T12:56:11-07:00", + "tree_id": "b46afdd603a2f3bd60b9cac918c2ddc0faf0d668", + "url": "https://github.com/luau-lang/luau/commit/9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1" + }, + "date": 1710360181294, + "tool": "benchmarkluau", + "benches": [ + { + "name": "base64", + "value": 23.603, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "chess", + "value": 80.432, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "life", + "value": 83.424, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "matrixmult", + "value": 23.433, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "mesh-normal-scalar", + "value": 30.546, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "pcmmix", + "value": 9.52, + "unit": "9ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "qsort", + "value": 75.358, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "sha256", + "value": 24.297, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "ack", + "value": 65.92, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "binary-trees", + "value": 28.875, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "fannkuchen-redux", + "value": 11.807, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "fixpoint-fact", + "value": 62.002, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "heapsort", + "value": 20.659, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "mandel", + "value": 61.474, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "n-body", + "value": 36.475, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "qt", + "value": 56.361, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "queen", + "value": 1.887, + "unit": "1ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "scimark", + "value": 84.626, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "spectral-norm", + "value": 11.345, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "sieve", + "value": 100.509, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "3d-cube", + "value": 7.987, + "unit": "7ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "3d-morph", + "value": 9.619, + "unit": "9ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "3d-raytrace", + "value": 9.736, + "unit": "9ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "controlflow-recursive", + "value": 5.623, + "unit": "5ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "crypto-aes", + "value": 13.498, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "fannkuch", + "value": 24.127, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "math-cordic", + "value": 15.263, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "math-partial-sums", + "value": 4.698, + "unit": "4ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "n-body-oop", + "value": 54.689, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "tictactoe", + "value": 122.892, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "trig", + "value": 23.916, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + }, + { + "name": "voxelgen", + "value": 49.603, + "unit": "ms", + "range": "±0.000%", + "extra": "luau" + } + ] } ] } diff --git a/compile.json b/compile.json index 0cba8b8..fd3a450 100644 --- a/compile.json +++ b/compile.json @@ -1,5 +1,5 @@ { - "lastUpdate": 1710160575580, + "lastUpdate": 1710360181934, "repoUrl": "https://github.com/luau-lang/luau", "entries": { "luau-compile": [ @@ -14618,6 +14618,86 @@ "extra": "luau-compile" } ] + }, + { + "commit": { + "author": { + "email": "arseny.kapoulkine@gmail.com", + "name": "Arseny Kapoulkine", + "username": "zeux" + }, + "committer": { + "email": "noreply@github.com", + "name": "GitHub", + "username": "web-flow" + }, + "distinct": true, + "id": "9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1", + "message": "CodeGen: Improve lowering of NUM_TO_VEC on A64 for constants (#1194)\n\nWhen the input is a constant, we use a fairly inefficient sequence of\r\nfmov+fcvt+dup or, when the double isn't encodable in fmov,\r\nadr+ldr+fcvt+dup.\r\n\r\nInstead, we can use the same lowering as X64 when the input is a\r\nconstant, and load the vector from memory. However, if the constant is\r\nencodable via fmov, we can use a vector fmov instead (which is just one\r\ninstruction and doesn't need constant space).\r\n\r\nFortunately the bit encoding of fmov for 32-bit floating point numbers\r\nmatches that of 64-bit: the decoding algorithm is a little different\r\nbecause it expands into a larger exponent, but the values are\r\ncompatible, so if a double can be encoded into a scalar fmov with a\r\ngiven abcdefgh pattern, the same pattern should encode the same float;\r\ndue to the very limited number of mantissa and exponent bits, all values\r\nthat are encodable are also exact in both 32-bit and 64-bit floats.\r\n\r\nThis strategy is ~same as what gcc uses. For complex vectors, we\r\npreviously used 4 instructions and 8 bytes of constant storage, and now\r\nwe use 2 instructions and 16 bytes of constant storage, so the memory\r\nfootprint is the same; for simple vectors we just need 1 instruction (4\r\nbytes).\r\n\r\nclang lowers vector constants a little differently, opting to synthesize\r\na 64-bit integer using 4 instructions (mov/movk) and then move it to the\r\nvector register - this requires 5 instructions and 20 bytes, vs ours/gcc\r\n2 instructions and 8+16=24 bytes. I tried a simpler version of this that\r\nwould be more compact - synthesize a 32-bit integer constant with\r\nmov+movk, and move it to vector register via dup.4s - but this was a\r\nlittle slower on M2, so for now we prefer the slightly larger version as\r\nit's not a regression vs current implementation.\r\n\r\nOn the vector approximation benchmark we get:\r\n\r\n- Before this PR (flag=false): ~7.85 ns/op\r\n- After this PR (flag=true): ~7.74 ns/op\r\n- After this PR, with 0.125 instead of 0.123 in the benchmark code (to\r\nuse fmov): ~7.52 ns/op\r\n- Not part of this PR, but the mov/dup strategy described above: ~8.00\r\nns/op", + "timestamp": "2024-03-13T12:56:11-07:00", + "tree_id": "b46afdd603a2f3bd60b9cac918c2ddc0faf0d668", + "url": "https://github.com/luau-lang/luau/commit/9aa82c6fb90e1dcd6e7f60626255d597ef0fdea1" + }, + "date": 1710360181930, + "tool": "benchmarkluau", + "benches": [ + { + "name": "map-O0", + "value": 0.680319, + "unit": "0ms", + "range": "±0%", + "extra": "luau-compile" + }, + { + "name": "map-O1", + "value": 0.746034, + "unit": "0ms", + "range": "±0%", + "extra": "luau-compile" + }, + { + "name": "map-O2", + "value": 0.818594, + "unit": "0ms", + "range": "±0%", + "extra": "luau-compile" + }, + { + "name": "map-O2-codegen", + "value": 3.27791, + "unit": "3ms", + "range": "±0%", + "extra": "luau-compile" + }, + { + "name": "regex-O0", + "value": 1.59274, + "unit": "1ms", + "range": "±0%", + "extra": "luau-compile" + }, + { + "name": "regex-O1", + "value": 1.76917, + "unit": "1ms", + "range": "±0%", + "extra": "luau-compile" + }, + { + "name": "regex-O2", + "value": 2.01794, + "unit": "2ms", + "range": "±0%", + "extra": "luau-compile" + }, + { + "name": "regex-O2-codegen", + "value": 10.0452, + "unit": "ms", + "range": "±0%", + "extra": "luau-compile" + } + ] } ] }