diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json
index d3bdb004..01ff5f93 100644
--- a/dev/.documenter-siteinfo.json
+++ b/dev/.documenter-siteinfo.json
@@ -1 +1 @@
-{"documenter":{"julia_version":"1.10.7","generation_timestamp":"2025-01-01T23:14:20","documenter_version":"1.8.0"}}
\ No newline at end of file
+{"documenter":{"julia_version":"1.10.7","generation_timestamp":"2025-01-06T00:32:50","documenter_version":"1.8.0"}}
\ No newline at end of file
diff --git a/dev/api/index.html b/dev/api/index.html
index d373e85c..c7235b39 100644
--- a/dev/api/index.html
+++ b/dev/api/index.html
@@ -3,4 +3,4 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li class="is-active"><a class="tocitem" href>API Reference</a><ul class="internal"><li><a class="tocitem" href="#Indexing"><span>Indexing</span></a></li><li><a class="tocitem" href="#Synchronization"><span>Synchronization</span></a></li></ul></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>API Reference</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>API Reference</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/api.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="AMDGPU-API-Reference"><a class="docs-heading-anchor" href="#AMDGPU-API-Reference">AMDGPU API Reference</a><a id="AMDGPU-API-Reference-1"></a><a class="docs-heading-anchor-permalink" href="#AMDGPU-API-Reference" title="Permalink"></a></h1><h2 id="Indexing"><a class="docs-heading-anchor" href="#Indexing">Indexing</a><a id="Indexing-1"></a><a class="docs-heading-anchor-permalink" href="#Indexing" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workitemIdx" href="#AMDGPU.Device.workitemIdx"><code>AMDGPU.Device.workitemIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workitemIdx()::ROCDim3</code></pre><p>Returns the work item index within the work group. See also: <a href="#AMDGPU.Device.threadIdx"><code>threadIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/indexing.jl#L116-L121">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workgroupIdx" href="#AMDGPU.Device.workgroupIdx"><code>AMDGPU.Device.workgroupIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workgroupIdx()::ROCDim3</code></pre><p>Returns the work group index. See also: <a href="#AMDGPU.Device.blockIdx"><code>blockIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/indexing.jl#L124-L129">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workgroupDim" href="#AMDGPU.Device.workgroupDim"><code>AMDGPU.Device.workgroupDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workgroupDim()::ROCDim3</code></pre><p>Returns the size of each workgroup in workitems. See also: <a href="#AMDGPU.Device.blockDim"><code>blockDim</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/indexing.jl#L132-L137">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.gridItemDim" href="#AMDGPU.Device.gridItemDim"><code>AMDGPU.Device.gridItemDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gridItemDim()::ROCDim3</code></pre><p>Returns the size of the grid in workitems. This behaviour is different from CUDA where <code>gridDim</code> gives the size of the grid in blocks.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/indexing.jl#L140-L145">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.gridGroupDim" href="#AMDGPU.Device.gridGroupDim"><code>AMDGPU.Device.gridGroupDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gridGroupDim()::ROCDim3</code></pre><p>Returns the size of the grid in workgroups. This is equivalent to CUDA&#39;s <code>gridDim</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/indexing.jl#L148-L153">source</a></section></article><p>Use these functions for compatibility with CUDA.jl.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.threadIdx" href="#AMDGPU.Device.threadIdx"><code>AMDGPU.Device.threadIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">threadIdx()::ROCDim3</code></pre><p>Returns the thread index within the block. See also: <a href="#AMDGPU.Device.workitemIdx"><code>workitemIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/indexing.jl#L158-L163">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.blockIdx" href="#AMDGPU.Device.blockIdx"><code>AMDGPU.Device.blockIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">blockIdx()::ROCDim3</code></pre><p>Returns the block index within the grid. See also: <a href="#AMDGPU.Device.workgroupIdx"><code>workgroupIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/indexing.jl#L166-L171">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.blockDim" href="#AMDGPU.Device.blockDim"><code>AMDGPU.Device.blockDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">blockDim()::ROCDim3</code></pre><p>Returns the dimensions of the block. See also: <a href="#AMDGPU.Device.workgroupDim"><code>workgroupDim</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/indexing.jl#L174-L179">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup" href="#AMDGPU.Device.sync_workgroup"><code>AMDGPU.Device.sync_workgroup</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup()</code></pre><p>Waits until all wavefronts in a workgroup have reached this call.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/synchronization.jl#L1-L5">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_count" href="#AMDGPU.Device.sync_workgroup_count"><code>AMDGPU.Device.sync_workgroup_count</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_count(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns the number of workitems for which predicate evaluates to non-zero.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/synchronization.jl#L9-L15">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_and" href="#AMDGPU.Device.sync_workgroup_and"><code>AMDGPU.Device.sync_workgroup_and</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_and(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for all of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/synchronization.jl#L22-L28">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_or" href="#AMDGPU.Device.sync_workgroup_or"><code>AMDGPU.Device.sync_workgroup_or</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_or(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/synchronization.jl#L35-L41">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../logging/">« Logging</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li class="is-active"><a class="tocitem" href>API Reference</a><ul class="internal"><li><a class="tocitem" href="#Indexing"><span>Indexing</span></a></li><li><a class="tocitem" href="#Synchronization"><span>Synchronization</span></a></li></ul></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>API Reference</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>API Reference</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/api.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="AMDGPU-API-Reference"><a class="docs-heading-anchor" href="#AMDGPU-API-Reference">AMDGPU API Reference</a><a id="AMDGPU-API-Reference-1"></a><a class="docs-heading-anchor-permalink" href="#AMDGPU-API-Reference" title="Permalink"></a></h1><h2 id="Indexing"><a class="docs-heading-anchor" href="#Indexing">Indexing</a><a id="Indexing-1"></a><a class="docs-heading-anchor-permalink" href="#Indexing" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workitemIdx" href="#AMDGPU.Device.workitemIdx"><code>AMDGPU.Device.workitemIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workitemIdx()::ROCDim3</code></pre><p>Returns the work item index within the work group. See also: <a href="#AMDGPU.Device.threadIdx"><code>threadIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/indexing.jl#L116-L121">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workgroupIdx" href="#AMDGPU.Device.workgroupIdx"><code>AMDGPU.Device.workgroupIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workgroupIdx()::ROCDim3</code></pre><p>Returns the work group index. See also: <a href="#AMDGPU.Device.blockIdx"><code>blockIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/indexing.jl#L124-L129">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.workgroupDim" href="#AMDGPU.Device.workgroupDim"><code>AMDGPU.Device.workgroupDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">workgroupDim()::ROCDim3</code></pre><p>Returns the size of each workgroup in workitems. See also: <a href="#AMDGPU.Device.blockDim"><code>blockDim</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/indexing.jl#L132-L137">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.gridItemDim" href="#AMDGPU.Device.gridItemDim"><code>AMDGPU.Device.gridItemDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gridItemDim()::ROCDim3</code></pre><p>Returns the size of the grid in workitems. This behaviour is different from CUDA where <code>gridDim</code> gives the size of the grid in blocks.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/indexing.jl#L140-L145">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.gridGroupDim" href="#AMDGPU.Device.gridGroupDim"><code>AMDGPU.Device.gridGroupDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gridGroupDim()::ROCDim3</code></pre><p>Returns the size of the grid in workgroups. This is equivalent to CUDA&#39;s <code>gridDim</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/indexing.jl#L148-L153">source</a></section></article><p>Use these functions for compatibility with CUDA.jl.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.threadIdx" href="#AMDGPU.Device.threadIdx"><code>AMDGPU.Device.threadIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">threadIdx()::ROCDim3</code></pre><p>Returns the thread index within the block. See also: <a href="#AMDGPU.Device.workitemIdx"><code>workitemIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/indexing.jl#L158-L163">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.blockIdx" href="#AMDGPU.Device.blockIdx"><code>AMDGPU.Device.blockIdx</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">blockIdx()::ROCDim3</code></pre><p>Returns the block index within the grid. See also: <a href="#AMDGPU.Device.workgroupIdx"><code>workgroupIdx</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/indexing.jl#L166-L171">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.blockDim" href="#AMDGPU.Device.blockDim"><code>AMDGPU.Device.blockDim</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">blockDim()::ROCDim3</code></pre><p>Returns the dimensions of the block. See also: <a href="#AMDGPU.Device.workgroupDim"><code>workgroupDim</code></a></p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/indexing.jl#L174-L179">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup" href="#AMDGPU.Device.sync_workgroup"><code>AMDGPU.Device.sync_workgroup</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup()</code></pre><p>Waits until all wavefronts in a workgroup have reached this call.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/synchronization.jl#L1-L5">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_count" href="#AMDGPU.Device.sync_workgroup_count"><code>AMDGPU.Device.sync_workgroup_count</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_count(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns the number of workitems for which predicate evaluates to non-zero.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/synchronization.jl#L9-L15">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_and" href="#AMDGPU.Device.sync_workgroup_and"><code>AMDGPU.Device.sync_workgroup_and</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_and(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for all of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/synchronization.jl#L22-L28">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.sync_workgroup_or" href="#AMDGPU.Device.sync_workgroup_or"><code>AMDGPU.Device.sync_workgroup_or</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">sync_workgroup_or(predicate::Cint)::Cint</code></pre><p>Identical to <code>sync_workgroup</code>, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for any of them.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/synchronization.jl#L35-L41">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../logging/">« Logging</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/caching_allocator/index.html b/dev/caching_allocator/index.html
deleted file mode 100644
index 271bbdd3..00000000
--- a/dev/caching_allocator/index.html
+++ /dev/null
@@ -1,16 +0,0 @@
-<!DOCTYPE html>
-<html lang="en"><head><meta charset="UTF-8"/><meta name="viewport" content="width=device-width, initial-scale=1.0"/><title>Caching Memory Allocator · AMDGPU.jl</title><meta name="title" content="Caching Memory Allocator · AMDGPU.jl"/><meta property="og:title" content="Caching Memory Allocator · AMDGPU.jl"/><meta property="twitter:title" content="Caching Memory Allocator · AMDGPU.jl"/><meta name="description" content="Documentation for AMDGPU.jl."/><meta property="og:description" content="Documentation for AMDGPU.jl."/><meta property="twitter:description" content="Documentation for AMDGPU.jl."/><meta property="og:url" content="https://amdgpu.juliagpu.org/stable/caching_allocator/"/><meta property="twitter:url" content="https://amdgpu.juliagpu.org/stable/caching_allocator/"/><link rel="canonical" href="https://amdgpu.juliagpu.org/stable/caching_allocator/"/><script async src="https://www.googletagmanager.com/gtag/js?id=UA-154489943-2"></script><script>  window.dataLayer = window.dataLayer || [];
-  function gtag(){dataLayer.push(arguments);}
-  gtag('js', new Date());
-  gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li class="is-active"><a class="tocitem" href>Caching Memory Allocator</a><ul class="internal"><li><a class="tocitem" href="#Example"><span>Example</span></a></li><li><a class="tocitem" href="#API"><span>API</span></a></li></ul></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Caching Memory Allocator</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Caching Memory Allocator</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/caching_allocator.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Caching-Memory-Allocator"><a class="docs-heading-anchor" href="#Caching-Memory-Allocator">Caching Memory Allocator</a><a id="Caching-Memory-Allocator-1"></a><a class="docs-heading-anchor-permalink" href="#Caching-Memory-Allocator" title="Permalink"></a></h1><p>Julia uses Garbage-Collection (GC) for automatic memory management. However, it does not know about other memory spaces, therefore it sees no difference between 1 KiB GPU allocation and 1 GiB and doesn&#39;t free it in time.</p><p>This leads to a situations where all of the GPU memory is used, even though your algorithm only requires a fraction of it.</p><p>Current mechanism of dealing with OOM (Out-Of-Memory) errors during allocations is to manually trigger GC and retry allocating again doing this in several rounds each more aggressive than previous.</p><p>However, manually triggering GC is very expensive, since it requires scanning all Julia objects, not just ROCArrays, so the actual memory freeing takes a fraction of GC time: <img src="../assets/gc-vram-breakdown.png" alt/></p><p>On the image above, red region is a call to GC and green region is where actual GPU memory is being freed.</p><hr/><p>To help with memory management, we can use caching memory allocator. It is usefult in scenarios where we execute the same function multiple times and have the same memory allocation pattern. One such example is training DL models, where given the model and its parameters we compute loss, gradients w.r.t. loss and perform in-place parameter update. In this case, every iteration performs same operations and memory allocations and with caching allocator we can efficiently re-use them without returning the memory back to OS.</p><h2 id="Example"><a class="docs-heading-anchor" href="#Example">Example</a><a id="Example-1"></a><a class="docs-heading-anchor-permalink" href="#Example" title="Permalink"></a></h2><p>We have a for-loop, where each iteration requires 2 GiB of VRAM. We create a caching allocator with the name <code>:loop</code> and pass a function to execute. First iteration will allocate, but subsequent won&#39;t.</p><pre><code class="language-julia hljs">using AMDGPU
-
-function main()
-    n = 1024^2 * 256
-    for i in 1:1000
-        AMDGPU.with_caching_allocator(:loop, n) do n
-            sin.(AMDGPU.rand(Float32, n)) # 2 GiB allocation
-            return
-        end
-    end
-end</code></pre><p>The reason for marking a region of code where to re-use the memory and not extending it to the whole program instead, is because we cannot rely on GC to tell us when the memory is no longer used (it is too slow for that), so we create such region manually.</p><p>You can free all memory held by allocator, by invalidating it using its name with <a href="#AMDGPU.invalidate_caching_allocator!"><code>AMDGPU.invalidate_caching_allocator!</code></a>. Or if you want some region of code within <a href="#AMDGPU.with_caching_allocator"><code>AMDGPU.with_caching_allocator</code></a> to execute without relying on cache, use <a href="#AMDGPU.with_no_caching"><code>AMDGPU.with_no_caching</code></a>.</p><table><tr><th style="text-align: center"></th><th style="text-align: center">Without Caching Allocator</th><th style="text-align: center">With Caching Allocator</th></tr><tr><td style="text-align: center">VRAM Usage</td><td style="text-align: center"><img src="../assets/without-caching-allocator.png" alt/></td><td style="text-align: center"><img src="../assets/with-caching-allocator.png" alt/></td></tr><tr><td style="text-align: center">Execution time (seconds)</td><td style="text-align: center"><code>12.865149</code></td><td style="text-align: center"><code>0.020943</code></td></tr></table><h2 id="API"><a class="docs-heading-anchor" href="#API">API</a><a id="API-1"></a><a class="docs-heading-anchor-permalink" href="#API" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.with_caching_allocator" href="#AMDGPU.with_caching_allocator"><code>AMDGPU.with_caching_allocator</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">with_caching_allocator(f, alloc_name::Symbol, args...)</code></pre><p>Execute function <code>f</code> with arguments <code>args...</code> using caching allocator given by its name <code>alloc_name</code>.</p><p>All GPU memory allocations will attempt to hit this cache before doing actual allocation (in case of cache miss). After executing <code>f</code>, all &quot;busy&quot; memory within the allocator is marked as free, so it can be re-used with the next call.</p><p><strong>Returns</strong></p><p>Result of the <code>f</code> function.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/caching_allocator.jl#L88-L102">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.with_no_caching" href="#AMDGPU.with_no_caching"><code>AMDGPU.with_no_caching</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">with_no_caching(f)</code></pre><p>Execute function <code>f</code>, but avoid hitting any caching allocator. This is useful to call from within <a href="#AMDGPU.with_caching_allocator"><code>with_caching_allocator</code></a>, so that the memory is independent from it.</p><p><strong>Returns</strong></p><p>Result of the <code>f</code> function.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/caching_allocator.jl#L114-L124">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.invalidate_caching_allocator!" href="#AMDGPU.invalidate_caching_allocator!"><code>AMDGPU.invalidate_caching_allocator!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">invalidate_caching_allocator!(alloc_name::Symbol)</code></pre><p>Free all memory held by caching allocator given by it name <code>alloc_name</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/caching_allocator.jl#L133-L137">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../memory/">« Memory</a><a class="docs-footer-nextpage" href="../hostcall/">Host-Call »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/devices/index.html b/dev/devices/index.html
index 040d0705..db62bca7 100644
--- a/dev/devices/index.html
+++ b/dev/devices/index.html
@@ -3,8 +3,8 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li class="is-active"><a class="tocitem" href>Devices</a><ul class="internal"><li><a class="tocitem" href="#Device-Properties"><span>Device Properties</span></a></li></ul></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Devices</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Devices</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/devices.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Devices"><a class="docs-heading-anchor" href="#Devices">Devices</a><a id="Devices-1"></a><a class="docs-heading-anchor-permalink" href="#Devices" title="Permalink"></a></h1><p>In AMDGPU, all GPU devices are auto-detected by the runtime, if they&#39;re supported.</p><p>AMDGPU maintains a global default device. The default device is relevant for all kernel and GPUArray operations. If one is not specified via <code>@roc</code> or an equivalent interface, then the default device is used for those operations, which affects compilation and kernel launch.</p><p>The device bound to a current Julia task is accessible via <a href="#AMDGPU.device"><code>AMDGPU.device</code></a> method. The list of available devices can be queried with <a href="#AMDGPU.HIP.devices"><code>AMDGPU.devices</code></a> method.</p><p>If you have a <code>HIPDevice</code> object, you can also switch the device with <a href="#AMDGPU.device!"><code>AMDGPU.device!</code></a>. This will switch it <strong>only within the task it is called from</strong>.</p><pre><code class="language-julia hljs">xd1 = AMDGPU.ones(Float32, 16) # On `AMDGPU.device()` device.
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li class="is-active"><a class="tocitem" href>Devices</a><ul class="internal"><li><a class="tocitem" href="#Device-Properties"><span>Device Properties</span></a></li></ul></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Devices</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Devices</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/devices.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Devices"><a class="docs-heading-anchor" href="#Devices">Devices</a><a id="Devices-1"></a><a class="docs-heading-anchor-permalink" href="#Devices" title="Permalink"></a></h1><p>In AMDGPU, all GPU devices are auto-detected by the runtime, if they&#39;re supported.</p><p>AMDGPU maintains a global default device. The default device is relevant for all kernel and GPUArray operations. If one is not specified via <code>@roc</code> or an equivalent interface, then the default device is used for those operations, which affects compilation and kernel launch.</p><p>The device bound to a current Julia task is accessible via <a href="#AMDGPU.device"><code>AMDGPU.device</code></a> method. The list of available devices can be queried with <a href="#AMDGPU.HIP.devices"><code>AMDGPU.devices</code></a> method.</p><p>If you have a <code>HIPDevice</code> object, you can also switch the device with <a href="#AMDGPU.device!"><code>AMDGPU.device!</code></a>. This will switch it <strong>only within the task it is called from</strong>.</p><pre><code class="language-julia hljs">xd1 = AMDGPU.ones(Float32, 16) # On `AMDGPU.device()` device.
 
 AMDGPU.device!(AMDGPU.devices()[2]) # Switch to second device.
-xd2 = AMDPGU.ones(Float32, 16) # On second device.</code></pre><p>Additionally, devices have an associated numeric ID. This value is bounded between <code>1</code> and <code>length(AMDGPU.devices())</code>, and device <code>1</code> is the default device when AMDGPU is first loaded. The ID of the device associated with the current task can be queried with <a href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> and changed with <a href="#AMDGPU.device_id!"><code>AMDGPU.device_id!</code></a>.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.devices" href="#AMDGPU.HIP.devices"><code>AMDGPU.HIP.devices</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">devices()</code></pre><p>Get list of all devices.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/device.jl#L107-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device" href="#AMDGPU.device"><code>AMDGPU.device</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device()::HIPDevice</code></pre><p>Get currently active device. This device is used when launching kernels via <code>@roc</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/tls.jl#L42-L47">source</a></section><section><div><pre><code class="language-julia hljs">device(A::ROCArray) -&gt; HIPDevice</code></pre><p>Return the device associated with the array <code>A</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/array.jl#L49-L53">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device!" href="#AMDGPU.device!"><code>AMDGPU.device!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device!(device::HIPDevice)</code></pre><p>Switch current device being used. This switches only for a task inside which it is called.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/tls.jl#L50-L55">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device_id" href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id() -&gt; Int
-device_id(device::HIPDevice) -&gt; Int</code></pre><p>Returns the numerical device ID for <code>device</code> or for the current <code>AMDGPU.device()</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/highlevel.jl#L1-L6">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device_id!" href="#AMDGPU.device_id!"><code>AMDGPU.device_id!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id!(idx::Integer)</code></pre><p>Sets the current device to <code>AMDGPU.devices()[idx]</code>. See <a href="#AMDGPU.device_id"><code>device_id</code></a> for details on the numbering semantics.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/highlevel.jl#L10-L15">source</a></section></article><h2 id="Device-Properties"><a class="docs-heading-anchor" href="#Device-Properties">Device Properties</a><a id="Device-Properties-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Properties" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.name" href="#AMDGPU.HIP.name"><code>AMDGPU.HIP.name</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">name(dev::HIPDevice)::String</code></pre><p>Get name of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/device.jl#L70-L74">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.wavefrontsize" href="#AMDGPU.HIP.wavefrontsize"><code>AMDGPU.HIP.wavefrontsize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">wavefrontsize(d::HIPDevice)::Cint</code></pre><p>Get size of the wavefront. AMD GPUs support either 32 or 64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/device.jl#L32-L36">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.gcn_arch" href="#AMDGPU.HIP.gcn_arch"><code>AMDGPU.HIP.gcn_arch</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gcn_arch(d::HIPDevice)::String</code></pre><p>Get GCN architecture for the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/device.jl#L39-L43">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.device_id" href="#AMDGPU.HIP.device_id"><code>AMDGPU.HIP.device_id</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id(d::HIPDevice)</code></pre><p>Zero-based device ID as expected by HIP functions. Differs from <a href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> method by <code>1</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/device.jl#L24-L29">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.properties" href="#AMDGPU.HIP.properties"><code>AMDGPU.HIP.properties</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">properties(dev::HIPDevice)::hipDeviceProp_t</code></pre><p>Get all properties for the device. See HIP documentation for <code>hipDeviceProp_t</code> for the meaning of each field.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/device.jl#L81-L86">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../quickstart/">« Quick Start</a><a class="docs-footer-nextpage" href="../streams/">Streams »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+xd2 = AMDPGU.ones(Float32, 16) # On second device.</code></pre><p>Additionally, devices have an associated numeric ID. This value is bounded between <code>1</code> and <code>length(AMDGPU.devices())</code>, and device <code>1</code> is the default device when AMDGPU is first loaded. The ID of the device associated with the current task can be queried with <a href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> and changed with <a href="#AMDGPU.device_id!"><code>AMDGPU.device_id!</code></a>.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.devices" href="#AMDGPU.HIP.devices"><code>AMDGPU.HIP.devices</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">devices()</code></pre><p>Get list of all devices.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/device.jl#L107-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device" href="#AMDGPU.device"><code>AMDGPU.device</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device()::HIPDevice</code></pre><p>Get currently active device. This device is used when launching kernels via <code>@roc</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/tls.jl#L39-L44">source</a></section><section><div><pre><code class="language-julia hljs">device(A::ROCArray) -&gt; HIPDevice</code></pre><p>Return the device associated with the array <code>A</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/array.jl#L45-L49">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device!" href="#AMDGPU.device!"><code>AMDGPU.device!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device!(device::HIPDevice)</code></pre><p>Switch current device being used. This switches only for a task inside which it is called.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/tls.jl#L47-L52">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device_id" href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id() -&gt; Int
+device_id(device::HIPDevice) -&gt; Int</code></pre><p>Returns the numerical device ID for <code>device</code> or for the current <code>AMDGPU.device()</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/highlevel.jl#L1-L6">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.device_id!" href="#AMDGPU.device_id!"><code>AMDGPU.device_id!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id!(idx::Integer)</code></pre><p>Sets the current device to <code>AMDGPU.devices()[idx]</code>. See <a href="#AMDGPU.device_id"><code>device_id</code></a> for details on the numbering semantics.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/highlevel.jl#L10-L15">source</a></section></article><h2 id="Device-Properties"><a class="docs-heading-anchor" href="#Device-Properties">Device Properties</a><a id="Device-Properties-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Properties" title="Permalink"></a></h2><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.name" href="#AMDGPU.HIP.name"><code>AMDGPU.HIP.name</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">name(dev::HIPDevice)::String</code></pre><p>Get name of the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/device.jl#L70-L74">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.wavefrontsize" href="#AMDGPU.HIP.wavefrontsize"><code>AMDGPU.HIP.wavefrontsize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">wavefrontsize(d::HIPDevice)::Cint</code></pre><p>Get size of the wavefront. AMD GPUs support either 32 or 64.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/device.jl#L32-L36">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.gcn_arch" href="#AMDGPU.HIP.gcn_arch"><code>AMDGPU.HIP.gcn_arch</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">gcn_arch(d::HIPDevice)::String</code></pre><p>Get GCN architecture for the device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/device.jl#L39-L43">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.device_id" href="#AMDGPU.HIP.device_id"><code>AMDGPU.HIP.device_id</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">device_id(d::HIPDevice)</code></pre><p>Zero-based device ID as expected by HIP functions. Differs from <a href="#AMDGPU.device_id"><code>AMDGPU.device_id</code></a> method by <code>1</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/device.jl#L24-L29">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.properties" href="#AMDGPU.HIP.properties"><code>AMDGPU.HIP.properties</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">properties(dev::HIPDevice)::hipDeviceProp_t</code></pre><p>Get all properties for the device. See HIP documentation for <code>hipDeviceProp_t</code> for the meaning of each field.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/device.jl#L81-L86">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../quickstart/">« Quick Start</a><a class="docs-footer-nextpage" href="../streams/">Streams »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/exceptions/index.html b/dev/exceptions/index.html
index 51cb8a6d..76e09e51 100644
--- a/dev/exceptions/index.html
+++ b/dev/exceptions/index.html
@@ -3,7 +3,7 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li class="is-active"><a class="tocitem" href>Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Exceptions</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Exceptions</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/exceptions.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Kernel-Exceptions"><a class="docs-heading-anchor" href="#Kernel-Exceptions">Kernel Exceptions</a><a id="Kernel-Exceptions-1"></a><a class="docs-heading-anchor-permalink" href="#Kernel-Exceptions" title="Permalink"></a></h1><p>Just like regular CPU-executed Julia functions, GPU kernels can throw exceptions!</p><p>For example, the following kernel will throw an out-of-bounds exception:</p><pre><code class="language-julia hljs">julia&gt; function ker!(x)
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li class="is-active"><a class="tocitem" href>Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Exceptions</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Exceptions</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/exceptions.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Kernel-Exceptions"><a class="docs-heading-anchor" href="#Kernel-Exceptions">Kernel Exceptions</a><a id="Kernel-Exceptions-1"></a><a class="docs-heading-anchor-permalink" href="#Kernel-Exceptions" title="Permalink"></a></h1><p>Just like regular CPU-executed Julia functions, GPU kernels can throw exceptions!</p><p>For example, the following kernel will throw an out-of-bounds exception:</p><pre><code class="language-julia hljs">julia&gt; function ker!(x)
            x[0] = 1
            return
        end
@@ -25,4 +25,4 @@
  [4] synchronize()
    @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154
  [5] top-level scope
-   @ REPL[5]:1</code></pre><p>Kernel-thrown exceptions are thrown during the host synchronization <a href="../streams/#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> or on the next kernel launch.</p><p>Kernels that hit an exception will write its information into a pre-allocated host buffer. Once complete, the wavefront throwing the exception will lock the buffer to prevent other wavefronts from overwriting the exception and stop itself, but other wavefronts will continue executing.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../kernel_programming/">« Kernel Programming</a><a class="docs-footer-nextpage" href="../profiling/">Profiling »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+   @ REPL[5]:1</code></pre><p>Kernel-thrown exceptions are thrown during the host synchronization <a href="../streams/#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> or on the next kernel launch.</p><p>Kernels that hit an exception will write its information into a pre-allocated host buffer. Once complete, the wavefront throwing the exception will lock the buffer to prevent other wavefronts from overwriting the exception and stop itself, but other wavefronts will continue executing.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../kernel_programming/">« Kernel Programming</a><a class="docs-footer-nextpage" href="../profiling/">Profiling »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/hostcall/index.html b/dev/hostcall/index.html
index 71968f87..cf90d1fc 100644
--- a/dev/hostcall/index.html
+++ b/dev/hostcall/index.html
@@ -3,7 +3,7 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li class="is-active"><a class="tocitem" href>Host-Call</a><ul class="internal"><li><a class="tocitem" href="#Example"><span>Example</span></a></li><li><a class="tocitem" href="#Continuous-Host-Call"><span>Continuous Host-Call</span></a></li><li><a class="tocitem" href="#Free-hostcall-buffers"><span>Free hostcall buffers</span></a></li></ul></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Host-Call</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Host-Call</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/hostcall.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Hostcall"><a class="docs-heading-anchor" href="#Hostcall">Hostcall</a><a id="Hostcall-1"></a><a class="docs-heading-anchor-permalink" href="#Hostcall" title="Permalink"></a></h1><p>Hostcalls provide a means for GPU-CPU communications within running kernels.</p><p>AMDGPU.jl provides its own implementation of hostcalls, relying on HSA signals. Currently, hostcalls are used for device-side allocations, printing and exception reporting.</p><p>Some of the hostcalls (global hostcalls), are launched automatically, if their usage is detected during compilation (e.g. device-side allocations, exception reporting).</p><p>Hostcalls require careful usage, since they each spawn their own Tasks. There should be no blocking operations during this time.</p><p>For example, using non-blocking synchronization instead of blocking with <code>AMDGPU.synchronize(; blocking=false)</code> (which is also the default).</p><p>To stop hostcalls after synchronization, provide <code>stop_hostcalls=true</code> keyword argument, otherwise the performance might degrade because of constant pooling of HSA signals in a loop.</p><h2 id="Example"><a class="docs-heading-anchor" href="#Example">Example</a><a id="Example-1"></a><a class="docs-heading-anchor-permalink" href="#Example" title="Permalink"></a></h2><pre><code class="language-julia hljs">hc = Device.HostCallHolder(Float32, Tuple{Float32}) do x
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li class="is-active"><a class="tocitem" href>Host-Call</a><ul class="internal"><li><a class="tocitem" href="#Example"><span>Example</span></a></li><li><a class="tocitem" href="#Continuous-Host-Call"><span>Continuous Host-Call</span></a></li><li><a class="tocitem" href="#Free-hostcall-buffers"><span>Free hostcall buffers</span></a></li></ul></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Host-Call</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Host-Call</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/hostcall.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Hostcall"><a class="docs-heading-anchor" href="#Hostcall">Hostcall</a><a id="Hostcall-1"></a><a class="docs-heading-anchor-permalink" href="#Hostcall" title="Permalink"></a></h1><p>Hostcalls provide a means for GPU-CPU communications within running kernels.</p><p>AMDGPU.jl provides its own implementation of hostcalls, relying on HSA signals. Currently, hostcalls are used for device-side allocations, printing and exception reporting.</p><p>Some of the hostcalls (global hostcalls), are launched automatically, if their usage is detected during compilation (e.g. device-side allocations, exception reporting).</p><p>Hostcalls require careful usage, since they each spawn their own Tasks. There should be no blocking operations during this time.</p><p>For example, using non-blocking synchronization instead of blocking with <code>AMDGPU.synchronize(; blocking=false)</code> (which is also the default).</p><p>To stop hostcalls after synchronization, provide <code>stop_hostcalls=true</code> keyword argument, otherwise the performance might degrade because of constant pooling of HSA signals in a loop.</p><h2 id="Example"><a class="docs-heading-anchor" href="#Example">Example</a><a id="Example-1"></a><a class="docs-heading-anchor-permalink" href="#Example" title="Permalink"></a></h2><pre><code class="language-julia hljs">hc = Device.HostCallHolder(Float32, Tuple{Float32}) do x
     return x + 42f0
 end
 
@@ -17,4 +17,4 @@
 AMDGPU.synchronize(; stop_hostcalls=true) # Stop hostcall.
 AMDGPU.Device.free!(hc) # Free hostcall buffers.
 
-@assert Array(y)[1] ≈ 42f0</code></pre><p>In this example, <code>HostCallHolder</code> is used to create and launch <code>HostCall</code>. <code>HostCallHolder</code> contains the <code>HostCall</code> structure itself that is passed to kernel, a task that is spawned on creation and some additional info for controlling the lifetime of the task.</p><p>First argument is a function we want to execute when we call the hostcall. In this case we add <code>42f0</code> to input argument <code>x</code> and return the result.</p><p>Second and third arguments are the return type <code>Float32</code> and the tuple of types of input arguments <code>Tuple{Float32}</code>.</p><p><code>hostcall!</code> is used to execute the function on the host, wait on the result, and obtain the return values. At the moment, it is performed once per workgroup.</p><h2 id="Continuous-Host-Call"><a class="docs-heading-anchor" href="#Continuous-Host-Call">Continuous Host-Call</a><a id="Continuous-Host-Call-1"></a><a class="docs-heading-anchor-permalink" href="#Continuous-Host-Call" title="Permalink"></a></h2><p>By default, hostcalls can be used only once. After executing the function on the host, the task finishes and exits.</p><p>However, if you need your hostcall to live indefinitely, pass <code>continuous=true</code> keyword argument to <code>HostCallHolder(...; continuous=true)</code>.</p><p>To then stop the hostcall, call <code>Device.non_continuous!(hc)</code> or <code>Device.finish!(hc)</code> on the <code>HostCallHolder</code>.</p><p>The difference between them is that <code>non_continuous!</code> will allow calling hostcall one more time before exiting, while <code>finish!</code> will exit immediately.</p><p><code>finish!</code> can be used on any <code>HostCallHolder</code> to force-exit the running hostcall task.</p><h2 id="Free-hostcall-buffers"><a class="docs-heading-anchor" href="#Free-hostcall-buffers">Free hostcall buffers</a><a id="Free-hostcall-buffers-1"></a><a class="docs-heading-anchor-permalink" href="#Free-hostcall-buffers" title="Permalink"></a></h2><p>For custom hostcalls it is important to call <code>AMDGPU.Device.free!</code> once kernel has finished to free buffers that hostcall used in the process.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../caching_allocator/">« Caching Memory Allocator</a><a class="docs-footer-nextpage" href="../printing/">Printing »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+@assert Array(y)[1] ≈ 42f0</code></pre><p>In this example, <code>HostCallHolder</code> is used to create and launch <code>HostCall</code>. <code>HostCallHolder</code> contains the <code>HostCall</code> structure itself that is passed to kernel, a task that is spawned on creation and some additional info for controlling the lifetime of the task.</p><p>First argument is a function we want to execute when we call the hostcall. In this case we add <code>42f0</code> to input argument <code>x</code> and return the result.</p><p>Second and third arguments are the return type <code>Float32</code> and the tuple of types of input arguments <code>Tuple{Float32}</code>.</p><p><code>hostcall!</code> is used to execute the function on the host, wait on the result, and obtain the return values. At the moment, it is performed once per workgroup.</p><h2 id="Continuous-Host-Call"><a class="docs-heading-anchor" href="#Continuous-Host-Call">Continuous Host-Call</a><a id="Continuous-Host-Call-1"></a><a class="docs-heading-anchor-permalink" href="#Continuous-Host-Call" title="Permalink"></a></h2><p>By default, hostcalls can be used only once. After executing the function on the host, the task finishes and exits.</p><p>However, if you need your hostcall to live indefinitely, pass <code>continuous=true</code> keyword argument to <code>HostCallHolder(...; continuous=true)</code>.</p><p>To then stop the hostcall, call <code>Device.non_continuous!(hc)</code> or <code>Device.finish!(hc)</code> on the <code>HostCallHolder</code>.</p><p>The difference between them is that <code>non_continuous!</code> will allow calling hostcall one more time before exiting, while <code>finish!</code> will exit immediately.</p><p><code>finish!</code> can be used on any <code>HostCallHolder</code> to force-exit the running hostcall task.</p><h2 id="Free-hostcall-buffers"><a class="docs-heading-anchor" href="#Free-hostcall-buffers">Free hostcall buffers</a><a id="Free-hostcall-buffers-1"></a><a class="docs-heading-anchor-permalink" href="#Free-hostcall-buffers" title="Permalink"></a></h2><p>For custom hostcalls it is important to call <code>AMDGPU.Device.free!</code> once kernel has finished to free buffers that hostcall used in the process.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../memory/">« Memory</a><a class="docs-footer-nextpage" href="../printing/">Printing »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/index.html b/dev/index.html
index 0d2d6b8b..5bd514ed 100644
--- a/dev/index.html
+++ b/dev/index.html
@@ -3,7 +3,7 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="assets/documenter.js"></script><script src="search_index.js"></script><script src="siteinfo.js"></script><script src="../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="assets/themeswap.js"></script><link href="assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href><img src="assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href>AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li class="is-active"><a class="tocitem" href>Home</a><ul class="internal"><li><a class="tocitem" href="#Installation"><span>Installation</span></a></li><li><a class="tocitem" href="#Requirements"><span>Requirements</span></a></li><li><a class="tocitem" href="#Windows-OS-missing-functionality"><span>Windows OS missing functionality</span></a></li><li><a class="tocitem" href="#ROCm-system-libraries"><span>ROCm system libraries</span></a></li><li><a class="tocitem" href="#Extra-Setup-Details"><span>Extra Setup Details</span></a></li><li><a class="tocitem" href="#Preferences"><span>Preferences</span></a></li></ul></li><li><a class="tocitem" href="quickstart/">Quick Start</a></li><li><a class="tocitem" href="devices/">Devices</a></li><li><a class="tocitem" href="streams/">Streams</a></li><li><a class="tocitem" href="kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="exceptions/">Exceptions</a></li><li><a class="tocitem" href="profiling/">Profiling</a></li><li><a class="tocitem" href="memory/">Memory</a></li><li><a class="tocitem" href="caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="hostcall/">Host-Call</a></li><li><a class="tocitem" href="printing/">Printing</a></li><li><a class="tocitem" href="logging/">Logging</a></li><li><a class="tocitem" href="api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Home</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Home</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/index.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Programming-AMD-GPUs-with-Julia"><a class="docs-heading-anchor" href="#Programming-AMD-GPUs-with-Julia">Programming AMD GPUs with Julia</a><a id="Programming-AMD-GPUs-with-Julia-1"></a><a class="docs-heading-anchor-permalink" href="#Programming-AMD-GPUs-with-Julia" title="Permalink"></a></h1><p>Julia support for programming AMD GPUs is currently provided by the <a href="https://github.com/JuliaGPU/AMDGPU.jl">AMDGPU.jl</a> package. This package contains everything necessary to program for AMD GPUs in Julia, including:</p><ul><li>An interface for compiling and running kernels written in Julia through LLVM&#39;s AMDGPU backend.</li><li>An interface for working with the HIP runtime API,   necessary for launching compiled kernels and controlling the GPU.</li><li>An array type implementing the <a href="https://github.com/JuliaGPU/GPUArrays.jl">GPUArrays.jl</a>   interface, providing high-level array operations.</li></ul><h2 id="Installation"><a class="docs-heading-anchor" href="#Installation">Installation</a><a id="Installation-1"></a><a class="docs-heading-anchor-permalink" href="#Installation" title="Permalink"></a></h2><p>Simply add the AMDGPU.jl package to your Julia environment:</p><pre><code class="language-julia hljs">using Pkg
+</script><script data-outdated-warner src="assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL="."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="assets/documenter.js"></script><script src="search_index.js"></script><script src="siteinfo.js"></script><script src="../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="assets/themeswap.js"></script><link href="assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href><img src="assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href>AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li class="is-active"><a class="tocitem" href>Home</a><ul class="internal"><li><a class="tocitem" href="#Installation"><span>Installation</span></a></li><li><a class="tocitem" href="#Requirements"><span>Requirements</span></a></li><li><a class="tocitem" href="#Windows-OS-missing-functionality"><span>Windows OS missing functionality</span></a></li><li><a class="tocitem" href="#ROCm-system-libraries"><span>ROCm system libraries</span></a></li><li><a class="tocitem" href="#Extra-Setup-Details"><span>Extra Setup Details</span></a></li><li><a class="tocitem" href="#Preferences"><span>Preferences</span></a></li></ul></li><li><a class="tocitem" href="quickstart/">Quick Start</a></li><li><a class="tocitem" href="devices/">Devices</a></li><li><a class="tocitem" href="streams/">Streams</a></li><li><a class="tocitem" href="kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="exceptions/">Exceptions</a></li><li><a class="tocitem" href="profiling/">Profiling</a></li><li><a class="tocitem" href="memory/">Memory</a></li><li><a class="tocitem" href="hostcall/">Host-Call</a></li><li><a class="tocitem" href="printing/">Printing</a></li><li><a class="tocitem" href="logging/">Logging</a></li><li><a class="tocitem" href="api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Home</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Home</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/index.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Programming-AMD-GPUs-with-Julia"><a class="docs-heading-anchor" href="#Programming-AMD-GPUs-with-Julia">Programming AMD GPUs with Julia</a><a id="Programming-AMD-GPUs-with-Julia-1"></a><a class="docs-heading-anchor-permalink" href="#Programming-AMD-GPUs-with-Julia" title="Permalink"></a></h1><p>Julia support for programming AMD GPUs is currently provided by the <a href="https://github.com/JuliaGPU/AMDGPU.jl">AMDGPU.jl</a> package. This package contains everything necessary to program for AMD GPUs in Julia, including:</p><ul><li>An interface for compiling and running kernels written in Julia through LLVM&#39;s AMDGPU backend.</li><li>An interface for working with the HIP runtime API,   necessary for launching compiled kernels and controlling the GPU.</li><li>An array type implementing the <a href="https://github.com/JuliaGPU/GPUArrays.jl">GPUArrays.jl</a>   interface, providing high-level array operations.</li></ul><h2 id="Installation"><a class="docs-heading-anchor" href="#Installation">Installation</a><a id="Installation-1"></a><a class="docs-heading-anchor-permalink" href="#Installation" title="Permalink"></a></h2><p>Simply add the AMDGPU.jl package to your Julia environment:</p><pre><code class="language-julia hljs">using Pkg
 Pkg.add(&quot;AMDGPU&quot;)</code></pre><p>To ensure that everything works, you can run the test suite:</p><pre><code class="language-julia hljs">using AMDGPU
 using Pkg
 Pkg.test(&quot;AMDGPU&quot;)</code></pre><h2 id="Requirements"><a class="docs-heading-anchor" href="#Requirements">Requirements</a><a id="Requirements-1"></a><a class="docs-heading-anchor-permalink" href="#Requirements" title="Permalink"></a></h2><ul><li>Julia <strong>1.9 or higher</strong> (Navi 3 requires Julia 1.10+).</li><li><strong>64-bit</strong> Linux or Windows.</li><li>Minimal supported ROCm version is <strong>5.3</strong>.</li><li>Required software:</li></ul><table><tr><th style="text-align: center">Linux</th><th style="text-align: center">Windows</th></tr><tr><td style="text-align: center"><a href="https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html">ROCm</a></td><td style="text-align: center"><a href="https://rocm.docs.amd.com/en/latest/deploy/windows/quick_start.html">ROCm</a></td></tr><tr><td style="text-align: center">-</td><td style="text-align: center"><a href="https://www.amd.com/en/technologies/software">AMD Software: Adrenalin Edition</a></td></tr></table><p>On Windows AMD Software: Adrenalin Edition contains HIP library itself, while ROCm provides support for other functionality.</p><h2 id="Windows-OS-missing-functionality"><a class="docs-heading-anchor" href="#Windows-OS-missing-functionality">Windows OS missing functionality</a><a id="Windows-OS-missing-functionality-1"></a><a class="docs-heading-anchor-permalink" href="#Windows-OS-missing-functionality" title="Permalink"></a></h2><p>Windows <strong>does not</strong> yet support <a href="hostcall/#Hostcall">Hostcall</a>, which means that some of the functionality does not work, like:</p><ul><li>device printing;</li><li>dynamic memory allocation (from kernels).</li></ul><p>These hostcalls are sometimes launched when AMDGPU detects that a kernel might throw an exception, specifically during conversions, like: <code>Int32(1f0)</code>.</p><p>To avoid this, use &#39;unsafe&#39; conversion option: <code>unsafe_trunc(Int32, 1f0)</code>.</p><h2 id="ROCm-system-libraries"><a class="docs-heading-anchor" href="#ROCm-system-libraries">ROCm system libraries</a><a id="ROCm-system-libraries-1"></a><a class="docs-heading-anchor-permalink" href="#ROCm-system-libraries" title="Permalink"></a></h2><p>AMDGPU.jl looks into standard directories and uses <code>Libdl.find_library</code> to find ROCm libraries.</p><p>Standard path:</p><ul><li>Linux: <code>/opt/rocm</code></li><li>Windows: <code>C:/Program Files/AMD/ROCm/&lt;rocm-version&gt;</code></li></ul><p>If you have non-standard path for ROCm, set <code>ROCM_PATH=&lt;path&gt;</code> environment variable before launching Julia. For example, if ROCm is installed in your Linux system root (e.g. on Fedora), set <code>ROCM_PATH=/usr/lib64/rocm/gfx11</code> or <code>ROCM_PATH=/usr/lib64/rocm/gfx1103</code>, depending on your GPU&#39;s architecture. You can query the architecture using the <code>amdgpu-arch</code> command. The <code>AMDGPU.versioninfo()</code> function prints the paths of any libraries found.</p><p>Depending on your GPU model and the functionality you want to use, you may have to force the GPU architecture by setting the <code>HSA_OVERRIDE_GFX_VERSION</code> variable to a compatible version.</p><h2 id="Extra-Setup-Details"><a class="docs-heading-anchor" href="#Extra-Setup-Details">Extra Setup Details</a><a id="Extra-Setup-Details-1"></a><a class="docs-heading-anchor-permalink" href="#Extra-Setup-Details" title="Permalink"></a></h2><p>List of additional steps that may be needed to take to ensure everything is working:</p><ul><li><p>Make sure your user is in the same group as <code>/dev/kfd</code>, other than <code>root</code>.</p><p>For example, it might be the <code>render</code> group:</p><p><code>crw-rw----   1 root   render  234,   0 Aug  5 11:43 kfd</code></p><p>In this case, you can add yourself to it:</p><p><code>sudo usermod -aG render username</code></p></li><li><p>ROCm libraries should be in the standard library locations, or in your <code>LD_LIBRARY_PATH</code>.</p></li><li><p>If you get an error message along the lines of <code>GLIB_CXX_... not found</code>,   it&#39;s possible that the C++ runtime used to build the ROCm stack   and the one used by Julia are different.   If you built the ROCm stack yourself this is very likely the case   since Julia normally ships with its own C++ runtime.</p><p>For more information, check out this <a href="https://github.com/JuliaLang/julia/issues/34276">GitHub issue</a>.   A quick fix is to use the <code>LD_PRELOAD</code> environment variable to make Julia use the system C++ runtime library, for example:</p><p><code>LD_PRELOAD=/usr/lib/libstdc++.so julia</code></p><p>Alternatively, you can build Julia from source as described   <a href="https://github.com/JuliaLang/julia/blob/master/doc/build/build.md">here</a>.   To quickly debug this issue start Julia and try to load a ROCm library:</p><p><code>using Libdl   Libdl.dlopen(&quot;/opt/rocm/hsa/lib/libhsa-runtime64.so.1&quot;)</code></p></li></ul><p>Once all of this is setup properly, you should be able to do <code>using AMDGPU</code> successfully.</p><p>See the <a href="quickstart/#Quick-Start">Quick Start</a> documentation for an introduction to using AMDGPU.jl.</p><h2 id="Preferences"><a class="docs-heading-anchor" href="#Preferences">Preferences</a><a id="Preferences-1"></a><a class="docs-heading-anchor-permalink" href="#Preferences" title="Permalink"></a></h2><p>AMDGPU.jl supports setting <a href="https://github.com/JuliaPackaging/Preferences.jl">preferences</a>. Template of <code>LocalPreferences.toml</code> with all options:</p><pre><code class="language-toml hljs">[AMDGPU]
@@ -17,4 +17,4 @@
 # Default is &quot;none&quot;, which does not apply any limitation.
 hard_memory_limit = &quot;none&quot;
 # Notice a space between the value and percentage sign.
-# hard_memory_limit = &quot;80 %&quot;</code></pre></article><nav class="docs-footer"><a class="docs-footer-nextpage" href="quickstart/">Quick Start »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+# hard_memory_limit = &quot;80 %&quot;</code></pre></article><nav class="docs-footer"><a class="docs-footer-nextpage" href="quickstart/">Quick Start »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/kernel_programming/index.html b/dev/kernel_programming/index.html
index 62eaba6d..122a6284 100644
--- a/dev/kernel_programming/index.html
+++ b/dev/kernel_programming/index.html
@@ -3,10 +3,10 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li class="is-active"><a class="tocitem" href>Kernel Programming</a><ul class="internal"><li><a class="tocitem" href="#Launch-Configuration"><span>Launch Configuration</span></a></li><li><a class="tocitem" href="#Atomics"><span>Atomics</span></a></li><li><a class="tocitem" href="#Device-Intrinsics"><span>Device Intrinsics</span></a></li></ul></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Kernel Programming</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Kernel Programming</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/kernel_programming.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Kernel-Programming"><a class="docs-heading-anchor" href="#Kernel-Programming">Kernel Programming</a><a id="Kernel-Programming-1"></a><a class="docs-heading-anchor-permalink" href="#Kernel-Programming" title="Permalink"></a></h1><h2 id="Launch-Configuration"><a class="docs-heading-anchor" href="#Launch-Configuration">Launch Configuration</a><a id="Launch-Configuration-1"></a><a class="docs-heading-anchor-permalink" href="#Launch-Configuration" title="Permalink"></a></h2><p>While an almost arbitrarily large number of workitems can be executed per kernel launch, the hardware can only support executing a limited number of wavefronts at one time.</p><p>To alleviate this, the compiler calculates the &quot;occupancy&quot; of each compiled kernel (which is the number of wavefronts that can be simultaneously executing on the GPU), and passes this information to the hardware; the hardware then launches a limited number of wavefronts at once, based on the kernel&#39;s &quot;occupancy&quot; values.</p><p>The rest of the wavefronts are not launched until hardware resources become available, which means that a kernel with better occupancy will see more of its wavefronts executing simultaneously (which often leads to better performance). Suffice to say, it&#39;s important to know the occupancy of kernels if you want the best performance.</p><p>Like CUDA.jl, AMDGPU.jl has the ability to calculate kernel occupancy, with the <code>launch_configuration</code> function:</p><pre><code class="language-julia hljs">kernel = @roc launch=false mykernel(args...)
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li class="is-active"><a class="tocitem" href>Kernel Programming</a><ul class="internal"><li><a class="tocitem" href="#Launch-Configuration"><span>Launch Configuration</span></a></li><li><a class="tocitem" href="#Atomics"><span>Atomics</span></a></li><li><a class="tocitem" href="#Device-Intrinsics"><span>Device Intrinsics</span></a></li></ul></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Kernel Programming</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Kernel Programming</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/kernel_programming.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Kernel-Programming"><a class="docs-heading-anchor" href="#Kernel-Programming">Kernel Programming</a><a id="Kernel-Programming-1"></a><a class="docs-heading-anchor-permalink" href="#Kernel-Programming" title="Permalink"></a></h1><h2 id="Launch-Configuration"><a class="docs-heading-anchor" href="#Launch-Configuration">Launch Configuration</a><a id="Launch-Configuration-1"></a><a class="docs-heading-anchor-permalink" href="#Launch-Configuration" title="Permalink"></a></h2><p>While an almost arbitrarily large number of workitems can be executed per kernel launch, the hardware can only support executing a limited number of wavefronts at one time.</p><p>To alleviate this, the compiler calculates the &quot;occupancy&quot; of each compiled kernel (which is the number of wavefronts that can be simultaneously executing on the GPU), and passes this information to the hardware; the hardware then launches a limited number of wavefronts at once, based on the kernel&#39;s &quot;occupancy&quot; values.</p><p>The rest of the wavefronts are not launched until hardware resources become available, which means that a kernel with better occupancy will see more of its wavefronts executing simultaneously (which often leads to better performance). Suffice to say, it&#39;s important to know the occupancy of kernels if you want the best performance.</p><p>Like CUDA.jl, AMDGPU.jl has the ability to calculate kernel occupancy, with the <code>launch_configuration</code> function:</p><pre><code class="language-julia hljs">kernel = @roc launch=false mykernel(args...)
 occupancy = AMDGPU.launch_configuration(kernel)
 @show occupancy.gridsize
-@show occupancy.groupsize</code></pre><p>Specifically, <code>launch_configuration</code> calculates the occupancy of <code>mykernel(args...)</code>, and then calculates an optimal groupsize based on the occupancy. This value can then be used to select the groupsize for the kernel:</p><pre><code class="language-julia hljs">@roc groupsize=occupancy.groupsize mykernel(args...)</code></pre><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.@roc" href="#AMDGPU.@roc"><code>AMDGPU.@roc</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@roc [kwargs...] func(args...)</code></pre><p>High-level interface for launching kernels on GPU. Upon a first call it will be compiled, subsequent calls will re-use the compiled object.</p><p>Several keyword arguments are supported:</p><ul><li><code>launch::Bool = true</code>: whether to launch the kernel.   If <code>false</code>, then returns a compiled kernel which can be launched by   calling it and passing arguments.</li><li>Arguments that influence kernel compilation, see   <a href="#AMDGPU.Compiler.hipfunction"><code>AMDGPU.Compiler.hipfunction</code></a>.</li><li>Arguments that influence kernel launch, see <a href="#AMDGPU.Runtime.HIPKernel"><code>AMDGPU.Runtime.HIPKernel</code></a>.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/highlevel.jl#L97-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Runtime.HIPKernel" href="#AMDGPU.Runtime.HIPKernel"><code>AMDGPU.Runtime.HIPKernel</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">(ker::HIPKernel)(args::Vararg{Any, N}; kwargs...)</code></pre><p>Launch compiled HIPKernel by passing arguments to it.</p><p>The following kwargs are supported:</p><ul><li><code>gridsize::ROCDim = 1</code>: Size of the grid.</li><li><code>groupsize::ROCDim = 1</code>:  Size of the workgroup.</li><li><code>shmem::Integer = 0</code>:   Amount of dynamically-allocated shared memory in bytes.</li><li><code>stream::HIP.HIPStream = AMDGPU.stream()</code>:   Stream on which to launch the kernel.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/runtime/hip-execution.jl#L1-L13">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Compiler.hipfunction" href="#AMDGPU.Compiler.hipfunction"><code>AMDGPU.Compiler.hipfunction</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">hipfunction(f::F, tt::TT = Tuple{}; kwargs...)</code></pre><p>Compile Julia function <code>f</code> to a HIP kernel given a tuple of argument&#39;s types <code>tt</code> that it accepts.</p><p>The following kwargs are supported:</p><ul><li><code>name::Union{String, Nothing} = nothing</code>:   A unique name to give a compiled kernel.</li><li><code>unsafe_fp_atomics::Bool = true</code>:   Whether to use &#39;unsafe&#39; floating-point atomics.   AMD GPU devices support fast atomic read-modify-write (RMW)   operations on floating-point values.   On single- or double-precision floating-point values this may generate   a hardware RMW instruction that is faster than emulating   the atomic operation using an atomic compare-and-swap (CAS) loop.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/compiler/codegen.jl#L136-L153">source</a></section></article><h2 id="Atomics"><a class="docs-heading-anchor" href="#Atomics">Atomics</a><a id="Atomics-1"></a><a class="docs-heading-anchor-permalink" href="#Atomics" title="Permalink"></a></h2><p>AMDGPU.jl relies on <a href="https://github.com/JuliaConcurrent/Atomix.jl">Atomix.jl</a> for atomics.</p><p>Example of a kernel that computes atomic max:</p><pre><code class="language-julia hljs">using AMDGPU
+@show occupancy.groupsize</code></pre><p>Specifically, <code>launch_configuration</code> calculates the occupancy of <code>mykernel(args...)</code>, and then calculates an optimal groupsize based on the occupancy. This value can then be used to select the groupsize for the kernel:</p><pre><code class="language-julia hljs">@roc groupsize=occupancy.groupsize mykernel(args...)</code></pre><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.@roc" href="#AMDGPU.@roc"><code>AMDGPU.@roc</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@roc [kwargs...] func(args...)</code></pre><p>High-level interface for launching kernels on GPU. Upon a first call it will be compiled, subsequent calls will re-use the compiled object.</p><p>Several keyword arguments are supported:</p><ul><li><code>launch::Bool = true</code>: whether to launch the kernel.   If <code>false</code>, then returns a compiled kernel which can be launched by   calling it and passing arguments.</li><li>Arguments that influence kernel compilation, see   <a href="#AMDGPU.Compiler.hipfunction"><code>AMDGPU.Compiler.hipfunction</code></a>.</li><li>Arguments that influence kernel launch, see <a href="#AMDGPU.Runtime.HIPKernel"><code>AMDGPU.Runtime.HIPKernel</code></a>.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/highlevel.jl#L97-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Runtime.HIPKernel" href="#AMDGPU.Runtime.HIPKernel"><code>AMDGPU.Runtime.HIPKernel</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">(ker::HIPKernel)(args::Vararg{Any, N}; kwargs...)</code></pre><p>Launch compiled HIPKernel by passing arguments to it.</p><p>The following kwargs are supported:</p><ul><li><code>gridsize::ROCDim = 1</code>: Size of the grid.</li><li><code>groupsize::ROCDim = 1</code>:  Size of the workgroup.</li><li><code>shmem::Integer = 0</code>:   Amount of dynamically-allocated shared memory in bytes.</li><li><code>stream::HIP.HIPStream = AMDGPU.stream()</code>:   Stream on which to launch the kernel.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/runtime/hip-execution.jl#L1-L13">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Compiler.hipfunction" href="#AMDGPU.Compiler.hipfunction"><code>AMDGPU.Compiler.hipfunction</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">hipfunction(f::F, tt::TT = Tuple{}; kwargs...)</code></pre><p>Compile Julia function <code>f</code> to a HIP kernel given a tuple of argument&#39;s types <code>tt</code> that it accepts.</p><p>The following kwargs are supported:</p><ul><li><code>name::Union{String, Nothing} = nothing</code>:   A unique name to give a compiled kernel.</li><li><code>unsafe_fp_atomics::Bool = true</code>:   Whether to use &#39;unsafe&#39; floating-point atomics.   AMD GPU devices support fast atomic read-modify-write (RMW)   operations on floating-point values.   On single- or double-precision floating-point values this may generate   a hardware RMW instruction that is faster than emulating   the atomic operation using an atomic compare-and-swap (CAS) loop.</li></ul></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/compiler/codegen.jl#L136-L153">source</a></section></article><h2 id="Atomics"><a class="docs-heading-anchor" href="#Atomics">Atomics</a><a id="Atomics-1"></a><a class="docs-heading-anchor-permalink" href="#Atomics" title="Permalink"></a></h2><p>AMDGPU.jl relies on <a href="https://github.com/JuliaConcurrent/Atomix.jl">Atomix.jl</a> for atomics.</p><p>Example of a kernel that computes atomic max:</p><pre><code class="language-julia hljs">using AMDGPU
 
 function ker_atomic_max!(target, source, indices)
     i = workitemIdx().x + (workgroupIdx().x - 0x1) * workgroupDim().x
@@ -20,7 +20,7 @@
 source = ROCArray(rand(UInt32, n))
 indices = ROCArray(rand(1:bins, n))
 target = ROCArray(zeros(UInt32, bins))
-@roc groupsize=256 gridsize=4 ker_atomic_max!(target, source, indices)</code></pre><h2 id="Device-Intrinsics"><a class="docs-heading-anchor" href="#Device-Intrinsics">Device Intrinsics</a><a id="Device-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Intrinsics" title="Permalink"></a></h2><h3 id="Wavefront-Level-Primitives"><a class="docs-heading-anchor" href="#Wavefront-Level-Primitives">Wavefront-Level Primitives</a><a id="Wavefront-Level-Primitives-1"></a><a class="docs-heading-anchor-permalink" href="#Wavefront-Level-Primitives" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.wavefrontsize" href="#AMDGPU.Device.wavefrontsize"><code>AMDGPU.Device.wavefrontsize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">wavefrontsize()::Cuint</code></pre><p>Get the wavefront size of the device that executes current kernel.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L79-L83">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.activelane" href="#AMDGPU.Device.activelane"><code>AMDGPU.Device.activelane</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">activelane()::Cuint</code></pre><p>Get id of the current lane within a wavefront/warp.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+@roc groupsize=256 gridsize=4 ker_atomic_max!(target, source, indices)</code></pre><h2 id="Device-Intrinsics"><a class="docs-heading-anchor" href="#Device-Intrinsics">Device Intrinsics</a><a id="Device-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#Device-Intrinsics" title="Permalink"></a></h2><h3 id="Wavefront-Level-Primitives"><a class="docs-heading-anchor" href="#Wavefront-Level-Primitives">Wavefront-Level Primitives</a><a id="Wavefront-Level-Primitives-1"></a><a class="docs-heading-anchor-permalink" href="#Wavefront-Level-Primitives" title="Permalink"></a></h3><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.wavefrontsize" href="#AMDGPU.Device.wavefrontsize"><code>AMDGPU.Device.wavefrontsize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">wavefrontsize()::Cuint</code></pre><p>Get the wavefront size of the device that executes current kernel.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L79-L83">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.activelane" href="#AMDGPU.Device.activelane"><code>AMDGPU.Device.activelane</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">activelane()::Cuint</code></pre><p>Get id of the current lane within a wavefront/warp.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            x[i + 1] = i
            return
@@ -33,7 +33,7 @@
 
 julia&gt; Array(x)
 1×8 Matrix{Int32}:
- 0  1  2  3  4  5  6  7</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L86-L107">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.ballot" href="#AMDGPU.Device.ballot"><code>AMDGPU.Device.ballot</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ballot(predicate::Bool)::UInt64</code></pre><p>Return a value whose <code>N</code>th bit is set if and only if <code>predicate</code> evaluates to <code>true</code> for the <code>N</code>th lane and the lane is active.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 0  1  2  3  4  5  6  7</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L86-L107">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.ballot" href="#AMDGPU.Device.ballot"><code>AMDGPU.Device.ballot</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ballot(predicate::Bool)::UInt64</code></pre><p>Return a value whose <code>N</code>th bit is set if and only if <code>predicate</code> evaluates to <code>true</code> for the <code>N</code>th lane and the lane is active.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            x[1] = AMDGPU.Device.ballot(true)
            return
        end
@@ -45,7 +45,7 @@
 
 julia&gt; x
 1-element ROCArray{UInt64, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
- 0x00000000ffffffff</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L110-L131">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.ballot_sync" href="#AMDGPU.Device.ballot_sync"><code>AMDGPU.Device.ballot_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ballot_sync(mask::UInt64, predicate::Bool)::UInt64</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return an integer whose Nth bit is set if and only if <code>predicate</code> is <code>true</code> for the Nth thread of the wavefront and the Nth thread is active.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 0x00000000ffffffff</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L110-L131">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.ballot_sync" href="#AMDGPU.Device.ballot_sync"><code>AMDGPU.Device.ballot_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">ballot_sync(mask::UInt64, predicate::Bool)::UInt64</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return an integer whose Nth bit is set if and only if <code>predicate</code> is <code>true</code> for the Nth thread of the wavefront and the Nth thread is active.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            if i % 2 == 0
                mask = 0x0000000055555555 # Only even threads.
@@ -60,7 +60,7 @@
 julia&gt; @roc groupsize=32 ker!(x);
 
 julia&gt; bitstring(Array(x)[1])
-&quot;0000000000000000000000000000000001010101010101010101010101010101&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront_sync.jl#L16-L41">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.activemask" href="#AMDGPU.Device.activemask"><code>AMDGPU.Device.activemask</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">activemask()::UInt64</code></pre><p>Get the mask of all active lanes in a warp.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L140-L144">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.bpermute" href="#AMDGPU.Device.bpermute"><code>AMDGPU.Device.bpermute</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">bpermute(addr::Integer, val::Cint)::Cint</code></pre><p>Read data stored in <code>val</code> from the lane VGPR (vector general purpose register) given by <code>addr</code>.</p><p>The permute instruction moves data between lanes but still uses the notion of byte addressing, as do other LDS instructions. Hence, the value in the <code>addr</code> VGPR should be <code>desired_lane_id * 4</code>, since VGPR values are 4 bytes wide.</p><p>Example below shifts all values in the wavefront by 1 to the &quot;left&quot;.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+&quot;0000000000000000000000000000000001010101010101010101010101010101&quot;</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront_sync.jl#L16-L41">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.activemask" href="#AMDGPU.Device.activemask"><code>AMDGPU.Device.activemask</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">activemask()::UInt64</code></pre><p>Get the mask of all active lanes in a warp.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L140-L144">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.bpermute" href="#AMDGPU.Device.bpermute"><code>AMDGPU.Device.bpermute</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">bpermute(addr::Integer, val::Cint)::Cint</code></pre><p>Read data stored in <code>val</code> from the lane VGPR (vector general purpose register) given by <code>addr</code>.</p><p>The permute instruction moves data between lanes but still uses the notion of byte addressing, as do other LDS instructions. Hence, the value in the <code>addr</code> VGPR should be <code>desired_lane_id * 4</code>, since VGPR values are 4 bytes wide.</p><p>Example below shifts all values in the wavefront by 1 to the &quot;left&quot;.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i::Cint = AMDGPU.Device.activelane()
            # `addr` points to the next immediate lane.
            addr = ((i + 1) % 8) * 4 # VGPRs are 4 bytes wide
@@ -76,7 +76,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int32, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1  2  3  4  5  6  7  0</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L147-L179">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.permute" href="#AMDGPU.Device.permute"><code>AMDGPU.Device.permute</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">permute(addr::Integer, val::Cint)::Cint</code></pre><p>Put data stored in <code>val</code> to the lane VGPR (vector general purpose register) given by <code>addr</code>.</p><p>Example below shifts all values in the wavefront by 1 to the &quot;right&quot;.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1  2  3  4  5  6  7  0</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L147-L179">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.permute" href="#AMDGPU.Device.permute"><code>AMDGPU.Device.permute</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">permute(addr::Integer, val::Cint)::Cint</code></pre><p>Put data stored in <code>val</code> to the lane VGPR (vector general purpose register) given by <code>addr</code>.</p><p>Example below shifts all values in the wavefront by 1 to the &quot;right&quot;.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i::Cint = AMDGPU.Device.activelane()
            # `addr` points to the next immediate lane.
            addr = ((i + 1) % 8) * 4 # VGPRs are 4 bytes wide
@@ -92,7 +92,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int32, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 7  0  1  2  3  4  5  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L183-L210">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl" href="#AMDGPU.Device.shfl"><code>AMDGPU.Device.shfl</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl(val, lane, width = wavefrontsize())</code></pre><p>Read data stored in <code>val</code> from a <code>lane</code> (this is a more high-level op than <a href="#AMDGPU.Device.bpermute"><code>bpermute</code></a>).</p><p>If <code>lane</code> is outside the range <code>[0:width - 1]</code>, the value returned corresponds to the value held by the <code>lane modulo width</code> (within the same subsection).</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 7  0  1  2  3  4  5  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L183-L210">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl" href="#AMDGPU.Device.shfl"><code>AMDGPU.Device.shfl</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl(val, lane, width = wavefrontsize())</code></pre><p>Read data stored in <code>val</code> from a <code>lane</code> (this is a more high-level op than <a href="#AMDGPU.Device.bpermute"><code>bpermute</code></a>).</p><p>If <code>lane</code> is outside the range <code>[0:width - 1]</code>, the value returned corresponds to the value held by the <code>lane modulo width</code> (within the same subsection).</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i::UInt32 = AMDGPU.Device.activelane()
            x[i + 1] = AMDGPU.Device.shfl(i, i + 1)
            return
@@ -118,7 +118,7 @@
 
 julia&gt; Int.(x)
 1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1  2  3  0  5  6  7  4</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L264-L309">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_sync" href="#AMDGPU.Device.shfl_sync"><code>AMDGPU.Device.shfl_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_sync(mask::UInt64, val, lane, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> ID.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront_sync.jl#L103-L108">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_up" href="#AMDGPU.Device.shfl_up"><code>AMDGPU.Device.shfl_up</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_up(val, δ, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, accepts <code>δ</code> that is subtracted from the current lane ID. I.e. read from a lane with lower ID relative to the caller.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1  2  3  0  5  6  7  4</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L264-L309">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_sync" href="#AMDGPU.Device.shfl_sync"><code>AMDGPU.Device.shfl_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_sync(mask::UInt64, val, lane, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> ID.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront_sync.jl#L103-L108">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_up" href="#AMDGPU.Device.shfl_up"><code>AMDGPU.Device.shfl_up</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_up(val, δ, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, accepts <code>δ</code> that is subtracted from the current lane ID. I.e. read from a lane with lower ID relative to the caller.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            x[i + 1] = AMDGPU.Device.shfl_up(i, 1)
            return
@@ -131,7 +131,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 0  0  1  2  3  4  5  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L312-L335">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_up_sync" href="#AMDGPU.Device.shfl_up_sync"><code>AMDGPU.Device.shfl_up_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_up_sync(mask::UInt64, val, δ, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> with lower ID relative to the caller.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront_sync.jl#L114-L119">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_down" href="#AMDGPU.Device.shfl_down"><code>AMDGPU.Device.shfl_down</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_down(val, δ, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, accepts <code>δ</code> that is added to the current lane ID. I.e. read from a lane with higher ID relative to the caller.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 0  0  1  2  3  4  5  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L312-L335">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_up_sync" href="#AMDGPU.Device.shfl_up_sync"><code>AMDGPU.Device.shfl_up_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_up_sync(mask::UInt64, val, δ, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> with lower ID relative to the caller.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront_sync.jl#L114-L119">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_down" href="#AMDGPU.Device.shfl_down"><code>AMDGPU.Device.shfl_down</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_down(val, δ, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, accepts <code>δ</code> that is added to the current lane ID. I.e. read from a lane with higher ID relative to the caller.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            x[i + 1] = AMDGPU.Device.shfl_down(i, 1, 8)
            return
@@ -144,7 +144,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1  2  3  4  5  6  7  7</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L338-L361">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_down_sync" href="#AMDGPU.Device.shfl_down_sync"><code>AMDGPU.Device.shfl_down_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_down_sync(mask::UInt64, val, δ, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> with higher ID relative to the caller.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront_sync.jl#L125-L130">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_xor" href="#AMDGPU.Device.shfl_xor"><code>AMDGPU.Device.shfl_xor</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_xor(val, lane_mask, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, performs bitwise XOR of the caller&#39;s lane ID with the <code>lane_mask</code>.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1  2  3  4  5  6  7  7</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L338-L361">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_down_sync" href="#AMDGPU.Device.shfl_down_sync"><code>AMDGPU.Device.shfl_down_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_down_sync(mask::UInt64, val, δ, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a <code>lane</code> with higher ID relative to the caller.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront_sync.jl#L125-L130">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_xor" href="#AMDGPU.Device.shfl_xor"><code>AMDGPU.Device.shfl_xor</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_xor(val, lane_mask, width = wavefrontsize())</code></pre><p>Same as <a href="#AMDGPU.Device.shfl"><code>shfl</code></a>, but instead of specifying lane ID, performs bitwise XOR of the caller&#39;s lane ID with the <code>lane_mask</code>.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            x[i + 1] = AMDGPU.Device.shfl_xor(i, 1)
            return
@@ -157,7 +157,7 @@
 
 julia&gt; x
 1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1  0  3  2  5  4  7  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront.jl#L365-L387">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_xor_sync" href="#AMDGPU.Device.shfl_xor_sync"><code>AMDGPU.Device.shfl_xor_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_xor_sync(mask::UInt64, val, lane_mask, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a lane according to a bitwise XOR of the caller&#39;s lane ID with the <code>lane_mask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront_sync.jl#L136-L142">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.any_sync" href="#AMDGPU.Device.any_sync"><code>AMDGPU.Device.any_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">any_sync(mask::UInt64, predicate::Bool)::Bool</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return non-zero if and only if <code>predicate</code> evaluates to non-zero for any of them.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1  0  3  2  5  4  7  6</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront.jl#L365-L387">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.shfl_xor_sync" href="#AMDGPU.Device.shfl_xor_sync"><code>AMDGPU.Device.shfl_xor_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">shfl_xor_sync(mask::UInt64, val, lane_mask, width = wavefrontsize())</code></pre><p>Synchronize threads according to a <code>mask</code> and read data stored in <code>val</code> from a lane according to a bitwise XOR of the caller&#39;s lane ID with the <code>lane_mask</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront_sync.jl#L136-L142">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.any_sync" href="#AMDGPU.Device.any_sync"><code>AMDGPU.Device.any_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">any_sync(mask::UInt64, predicate::Bool)::Bool</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return non-zero if and only if <code>predicate</code> evaluates to non-zero for any of them.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            if i % 2 == 0
                mask = 0x0000000055555555 # Only even threads.
@@ -173,7 +173,7 @@
 
 julia&gt; x
 1-element ROCArray{Bool, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront_sync.jl#L47-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.all_sync" href="#AMDGPU.Device.all_sync"><code>AMDGPU.Device.all_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">all_sync(mask::UInt64, predicate::Bool)::Bool</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return non-zero if and only if <code>predicate</code> evaluates to non-zero for all of them.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
+ 1</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront_sync.jl#L47-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.Device.all_sync" href="#AMDGPU.Device.all_sync"><code>AMDGPU.Device.all_sync</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">all_sync(mask::UInt64, predicate::Bool)::Bool</code></pre><p>Evaluate <code>predicate</code> for all non-exited threads in <code>mask</code> and return non-zero if and only if <code>predicate</code> evaluates to non-zero for all of them.</p><pre><code class="language-julia-repl hljs">julia&gt; function ker!(x)
            i = AMDGPU.Device.activelane()
            if i % 2 == 0
                mask = 0x0000000055555555 # Only even threads.
@@ -189,4 +189,4 @@
 
 julia&gt; x
 1-element ROCArray{Bool, 1, AMDGPU.Runtime.Mem.HIPBuffer}:
- 1</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/device/gcn/wavefront_sync.jl#L75-L100">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../streams/">« Streams</a><a class="docs-footer-nextpage" href="../exceptions/">Exceptions »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+ 1</code></pre></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/device/gcn/wavefront_sync.jl#L75-L100">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../streams/">« Streams</a><a class="docs-footer-nextpage" href="../exceptions/">Exceptions »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/logging/index.html b/dev/logging/index.html
index 31e93f06..f2f766e1 100644
--- a/dev/logging/index.html
+++ b/dev/logging/index.html
@@ -3,10 +3,10 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li class="is-active"><a class="tocitem" href>Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Logging</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Logging</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/logging.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Runtime-and-Compiler-Logging"><a class="docs-heading-anchor" href="#Runtime-and-Compiler-Logging">Runtime and Compiler Logging</a><a id="Runtime-and-Compiler-Logging-1"></a><a class="docs-heading-anchor-permalink" href="#Runtime-and-Compiler-Logging" title="Permalink"></a></h1><p>AMDGPU.jl has a built-in logging system integrated into various runtime and compiler operations, which is provided by TimespanLogging.jl. Operations such as compilation and linking, signal and buffer allocation/freeing, kernel launch, etc. are instrumented with logging statements, allowing the user to record the start and end of operations.</p><p>While disabled by default, logging can be enabled by first running <code>AMDGPU.Runtime.enable_logging!()</code> to globally enable logging, after which Julia must be restarted for the changes to take effect.</p><p>Once logging is globally enabled, <code>AMDGPU.Runtime.start_logging()</code> causes new log events to be saved, while <code>AMDGPU.Runtime.stop_logging()</code> causes new log events to be discarded. Log events can be collected with <code>AMDGPU.Runtime.fetch_logs!()</code>. A more convenient option is <code>AMDGPU.Runtime.log_and_fetch!(f)</code>, which can be used to easily log operations within a region of code:</p><pre><code class="language-julia hljs">logs = AMDGPU.Runtime.log_and_fetch!() do
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li class="is-active"><a class="tocitem" href>Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Logging</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Logging</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/logging.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Runtime-and-Compiler-Logging"><a class="docs-heading-anchor" href="#Runtime-and-Compiler-Logging">Runtime and Compiler Logging</a><a id="Runtime-and-Compiler-Logging-1"></a><a class="docs-heading-anchor-permalink" href="#Runtime-and-Compiler-Logging" title="Permalink"></a></h1><p>AMDGPU.jl has a built-in logging system integrated into various runtime and compiler operations, which is provided by TimespanLogging.jl. Operations such as compilation and linking, signal and buffer allocation/freeing, kernel launch, etc. are instrumented with logging statements, allowing the user to record the start and end of operations.</p><p>While disabled by default, logging can be enabled by first running <code>AMDGPU.Runtime.enable_logging!()</code> to globally enable logging, after which Julia must be restarted for the changes to take effect.</p><p>Once logging is globally enabled, <code>AMDGPU.Runtime.start_logging()</code> causes new log events to be saved, while <code>AMDGPU.Runtime.stop_logging()</code> causes new log events to be discarded. Log events can be collected with <code>AMDGPU.Runtime.fetch_logs!()</code>. A more convenient option is <code>AMDGPU.Runtime.log_and_fetch!(f)</code>, which can be used to easily log operations within a region of code:</p><pre><code class="language-julia hljs">logs = AMDGPU.Runtime.log_and_fetch!() do
     A = AMDGPU.ones(3, 4)
     B = copy(A)
     fill!(B, 1f0)
     C = Array(B)
 end
-@show logs[1]</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../printing/">« Printing</a><a class="docs-footer-nextpage" href="../api/">API Reference »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+@show logs[1]</code></pre></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../printing/">« Printing</a><a class="docs-footer-nextpage" href="../api/">API Reference »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/memory/index.html b/dev/memory/index.html
index bfd2665f..de43f08b 100644
--- a/dev/memory/index.html
+++ b/dev/memory/index.html
@@ -3,7 +3,7 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li class="is-active"><a class="tocitem" href>Memory</a><ul class="internal"><li><a class="tocitem" href="#Memory-Varieties"><span>Memory Varieties</span></a></li><li><a class="tocitem" href="#Local-Memory"><span>Local Memory</span></a></li><li><a class="tocitem" href="#Device-Side-Allocations"><span>Device-Side Allocations</span></a></li><li><a class="tocitem" href="#Memory-Modification-Intrinsics"><span>Memory Modification Intrinsics</span></a></li><li><a class="tocitem" href="#Wrapping-in-ROCArray"><span>Wrapping in <code>ROCArray</code></span></a></li></ul></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Memory</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Memory</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/memory.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Memory-Allocation-and-Intrinsics"><a class="docs-heading-anchor" href="#Memory-Allocation-and-Intrinsics">Memory Allocation and Intrinsics</a><a id="Memory-Allocation-and-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#Memory-Allocation-and-Intrinsics" title="Permalink"></a></h1><h2 id="Memory-Varieties"><a class="docs-heading-anchor" href="#Memory-Varieties">Memory Varieties</a><a id="Memory-Varieties-1"></a><a class="docs-heading-anchor-permalink" href="#Memory-Varieties" title="Permalink"></a></h2><p>GPUs contain various kinds of memory, just like CPUs:</p><ul><li>Global:   Globally accessible by all CUs on a GPU, and possibly accessible   from outside of the GPU (by the CPU host, by other GPUs, by PCIe devices,   etc.). Slowest form of memory.</li><li>Constant:   Same as global memory, but signals to the hardware that it can use   special instructions to access and cache this memory.   Can be changed between kernel invocations.</li><li>Region:   Also known as Global Data Store (GDS), all wavefronts on a CU   can access the same memory region from the same address.   Faster than Global/Constant.   Automatically allocated by the compiler/runtime, not user accessible.</li><li>Local:   Also known as Local Data Store (LDS), all wavefronts in the same workgroup   can access the same memory region from the same address.   Faster than GDS.</li><li>Private:   Uses the hardware scratch space, and is private to each SIMD lane   in a wavefront.   Fastest form of traditional memory.</li></ul><h2 id="Local-Memory"><a class="docs-heading-anchor" href="#Local-Memory">Local Memory</a><a id="Local-Memory-1"></a><a class="docs-heading-anchor-permalink" href="#Local-Memory" title="Permalink"></a></h2><p>Local memory may be allocated within a kernel by calling either:</p><ul><li><p><code>@ROCStaticLocalArray(T, dims)</code> - if <code>dims</code> is passed as a constant value,   known at compile-time.   E.g. <code>@ROCStaticLocalArray(Float32, 8)</code>.</p></li><li><p><code>@ROCDynamicLocalArray(T, dims)</code> - otherwise.   E.g. <code>@ROCStaticLocalArray(Float32, length(X))</code>.</p></li></ul><p>Local memory is zero-initialized by default. If this is unnecessary and undesired for performance reasons, disable this, passing <code>false</code> as a last argument: <code>@ROCStaticLocalArray(Float32, 8, false)</code> or <code>@ROCStaticLocalArray(Float32, length(X), false)</code></p><p>Local memory does not need to be freed, as it is automatically freed by the hardware.</p><p>If <code>@ROCDynamicLocalArray</code> is used, then local memory is dynamically allocated at kernel execution time. The <code>shmem</code> option to <code>@roc</code> must be set appropriately to ensure that enough local memory is allocated by the hardware.</p><p>It is allocated in addition to the local memory that is statically allocated by the kernel.</p><pre><code class="language-julia hljs">function kernel(C, A)
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li class="is-active"><a class="tocitem" href>Memory</a><ul class="internal"><li><a class="tocitem" href="#Memory-Varieties"><span>Memory Varieties</span></a></li><li><a class="tocitem" href="#Local-Memory"><span>Local Memory</span></a></li><li><a class="tocitem" href="#Device-Side-Allocations"><span>Device-Side Allocations</span></a></li><li><a class="tocitem" href="#Memory-Modification-Intrinsics"><span>Memory Modification Intrinsics</span></a></li><li><a class="tocitem" href="#Wrapping-in-ROCArray"><span>Wrapping in <code>ROCArray</code></span></a></li></ul></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Memory</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Memory</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/memory.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Memory-Allocation-and-Intrinsics"><a class="docs-heading-anchor" href="#Memory-Allocation-and-Intrinsics">Memory Allocation and Intrinsics</a><a id="Memory-Allocation-and-Intrinsics-1"></a><a class="docs-heading-anchor-permalink" href="#Memory-Allocation-and-Intrinsics" title="Permalink"></a></h1><h2 id="Memory-Varieties"><a class="docs-heading-anchor" href="#Memory-Varieties">Memory Varieties</a><a id="Memory-Varieties-1"></a><a class="docs-heading-anchor-permalink" href="#Memory-Varieties" title="Permalink"></a></h2><p>GPUs contain various kinds of memory, just like CPUs:</p><ul><li>Global:   Globally accessible by all CUs on a GPU, and possibly accessible   from outside of the GPU (by the CPU host, by other GPUs, by PCIe devices,   etc.). Slowest form of memory.</li><li>Constant:   Same as global memory, but signals to the hardware that it can use   special instructions to access and cache this memory.   Can be changed between kernel invocations.</li><li>Region:   Also known as Global Data Store (GDS), all wavefronts on a CU   can access the same memory region from the same address.   Faster than Global/Constant.   Automatically allocated by the compiler/runtime, not user accessible.</li><li>Local:   Also known as Local Data Store (LDS), all wavefronts in the same workgroup   can access the same memory region from the same address.   Faster than GDS.</li><li>Private:   Uses the hardware scratch space, and is private to each SIMD lane   in a wavefront.   Fastest form of traditional memory.</li></ul><h2 id="Local-Memory"><a class="docs-heading-anchor" href="#Local-Memory">Local Memory</a><a id="Local-Memory-1"></a><a class="docs-heading-anchor-permalink" href="#Local-Memory" title="Permalink"></a></h2><p>Local memory may be allocated within a kernel by calling either:</p><ul><li><p><code>@ROCStaticLocalArray(T, dims)</code> - if <code>dims</code> is passed as a constant value,   known at compile-time.   E.g. <code>@ROCStaticLocalArray(Float32, 8)</code>.</p></li><li><p><code>@ROCDynamicLocalArray(T, dims)</code> - otherwise.   E.g. <code>@ROCStaticLocalArray(Float32, length(X))</code>.</p></li></ul><p>Local memory is zero-initialized by default. If this is unnecessary and undesired for performance reasons, disable this, passing <code>false</code> as a last argument: <code>@ROCStaticLocalArray(Float32, 8, false)</code> or <code>@ROCStaticLocalArray(Float32, length(X), false)</code></p><p>Local memory does not need to be freed, as it is automatically freed by the hardware.</p><p>If <code>@ROCDynamicLocalArray</code> is used, then local memory is dynamically allocated at kernel execution time. The <code>shmem</code> option to <code>@roc</code> must be set appropriately to ensure that enough local memory is allocated by the hardware.</p><p>It is allocated in addition to the local memory that is statically allocated by the kernel.</p><pre><code class="language-julia hljs">function kernel(C, A)
     # Allocate local memory dynamically
     Ctmp = @ROCDynamicLocalArray(Float64, length(C))
     # Or, allocate local memory statically if the size is known ahead-of-time
@@ -62,4 +62,4 @@
 xd * xd
 
 # Freeing is a no-op for `xd`, since `xd` does not own the underlying memory.
-AMDGPU.unsafe_free!(xd) # No-op.</code></pre><p>Notice mandatory <code>; lock=false</code> keyword, this is to be able to differentiate between host &amp; device pointers.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../profiling/">« Profiling</a><a class="docs-footer-nextpage" href="../caching_allocator/">Caching Memory Allocator »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+AMDGPU.unsafe_free!(xd) # No-op.</code></pre><p>Notice mandatory <code>; lock=false</code> keyword, this is to be able to differentiate between host &amp; device pointers.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../profiling/">« Profiling</a><a class="docs-footer-nextpage" href="../hostcall/">Host-Call »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/objects.inv b/dev/objects.inv
index 128fbf7d..089472fe 100644
Binary files a/dev/objects.inv and b/dev/objects.inv differ
diff --git a/dev/printing/index.html b/dev/printing/index.html
index cf08c1d8..001faab3 100644
--- a/dev/printing/index.html
+++ b/dev/printing/index.html
@@ -3,7 +3,7 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li class="is-active"><a class="tocitem" href>Printing</a><ul class="internal"><li><a class="tocitem" href="#Differences-to-@cuprintf"><span>Differences to <code>@cuprintf</code></span></a></li></ul></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Printing</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Printing</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/printing.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Printing"><a class="docs-heading-anchor" href="#Printing">Printing</a><a id="Printing-1"></a><a class="docs-heading-anchor-permalink" href="#Printing" title="Permalink"></a></h1><p>Writing GPU kernels can be a difficult endeavor, owing to the fact that the LLVM GPU backends turn serial code into parallel code automatically. Recognizing this, every good GPU programming interface allows the user&#39;s GPU kernels to print output to a buffer, which will be passed to the host for display. With the ability to interpolate variables, this functionality serves as the &quot;printf of GPUs&quot;. Quite literally, the primary tool for this is <code>@rocprintf</code>. Here&#39;s a simple example of printing the current workgroup index:</p><pre><code class="language-julia hljs">kernel(x) = @rocprintf &quot;Workgroup index: %d\n&quot; workgroupIdx().x</code></pre><p>The above kernel would print out the string &quot;Workgroup index: 1\n&quot; when run with a single workgroup (where &quot;\n&quot; means a newline).</p><p>Any number of variables may be passed to <code>@rocprintf</code>, as long as those variables have a printf-compatible implementation in <code>Printf.@printf</code>. Calls to <code>@rocprintf</code> are blocking, and will not return control to the kernel until the string has been formatted and sent to the OS runtime for printing (the same as for calls to <code>Printf.@printf</code>).</p><p>While <code>@rocprintf</code> is printed once per workgroup by default, it&#39;s possible to print once per lane, once per wavefront, or once per grid by specifying an execution mode as the first argument:</p><pre><code class="language-julia hljs"># Once per lane
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li class="is-active"><a class="tocitem" href>Printing</a><ul class="internal"><li><a class="tocitem" href="#Differences-to-@cuprintf"><span>Differences to <code>@cuprintf</code></span></a></li></ul></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Printing</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Printing</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/printing.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Printing"><a class="docs-heading-anchor" href="#Printing">Printing</a><a id="Printing-1"></a><a class="docs-heading-anchor-permalink" href="#Printing" title="Permalink"></a></h1><p>Writing GPU kernels can be a difficult endeavor, owing to the fact that the LLVM GPU backends turn serial code into parallel code automatically. Recognizing this, every good GPU programming interface allows the user&#39;s GPU kernels to print output to a buffer, which will be passed to the host for display. With the ability to interpolate variables, this functionality serves as the &quot;printf of GPUs&quot;. Quite literally, the primary tool for this is <code>@rocprintf</code>. Here&#39;s a simple example of printing the current workgroup index:</p><pre><code class="language-julia hljs">kernel(x) = @rocprintf &quot;Workgroup index: %d\n&quot; workgroupIdx().x</code></pre><p>The above kernel would print out the string &quot;Workgroup index: 1\n&quot; when run with a single workgroup (where &quot;\n&quot; means a newline).</p><p>Any number of variables may be passed to <code>@rocprintf</code>, as long as those variables have a printf-compatible implementation in <code>Printf.@printf</code>. Calls to <code>@rocprintf</code> are blocking, and will not return control to the kernel until the string has been formatted and sent to the OS runtime for printing (the same as for calls to <code>Printf.@printf</code>).</p><p>While <code>@rocprintf</code> is printed once per workgroup by default, it&#39;s possible to print once per lane, once per wavefront, or once per grid by specifying an execution mode as the first argument:</p><pre><code class="language-julia hljs"># Once per lane
 kernel(x) = @rocprintf :lane &quot;My index is: %d\n&quot; workitemIdx().x
 
 # Once per wavefront
@@ -38,4 +38,4 @@
 My index is 1
 
 # :grid
-My index is 1</code></pre><h2 id="Differences-to-@cuprintf"><a class="docs-heading-anchor" href="#Differences-to-@cuprintf">Differences to <code>@cuprintf</code></a><a id="Differences-to-@cuprintf-1"></a><a class="docs-heading-anchor-permalink" href="#Differences-to-@cuprintf" title="Permalink"></a></h2><p>Similar to CUDA&#39;s <code>@cuprintf</code>, <code>@rocprintf</code> is a printf-compatible macro which takes a format string and arguments, and commands the host CPU to display it as formatted text. However, in contrast to <code>@cuprintf</code>, we use AMDGPU&#39;s hostcall and Julia&#39;s <code>Printf</code> stdlib to implement this. This means that anything that <code>Printf</code> can print, so can <code>@rocprintf</code> (assuming such an object can be represented on the GPU). The macro is also handled as a regular hostcall, which means that argument types are checked at compile time (although currently, any errors while printing will be detected on the host, and will terminate the kernel).</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../hostcall/">« Host-Call</a><a class="docs-footer-nextpage" href="../logging/">Logging »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+My index is 1</code></pre><h2 id="Differences-to-@cuprintf"><a class="docs-heading-anchor" href="#Differences-to-@cuprintf">Differences to <code>@cuprintf</code></a><a id="Differences-to-@cuprintf-1"></a><a class="docs-heading-anchor-permalink" href="#Differences-to-@cuprintf" title="Permalink"></a></h2><p>Similar to CUDA&#39;s <code>@cuprintf</code>, <code>@rocprintf</code> is a printf-compatible macro which takes a format string and arguments, and commands the host CPU to display it as formatted text. However, in contrast to <code>@cuprintf</code>, we use AMDGPU&#39;s hostcall and Julia&#39;s <code>Printf</code> stdlib to implement this. This means that anything that <code>Printf</code> can print, so can <code>@rocprintf</code> (assuming such an object can be represented on the GPU). The macro is also handled as a regular hostcall, which means that argument types are checked at compile time (although currently, any errors while printing will be detected on the host, and will terminate the kernel).</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../hostcall/">« Host-Call</a><a class="docs-footer-nextpage" href="../logging/">Logging »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/profiling/index.html b/dev/profiling/index.html
index a95af96e..aec107bf 100644
--- a/dev/profiling/index.html
+++ b/dev/profiling/index.html
@@ -3,7 +3,7 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li class="is-active"><a class="tocitem" href>Profiling</a><ul class="internal"><li><a class="tocitem" href="#rocprof"><span>rocprof</span></a></li><li><a class="tocitem" href="#Debugging"><span>Debugging</span></a></li></ul></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Profiling</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Profiling</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/profiling.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h2 id="rocprof"><a class="docs-heading-anchor" href="#rocprof">rocprof</a><a id="rocprof-1"></a><a class="docs-heading-anchor-permalink" href="#rocprof" title="Permalink"></a></h2><p><a href="https://github.com/ROCm/rocprofiler?tab=readme-ov-file#rocprofiler-v2">rocprofv2</a> allows profiling both HSA &amp; HIP API calls (rocprof being deprecated).</p><p>Let&#39;s profile simple copying kernel saved in <code>profile.jl</code> file:</p><pre><code class="language-julia hljs">using AMDGPU
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li class="is-active"><a class="tocitem" href>Profiling</a><ul class="internal"><li><a class="tocitem" href="#rocprof"><span>rocprof</span></a></li><li><a class="tocitem" href="#Debugging"><span>Debugging</span></a></li></ul></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Profiling</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Profiling</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/profiling.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h2 id="rocprof"><a class="docs-heading-anchor" href="#rocprof">rocprof</a><a id="rocprof-1"></a><a class="docs-heading-anchor-permalink" href="#rocprof" title="Permalink"></a></h2><p><a href="https://github.com/ROCm/rocprofiler?tab=readme-ov-file#rocprofiler-v2">rocprofv2</a> allows profiling both HSA &amp; HIP API calls (rocprof being deprecated).</p><p>Let&#39;s profile simple copying kernel saved in <code>profile.jl</code> file:</p><pre><code class="language-julia hljs">using AMDGPU
 
 function mycopy!(dst, src)
     i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x
@@ -34,4 +34,4 @@
         @roc groupsize=groupsize gridsize=gridsize mycopy!(dst, src)
     end
     AMDGPU.synchronize()
-    ...</code></pre><p>Running profiling again and visualizing results we now see that kernel launches are adjacent to each other and that the average wall duration is lower.</p><table><tr><th style="text-align: center">Zoomed out</th><th style="text-align: center">Zoomed in</th></tr><tr><td style="text-align: center"><img src="../assets/profiling-2.png" alt="image"/></td><td style="text-align: center"><img src="../assets/profiling-3.png" alt="image"/></td></tr></table><h2 id="Debugging"><a class="docs-heading-anchor" href="#Debugging">Debugging</a><a id="Debugging-1"></a><a class="docs-heading-anchor-permalink" href="#Debugging" title="Permalink"></a></h2><p>Use <code>HIP_LAUNCH_BLOCKING=1</code> to synchronize immediately after launching GPU kernels. This will allow to pinpoint exact kernel that caused the exception.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../exceptions/">« Exceptions</a><a class="docs-footer-nextpage" href="../memory/">Memory »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+    ...</code></pre><p>Running profiling again and visualizing results we now see that kernel launches are adjacent to each other and that the average wall duration is lower.</p><table><tr><th style="text-align: center">Zoomed out</th><th style="text-align: center">Zoomed in</th></tr><tr><td style="text-align: center"><img src="../assets/profiling-2.png" alt="image"/></td><td style="text-align: center"><img src="../assets/profiling-3.png" alt="image"/></td></tr></table><h2 id="Debugging"><a class="docs-heading-anchor" href="#Debugging">Debugging</a><a id="Debugging-1"></a><a class="docs-heading-anchor-permalink" href="#Debugging" title="Permalink"></a></h2><p>Use <code>HIP_LAUNCH_BLOCKING=1</code> to synchronize immediately after launching GPU kernels. This will allow to pinpoint exact kernel that caused the exception.</p></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../exceptions/">« Exceptions</a><a class="docs-footer-nextpage" href="../memory/">Memory »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/quickstart/index.html b/dev/quickstart/index.html
index 57cf12c1..d63a2342 100644
--- a/dev/quickstart/index.html
+++ b/dev/quickstart/index.html
@@ -3,7 +3,7 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li class="is-active"><a class="tocitem" href>Quick Start</a><ul class="internal"><li><a class="tocitem" href="#Simple-example"><span>Simple example</span></a></li><li><a class="tocitem" href="#Kernel-example"><span>Kernel example</span></a></li><li><a class="tocitem" href="#Naming-conventions"><span>Naming conventions</span></a></li></ul></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Quick Start</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Quick Start</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/quickstart.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Quick-Start"><a class="docs-heading-anchor" href="#Quick-Start">Quick Start</a><a id="Quick-Start-1"></a><a class="docs-heading-anchor-permalink" href="#Quick-Start" title="Permalink"></a></h1><h2 id="Simple-example"><a class="docs-heading-anchor" href="#Simple-example">Simple example</a><a id="Simple-example-1"></a><a class="docs-heading-anchor-permalink" href="#Simple-example" title="Permalink"></a></h2><p>As a simple example, let&#39;s add two vectors both on CPU and GPU and make sure that the results are the same:</p><p>First, we do this on CPU:</p><pre><code class="language-julia-repl hljs">julia&gt; n = 1024;
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li class="is-active"><a class="tocitem" href>Quick Start</a><ul class="internal"><li><a class="tocitem" href="#Simple-example"><span>Simple example</span></a></li><li><a class="tocitem" href="#Kernel-example"><span>Kernel example</span></a></li><li><a class="tocitem" href="#Naming-conventions"><span>Naming conventions</span></a></li></ul></li><li><a class="tocitem" href="../devices/">Devices</a></li><li><a class="tocitem" href="../streams/">Streams</a></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Quick Start</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Quick Start</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/quickstart.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Quick-Start"><a class="docs-heading-anchor" href="#Quick-Start">Quick Start</a><a id="Quick-Start-1"></a><a class="docs-heading-anchor-permalink" href="#Quick-Start" title="Permalink"></a></h1><h2 id="Simple-example"><a class="docs-heading-anchor" href="#Simple-example">Simple example</a><a id="Simple-example-1"></a><a class="docs-heading-anchor-permalink" href="#Simple-example" title="Permalink"></a></h2><p>As a simple example, let&#39;s add two vectors both on CPU and GPU and make sure that the results are the same:</p><p>First, we do this on CPU:</p><pre><code class="language-julia-repl hljs">julia&gt; n = 1024;
 
 julia&gt; a = fill(1.0, n);
 
@@ -28,4 +28,4 @@
 julia&gt; @roc groupsize=groupsize gridsize=gridsize vadd!(c_d, a_d, b_d);
 
 julia&gt; Array(c_d) ≈ c
-true</code></pre><p>The easiest way to launch a GPU kernel is with the <code>@roc</code> macro, specifying <code>groupsize</code> and <code>gridsize</code> to cover full array, and calling it like a regular function.</p><p>Keep in mind that kernel launches are asynchronous, meaning that you need to synchronize before you can use the result (e.g. with <a href="../streams/#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a>). However, GPU &lt;-&gt; CPU transfers synchronize implicitly.</p><p>The grid is the domain over which the <em>entire</em> kernel executes over. The grid will be split into multiple workgroups by hardware automatically, and the kernel does not complete until all workgroups complete.</p><p>Like OpenCL, AMDGPU has the concept of &quot;workitems&quot;, &quot;workgroups&quot;, and the &quot;grid&quot;. A workitem is a single thread of execution, capable of performing arithmentic operations. Workitems are grouped into &quot;wavefronts&quot; (&quot;warps&quot; in CUDA) which share the same compute unit, and execute the same instructions simulatenously. The workgroup is a logical unit of compute supported by hardware which comprises multiple wavefronts, which shares resources (specifically local memory) and can be efficiently synchronized. A workgroup may be executed by one or multiple hardware compute units, making it often the only dimension of importance for smaller kernel launches.</p><p>Notice how we explicitly specify that this function does not return a value by adding the <code>return</code> statement. This is necessary for all GPU kernels and we can enforce it by adding a <code>return</code>, <code>return nothing</code>, or even <code>nothing</code> at the end of the kernel. If this statement is omitted, Julia will attempt to return the value of the last evaluated expression, in this case a <code>Float64</code>, which will cause a compilation failure as kernels cannot return values.</p><h2 id="Naming-conventions"><a class="docs-heading-anchor" href="#Naming-conventions">Naming conventions</a><a id="Naming-conventions-1"></a><a class="docs-heading-anchor-permalink" href="#Naming-conventions" title="Permalink"></a></h2><p>Throughout this example we use terms like &quot;work group&quot; and &quot;work item&quot;. These terms are used by the Khronos consortium and their APIs including OpenCL and Vulkan, as well as the HSA foundation.</p><p>NVIDIA, on the other hand, uses some different terms in their CUDA API, which might be confusing to some users porting their kernels from CUDA to AMDGPU.</p><p>As a quick summary, here is a mapping of the most common terms:</p><table><tr><th style="text-align: center">AMDGPU</th><th style="text-align: center">CUDA</th></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workitemIdx"><code>workitemIdx</code></a></td><td style="text-align: center"><code>threadIdx</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workgroupIdx"><code>workgroupIdx</code></a></td><td style="text-align: center"><code>blockIdx</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workgroupDim"><code>workgroupDim</code></a></td><td style="text-align: center"><code>blockDim</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.gridItemDim"><code>gridItemDim</code></a></td><td style="text-align: center">No equivalent</td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.gridGroupDim"><code>gridGroupDim</code></a></td><td style="text-align: center"><code>gridDim</code></td></tr><tr><td style="text-align: center"><code>groupsize</code></td><td style="text-align: center"><code>threads</code></td></tr><tr><td style="text-align: center"><code>gridsize</code></td><td style="text-align: center"><code>blocks</code></td></tr><tr><td style="text-align: center"><code>stream</code></td><td style="text-align: center"><code>stream</code></td></tr></table></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../">« Home</a><a class="docs-footer-nextpage" href="../devices/">Devices »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+true</code></pre><p>The easiest way to launch a GPU kernel is with the <code>@roc</code> macro, specifying <code>groupsize</code> and <code>gridsize</code> to cover full array, and calling it like a regular function.</p><p>Keep in mind that kernel launches are asynchronous, meaning that you need to synchronize before you can use the result (e.g. with <a href="../streams/#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a>). However, GPU &lt;-&gt; CPU transfers synchronize implicitly.</p><p>The grid is the domain over which the <em>entire</em> kernel executes over. The grid will be split into multiple workgroups by hardware automatically, and the kernel does not complete until all workgroups complete.</p><p>Like OpenCL, AMDGPU has the concept of &quot;workitems&quot;, &quot;workgroups&quot;, and the &quot;grid&quot;. A workitem is a single thread of execution, capable of performing arithmentic operations. Workitems are grouped into &quot;wavefronts&quot; (&quot;warps&quot; in CUDA) which share the same compute unit, and execute the same instructions simulatenously. The workgroup is a logical unit of compute supported by hardware which comprises multiple wavefronts, which shares resources (specifically local memory) and can be efficiently synchronized. A workgroup may be executed by one or multiple hardware compute units, making it often the only dimension of importance for smaller kernel launches.</p><p>Notice how we explicitly specify that this function does not return a value by adding the <code>return</code> statement. This is necessary for all GPU kernels and we can enforce it by adding a <code>return</code>, <code>return nothing</code>, or even <code>nothing</code> at the end of the kernel. If this statement is omitted, Julia will attempt to return the value of the last evaluated expression, in this case a <code>Float64</code>, which will cause a compilation failure as kernels cannot return values.</p><h2 id="Naming-conventions"><a class="docs-heading-anchor" href="#Naming-conventions">Naming conventions</a><a id="Naming-conventions-1"></a><a class="docs-heading-anchor-permalink" href="#Naming-conventions" title="Permalink"></a></h2><p>Throughout this example we use terms like &quot;work group&quot; and &quot;work item&quot;. These terms are used by the Khronos consortium and their APIs including OpenCL and Vulkan, as well as the HSA foundation.</p><p>NVIDIA, on the other hand, uses some different terms in their CUDA API, which might be confusing to some users porting their kernels from CUDA to AMDGPU.</p><p>As a quick summary, here is a mapping of the most common terms:</p><table><tr><th style="text-align: center">AMDGPU</th><th style="text-align: center">CUDA</th></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workitemIdx"><code>workitemIdx</code></a></td><td style="text-align: center"><code>threadIdx</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workgroupIdx"><code>workgroupIdx</code></a></td><td style="text-align: center"><code>blockIdx</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.workgroupDim"><code>workgroupDim</code></a></td><td style="text-align: center"><code>blockDim</code></td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.gridItemDim"><code>gridItemDim</code></a></td><td style="text-align: center">No equivalent</td></tr><tr><td style="text-align: center"><a href="../api/#AMDGPU.Device.gridGroupDim"><code>gridGroupDim</code></a></td><td style="text-align: center"><code>gridDim</code></td></tr><tr><td style="text-align: center"><code>groupsize</code></td><td style="text-align: center"><code>threads</code></td></tr><tr><td style="text-align: center"><code>gridsize</code></td><td style="text-align: center"><code>blocks</code></td></tr><tr><td style="text-align: center"><code>stream</code></td><td style="text-align: center"><code>stream</code></td></tr></table></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../">« Home</a><a class="docs-footer-nextpage" href="../devices/">Devices »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
diff --git a/dev/search_index.js b/dev/search_index.js
index 21590641..407b93f8 100644
--- a/dev/search_index.js
+++ b/dev/search_index.js
@@ -1,3 +1,3 @@
 var documenterSearchIndex = {"docs":
-[{"location":"exceptions/#Kernel-Exceptions","page":"Exceptions","title":"Kernel Exceptions","text":"","category":"section"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"Just like regular CPU-executed Julia functions, GPU kernels can throw exceptions!","category":"page"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"For example, the following kernel will throw an out-of-bounds exception:","category":"page"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"julia> function ker!(x)\n           x[0] = 1\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray([1]);\n\njulia> @roc kerr(x);\n\njulia> AMDGPU.synchronize()\nERROR: GPU Kernel Exception\nStacktrace:\n [1] error(s::String)\n   @ Base ./error.jl:35\n [2] throw_if_exception(dev::HIPDevice)\n   @ AMDGPU ~/.julia/dev/AMDGPU/src/exception_handler.jl:115\n [3] synchronize(stm::HIPStream)\n   @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154\n [4] synchronize()\n   @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154\n [5] top-level scope\n   @ REPL[5]:1","category":"page"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"Kernel-thrown exceptions are thrown during the host synchronization AMDGPU.synchronize or on the next kernel launch.","category":"page"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"Kernels that hit an exception will write its information into a pre-allocated host buffer. Once complete, the wavefront throwing the exception will lock the buffer to prevent other wavefronts from overwriting the exception and stop itself, but other wavefronts will continue executing.","category":"page"},{"location":"caching_allocator/#Caching-Memory-Allocator","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"","category":"section"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"Julia uses Garbage-Collection (GC) for automatic memory management. However, it does not know about other memory spaces, therefore it sees no difference between 1 KiB GPU allocation and 1 GiB and doesn't free it in time.","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"This leads to a situations where all of the GPU memory is used, even though your algorithm only requires a fraction of it.","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"Current mechanism of dealing with OOM (Out-Of-Memory) errors during allocations is to manually trigger GC and retry allocating again doing this in several rounds each more aggressive than previous.","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"However, manually triggering GC is very expensive, since it requires scanning all Julia objects, not just ROCArrays, so the actual memory freeing takes a fraction of GC time: (Image: )","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"On the image above, red region is a call to GC and green region is where actual GPU memory is being freed.","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"To help with memory management, we can use caching memory allocator. It is usefult in scenarios where we execute the same function multiple times and have the same memory allocation pattern. One such example is training DL models, where given the model and its parameters we compute loss, gradients w.r.t. loss and perform in-place parameter update. In this case, every iteration performs same operations and memory allocations and with caching allocator we can efficiently re-use them without returning the memory back to OS.","category":"page"},{"location":"caching_allocator/#Example","page":"Caching Memory Allocator","title":"Example","text":"","category":"section"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"We have a for-loop, where each iteration requires 2 GiB of VRAM. We create a caching allocator with the name :loop and pass a function to execute. First iteration will allocate, but subsequent won't.","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"using AMDGPU\n\nfunction main()\n    n = 1024^2 * 256\n    for i in 1:1000\n        AMDGPU.with_caching_allocator(:loop, n) do n\n            sin.(AMDGPU.rand(Float32, n)) # 2 GiB allocation\n            return\n        end\n    end\nend","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"The reason for marking a region of code where to re-use the memory and not extending it to the whole program instead, is because we cannot rely on GC to tell us when the memory is no longer used (it is too slow for that), so we create such region manually.","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"You can free all memory held by allocator, by invalidating it using its name with AMDGPU.invalidate_caching_allocator!. Or if you want some region of code within AMDGPU.with_caching_allocator to execute without relying on cache, use AMDGPU.with_no_caching.","category":"page"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":" Without Caching Allocator With Caching Allocator\nVRAM Usage (Image: ) (Image: )\nExecution time (seconds) 12.865149 0.020943","category":"page"},{"location":"caching_allocator/#API","page":"Caching Memory Allocator","title":"API","text":"","category":"section"},{"location":"caching_allocator/","page":"Caching Memory Allocator","title":"Caching Memory Allocator","text":"AMDGPU.with_caching_allocator\nAMDGPU.with_no_caching\nAMDGPU.invalidate_caching_allocator!","category":"page"},{"location":"caching_allocator/#AMDGPU.with_caching_allocator","page":"Caching Memory Allocator","title":"AMDGPU.with_caching_allocator","text":"with_caching_allocator(f, alloc_name::Symbol, args...)\n\nExecute function f with arguments args... using caching allocator given by its name alloc_name.\n\nAll GPU memory allocations will attempt to hit this cache before doing actual allocation (in case of cache miss). After executing f, all \"busy\" memory within the allocator is marked as free, so it can be re-used with the next call.\n\nReturns\n\nResult of the f function.\n\n\n\n\n\n","category":"function"},{"location":"caching_allocator/#AMDGPU.with_no_caching","page":"Caching Memory Allocator","title":"AMDGPU.with_no_caching","text":"with_no_caching(f)\n\nExecute function f, but avoid hitting any caching allocator. This is useful to call from within with_caching_allocator, so that the memory is independent from it.\n\nReturns\n\nResult of the f function.\n\n\n\n\n\n","category":"function"},{"location":"caching_allocator/#AMDGPU.invalidate_caching_allocator!","page":"Caching Memory Allocator","title":"AMDGPU.invalidate_caching_allocator!","text":"invalidate_caching_allocator!(alloc_name::Symbol)\n\nFree all memory held by caching allocator given by it name alloc_name.\n\n\n\n\n\n","category":"function"},{"location":"devices/#Devices","page":"Devices","title":"Devices","text":"","category":"section"},{"location":"devices/","page":"Devices","title":"Devices","text":"In AMDGPU, all GPU devices are auto-detected by the runtime, if they're supported.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"AMDGPU maintains a global default device. The default device is relevant for all kernel and GPUArray operations. If one is not specified via @roc or an equivalent interface, then the default device is used for those operations, which affects compilation and kernel launch.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"The device bound to a current Julia task is accessible via AMDGPU.device method. The list of available devices can be queried with AMDGPU.devices method.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"If you have a HIPDevice object, you can also switch the device with AMDGPU.device!. This will switch it only within the task it is called from.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"xd1 = AMDGPU.ones(Float32, 16) # On `AMDGPU.device()` device.\n\nAMDGPU.device!(AMDGPU.devices()[2]) # Switch to second device.\nxd2 = AMDPGU.ones(Float32, 16) # On second device.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"Additionally, devices have an associated numeric ID. This value is bounded between 1 and length(AMDGPU.devices()), and device 1 is the default device when AMDGPU is first loaded. The ID of the device associated with the current task can be queried with AMDGPU.device_id and changed with AMDGPU.device_id!.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"AMDGPU.devices\nAMDGPU.device\nAMDGPU.device!\nAMDGPU.device_id\nAMDGPU.device_id!","category":"page"},{"location":"devices/#AMDGPU.HIP.devices","page":"Devices","title":"AMDGPU.HIP.devices","text":"devices()\n\nGet list of all devices.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.device","page":"Devices","title":"AMDGPU.device","text":"device()::HIPDevice\n\nGet currently active device. This device is used when launching kernels via @roc.\n\n\n\n\n\ndevice(A::ROCArray) -> HIPDevice\n\nReturn the device associated with the array A.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.device!","page":"Devices","title":"AMDGPU.device!","text":"device!(device::HIPDevice)\n\nSwitch current device being used. This switches only for a task inside which it is called.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.device_id","page":"Devices","title":"AMDGPU.device_id","text":"device_id() -> Int\ndevice_id(device::HIPDevice) -> Int\n\nReturns the numerical device ID for device or for the current AMDGPU.device().\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.device_id!","page":"Devices","title":"AMDGPU.device_id!","text":"device_id!(idx::Integer)\n\nSets the current device to AMDGPU.devices()[idx]. See device_id for details on the numbering semantics.\n\n\n\n\n\n","category":"function"},{"location":"devices/#Device-Properties","page":"Devices","title":"Device Properties","text":"","category":"section"},{"location":"devices/","page":"Devices","title":"Devices","text":"AMDGPU.HIP.name\nAMDGPU.HIP.wavefrontsize\nAMDGPU.HIP.gcn_arch\nAMDGPU.HIP.device_id\nAMDGPU.HIP.properties","category":"page"},{"location":"devices/#AMDGPU.HIP.name","page":"Devices","title":"AMDGPU.HIP.name","text":"name(dev::HIPDevice)::String\n\nGet name of the device.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.HIP.wavefrontsize","page":"Devices","title":"AMDGPU.HIP.wavefrontsize","text":"wavefrontsize(d::HIPDevice)::Cint\n\nGet size of the wavefront. AMD GPUs support either 32 or 64.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.HIP.gcn_arch","page":"Devices","title":"AMDGPU.HIP.gcn_arch","text":"gcn_arch(d::HIPDevice)::String\n\nGet GCN architecture for the device.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.HIP.device_id","page":"Devices","title":"AMDGPU.HIP.device_id","text":"device_id(d::HIPDevice)\n\nZero-based device ID as expected by HIP functions. Differs from AMDGPU.device_id method by 1.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.HIP.properties","page":"Devices","title":"AMDGPU.HIP.properties","text":"properties(dev::HIPDevice)::hipDeviceProp_t\n\nGet all properties for the device. See HIP documentation for hipDeviceProp_t for the meaning of each field.\n\n\n\n\n\n","category":"function"},{"location":"hostcall/#Hostcall","page":"Host-Call","title":"Hostcall","text":"","category":"section"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"Hostcalls provide a means for GPU-CPU communications within running kernels.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"AMDGPU.jl provides its own implementation of hostcalls, relying on HSA signals. Currently, hostcalls are used for device-side allocations, printing and exception reporting.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"Some of the hostcalls (global hostcalls), are launched automatically, if their usage is detected during compilation (e.g. device-side allocations, exception reporting).","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"Hostcalls require careful usage, since they each spawn their own Tasks. There should be no blocking operations during this time.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"For example, using non-blocking synchronization instead of blocking with AMDGPU.synchronize(; blocking=false) (which is also the default).","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"To stop hostcalls after synchronization, provide stop_hostcalls=true keyword argument, otherwise the performance might degrade because of constant pooling of HSA signals in a loop.","category":"page"},{"location":"hostcall/#Example","page":"Host-Call","title":"Example","text":"","category":"section"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"hc = Device.HostCallHolder(Float32, Tuple{Float32}) do x\n    return x + 42f0\nend\n\nfunction kernel!(y, hc)\n    y[1] = Device.hostcall!(hc, y[1])\n    return\nend\n\ny = ROCArray(Float32[0f0])\n@roc kernel!(y, hc)\nAMDGPU.synchronize(; stop_hostcalls=true) # Stop hostcall.\nAMDGPU.Device.free!(hc) # Free hostcall buffers.\n\n@assert Array(y)[1] ≈ 42f0","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"In this example, HostCallHolder is used to create and launch HostCall. HostCallHolder contains the HostCall structure itself that is passed to kernel, a task that is spawned on creation and some additional info for controlling the lifetime of the task.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"First argument is a function we want to execute when we call the hostcall. In this case we add 42f0 to input argument x and return the result.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"Second and third arguments are the return type Float32 and the tuple of types of input arguments Tuple{Float32}.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"hostcall! is used to execute the function on the host, wait on the result, and obtain the return values. At the moment, it is performed once per workgroup.","category":"page"},{"location":"hostcall/#Continuous-Host-Call","page":"Host-Call","title":"Continuous Host-Call","text":"","category":"section"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"By default, hostcalls can be used only once. After executing the function on the host, the task finishes and exits.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"However, if you need your hostcall to live indefinitely, pass continuous=true keyword argument to HostCallHolder(...; continuous=true).","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"To then stop the hostcall, call Device.non_continuous!(hc) or Device.finish!(hc) on the HostCallHolder.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"The difference between them is that non_continuous! will allow calling hostcall one more time before exiting, while finish! will exit immediately.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"finish! can be used on any HostCallHolder to force-exit the running hostcall task.","category":"page"},{"location":"hostcall/#Free-hostcall-buffers","page":"Host-Call","title":"Free hostcall buffers","text":"","category":"section"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"For custom hostcalls it is important to call AMDGPU.Device.free! once kernel has finished to free buffers that hostcall used in the process.","category":"page"},{"location":"logging/#Runtime-and-Compiler-Logging","page":"Logging","title":"Runtime and Compiler Logging","text":"","category":"section"},{"location":"logging/","page":"Logging","title":"Logging","text":"AMDGPU.jl has a built-in logging system integrated into various runtime and compiler operations, which is provided by TimespanLogging.jl. Operations such as compilation and linking, signal and buffer allocation/freeing, kernel launch, etc. are instrumented with logging statements, allowing the user to record the start and end of operations.","category":"page"},{"location":"logging/","page":"Logging","title":"Logging","text":"While disabled by default, logging can be enabled by first running AMDGPU.Runtime.enable_logging!() to globally enable logging, after which Julia must be restarted for the changes to take effect.","category":"page"},{"location":"logging/","page":"Logging","title":"Logging","text":"Once logging is globally enabled, AMDGPU.Runtime.start_logging() causes new log events to be saved, while AMDGPU.Runtime.stop_logging() causes new log events to be discarded. Log events can be collected with AMDGPU.Runtime.fetch_logs!(). A more convenient option is AMDGPU.Runtime.log_and_fetch!(f), which can be used to easily log operations within a region of code:","category":"page"},{"location":"logging/","page":"Logging","title":"Logging","text":"logs = AMDGPU.Runtime.log_and_fetch!() do\n    A = AMDGPU.ones(3, 4)\n    B = copy(A)\n    fill!(B, 1f0)\n    C = Array(B)\nend\n@show logs[1]","category":"page"},{"location":"api/#AMDGPU-API-Reference","page":"API Reference","title":"AMDGPU API Reference","text":"","category":"section"},{"location":"api/#Indexing","page":"API Reference","title":"Indexing","text":"","category":"section"},{"location":"api/","page":"API Reference","title":"API Reference","text":"AMDGPU.workitemIdx\nAMDGPU.workgroupIdx\nAMDGPU.workgroupDim\nAMDGPU.gridItemDim\nAMDGPU.gridGroupDim","category":"page"},{"location":"api/#AMDGPU.Device.workitemIdx","page":"API Reference","title":"AMDGPU.Device.workitemIdx","text":"workitemIdx()::ROCDim3\n\nReturns the work item index within the work group. See also: threadIdx\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.workgroupIdx","page":"API Reference","title":"AMDGPU.Device.workgroupIdx","text":"workgroupIdx()::ROCDim3\n\nReturns the work group index. See also: blockIdx\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.workgroupDim","page":"API Reference","title":"AMDGPU.Device.workgroupDim","text":"workgroupDim()::ROCDim3\n\nReturns the size of each workgroup in workitems. See also: blockDim\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.gridItemDim","page":"API Reference","title":"AMDGPU.Device.gridItemDim","text":"gridItemDim()::ROCDim3\n\nReturns the size of the grid in workitems. This behaviour is different from CUDA where gridDim gives the size of the grid in blocks.\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.gridGroupDim","page":"API Reference","title":"AMDGPU.Device.gridGroupDim","text":"gridGroupDim()::ROCDim3\n\nReturns the size of the grid in workgroups. This is equivalent to CUDA's gridDim.\n\n\n\n\n\n","category":"function"},{"location":"api/","page":"API Reference","title":"API Reference","text":"Use these functions for compatibility with CUDA.jl.","category":"page"},{"location":"api/","page":"API Reference","title":"API Reference","text":"AMDGPU.Device.threadIdx\nAMDGPU.Device.blockIdx\nAMDGPU.Device.blockDim","category":"page"},{"location":"api/#AMDGPU.Device.threadIdx","page":"API Reference","title":"AMDGPU.Device.threadIdx","text":"threadIdx()::ROCDim3\n\nReturns the thread index within the block. See also: workitemIdx\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.blockIdx","page":"API Reference","title":"AMDGPU.Device.blockIdx","text":"blockIdx()::ROCDim3\n\nReturns the block index within the grid. See also: workgroupIdx\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.blockDim","page":"API Reference","title":"AMDGPU.Device.blockDim","text":"blockDim()::ROCDim3\n\nReturns the dimensions of the block. See also: workgroupDim\n\n\n\n\n\n","category":"function"},{"location":"api/#Synchronization","page":"API Reference","title":"Synchronization","text":"","category":"section"},{"location":"api/","page":"API Reference","title":"API Reference","text":"AMDGPU.sync_workgroup\nAMDGPU.sync_workgroup_count\nAMDGPU.sync_workgroup_and\nAMDGPU.sync_workgroup_or","category":"page"},{"location":"api/#AMDGPU.Device.sync_workgroup","page":"API Reference","title":"AMDGPU.Device.sync_workgroup","text":"sync_workgroup()\n\nWaits until all wavefronts in a workgroup have reached this call.\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.sync_workgroup_count","page":"API Reference","title":"AMDGPU.Device.sync_workgroup_count","text":"sync_workgroup_count(predicate::Cint)::Cint\n\nIdentical to sync_workgroup, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns the number of workitems for which predicate evaluates to non-zero.\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.sync_workgroup_and","page":"API Reference","title":"AMDGPU.Device.sync_workgroup_and","text":"sync_workgroup_and(predicate::Cint)::Cint\n\nIdentical to sync_workgroup, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for all of them.\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.sync_workgroup_or","page":"API Reference","title":"AMDGPU.Device.sync_workgroup_or","text":"sync_workgroup_or(predicate::Cint)::Cint\n\nIdentical to sync_workgroup, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for any of them.\n\n\n\n\n\n","category":"function"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"DocTestSetup = quote\n    using AMDGPU\nend","category":"page"},{"location":"quickstart/#Quick-Start","page":"Quick Start","title":"Quick Start","text":"","category":"section"},{"location":"quickstart/#Simple-example","page":"Quick Start","title":"Simple example","text":"","category":"section"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"As a simple example, let's add two vectors both on CPU and GPU and make sure that the results are the same:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"First, we do this on CPU:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> n = 1024;\n\njulia> a = fill(1.0, n);\n\njulia> b = fill(2.0, n);\n\njulia> c = a .+ b;","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"To do the same computation on the GPU, we first need to copy the host arrays to the device and then simply add them together element-wise:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> a_d = ROCArray(a);\n\njulia> b_d = ROCArray(b);\n\njulia> c_d = a_d .+ b_d;","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Now, let's check that the results are the same on CPU and GPU by transferring GPU array back to host and comparing the results:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> Array(c_d) ≈ c\ntrue","category":"page"},{"location":"quickstart/#Kernel-example","page":"Quick Start","title":"Kernel example","text":"","category":"section"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Alternatively, we can perform the same computation by writing our custom GPU kernel:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> function vadd!(c, a, b)\n           i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x\n           c[i] = a[i] + b[i]\n           return\n       end\nvadd! (generic function with 1 method)","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"The index i of a single workitem can be uniquely identified by its grid index. In this case only one dimension is used, so we take only .x coordinate into account.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"A kernel is compiled upon its first launch. Subsequent launches re-use it, without recompilation. Let's launch a kernel, but first clear-out the memory of the resulting vector cd.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> fill!(c_d, 0.0);\n\njulia> groupsize = 256;\n\njulia> gridsize = cld(n, groupsize);\n\njulia> @roc groupsize=groupsize gridsize=gridsize vadd!(c_d, a_d, b_d);\n\njulia> Array(c_d) ≈ c\ntrue","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"The easiest way to launch a GPU kernel is with the @roc macro, specifying groupsize and gridsize to cover full array, and calling it like a regular function.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Keep in mind that kernel launches are asynchronous, meaning that you need to synchronize before you can use the result (e.g. with AMDGPU.synchronize). However, GPU <-> CPU transfers synchronize implicitly.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"The grid is the domain over which the entire kernel executes over. The grid will be split into multiple workgroups by hardware automatically, and the kernel does not complete until all workgroups complete.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Like OpenCL, AMDGPU has the concept of \"workitems\", \"workgroups\", and the \"grid\". A workitem is a single thread of execution, capable of performing arithmentic operations. Workitems are grouped into \"wavefronts\" (\"warps\" in CUDA) which share the same compute unit, and execute the same instructions simulatenously. The workgroup is a logical unit of compute supported by hardware which comprises multiple wavefronts, which shares resources (specifically local memory) and can be efficiently synchronized. A workgroup may be executed by one or multiple hardware compute units, making it often the only dimension of importance for smaller kernel launches.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Notice how we explicitly specify that this function does not return a value by adding the return statement. This is necessary for all GPU kernels and we can enforce it by adding a return, return nothing, or even nothing at the end of the kernel. If this statement is omitted, Julia will attempt to return the value of the last evaluated expression, in this case a Float64, which will cause a compilation failure as kernels cannot return values.","category":"page"},{"location":"quickstart/#Naming-conventions","page":"Quick Start","title":"Naming conventions","text":"","category":"section"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Throughout this example we use terms like \"work group\" and \"work item\". These terms are used by the Khronos consortium and their APIs including OpenCL and Vulkan, as well as the HSA foundation.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"NVIDIA, on the other hand, uses some different terms in their CUDA API, which might be confusing to some users porting their kernels from CUDA to AMDGPU.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"As a quick summary, here is a mapping of the most common terms:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"AMDGPU CUDA\nworkitemIdx threadIdx\nworkgroupIdx blockIdx\nworkgroupDim blockDim\ngridItemDim No equivalent\ngridGroupDim gridDim\ngroupsize threads\ngridsize blocks\nstream stream","category":"page"},{"location":"memory/#Memory-Allocation-and-Intrinsics","page":"Memory","title":"Memory Allocation and Intrinsics","text":"","category":"section"},{"location":"memory/#Memory-Varieties","page":"Memory","title":"Memory Varieties","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"GPUs contain various kinds of memory, just like CPUs:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Global:   Globally accessible by all CUs on a GPU, and possibly accessible   from outside of the GPU (by the CPU host, by other GPUs, by PCIe devices,   etc.). Slowest form of memory.\nConstant:   Same as global memory, but signals to the hardware that it can use   special instructions to access and cache this memory.   Can be changed between kernel invocations.\nRegion:   Also known as Global Data Store (GDS), all wavefronts on a CU   can access the same memory region from the same address.   Faster than Global/Constant.   Automatically allocated by the compiler/runtime, not user accessible.\nLocal:   Also known as Local Data Store (LDS), all wavefronts in the same workgroup   can access the same memory region from the same address.   Faster than GDS.\nPrivate:   Uses the hardware scratch space, and is private to each SIMD lane   in a wavefront.   Fastest form of traditional memory.","category":"page"},{"location":"memory/#Local-Memory","page":"Memory","title":"Local Memory","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"Local memory may be allocated within a kernel by calling either:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"@ROCStaticLocalArray(T, dims) - if dims is passed as a constant value,   known at compile-time.   E.g. @ROCStaticLocalArray(Float32, 8).\n@ROCDynamicLocalArray(T, dims) - otherwise.   E.g. @ROCStaticLocalArray(Float32, length(X)).","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Local memory is zero-initialized by default. If this is unnecessary and undesired for performance reasons, disable this, passing false as a last argument: @ROCStaticLocalArray(Float32, 8, false) or @ROCStaticLocalArray(Float32, length(X), false)","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Local memory does not need to be freed, as it is automatically freed by the hardware.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"If @ROCDynamicLocalArray is used, then local memory is dynamically allocated at kernel execution time. The shmem option to @roc must be set appropriately to ensure that enough local memory is allocated by the hardware.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"It is allocated in addition to the local memory that is statically allocated by the kernel.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"function kernel(C, A)\n    # Allocate local memory dynamically\n    Ctmp = @ROCDynamicLocalArray(Float64, length(C))\n    # Or, allocate local memory statically if the size is known ahead-of-time\n    Ctmp = @ROCStaticLocalArray(Float64, 8) # if we want 8 elements\n\n    idx = AMDGPU.workitemIdx().x\n    Ctmp[idx] = A[idx] + C[1]\n    AMDGPU.Device.sync_workgroup()\n\n    C[idx] = Ctmp[idx]\n    return\nend\n\n...\n# Note: The `shmem` option isn't necessary if `@ROCStaticLocalArray` is used\nshmem = sizeof(Float64) * length(RC)\n@roc groupsize=8 shmem=shmem kernel(RC, RA)","category":"page"},{"location":"memory/#Device-Side-Allocations","page":"Memory","title":"Device-Side Allocations","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"Global memory may be allocated/freed dynamically from kernels by calling AMDGPU.Device.malloc(::Csize_t)::Ptr{Cvoid} and AMDGPU.Device.free(::Ptr{Cvoid}).","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"This memory allocation/deallocation uses hostcalls to operate, and so is relatively slow, but is also very useful. See Hostcall section for more info about them.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Memory allocated with AMDGPU.Device.malloc is a host-pinned memory. Calls to malloc and free are performed once per workgroup, so ensure that enough memory has been allocated to feed the lanes that will be accessing it.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"As an example, here's how an array could be allocated on-device to store temporary results:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"function kernel(C, A)\n    # Allocate memory dynamically and get a pointer to it.\n    Ctmp_ptr = AMDGPU.Device.malloc(Csize_t(sizeof(Float64) * length(C)))\n    # Turn a pointer into a device-side array.\n    Ctmp = ROCDeviceArray(length(C), reinterpret(Core.LLVMPtr{Float64,1}, Ctmp_ptr))\n\n    # Use it\n    idx = AMDGPU.workitemIdx().x\n    Ctmp[idx] = A[idx] + C[1]\n    AMDGPU.Device.sync_workgroup()\n\n    C[idx] = Ctmp[idx]\n    # Make sure to free it.\n    AMDGPU.Device.free(Ctmp_ptr)\n    return\nend\n\nRA = AMDGPU.rand(4)\nRC = AMDGPU.rand(4)\nRC_elem = Array(RC)[1]\n@roc groupsize=4 kernel(RC, RA)\n@assert Array(RC) ≈ Array(RA) .+ RC_elem","category":"page"},{"location":"memory/#Memory-Modification-Intrinsics","page":"Memory","title":"Memory Modification Intrinsics","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"Like C, AMDGPU.jl provides the memset! and memcpy! intrinsics, which are useful for setting a memory region to a value, or copying one region to another, respectively. Check test/device/memory.jl for examples of their usage.","category":"page"},{"location":"memory/#Wrapping-in-ROCArray","page":"Memory","title":"Wrapping in ROCArray","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"You can wrap host array to be accessible (pinned) on the device with:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"x = rand(Float32, 4, 4)\nxd = unsafe_wrap(ROCArray, pointer(x), size(x))\n\n# Pointer to `xd` is a device-mapped pointer, not host pointer.\n@show pointer(xd) == xd.buf.dev_ptr\n@show pointer(xd) == xd.buf.ptr\n\n# Can be used in kernels, host array `x` is also updated.\nxd .+= 1f0\n\n# Can be used with HIP libraries.\nxd * xd","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Pinned memory is automatically unregistered upon array destruction. You can't free it, since it is managed by the host.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Additionally, you can wrap the device array with:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"x = AMDGPU.rand(Float32, 4, 4)\nxd = unsafe_wrap(ROCArray, pointer(x), size(x); lock=false)\n\n# Can be used in kernels, `x` is also updated.\nxd .+= 1f0\n\n# Can be used with HIP libraries.\nxd * xd\n\n# Freeing is a no-op for `xd`, since `xd` does not own the underlying memory.\nAMDGPU.unsafe_free!(xd) # No-op.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Notice mandatory ; lock=false keyword, this is to be able to differentiate between host & device pointers.","category":"page"},{"location":"printing/#Printing","page":"Printing","title":"Printing","text":"","category":"section"},{"location":"printing/","page":"Printing","title":"Printing","text":"Writing GPU kernels can be a difficult endeavor, owing to the fact that the LLVM GPU backends turn serial code into parallel code automatically. Recognizing this, every good GPU programming interface allows the user's GPU kernels to print output to a buffer, which will be passed to the host for display. With the ability to interpolate variables, this functionality serves as the \"printf of GPUs\". Quite literally, the primary tool for this is @rocprintf. Here's a simple example of printing the current workgroup index:","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"kernel(x) = @rocprintf \"Workgroup index: %d\\n\" workgroupIdx().x","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"The above kernel would print out the string \"Workgroup index: 1\\n\" when run with a single workgroup (where \"\\n\" means a newline).","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"Any number of variables may be passed to @rocprintf, as long as those variables have a printf-compatible implementation in Printf.@printf. Calls to @rocprintf are blocking, and will not return control to the kernel until the string has been formatted and sent to the OS runtime for printing (the same as for calls to Printf.@printf).","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"While @rocprintf is printed once per workgroup by default, it's possible to print once per lane, once per wavefront, or once per grid by specifying an execution mode as the first argument:","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"# Once per lane\nkernel(x) = @rocprintf :lane \"My index is: %d\\n\" workitemIdx().x\n\n# Once per wavefront\nkernel(x) = @rocprintf :wave \"My index is: %d\\n\" workitemIdx().x\n\n# Once per workgroup\nkernel(x) = @rocprintf :group \"My index is: %d\\n\" workitemIdx().x\n# OR (:group is the default)\nkernel(x) = @rocprintf \"My index is: %d\\n\" workitemIdx().x\n\n# Once total\nkernel(x) = @rocprintf :grid \"My index is: %d\\n\" workitemIdx().x","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"Executing those kernels with 256 workitems split evenly between 2 workgroups would print out:","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"# :lane\nMy index is 1\nMy index is 2\n...\nMy index is 127\nMy index is 128\nMy index is 1\nMy index is 2\n...\nMy index is 127\nMy index is 128\n\n# :wave\nMy index is 1\nMy index is 65\nMy index is 1\nMy index is 65\n\n# :group\nMy index is 1\nMy index is 1\n\n# :grid\nMy index is 1","category":"page"},{"location":"printing/#Differences-to-@cuprintf","page":"Printing","title":"Differences to @cuprintf","text":"","category":"section"},{"location":"printing/","page":"Printing","title":"Printing","text":"Similar to CUDA's @cuprintf, @rocprintf is a printf-compatible macro which takes a format string and arguments, and commands the host CPU to display it as formatted text. However, in contrast to @cuprintf, we use AMDGPU's hostcall and Julia's Printf stdlib to implement this. This means that anything that Printf can print, so can @rocprintf (assuming such an object can be represented on the GPU). The macro is also handled as a regular hostcall, which means that argument types are checked at compile time (although currently, any errors while printing will be detected on the host, and will terminate the kernel).","category":"page"},{"location":"streams/#Streams","page":"Streams","title":"Streams","text":"","category":"section"},{"location":"streams/","page":"Streams","title":"Streams","text":"Similar to CUDA streams, ROCm has HIP streams, which are buffers used to instruct the GPU hardware which kernels to launch. HIP streams are synchronous, like CUDA streams.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Each device has a default stream associated, which is accessible with AMDGPU.stream().","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"There are several ways to specify which stream to launch a kernel on:","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Using AMDGPU.stream! to change default stream to be used   within the same Julia task.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"stream = AMDGPU.HIPStream()\nAMDGPU.stream!(stream) # Change default stream to be used for subsequent operations.\nAMDGPU.ones(Float32, 16) # Will be executed on `stream`.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Using AMDGPU.stream! to execute given function and reset   to the original stream after completion:","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"stream = AMDGPU.HIPStream()\nx = AMDGPU.stream!(() -> AMDGPU.ones(Float32, 16), stream)","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Using stream argument to @roc macro:","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"stream = AMDGPU.HIPStream()\n@roc stream=stream kernel(...)","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Streams also have an inherent priority, which allows control of kernel submission latency and on-device scheduling preference with respect to kernels submitted on other streams. There are three priorities: normal (the default), low, and high priority.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Priority of the default stream can be set with AMDGPU.priority!. Alternatively, it can be set at stream creation time:","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"low_prio = HIPStream(:low)\nhigh_prio = HIPStream(:high)\nnormal_prio = HIPStream(:normal) # or just omit \"priority\"","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"AMDGPU.stream\nAMDGPU.stream!\nAMDGPU.priority!\nAMDGPU.HIPStream","category":"page"},{"location":"streams/#AMDGPU.stream","page":"Streams","title":"AMDGPU.stream","text":"stream()::HIPStream\n\nGet the HIP stream that should be used as the default one for the currently executing task.\n\n\n\n\n\n","category":"function"},{"location":"streams/#AMDGPU.stream!","page":"Streams","title":"AMDGPU.stream!","text":"stream!(s::HIPStream)\n\nChange the default stream to be used within the same Julia task.\n\n\n\n\n\nstream!(f::Base.Callable, stream::HIPStream)\n\nChange the default stream to be used within the same Julia task, execute f and revert to the original stream.\n\nReturns:\n\nReturn value of the function f.\n\n\n\n\n\n","category":"function"},{"location":"streams/#AMDGPU.priority!","page":"Streams","title":"AMDGPU.priority!","text":"priority!(p::Symbol)\n\nChange the priority of the default stream. Accepted values are :normal (the default), :low and :high.\n\n\n\n\n\npriority!(f::Base.Callable, priority::Symbol)\n\nChnage the priority of default stream, execute f and revert to the original priority. Accepted values are :normal (the default), :low and :high.\n\nReturns:\n\nReturn value of the function f.\n\n\n\n\n\n","category":"function"},{"location":"streams/#AMDGPU.HIP.HIPStream","page":"Streams","title":"AMDGPU.HIP.HIPStream","text":"HIPStream(priority::Symbol = :normal)\n\nArguments:\n\npriority::Symbol: Priority of the stream: :normal, :high or :low.\n\nCreate HIPStream with given priority. Device is the default device that's currently in use.\n\n\n\n\n\nHIPStream(stream::hipStream_t)\n\nCreate HIPStream from hipStream_t handle. Device is the default device that's currently in use.\n\n\n\n\n\n","category":"type"},{"location":"streams/#Synchronization","page":"Streams","title":"Synchronization","text":"","category":"section"},{"location":"streams/","page":"Streams","title":"Streams","text":"AMDGPU.jl by default uses non-blocking stream synchronization with AMDGPU.synchronize to work correctly with TLS and Hostcall.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Users, however, can switch to a blocking synchronization globally with nonblocking_synchronization preference or with fine-grained AMDGPU.synchronize(; blocking=true). Blocking synchronization might offer slightly lower latency.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"You can also perform synchronization of the expression with AMDGPU.@sync macro, which will execute given expression and synchronize afterwards (using AMDGPU.synchronize under the hood).","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"AMDGPU.@sync begin\n    @roc ...\nend","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Finally, you can perform full device synchronization with AMDGPU.device_synchronize.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"AMDGPU.synchronize\nAMDGPU.@sync\nAMDGPU.device_synchronize","category":"page"},{"location":"streams/#AMDGPU.synchronize","page":"Streams","title":"AMDGPU.synchronize","text":"synchronize(stream::HIPStream = stream(); blocking::Bool = false)\n\nWait until all kernels executing on stream have completed.\n\nIf there are running HostCalls, then blocking must be false. Additionally, if you want to stop host calls afterwards, then provide stop_hostcalls=true keyword argument.\n\n\n\n\n\n","category":"function"},{"location":"streams/#AMDGPU.@sync","page":"Streams","title":"AMDGPU.@sync","text":"@sync ex\n\nRun expression ex on currently active stream and synchronize the GPU on that stream afterwards.\n\nSee also: synchronize.\n\n\n\n\n\n","category":"macro"},{"location":"streams/#AMDGPU.HIP.device_synchronize","page":"Streams","title":"AMDGPU.HIP.device_synchronize","text":"Blocks until all kernels on all streams have completed. Uses currently active device.\n\n\n\n\n\n","category":"function"},{"location":"profiling/#rocprof","page":"Profiling","title":"rocprof","text":"","category":"section"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"rocprofv2 allows profiling both HSA & HIP API calls (rocprof being deprecated).","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Let's profile simple copying kernel saved in profile.jl file:","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"using AMDGPU\n\nfunction mycopy!(dst, src)\n    i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x\n    if i ≤ length(dst)\n        @inbounds dst[i] = src[i]\n    end\n    return\nend\n\nfunction main(N)\n    src = ROCArray{Float64}(undef, N)\n    dst = ROCArray{Float64}(undef, N)\n    groupsize = 256               # nthreads\n    gridsize = cld(N, groupsize)  # nblocks\n\n    for i in 1:10\n        @roc groupsize=groupsize gridsize=gridsize mycopy!(dst, src)\n        AMDGPU.synchronize()\n    end\n\n    AMDGPU.unsafe_free!(dst)\n    AMDGPU.unsafe_free!(src)\n    AMDGPU.synchronize()\n    return\nend\nmain(2^24)","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"ENABLE_JITPROFILING=1 rocprofv2 --plugin perfetto --hip-trace --hsa-trace --kernel-trace -o prof julia ./profile.jl","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"This will produce prof_output.pftrace file which can be visualized using Perfetto UI.","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Zoomed out Zoomed in\n(Image: image) (Image: image)","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Here we can clearly see that host synchronization after each kernel dispatch causes poor device occupancy (empty spaces between kernel dispatches).","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"We can fix this by moving synchronization outside the loop so that it happens only once.","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"    ...\n    for i in 1:10\n        @roc groupsize=groupsize gridsize=gridsize mycopy!(dst, src)\n    end\n    AMDGPU.synchronize()\n    ...","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Running profiling again and visualizing results we now see that kernel launches are adjacent to each other and that the average wall duration is lower.","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Zoomed out Zoomed in\n(Image: image) (Image: image)","category":"page"},{"location":"profiling/#Debugging","page":"Profiling","title":"Debugging","text":"","category":"section"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Use HIP_LAUNCH_BLOCKING=1 to synchronize immediately after launching GPU kernels. This will allow to pinpoint exact kernel that caused the exception.","category":"page"},{"location":"#Programming-AMD-GPUs-with-Julia","page":"Home","title":"Programming AMD GPUs with Julia","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Julia support for programming AMD GPUs is currently provided by the AMDGPU.jl package. This package contains everything necessary to program for AMD GPUs in Julia, including:","category":"page"},{"location":"","page":"Home","title":"Home","text":"An interface for compiling and running kernels written in Julia through LLVM's AMDGPU backend.\nAn interface for working with the HIP runtime API,   necessary for launching compiled kernels and controlling the GPU.\nAn array type implementing the GPUArrays.jl   interface, providing high-level array operations.","category":"page"},{"location":"#Installation","page":"Home","title":"Installation","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Simply add the AMDGPU.jl package to your Julia environment:","category":"page"},{"location":"","page":"Home","title":"Home","text":"using Pkg\nPkg.add(\"AMDGPU\")","category":"page"},{"location":"","page":"Home","title":"Home","text":"To ensure that everything works, you can run the test suite:","category":"page"},{"location":"","page":"Home","title":"Home","text":"using AMDGPU\nusing Pkg\nPkg.test(\"AMDGPU\")","category":"page"},{"location":"#Requirements","page":"Home","title":"Requirements","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Julia 1.9 or higher (Navi 3 requires Julia 1.10+).\n64-bit Linux or Windows.\nMinimal supported ROCm version is 5.3.\nRequired software:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Linux Windows\nROCm ROCm\n- AMD Software: Adrenalin Edition","category":"page"},{"location":"","page":"Home","title":"Home","text":"On Windows AMD Software: Adrenalin Edition contains HIP library itself, while ROCm provides support for other functionality.","category":"page"},{"location":"#Windows-OS-missing-functionality","page":"Home","title":"Windows OS missing functionality","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Windows does not yet support Hostcall, which means that some of the functionality does not work, like:","category":"page"},{"location":"","page":"Home","title":"Home","text":"device printing;\ndynamic memory allocation (from kernels).","category":"page"},{"location":"","page":"Home","title":"Home","text":"These hostcalls are sometimes launched when AMDGPU detects that a kernel might throw an exception, specifically during conversions, like: Int32(1f0).","category":"page"},{"location":"","page":"Home","title":"Home","text":"To avoid this, use 'unsafe' conversion option: unsafe_trunc(Int32, 1f0).","category":"page"},{"location":"#ROCm-system-libraries","page":"Home","title":"ROCm system libraries","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"AMDGPU.jl looks into standard directories and uses Libdl.find_library to find ROCm libraries.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Standard path:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Linux: /opt/rocm\nWindows: C:/Program Files/AMD/ROCm/<rocm-version>","category":"page"},{"location":"","page":"Home","title":"Home","text":"If you have non-standard path for ROCm, set ROCM_PATH=<path> environment variable before launching Julia. For example, if ROCm is installed in your Linux system root (e.g. on Fedora), set ROCM_PATH=/usr/lib64/rocm/gfx11 or ROCM_PATH=/usr/lib64/rocm/gfx1103, depending on your GPU's architecture. You can query the architecture using the amdgpu-arch command. The AMDGPU.versioninfo() function prints the paths of any libraries found.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Depending on your GPU model and the functionality you want to use, you may have to force the GPU architecture by setting the HSA_OVERRIDE_GFX_VERSION variable to a compatible version.","category":"page"},{"location":"#Extra-Setup-Details","page":"Home","title":"Extra Setup Details","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"List of additional steps that may be needed to take to ensure everything is working:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Make sure your user is in the same group as /dev/kfd, other than root.\nFor example, it might be the render group:\ncrw-rw----   1 root   render  234,   0 Aug  5 11:43 kfd\nIn this case, you can add yourself to it:\nsudo usermod -aG render username\nROCm libraries should be in the standard library locations, or in your LD_LIBRARY_PATH.\nIf you get an error message along the lines of GLIB_CXX_... not found,   it's possible that the C++ runtime used to build the ROCm stack   and the one used by Julia are different.   If you built the ROCm stack yourself this is very likely the case   since Julia normally ships with its own C++ runtime.\nFor more information, check out this GitHub issue.   A quick fix is to use the LD_PRELOAD environment variable to make Julia use the system C++ runtime library, for example:\nLD_PRELOAD=/usr/lib/libstdc++.so julia\nAlternatively, you can build Julia from source as described   here.   To quickly debug this issue start Julia and try to load a ROCm library:\nusing Libdl   Libdl.dlopen(\"/opt/rocm/hsa/lib/libhsa-runtime64.so.1\")","category":"page"},{"location":"","page":"Home","title":"Home","text":"Once all of this is setup properly, you should be able to do using AMDGPU successfully.","category":"page"},{"location":"","page":"Home","title":"Home","text":"See the Quick Start documentation for an introduction to using AMDGPU.jl.","category":"page"},{"location":"#Preferences","page":"Home","title":"Preferences","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"AMDGPU.jl supports setting preferences. Template of LocalPreferences.toml with all options:","category":"page"},{"location":"","page":"Home","title":"Home","text":"[AMDGPU]\n# If `true` (default), eagerly run GC to keep the pool from growing too big.\n# GC is triggered during new allocatoins or synchronization points.\neager_gc = false\n# Use non-blocking synchronization for all `AMDGPU.synchronize()` calls.\nnonblocking_synchronization = true\n# Memory limit specifies maximum amount of memory in percentages\n# a current Julia process can use.\n# Default is \"none\", which does not apply any limitation.\nhard_memory_limit = \"none\"\n# Notice a space between the value and percentage sign.\n# hard_memory_limit = \"80 %\"","category":"page"},{"location":"kernel_programming/#Kernel-Programming","page":"Kernel Programming","title":"Kernel Programming","text":"","category":"section"},{"location":"kernel_programming/#Launch-Configuration","page":"Kernel Programming","title":"Launch Configuration","text":"","category":"section"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"While an almost arbitrarily large number of workitems can be executed per kernel launch, the hardware can only support executing a limited number of wavefronts at one time.","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"To alleviate this, the compiler calculates the \"occupancy\" of each compiled kernel (which is the number of wavefronts that can be simultaneously executing on the GPU), and passes this information to the hardware; the hardware then launches a limited number of wavefronts at once, based on the kernel's \"occupancy\" values.","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"The rest of the wavefronts are not launched until hardware resources become available, which means that a kernel with better occupancy will see more of its wavefronts executing simultaneously (which often leads to better performance). Suffice to say, it's important to know the occupancy of kernels if you want the best performance.","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"Like CUDA.jl, AMDGPU.jl has the ability to calculate kernel occupancy, with the launch_configuration function:","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"kernel = @roc launch=false mykernel(args...)\noccupancy = AMDGPU.launch_configuration(kernel)\n@show occupancy.gridsize\n@show occupancy.groupsize","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"Specifically, launch_configuration calculates the occupancy of mykernel(args...), and then calculates an optimal groupsize based on the occupancy. This value can then be used to select the groupsize for the kernel:","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"@roc groupsize=occupancy.groupsize mykernel(args...)","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"AMDGPU.@roc\nAMDGPU.Runtime.HIPKernel\nAMDGPU.Compiler.hipfunction","category":"page"},{"location":"kernel_programming/#AMDGPU.@roc","page":"Kernel Programming","title":"AMDGPU.@roc","text":"@roc [kwargs...] func(args...)\n\nHigh-level interface for launching kernels on GPU. Upon a first call it will be compiled, subsequent calls will re-use the compiled object.\n\nSeveral keyword arguments are supported:\n\nlaunch::Bool = true: whether to launch the kernel.   If false, then returns a compiled kernel which can be launched by   calling it and passing arguments.\nArguments that influence kernel compilation, see   AMDGPU.Compiler.hipfunction.\nArguments that influence kernel launch, see AMDGPU.Runtime.HIPKernel.\n\n\n\n\n\n","category":"macro"},{"location":"kernel_programming/#AMDGPU.Runtime.HIPKernel","page":"Kernel Programming","title":"AMDGPU.Runtime.HIPKernel","text":"(ker::HIPKernel)(args::Vararg{Any, N}; kwargs...)\n\nLaunch compiled HIPKernel by passing arguments to it.\n\nThe following kwargs are supported:\n\ngridsize::ROCDim = 1: Size of the grid.\ngroupsize::ROCDim = 1:  Size of the workgroup.\nshmem::Integer = 0:   Amount of dynamically-allocated shared memory in bytes.\nstream::HIP.HIPStream = AMDGPU.stream():   Stream on which to launch the kernel.\n\n\n\n\n\n","category":"type"},{"location":"kernel_programming/#AMDGPU.Compiler.hipfunction","page":"Kernel Programming","title":"AMDGPU.Compiler.hipfunction","text":"hipfunction(f::F, tt::TT = Tuple{}; kwargs...)\n\nCompile Julia function f to a HIP kernel given a tuple of argument's types tt that it accepts.\n\nThe following kwargs are supported:\n\nname::Union{String, Nothing} = nothing:   A unique name to give a compiled kernel.\nunsafe_fp_atomics::Bool = true:   Whether to use 'unsafe' floating-point atomics.   AMD GPU devices support fast atomic read-modify-write (RMW)   operations on floating-point values.   On single- or double-precision floating-point values this may generate   a hardware RMW instruction that is faster than emulating   the atomic operation using an atomic compare-and-swap (CAS) loop.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#Atomics","page":"Kernel Programming","title":"Atomics","text":"","category":"section"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"AMDGPU.jl relies on Atomix.jl for atomics.","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"Example of a kernel that computes atomic max:","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"using AMDGPU\n\nfunction ker_atomic_max!(target, source, indices)\n    i = workitemIdx().x + (workgroupIdx().x - 0x1) * workgroupDim().x\n    idx = indices[i]\n    v = source[i]\n    AMDGPU.@atomic max(target[idx], v)\n    return\nend\n\nn, bins = 1024, 32\nsource = ROCArray(rand(UInt32, n))\nindices = ROCArray(rand(1:bins, n))\ntarget = ROCArray(zeros(UInt32, bins))\n@roc groupsize=256 gridsize=4 ker_atomic_max!(target, source, indices)","category":"page"},{"location":"kernel_programming/#Device-Intrinsics","page":"Kernel Programming","title":"Device Intrinsics","text":"","category":"section"},{"location":"kernel_programming/#Wavefront-Level-Primitives","page":"Kernel Programming","title":"Wavefront-Level Primitives","text":"","category":"section"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"AMDGPU.Device.wavefrontsize\nAMDGPU.Device.activelane\n\nAMDGPU.Device.ballot\nAMDGPU.Device.ballot_sync\nAMDGPU.Device.activemask\n\nAMDGPU.Device.bpermute\nAMDGPU.Device.permute\n\nAMDGPU.Device.shfl\nAMDGPU.Device.shfl_sync\nAMDGPU.Device.shfl_up\nAMDGPU.Device.shfl_up_sync\nAMDGPU.Device.shfl_down\nAMDGPU.Device.shfl_down_sync\nAMDGPU.Device.shfl_xor\nAMDGPU.Device.shfl_xor_sync\n\nAMDGPU.Device.any_sync\nAMDGPU.Device.all_sync","category":"page"},{"location":"kernel_programming/#AMDGPU.Device.wavefrontsize","page":"Kernel Programming","title":"AMDGPU.Device.wavefrontsize","text":"wavefrontsize()::Cuint\n\nGet the wavefront size of the device that executes current kernel.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.activelane","page":"Kernel Programming","title":"AMDGPU.Device.activelane","text":"activelane()::Cuint\n\nGet id of the current lane within a wavefront/warp.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           x[i + 1] = i\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Cint}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> Array(x)\n1×8 Matrix{Int32}:\n 0  1  2  3  4  5  6  7\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.ballot","page":"Kernel Programming","title":"AMDGPU.Device.ballot","text":"ballot(predicate::Bool)::UInt64\n\nReturn a value whose Nth bit is set if and only if predicate evaluates to true for the Nth lane and the lane is active.\n\njulia> function ker!(x)\n           x[1] = AMDGPU.Device.ballot(true)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Culong}(undef, 1);\n\njulia> @roc groupsize=32 ker!(x);\n\njulia> x\n1-element ROCArray{UInt64, 1, AMDGPU.Runtime.Mem.HIPBuffer}:\n 0x00000000ffffffff\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.ballot_sync","page":"Kernel Programming","title":"AMDGPU.Device.ballot_sync","text":"ballot_sync(mask::UInt64, predicate::Bool)::UInt64\n\nEvaluate predicate for all non-exited threads in mask and return an integer whose Nth bit is set if and only if predicate is true for the Nth thread of the wavefront and the Nth thread is active.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           if i % 2 == 0\n               mask = 0x0000000055555555 # Only even threads.\n               x[1] = AMDGPU.Device.ballot_sync(mask, true)\n           end\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{UInt64}(undef, 1);\n\njulia> @roc groupsize=32 ker!(x);\n\njulia> bitstring(Array(x)[1])\n\"0000000000000000000000000000000001010101010101010101010101010101\"\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.activemask","page":"Kernel Programming","title":"AMDGPU.Device.activemask","text":"activemask()::UInt64\n\nGet the mask of all active lanes in a warp.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.bpermute","page":"Kernel Programming","title":"AMDGPU.Device.bpermute","text":"bpermute(addr::Integer, val::Cint)::Cint\n\nRead data stored in val from the lane VGPR (vector general purpose register) given by addr.\n\nThe permute instruction moves data between lanes but still uses the notion of byte addressing, as do other LDS instructions. Hence, the value in the addr VGPR should be desired_lane_id * 4, since VGPR values are 4 bytes wide.\n\nExample below shifts all values in the wavefront by 1 to the \"left\".\n\njulia> function ker!(x)\n           i::Cint = AMDGPU.Device.activelane()\n           # `addr` points to the next immediate lane.\n           addr = ((i + 1) % 8) * 4 # VGPRs are 4 bytes wide\n           # Read data from the next immediate lane.\n           x[i + 1] = AMDGPU.Device.bpermute(addr, i)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Cint}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int32, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  2  3  4  5  6  7  0\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.permute","page":"Kernel Programming","title":"AMDGPU.Device.permute","text":"permute(addr::Integer, val::Cint)::Cint\n\nPut data stored in val to the lane VGPR (vector general purpose register) given by addr.\n\nExample below shifts all values in the wavefront by 1 to the \"right\".\n\njulia> function ker!(x)\n           i::Cint = AMDGPU.Device.activelane()\n           # `addr` points to the next immediate lane.\n           addr = ((i + 1) % 8) * 4 # VGPRs are 4 bytes wide\n           # Put data into the next immediate lane.\n           x[i + 1] = AMDGPU.Device.permute(addr, i)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Cint}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int32, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 7  0  1  2  3  4  5  6\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl","page":"Kernel Programming","title":"AMDGPU.Device.shfl","text":"shfl(val, lane, width = wavefrontsize())\n\nRead data stored in val from a lane (this is a more high-level op than bpermute).\n\nIf lane is outside the range [0:width - 1], the value returned corresponds to the value held by the lane modulo width (within the same subsection).\n\njulia> function ker!(x)\n           i::UInt32 = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl(i, i + 1)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{UInt32}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> Int.(x)\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  2  3  4  5  6  7  0\n\nIf width is less than wavefront size then each subsection of the wavefront behaves as a separate entity with a starting logical lane ID of 0.\n\njulia> function ker!(x)\n           i::UInt32 = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl(i, i + 1, 4) # <-- Notice width = 4.\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{UInt32}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> Int.(x)\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  2  3  0  5  6  7  4\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_sync","page":"Kernel Programming","title":"AMDGPU.Device.shfl_sync","text":"shfl_sync(mask::UInt64, val, lane, width = wavefrontsize())\n\nSynchronize threads according to a mask and read data stored in val from a lane ID.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_up","page":"Kernel Programming","title":"AMDGPU.Device.shfl_up","text":"shfl_up(val, δ, width = wavefrontsize())\n\nSame as shfl, but instead of specifying lane ID, accepts δ that is subtracted from the current lane ID. I.e. read from a lane with lower ID relative to the caller.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl_up(i, 1)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Int}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 0  0  1  2  3  4  5  6\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_up_sync","page":"Kernel Programming","title":"AMDGPU.Device.shfl_up_sync","text":"shfl_up_sync(mask::UInt64, val, δ, width = wavefrontsize())\n\nSynchronize threads according to a mask and read data stored in val from a lane with lower ID relative to the caller.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_down","page":"Kernel Programming","title":"AMDGPU.Device.shfl_down","text":"shfl_down(val, δ, width = wavefrontsize())\n\nSame as shfl, but instead of specifying lane ID, accepts δ that is added to the current lane ID. I.e. read from a lane with higher ID relative to the caller.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl_down(i, 1, 8)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Int}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  2  3  4  5  6  7  7\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_down_sync","page":"Kernel Programming","title":"AMDGPU.Device.shfl_down_sync","text":"shfl_down_sync(mask::UInt64, val, δ, width = wavefrontsize())\n\nSynchronize threads according to a mask and read data stored in val from a lane with higher ID relative to the caller.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_xor","page":"Kernel Programming","title":"AMDGPU.Device.shfl_xor","text":"shfl_xor(val, lane_mask, width = wavefrontsize())\n\nSame as shfl, but instead of specifying lane ID, performs bitwise XOR of the caller's lane ID with the lane_mask.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl_xor(i, 1)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Int}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  0  3  2  5  4  7  6\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_xor_sync","page":"Kernel Programming","title":"AMDGPU.Device.shfl_xor_sync","text":"shfl_xor_sync(mask::UInt64, val, lane_mask, width = wavefrontsize())\n\nSynchronize threads according to a mask and read data stored in val from a lane according to a bitwise XOR of the caller's lane ID with the lane_mask.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.any_sync","page":"Kernel Programming","title":"AMDGPU.Device.any_sync","text":"any_sync(mask::UInt64, predicate::Bool)::Bool\n\nEvaluate predicate for all non-exited threads in mask and return non-zero if and only if predicate evaluates to non-zero for any of them.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           if i % 2 == 0\n               mask = 0x0000000055555555 # Only even threads.\n               x[1] = AMDGPU.Device.any_sync(mask, i == 0)\n           end\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Bool}(undef, 1);\n\njulia> @roc groupsize=32 ker!(x);\n\njulia> x\n1-element ROCArray{Bool, 1, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.all_sync","page":"Kernel Programming","title":"AMDGPU.Device.all_sync","text":"all_sync(mask::UInt64, predicate::Bool)::Bool\n\nEvaluate predicate for all non-exited threads in mask and return non-zero if and only if predicate evaluates to non-zero for all of them.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           if i % 2 == 0\n               mask = 0x0000000055555555 # Only even threads.\n               x[1] = AMDGPU.Device.all_sync(mask, true)\n           end\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Bool}(undef, 1);\n\njulia> @roc groupsize=32 ker!(x);\n\njulia> x\n1-element ROCArray{Bool, 1, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1\n\n\n\n\n\n","category":"function"}]
+[{"location":"exceptions/#Kernel-Exceptions","page":"Exceptions","title":"Kernel Exceptions","text":"","category":"section"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"Just like regular CPU-executed Julia functions, GPU kernels can throw exceptions!","category":"page"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"For example, the following kernel will throw an out-of-bounds exception:","category":"page"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"julia> function ker!(x)\n           x[0] = 1\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray([1]);\n\njulia> @roc kerr(x);\n\njulia> AMDGPU.synchronize()\nERROR: GPU Kernel Exception\nStacktrace:\n [1] error(s::String)\n   @ Base ./error.jl:35\n [2] throw_if_exception(dev::HIPDevice)\n   @ AMDGPU ~/.julia/dev/AMDGPU/src/exception_handler.jl:115\n [3] synchronize(stm::HIPStream)\n   @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154\n [4] synchronize()\n   @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154\n [5] top-level scope\n   @ REPL[5]:1","category":"page"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"Kernel-thrown exceptions are thrown during the host synchronization AMDGPU.synchronize or on the next kernel launch.","category":"page"},{"location":"exceptions/","page":"Exceptions","title":"Exceptions","text":"Kernels that hit an exception will write its information into a pre-allocated host buffer. Once complete, the wavefront throwing the exception will lock the buffer to prevent other wavefronts from overwriting the exception and stop itself, but other wavefronts will continue executing.","category":"page"},{"location":"devices/#Devices","page":"Devices","title":"Devices","text":"","category":"section"},{"location":"devices/","page":"Devices","title":"Devices","text":"In AMDGPU, all GPU devices are auto-detected by the runtime, if they're supported.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"AMDGPU maintains a global default device. The default device is relevant for all kernel and GPUArray operations. If one is not specified via @roc or an equivalent interface, then the default device is used for those operations, which affects compilation and kernel launch.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"The device bound to a current Julia task is accessible via AMDGPU.device method. The list of available devices can be queried with AMDGPU.devices method.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"If you have a HIPDevice object, you can also switch the device with AMDGPU.device!. This will switch it only within the task it is called from.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"xd1 = AMDGPU.ones(Float32, 16) # On `AMDGPU.device()` device.\n\nAMDGPU.device!(AMDGPU.devices()[2]) # Switch to second device.\nxd2 = AMDPGU.ones(Float32, 16) # On second device.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"Additionally, devices have an associated numeric ID. This value is bounded between 1 and length(AMDGPU.devices()), and device 1 is the default device when AMDGPU is first loaded. The ID of the device associated with the current task can be queried with AMDGPU.device_id and changed with AMDGPU.device_id!.","category":"page"},{"location":"devices/","page":"Devices","title":"Devices","text":"AMDGPU.devices\nAMDGPU.device\nAMDGPU.device!\nAMDGPU.device_id\nAMDGPU.device_id!","category":"page"},{"location":"devices/#AMDGPU.HIP.devices","page":"Devices","title":"AMDGPU.HIP.devices","text":"devices()\n\nGet list of all devices.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.device","page":"Devices","title":"AMDGPU.device","text":"device()::HIPDevice\n\nGet currently active device. This device is used when launching kernels via @roc.\n\n\n\n\n\ndevice(A::ROCArray) -> HIPDevice\n\nReturn the device associated with the array A.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.device!","page":"Devices","title":"AMDGPU.device!","text":"device!(device::HIPDevice)\n\nSwitch current device being used. This switches only for a task inside which it is called.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.device_id","page":"Devices","title":"AMDGPU.device_id","text":"device_id() -> Int\ndevice_id(device::HIPDevice) -> Int\n\nReturns the numerical device ID for device or for the current AMDGPU.device().\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.device_id!","page":"Devices","title":"AMDGPU.device_id!","text":"device_id!(idx::Integer)\n\nSets the current device to AMDGPU.devices()[idx]. See device_id for details on the numbering semantics.\n\n\n\n\n\n","category":"function"},{"location":"devices/#Device-Properties","page":"Devices","title":"Device Properties","text":"","category":"section"},{"location":"devices/","page":"Devices","title":"Devices","text":"AMDGPU.HIP.name\nAMDGPU.HIP.wavefrontsize\nAMDGPU.HIP.gcn_arch\nAMDGPU.HIP.device_id\nAMDGPU.HIP.properties","category":"page"},{"location":"devices/#AMDGPU.HIP.name","page":"Devices","title":"AMDGPU.HIP.name","text":"name(dev::HIPDevice)::String\n\nGet name of the device.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.HIP.wavefrontsize","page":"Devices","title":"AMDGPU.HIP.wavefrontsize","text":"wavefrontsize(d::HIPDevice)::Cint\n\nGet size of the wavefront. AMD GPUs support either 32 or 64.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.HIP.gcn_arch","page":"Devices","title":"AMDGPU.HIP.gcn_arch","text":"gcn_arch(d::HIPDevice)::String\n\nGet GCN architecture for the device.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.HIP.device_id","page":"Devices","title":"AMDGPU.HIP.device_id","text":"device_id(d::HIPDevice)\n\nZero-based device ID as expected by HIP functions. Differs from AMDGPU.device_id method by 1.\n\n\n\n\n\n","category":"function"},{"location":"devices/#AMDGPU.HIP.properties","page":"Devices","title":"AMDGPU.HIP.properties","text":"properties(dev::HIPDevice)::hipDeviceProp_t\n\nGet all properties for the device. See HIP documentation for hipDeviceProp_t for the meaning of each field.\n\n\n\n\n\n","category":"function"},{"location":"hostcall/#Hostcall","page":"Host-Call","title":"Hostcall","text":"","category":"section"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"Hostcalls provide a means for GPU-CPU communications within running kernels.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"AMDGPU.jl provides its own implementation of hostcalls, relying on HSA signals. Currently, hostcalls are used for device-side allocations, printing and exception reporting.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"Some of the hostcalls (global hostcalls), are launched automatically, if their usage is detected during compilation (e.g. device-side allocations, exception reporting).","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"Hostcalls require careful usage, since they each spawn their own Tasks. There should be no blocking operations during this time.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"For example, using non-blocking synchronization instead of blocking with AMDGPU.synchronize(; blocking=false) (which is also the default).","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"To stop hostcalls after synchronization, provide stop_hostcalls=true keyword argument, otherwise the performance might degrade because of constant pooling of HSA signals in a loop.","category":"page"},{"location":"hostcall/#Example","page":"Host-Call","title":"Example","text":"","category":"section"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"hc = Device.HostCallHolder(Float32, Tuple{Float32}) do x\n    return x + 42f0\nend\n\nfunction kernel!(y, hc)\n    y[1] = Device.hostcall!(hc, y[1])\n    return\nend\n\ny = ROCArray(Float32[0f0])\n@roc kernel!(y, hc)\nAMDGPU.synchronize(; stop_hostcalls=true) # Stop hostcall.\nAMDGPU.Device.free!(hc) # Free hostcall buffers.\n\n@assert Array(y)[1] ≈ 42f0","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"In this example, HostCallHolder is used to create and launch HostCall. HostCallHolder contains the HostCall structure itself that is passed to kernel, a task that is spawned on creation and some additional info for controlling the lifetime of the task.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"First argument is a function we want to execute when we call the hostcall. In this case we add 42f0 to input argument x and return the result.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"Second and third arguments are the return type Float32 and the tuple of types of input arguments Tuple{Float32}.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"hostcall! is used to execute the function on the host, wait on the result, and obtain the return values. At the moment, it is performed once per workgroup.","category":"page"},{"location":"hostcall/#Continuous-Host-Call","page":"Host-Call","title":"Continuous Host-Call","text":"","category":"section"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"By default, hostcalls can be used only once. After executing the function on the host, the task finishes and exits.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"However, if you need your hostcall to live indefinitely, pass continuous=true keyword argument to HostCallHolder(...; continuous=true).","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"To then stop the hostcall, call Device.non_continuous!(hc) or Device.finish!(hc) on the HostCallHolder.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"The difference between them is that non_continuous! will allow calling hostcall one more time before exiting, while finish! will exit immediately.","category":"page"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"finish! can be used on any HostCallHolder to force-exit the running hostcall task.","category":"page"},{"location":"hostcall/#Free-hostcall-buffers","page":"Host-Call","title":"Free hostcall buffers","text":"","category":"section"},{"location":"hostcall/","page":"Host-Call","title":"Host-Call","text":"For custom hostcalls it is important to call AMDGPU.Device.free! once kernel has finished to free buffers that hostcall used in the process.","category":"page"},{"location":"logging/#Runtime-and-Compiler-Logging","page":"Logging","title":"Runtime and Compiler Logging","text":"","category":"section"},{"location":"logging/","page":"Logging","title":"Logging","text":"AMDGPU.jl has a built-in logging system integrated into various runtime and compiler operations, which is provided by TimespanLogging.jl. Operations such as compilation and linking, signal and buffer allocation/freeing, kernel launch, etc. are instrumented with logging statements, allowing the user to record the start and end of operations.","category":"page"},{"location":"logging/","page":"Logging","title":"Logging","text":"While disabled by default, logging can be enabled by first running AMDGPU.Runtime.enable_logging!() to globally enable logging, after which Julia must be restarted for the changes to take effect.","category":"page"},{"location":"logging/","page":"Logging","title":"Logging","text":"Once logging is globally enabled, AMDGPU.Runtime.start_logging() causes new log events to be saved, while AMDGPU.Runtime.stop_logging() causes new log events to be discarded. Log events can be collected with AMDGPU.Runtime.fetch_logs!(). A more convenient option is AMDGPU.Runtime.log_and_fetch!(f), which can be used to easily log operations within a region of code:","category":"page"},{"location":"logging/","page":"Logging","title":"Logging","text":"logs = AMDGPU.Runtime.log_and_fetch!() do\n    A = AMDGPU.ones(3, 4)\n    B = copy(A)\n    fill!(B, 1f0)\n    C = Array(B)\nend\n@show logs[1]","category":"page"},{"location":"api/#AMDGPU-API-Reference","page":"API Reference","title":"AMDGPU API Reference","text":"","category":"section"},{"location":"api/#Indexing","page":"API Reference","title":"Indexing","text":"","category":"section"},{"location":"api/","page":"API Reference","title":"API Reference","text":"AMDGPU.workitemIdx\nAMDGPU.workgroupIdx\nAMDGPU.workgroupDim\nAMDGPU.gridItemDim\nAMDGPU.gridGroupDim","category":"page"},{"location":"api/#AMDGPU.Device.workitemIdx","page":"API Reference","title":"AMDGPU.Device.workitemIdx","text":"workitemIdx()::ROCDim3\n\nReturns the work item index within the work group. See also: threadIdx\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.workgroupIdx","page":"API Reference","title":"AMDGPU.Device.workgroupIdx","text":"workgroupIdx()::ROCDim3\n\nReturns the work group index. See also: blockIdx\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.workgroupDim","page":"API Reference","title":"AMDGPU.Device.workgroupDim","text":"workgroupDim()::ROCDim3\n\nReturns the size of each workgroup in workitems. See also: blockDim\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.gridItemDim","page":"API Reference","title":"AMDGPU.Device.gridItemDim","text":"gridItemDim()::ROCDim3\n\nReturns the size of the grid in workitems. This behaviour is different from CUDA where gridDim gives the size of the grid in blocks.\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.gridGroupDim","page":"API Reference","title":"AMDGPU.Device.gridGroupDim","text":"gridGroupDim()::ROCDim3\n\nReturns the size of the grid in workgroups. This is equivalent to CUDA's gridDim.\n\n\n\n\n\n","category":"function"},{"location":"api/","page":"API Reference","title":"API Reference","text":"Use these functions for compatibility with CUDA.jl.","category":"page"},{"location":"api/","page":"API Reference","title":"API Reference","text":"AMDGPU.Device.threadIdx\nAMDGPU.Device.blockIdx\nAMDGPU.Device.blockDim","category":"page"},{"location":"api/#AMDGPU.Device.threadIdx","page":"API Reference","title":"AMDGPU.Device.threadIdx","text":"threadIdx()::ROCDim3\n\nReturns the thread index within the block. See also: workitemIdx\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.blockIdx","page":"API Reference","title":"AMDGPU.Device.blockIdx","text":"blockIdx()::ROCDim3\n\nReturns the block index within the grid. See also: workgroupIdx\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.blockDim","page":"API Reference","title":"AMDGPU.Device.blockDim","text":"blockDim()::ROCDim3\n\nReturns the dimensions of the block. See also: workgroupDim\n\n\n\n\n\n","category":"function"},{"location":"api/#Synchronization","page":"API Reference","title":"Synchronization","text":"","category":"section"},{"location":"api/","page":"API Reference","title":"API Reference","text":"AMDGPU.sync_workgroup\nAMDGPU.sync_workgroup_count\nAMDGPU.sync_workgroup_and\nAMDGPU.sync_workgroup_or","category":"page"},{"location":"api/#AMDGPU.Device.sync_workgroup","page":"API Reference","title":"AMDGPU.Device.sync_workgroup","text":"sync_workgroup()\n\nWaits until all wavefronts in a workgroup have reached this call.\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.sync_workgroup_count","page":"API Reference","title":"AMDGPU.Device.sync_workgroup_count","text":"sync_workgroup_count(predicate::Cint)::Cint\n\nIdentical to sync_workgroup, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns the number of workitems for which predicate evaluates to non-zero.\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.sync_workgroup_and","page":"API Reference","title":"AMDGPU.Device.sync_workgroup_and","text":"sync_workgroup_and(predicate::Cint)::Cint\n\nIdentical to sync_workgroup, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for all of them.\n\n\n\n\n\n","category":"function"},{"location":"api/#AMDGPU.Device.sync_workgroup_or","page":"API Reference","title":"AMDGPU.Device.sync_workgroup_or","text":"sync_workgroup_or(predicate::Cint)::Cint\n\nIdentical to sync_workgroup, with the additional feature that it evaluates the predicate for all workitems in the workgroup and returns non-zero if and only if predicate evaluates to non-zero for any of them.\n\n\n\n\n\n","category":"function"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"DocTestSetup = quote\n    using AMDGPU\nend","category":"page"},{"location":"quickstart/#Quick-Start","page":"Quick Start","title":"Quick Start","text":"","category":"section"},{"location":"quickstart/#Simple-example","page":"Quick Start","title":"Simple example","text":"","category":"section"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"As a simple example, let's add two vectors both on CPU and GPU and make sure that the results are the same:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"First, we do this on CPU:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> n = 1024;\n\njulia> a = fill(1.0, n);\n\njulia> b = fill(2.0, n);\n\njulia> c = a .+ b;","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"To do the same computation on the GPU, we first need to copy the host arrays to the device and then simply add them together element-wise:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> a_d = ROCArray(a);\n\njulia> b_d = ROCArray(b);\n\njulia> c_d = a_d .+ b_d;","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Now, let's check that the results are the same on CPU and GPU by transferring GPU array back to host and comparing the results:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> Array(c_d) ≈ c\ntrue","category":"page"},{"location":"quickstart/#Kernel-example","page":"Quick Start","title":"Kernel example","text":"","category":"section"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Alternatively, we can perform the same computation by writing our custom GPU kernel:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> function vadd!(c, a, b)\n           i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x\n           c[i] = a[i] + b[i]\n           return\n       end\nvadd! (generic function with 1 method)","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"The index i of a single workitem can be uniquely identified by its grid index. In this case only one dimension is used, so we take only .x coordinate into account.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"A kernel is compiled upon its first launch. Subsequent launches re-use it, without recompilation. Let's launch a kernel, but first clear-out the memory of the resulting vector cd.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"julia> fill!(c_d, 0.0);\n\njulia> groupsize = 256;\n\njulia> gridsize = cld(n, groupsize);\n\njulia> @roc groupsize=groupsize gridsize=gridsize vadd!(c_d, a_d, b_d);\n\njulia> Array(c_d) ≈ c\ntrue","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"The easiest way to launch a GPU kernel is with the @roc macro, specifying groupsize and gridsize to cover full array, and calling it like a regular function.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Keep in mind that kernel launches are asynchronous, meaning that you need to synchronize before you can use the result (e.g. with AMDGPU.synchronize). However, GPU <-> CPU transfers synchronize implicitly.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"The grid is the domain over which the entire kernel executes over. The grid will be split into multiple workgroups by hardware automatically, and the kernel does not complete until all workgroups complete.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Like OpenCL, AMDGPU has the concept of \"workitems\", \"workgroups\", and the \"grid\". A workitem is a single thread of execution, capable of performing arithmentic operations. Workitems are grouped into \"wavefronts\" (\"warps\" in CUDA) which share the same compute unit, and execute the same instructions simulatenously. The workgroup is a logical unit of compute supported by hardware which comprises multiple wavefronts, which shares resources (specifically local memory) and can be efficiently synchronized. A workgroup may be executed by one or multiple hardware compute units, making it often the only dimension of importance for smaller kernel launches.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Notice how we explicitly specify that this function does not return a value by adding the return statement. This is necessary for all GPU kernels and we can enforce it by adding a return, return nothing, or even nothing at the end of the kernel. If this statement is omitted, Julia will attempt to return the value of the last evaluated expression, in this case a Float64, which will cause a compilation failure as kernels cannot return values.","category":"page"},{"location":"quickstart/#Naming-conventions","page":"Quick Start","title":"Naming conventions","text":"","category":"section"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"Throughout this example we use terms like \"work group\" and \"work item\". These terms are used by the Khronos consortium and their APIs including OpenCL and Vulkan, as well as the HSA foundation.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"NVIDIA, on the other hand, uses some different terms in their CUDA API, which might be confusing to some users porting their kernels from CUDA to AMDGPU.","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"As a quick summary, here is a mapping of the most common terms:","category":"page"},{"location":"quickstart/","page":"Quick Start","title":"Quick Start","text":"AMDGPU CUDA\nworkitemIdx threadIdx\nworkgroupIdx blockIdx\nworkgroupDim blockDim\ngridItemDim No equivalent\ngridGroupDim gridDim\ngroupsize threads\ngridsize blocks\nstream stream","category":"page"},{"location":"memory/#Memory-Allocation-and-Intrinsics","page":"Memory","title":"Memory Allocation and Intrinsics","text":"","category":"section"},{"location":"memory/#Memory-Varieties","page":"Memory","title":"Memory Varieties","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"GPUs contain various kinds of memory, just like CPUs:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Global:   Globally accessible by all CUs on a GPU, and possibly accessible   from outside of the GPU (by the CPU host, by other GPUs, by PCIe devices,   etc.). Slowest form of memory.\nConstant:   Same as global memory, but signals to the hardware that it can use   special instructions to access and cache this memory.   Can be changed between kernel invocations.\nRegion:   Also known as Global Data Store (GDS), all wavefronts on a CU   can access the same memory region from the same address.   Faster than Global/Constant.   Automatically allocated by the compiler/runtime, not user accessible.\nLocal:   Also known as Local Data Store (LDS), all wavefronts in the same workgroup   can access the same memory region from the same address.   Faster than GDS.\nPrivate:   Uses the hardware scratch space, and is private to each SIMD lane   in a wavefront.   Fastest form of traditional memory.","category":"page"},{"location":"memory/#Local-Memory","page":"Memory","title":"Local Memory","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"Local memory may be allocated within a kernel by calling either:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"@ROCStaticLocalArray(T, dims) - if dims is passed as a constant value,   known at compile-time.   E.g. @ROCStaticLocalArray(Float32, 8).\n@ROCDynamicLocalArray(T, dims) - otherwise.   E.g. @ROCStaticLocalArray(Float32, length(X)).","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Local memory is zero-initialized by default. If this is unnecessary and undesired for performance reasons, disable this, passing false as a last argument: @ROCStaticLocalArray(Float32, 8, false) or @ROCStaticLocalArray(Float32, length(X), false)","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Local memory does not need to be freed, as it is automatically freed by the hardware.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"If @ROCDynamicLocalArray is used, then local memory is dynamically allocated at kernel execution time. The shmem option to @roc must be set appropriately to ensure that enough local memory is allocated by the hardware.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"It is allocated in addition to the local memory that is statically allocated by the kernel.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"function kernel(C, A)\n    # Allocate local memory dynamically\n    Ctmp = @ROCDynamicLocalArray(Float64, length(C))\n    # Or, allocate local memory statically if the size is known ahead-of-time\n    Ctmp = @ROCStaticLocalArray(Float64, 8) # if we want 8 elements\n\n    idx = AMDGPU.workitemIdx().x\n    Ctmp[idx] = A[idx] + C[1]\n    AMDGPU.Device.sync_workgroup()\n\n    C[idx] = Ctmp[idx]\n    return\nend\n\n...\n# Note: The `shmem` option isn't necessary if `@ROCStaticLocalArray` is used\nshmem = sizeof(Float64) * length(RC)\n@roc groupsize=8 shmem=shmem kernel(RC, RA)","category":"page"},{"location":"memory/#Device-Side-Allocations","page":"Memory","title":"Device-Side Allocations","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"Global memory may be allocated/freed dynamically from kernels by calling AMDGPU.Device.malloc(::Csize_t)::Ptr{Cvoid} and AMDGPU.Device.free(::Ptr{Cvoid}).","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"This memory allocation/deallocation uses hostcalls to operate, and so is relatively slow, but is also very useful. See Hostcall section for more info about them.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Memory allocated with AMDGPU.Device.malloc is a host-pinned memory. Calls to malloc and free are performed once per workgroup, so ensure that enough memory has been allocated to feed the lanes that will be accessing it.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"As an example, here's how an array could be allocated on-device to store temporary results:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"function kernel(C, A)\n    # Allocate memory dynamically and get a pointer to it.\n    Ctmp_ptr = AMDGPU.Device.malloc(Csize_t(sizeof(Float64) * length(C)))\n    # Turn a pointer into a device-side array.\n    Ctmp = ROCDeviceArray(length(C), reinterpret(Core.LLVMPtr{Float64,1}, Ctmp_ptr))\n\n    # Use it\n    idx = AMDGPU.workitemIdx().x\n    Ctmp[idx] = A[idx] + C[1]\n    AMDGPU.Device.sync_workgroup()\n\n    C[idx] = Ctmp[idx]\n    # Make sure to free it.\n    AMDGPU.Device.free(Ctmp_ptr)\n    return\nend\n\nRA = AMDGPU.rand(4)\nRC = AMDGPU.rand(4)\nRC_elem = Array(RC)[1]\n@roc groupsize=4 kernel(RC, RA)\n@assert Array(RC) ≈ Array(RA) .+ RC_elem","category":"page"},{"location":"memory/#Memory-Modification-Intrinsics","page":"Memory","title":"Memory Modification Intrinsics","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"Like C, AMDGPU.jl provides the memset! and memcpy! intrinsics, which are useful for setting a memory region to a value, or copying one region to another, respectively. Check test/device/memory.jl for examples of their usage.","category":"page"},{"location":"memory/#Wrapping-in-ROCArray","page":"Memory","title":"Wrapping in ROCArray","text":"","category":"section"},{"location":"memory/","page":"Memory","title":"Memory","text":"You can wrap host array to be accessible (pinned) on the device with:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"x = rand(Float32, 4, 4)\nxd = unsafe_wrap(ROCArray, pointer(x), size(x))\n\n# Pointer to `xd` is a device-mapped pointer, not host pointer.\n@show pointer(xd) == xd.buf.dev_ptr\n@show pointer(xd) == xd.buf.ptr\n\n# Can be used in kernels, host array `x` is also updated.\nxd .+= 1f0\n\n# Can be used with HIP libraries.\nxd * xd","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Pinned memory is automatically unregistered upon array destruction. You can't free it, since it is managed by the host.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Additionally, you can wrap the device array with:","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"x = AMDGPU.rand(Float32, 4, 4)\nxd = unsafe_wrap(ROCArray, pointer(x), size(x); lock=false)\n\n# Can be used in kernels, `x` is also updated.\nxd .+= 1f0\n\n# Can be used with HIP libraries.\nxd * xd\n\n# Freeing is a no-op for `xd`, since `xd` does not own the underlying memory.\nAMDGPU.unsafe_free!(xd) # No-op.","category":"page"},{"location":"memory/","page":"Memory","title":"Memory","text":"Notice mandatory ; lock=false keyword, this is to be able to differentiate between host & device pointers.","category":"page"},{"location":"printing/#Printing","page":"Printing","title":"Printing","text":"","category":"section"},{"location":"printing/","page":"Printing","title":"Printing","text":"Writing GPU kernels can be a difficult endeavor, owing to the fact that the LLVM GPU backends turn serial code into parallel code automatically. Recognizing this, every good GPU programming interface allows the user's GPU kernels to print output to a buffer, which will be passed to the host for display. With the ability to interpolate variables, this functionality serves as the \"printf of GPUs\". Quite literally, the primary tool for this is @rocprintf. Here's a simple example of printing the current workgroup index:","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"kernel(x) = @rocprintf \"Workgroup index: %d\\n\" workgroupIdx().x","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"The above kernel would print out the string \"Workgroup index: 1\\n\" when run with a single workgroup (where \"\\n\" means a newline).","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"Any number of variables may be passed to @rocprintf, as long as those variables have a printf-compatible implementation in Printf.@printf. Calls to @rocprintf are blocking, and will not return control to the kernel until the string has been formatted and sent to the OS runtime for printing (the same as for calls to Printf.@printf).","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"While @rocprintf is printed once per workgroup by default, it's possible to print once per lane, once per wavefront, or once per grid by specifying an execution mode as the first argument:","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"# Once per lane\nkernel(x) = @rocprintf :lane \"My index is: %d\\n\" workitemIdx().x\n\n# Once per wavefront\nkernel(x) = @rocprintf :wave \"My index is: %d\\n\" workitemIdx().x\n\n# Once per workgroup\nkernel(x) = @rocprintf :group \"My index is: %d\\n\" workitemIdx().x\n# OR (:group is the default)\nkernel(x) = @rocprintf \"My index is: %d\\n\" workitemIdx().x\n\n# Once total\nkernel(x) = @rocprintf :grid \"My index is: %d\\n\" workitemIdx().x","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"Executing those kernels with 256 workitems split evenly between 2 workgroups would print out:","category":"page"},{"location":"printing/","page":"Printing","title":"Printing","text":"# :lane\nMy index is 1\nMy index is 2\n...\nMy index is 127\nMy index is 128\nMy index is 1\nMy index is 2\n...\nMy index is 127\nMy index is 128\n\n# :wave\nMy index is 1\nMy index is 65\nMy index is 1\nMy index is 65\n\n# :group\nMy index is 1\nMy index is 1\n\n# :grid\nMy index is 1","category":"page"},{"location":"printing/#Differences-to-@cuprintf","page":"Printing","title":"Differences to @cuprintf","text":"","category":"section"},{"location":"printing/","page":"Printing","title":"Printing","text":"Similar to CUDA's @cuprintf, @rocprintf is a printf-compatible macro which takes a format string and arguments, and commands the host CPU to display it as formatted text. However, in contrast to @cuprintf, we use AMDGPU's hostcall and Julia's Printf stdlib to implement this. This means that anything that Printf can print, so can @rocprintf (assuming such an object can be represented on the GPU). The macro is also handled as a regular hostcall, which means that argument types are checked at compile time (although currently, any errors while printing will be detected on the host, and will terminate the kernel).","category":"page"},{"location":"streams/#Streams","page":"Streams","title":"Streams","text":"","category":"section"},{"location":"streams/","page":"Streams","title":"Streams","text":"Similar to CUDA streams, ROCm has HIP streams, which are buffers used to instruct the GPU hardware which kernels to launch. HIP streams are synchronous, like CUDA streams.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Each device has a default stream associated, which is accessible with AMDGPU.stream().","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"There are several ways to specify which stream to launch a kernel on:","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Using AMDGPU.stream! to change default stream to be used   within the same Julia task.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"stream = AMDGPU.HIPStream()\nAMDGPU.stream!(stream) # Change default stream to be used for subsequent operations.\nAMDGPU.ones(Float32, 16) # Will be executed on `stream`.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Using AMDGPU.stream! to execute given function and reset   to the original stream after completion:","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"stream = AMDGPU.HIPStream()\nx = AMDGPU.stream!(() -> AMDGPU.ones(Float32, 16), stream)","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Using stream argument to @roc macro:","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"stream = AMDGPU.HIPStream()\n@roc stream=stream kernel(...)","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Streams also have an inherent priority, which allows control of kernel submission latency and on-device scheduling preference with respect to kernels submitted on other streams. There are three priorities: normal (the default), low, and high priority.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Priority of the default stream can be set with AMDGPU.priority!. Alternatively, it can be set at stream creation time:","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"low_prio = HIPStream(:low)\nhigh_prio = HIPStream(:high)\nnormal_prio = HIPStream(:normal) # or just omit \"priority\"","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"AMDGPU.stream\nAMDGPU.stream!\nAMDGPU.priority!\nAMDGPU.HIPStream","category":"page"},{"location":"streams/#AMDGPU.stream","page":"Streams","title":"AMDGPU.stream","text":"stream()::HIPStream\n\nGet the HIP stream that should be used as the default one for the currently executing task.\n\n\n\n\n\n","category":"function"},{"location":"streams/#AMDGPU.stream!","page":"Streams","title":"AMDGPU.stream!","text":"stream!(s::HIPStream)\n\nChange the default stream to be used within the same Julia task.\n\n\n\n\n\nstream!(f::Base.Callable, stream::HIPStream)\n\nChange the default stream to be used within the same Julia task, execute f and revert to the original stream.\n\nReturns:\n\nReturn value of the function f.\n\n\n\n\n\n","category":"function"},{"location":"streams/#AMDGPU.priority!","page":"Streams","title":"AMDGPU.priority!","text":"priority!(p::Symbol)\n\nChange the priority of the default stream. Accepted values are :normal (the default), :low and :high.\n\n\n\n\n\npriority!(f::Base.Callable, priority::Symbol)\n\nChnage the priority of default stream, execute f and revert to the original priority. Accepted values are :normal (the default), :low and :high.\n\nReturns:\n\nReturn value of the function f.\n\n\n\n\n\n","category":"function"},{"location":"streams/#AMDGPU.HIP.HIPStream","page":"Streams","title":"AMDGPU.HIP.HIPStream","text":"HIPStream(priority::Symbol = :normal)\n\nArguments:\n\npriority::Symbol: Priority of the stream: :normal, :high or :low.\n\nCreate HIPStream with given priority. Device is the default device that's currently in use.\n\n\n\n\n\nHIPStream(stream::hipStream_t)\n\nCreate HIPStream from hipStream_t handle. Device is the default device that's currently in use.\n\n\n\n\n\n","category":"type"},{"location":"streams/#Synchronization","page":"Streams","title":"Synchronization","text":"","category":"section"},{"location":"streams/","page":"Streams","title":"Streams","text":"AMDGPU.jl by default uses non-blocking stream synchronization with AMDGPU.synchronize to work correctly with TLS and Hostcall.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Users, however, can switch to a blocking synchronization globally with nonblocking_synchronization preference or with fine-grained AMDGPU.synchronize(; blocking=true). Blocking synchronization might offer slightly lower latency.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"You can also perform synchronization of the expression with AMDGPU.@sync macro, which will execute given expression and synchronize afterwards (using AMDGPU.synchronize under the hood).","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"AMDGPU.@sync begin\n    @roc ...\nend","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"Finally, you can perform full device synchronization with AMDGPU.device_synchronize.","category":"page"},{"location":"streams/","page":"Streams","title":"Streams","text":"AMDGPU.synchronize\nAMDGPU.@sync\nAMDGPU.device_synchronize","category":"page"},{"location":"streams/#AMDGPU.synchronize","page":"Streams","title":"AMDGPU.synchronize","text":"synchronize(stream::HIPStream = stream(); blocking::Bool = false)\n\nWait until all kernels executing on stream have completed.\n\nIf there are running HostCalls, then blocking must be false. Additionally, if you want to stop host calls afterwards, then provide stop_hostcalls=true keyword argument.\n\n\n\n\n\n","category":"function"},{"location":"streams/#AMDGPU.@sync","page":"Streams","title":"AMDGPU.@sync","text":"@sync ex\n\nRun expression ex on currently active stream and synchronize the GPU on that stream afterwards.\n\nSee also: synchronize.\n\n\n\n\n\n","category":"macro"},{"location":"streams/#AMDGPU.HIP.device_synchronize","page":"Streams","title":"AMDGPU.HIP.device_synchronize","text":"Blocks until all kernels on all streams have completed. Uses currently active device.\n\n\n\n\n\n","category":"function"},{"location":"profiling/#rocprof","page":"Profiling","title":"rocprof","text":"","category":"section"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"rocprofv2 allows profiling both HSA & HIP API calls (rocprof being deprecated).","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Let's profile simple copying kernel saved in profile.jl file:","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"using AMDGPU\n\nfunction mycopy!(dst, src)\n    i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x\n    if i ≤ length(dst)\n        @inbounds dst[i] = src[i]\n    end\n    return\nend\n\nfunction main(N)\n    src = ROCArray{Float64}(undef, N)\n    dst = ROCArray{Float64}(undef, N)\n    groupsize = 256               # nthreads\n    gridsize = cld(N, groupsize)  # nblocks\n\n    for i in 1:10\n        @roc groupsize=groupsize gridsize=gridsize mycopy!(dst, src)\n        AMDGPU.synchronize()\n    end\n\n    AMDGPU.unsafe_free!(dst)\n    AMDGPU.unsafe_free!(src)\n    AMDGPU.synchronize()\n    return\nend\nmain(2^24)","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"ENABLE_JITPROFILING=1 rocprofv2 --plugin perfetto --hip-trace --hsa-trace --kernel-trace -o prof julia ./profile.jl","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"This will produce prof_output.pftrace file which can be visualized using Perfetto UI.","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Zoomed out Zoomed in\n(Image: image) (Image: image)","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Here we can clearly see that host synchronization after each kernel dispatch causes poor device occupancy (empty spaces between kernel dispatches).","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"We can fix this by moving synchronization outside the loop so that it happens only once.","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"    ...\n    for i in 1:10\n        @roc groupsize=groupsize gridsize=gridsize mycopy!(dst, src)\n    end\n    AMDGPU.synchronize()\n    ...","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Running profiling again and visualizing results we now see that kernel launches are adjacent to each other and that the average wall duration is lower.","category":"page"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Zoomed out Zoomed in\n(Image: image) (Image: image)","category":"page"},{"location":"profiling/#Debugging","page":"Profiling","title":"Debugging","text":"","category":"section"},{"location":"profiling/","page":"Profiling","title":"Profiling","text":"Use HIP_LAUNCH_BLOCKING=1 to synchronize immediately after launching GPU kernels. This will allow to pinpoint exact kernel that caused the exception.","category":"page"},{"location":"#Programming-AMD-GPUs-with-Julia","page":"Home","title":"Programming AMD GPUs with Julia","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Julia support for programming AMD GPUs is currently provided by the AMDGPU.jl package. This package contains everything necessary to program for AMD GPUs in Julia, including:","category":"page"},{"location":"","page":"Home","title":"Home","text":"An interface for compiling and running kernels written in Julia through LLVM's AMDGPU backend.\nAn interface for working with the HIP runtime API,   necessary for launching compiled kernels and controlling the GPU.\nAn array type implementing the GPUArrays.jl   interface, providing high-level array operations.","category":"page"},{"location":"#Installation","page":"Home","title":"Installation","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Simply add the AMDGPU.jl package to your Julia environment:","category":"page"},{"location":"","page":"Home","title":"Home","text":"using Pkg\nPkg.add(\"AMDGPU\")","category":"page"},{"location":"","page":"Home","title":"Home","text":"To ensure that everything works, you can run the test suite:","category":"page"},{"location":"","page":"Home","title":"Home","text":"using AMDGPU\nusing Pkg\nPkg.test(\"AMDGPU\")","category":"page"},{"location":"#Requirements","page":"Home","title":"Requirements","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Julia 1.9 or higher (Navi 3 requires Julia 1.10+).\n64-bit Linux or Windows.\nMinimal supported ROCm version is 5.3.\nRequired software:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Linux Windows\nROCm ROCm\n- AMD Software: Adrenalin Edition","category":"page"},{"location":"","page":"Home","title":"Home","text":"On Windows AMD Software: Adrenalin Edition contains HIP library itself, while ROCm provides support for other functionality.","category":"page"},{"location":"#Windows-OS-missing-functionality","page":"Home","title":"Windows OS missing functionality","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"Windows does not yet support Hostcall, which means that some of the functionality does not work, like:","category":"page"},{"location":"","page":"Home","title":"Home","text":"device printing;\ndynamic memory allocation (from kernels).","category":"page"},{"location":"","page":"Home","title":"Home","text":"These hostcalls are sometimes launched when AMDGPU detects that a kernel might throw an exception, specifically during conversions, like: Int32(1f0).","category":"page"},{"location":"","page":"Home","title":"Home","text":"To avoid this, use 'unsafe' conversion option: unsafe_trunc(Int32, 1f0).","category":"page"},{"location":"#ROCm-system-libraries","page":"Home","title":"ROCm system libraries","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"AMDGPU.jl looks into standard directories and uses Libdl.find_library to find ROCm libraries.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Standard path:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Linux: /opt/rocm\nWindows: C:/Program Files/AMD/ROCm/<rocm-version>","category":"page"},{"location":"","page":"Home","title":"Home","text":"If you have non-standard path for ROCm, set ROCM_PATH=<path> environment variable before launching Julia. For example, if ROCm is installed in your Linux system root (e.g. on Fedora), set ROCM_PATH=/usr/lib64/rocm/gfx11 or ROCM_PATH=/usr/lib64/rocm/gfx1103, depending on your GPU's architecture. You can query the architecture using the amdgpu-arch command. The AMDGPU.versioninfo() function prints the paths of any libraries found.","category":"page"},{"location":"","page":"Home","title":"Home","text":"Depending on your GPU model and the functionality you want to use, you may have to force the GPU architecture by setting the HSA_OVERRIDE_GFX_VERSION variable to a compatible version.","category":"page"},{"location":"#Extra-Setup-Details","page":"Home","title":"Extra Setup Details","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"List of additional steps that may be needed to take to ensure everything is working:","category":"page"},{"location":"","page":"Home","title":"Home","text":"Make sure your user is in the same group as /dev/kfd, other than root.\nFor example, it might be the render group:\ncrw-rw----   1 root   render  234,   0 Aug  5 11:43 kfd\nIn this case, you can add yourself to it:\nsudo usermod -aG render username\nROCm libraries should be in the standard library locations, or in your LD_LIBRARY_PATH.\nIf you get an error message along the lines of GLIB_CXX_... not found,   it's possible that the C++ runtime used to build the ROCm stack   and the one used by Julia are different.   If you built the ROCm stack yourself this is very likely the case   since Julia normally ships with its own C++ runtime.\nFor more information, check out this GitHub issue.   A quick fix is to use the LD_PRELOAD environment variable to make Julia use the system C++ runtime library, for example:\nLD_PRELOAD=/usr/lib/libstdc++.so julia\nAlternatively, you can build Julia from source as described   here.   To quickly debug this issue start Julia and try to load a ROCm library:\nusing Libdl   Libdl.dlopen(\"/opt/rocm/hsa/lib/libhsa-runtime64.so.1\")","category":"page"},{"location":"","page":"Home","title":"Home","text":"Once all of this is setup properly, you should be able to do using AMDGPU successfully.","category":"page"},{"location":"","page":"Home","title":"Home","text":"See the Quick Start documentation for an introduction to using AMDGPU.jl.","category":"page"},{"location":"#Preferences","page":"Home","title":"Preferences","text":"","category":"section"},{"location":"","page":"Home","title":"Home","text":"AMDGPU.jl supports setting preferences. Template of LocalPreferences.toml with all options:","category":"page"},{"location":"","page":"Home","title":"Home","text":"[AMDGPU]\n# If `true` (default), eagerly run GC to keep the pool from growing too big.\n# GC is triggered during new allocatoins or synchronization points.\neager_gc = false\n# Use non-blocking synchronization for all `AMDGPU.synchronize()` calls.\nnonblocking_synchronization = true\n# Memory limit specifies maximum amount of memory in percentages\n# a current Julia process can use.\n# Default is \"none\", which does not apply any limitation.\nhard_memory_limit = \"none\"\n# Notice a space between the value and percentage sign.\n# hard_memory_limit = \"80 %\"","category":"page"},{"location":"kernel_programming/#Kernel-Programming","page":"Kernel Programming","title":"Kernel Programming","text":"","category":"section"},{"location":"kernel_programming/#Launch-Configuration","page":"Kernel Programming","title":"Launch Configuration","text":"","category":"section"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"While an almost arbitrarily large number of workitems can be executed per kernel launch, the hardware can only support executing a limited number of wavefronts at one time.","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"To alleviate this, the compiler calculates the \"occupancy\" of each compiled kernel (which is the number of wavefronts that can be simultaneously executing on the GPU), and passes this information to the hardware; the hardware then launches a limited number of wavefronts at once, based on the kernel's \"occupancy\" values.","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"The rest of the wavefronts are not launched until hardware resources become available, which means that a kernel with better occupancy will see more of its wavefronts executing simultaneously (which often leads to better performance). Suffice to say, it's important to know the occupancy of kernels if you want the best performance.","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"Like CUDA.jl, AMDGPU.jl has the ability to calculate kernel occupancy, with the launch_configuration function:","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"kernel = @roc launch=false mykernel(args...)\noccupancy = AMDGPU.launch_configuration(kernel)\n@show occupancy.gridsize\n@show occupancy.groupsize","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"Specifically, launch_configuration calculates the occupancy of mykernel(args...), and then calculates an optimal groupsize based on the occupancy. This value can then be used to select the groupsize for the kernel:","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"@roc groupsize=occupancy.groupsize mykernel(args...)","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"AMDGPU.@roc\nAMDGPU.Runtime.HIPKernel\nAMDGPU.Compiler.hipfunction","category":"page"},{"location":"kernel_programming/#AMDGPU.@roc","page":"Kernel Programming","title":"AMDGPU.@roc","text":"@roc [kwargs...] func(args...)\n\nHigh-level interface for launching kernels on GPU. Upon a first call it will be compiled, subsequent calls will re-use the compiled object.\n\nSeveral keyword arguments are supported:\n\nlaunch::Bool = true: whether to launch the kernel.   If false, then returns a compiled kernel which can be launched by   calling it and passing arguments.\nArguments that influence kernel compilation, see   AMDGPU.Compiler.hipfunction.\nArguments that influence kernel launch, see AMDGPU.Runtime.HIPKernel.\n\n\n\n\n\n","category":"macro"},{"location":"kernel_programming/#AMDGPU.Runtime.HIPKernel","page":"Kernel Programming","title":"AMDGPU.Runtime.HIPKernel","text":"(ker::HIPKernel)(args::Vararg{Any, N}; kwargs...)\n\nLaunch compiled HIPKernel by passing arguments to it.\n\nThe following kwargs are supported:\n\ngridsize::ROCDim = 1: Size of the grid.\ngroupsize::ROCDim = 1:  Size of the workgroup.\nshmem::Integer = 0:   Amount of dynamically-allocated shared memory in bytes.\nstream::HIP.HIPStream = AMDGPU.stream():   Stream on which to launch the kernel.\n\n\n\n\n\n","category":"type"},{"location":"kernel_programming/#AMDGPU.Compiler.hipfunction","page":"Kernel Programming","title":"AMDGPU.Compiler.hipfunction","text":"hipfunction(f::F, tt::TT = Tuple{}; kwargs...)\n\nCompile Julia function f to a HIP kernel given a tuple of argument's types tt that it accepts.\n\nThe following kwargs are supported:\n\nname::Union{String, Nothing} = nothing:   A unique name to give a compiled kernel.\nunsafe_fp_atomics::Bool = true:   Whether to use 'unsafe' floating-point atomics.   AMD GPU devices support fast atomic read-modify-write (RMW)   operations on floating-point values.   On single- or double-precision floating-point values this may generate   a hardware RMW instruction that is faster than emulating   the atomic operation using an atomic compare-and-swap (CAS) loop.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#Atomics","page":"Kernel Programming","title":"Atomics","text":"","category":"section"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"AMDGPU.jl relies on Atomix.jl for atomics.","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"Example of a kernel that computes atomic max:","category":"page"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"using AMDGPU\n\nfunction ker_atomic_max!(target, source, indices)\n    i = workitemIdx().x + (workgroupIdx().x - 0x1) * workgroupDim().x\n    idx = indices[i]\n    v = source[i]\n    AMDGPU.@atomic max(target[idx], v)\n    return\nend\n\nn, bins = 1024, 32\nsource = ROCArray(rand(UInt32, n))\nindices = ROCArray(rand(1:bins, n))\ntarget = ROCArray(zeros(UInt32, bins))\n@roc groupsize=256 gridsize=4 ker_atomic_max!(target, source, indices)","category":"page"},{"location":"kernel_programming/#Device-Intrinsics","page":"Kernel Programming","title":"Device Intrinsics","text":"","category":"section"},{"location":"kernel_programming/#Wavefront-Level-Primitives","page":"Kernel Programming","title":"Wavefront-Level Primitives","text":"","category":"section"},{"location":"kernel_programming/","page":"Kernel Programming","title":"Kernel Programming","text":"AMDGPU.Device.wavefrontsize\nAMDGPU.Device.activelane\n\nAMDGPU.Device.ballot\nAMDGPU.Device.ballot_sync\nAMDGPU.Device.activemask\n\nAMDGPU.Device.bpermute\nAMDGPU.Device.permute\n\nAMDGPU.Device.shfl\nAMDGPU.Device.shfl_sync\nAMDGPU.Device.shfl_up\nAMDGPU.Device.shfl_up_sync\nAMDGPU.Device.shfl_down\nAMDGPU.Device.shfl_down_sync\nAMDGPU.Device.shfl_xor\nAMDGPU.Device.shfl_xor_sync\n\nAMDGPU.Device.any_sync\nAMDGPU.Device.all_sync","category":"page"},{"location":"kernel_programming/#AMDGPU.Device.wavefrontsize","page":"Kernel Programming","title":"AMDGPU.Device.wavefrontsize","text":"wavefrontsize()::Cuint\n\nGet the wavefront size of the device that executes current kernel.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.activelane","page":"Kernel Programming","title":"AMDGPU.Device.activelane","text":"activelane()::Cuint\n\nGet id of the current lane within a wavefront/warp.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           x[i + 1] = i\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Cint}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> Array(x)\n1×8 Matrix{Int32}:\n 0  1  2  3  4  5  6  7\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.ballot","page":"Kernel Programming","title":"AMDGPU.Device.ballot","text":"ballot(predicate::Bool)::UInt64\n\nReturn a value whose Nth bit is set if and only if predicate evaluates to true for the Nth lane and the lane is active.\n\njulia> function ker!(x)\n           x[1] = AMDGPU.Device.ballot(true)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Culong}(undef, 1);\n\njulia> @roc groupsize=32 ker!(x);\n\njulia> x\n1-element ROCArray{UInt64, 1, AMDGPU.Runtime.Mem.HIPBuffer}:\n 0x00000000ffffffff\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.ballot_sync","page":"Kernel Programming","title":"AMDGPU.Device.ballot_sync","text":"ballot_sync(mask::UInt64, predicate::Bool)::UInt64\n\nEvaluate predicate for all non-exited threads in mask and return an integer whose Nth bit is set if and only if predicate is true for the Nth thread of the wavefront and the Nth thread is active.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           if i % 2 == 0\n               mask = 0x0000000055555555 # Only even threads.\n               x[1] = AMDGPU.Device.ballot_sync(mask, true)\n           end\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{UInt64}(undef, 1);\n\njulia> @roc groupsize=32 ker!(x);\n\njulia> bitstring(Array(x)[1])\n\"0000000000000000000000000000000001010101010101010101010101010101\"\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.activemask","page":"Kernel Programming","title":"AMDGPU.Device.activemask","text":"activemask()::UInt64\n\nGet the mask of all active lanes in a warp.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.bpermute","page":"Kernel Programming","title":"AMDGPU.Device.bpermute","text":"bpermute(addr::Integer, val::Cint)::Cint\n\nRead data stored in val from the lane VGPR (vector general purpose register) given by addr.\n\nThe permute instruction moves data between lanes but still uses the notion of byte addressing, as do other LDS instructions. Hence, the value in the addr VGPR should be desired_lane_id * 4, since VGPR values are 4 bytes wide.\n\nExample below shifts all values in the wavefront by 1 to the \"left\".\n\njulia> function ker!(x)\n           i::Cint = AMDGPU.Device.activelane()\n           # `addr` points to the next immediate lane.\n           addr = ((i + 1) % 8) * 4 # VGPRs are 4 bytes wide\n           # Read data from the next immediate lane.\n           x[i + 1] = AMDGPU.Device.bpermute(addr, i)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Cint}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int32, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  2  3  4  5  6  7  0\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.permute","page":"Kernel Programming","title":"AMDGPU.Device.permute","text":"permute(addr::Integer, val::Cint)::Cint\n\nPut data stored in val to the lane VGPR (vector general purpose register) given by addr.\n\nExample below shifts all values in the wavefront by 1 to the \"right\".\n\njulia> function ker!(x)\n           i::Cint = AMDGPU.Device.activelane()\n           # `addr` points to the next immediate lane.\n           addr = ((i + 1) % 8) * 4 # VGPRs are 4 bytes wide\n           # Put data into the next immediate lane.\n           x[i + 1] = AMDGPU.Device.permute(addr, i)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Cint}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int32, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 7  0  1  2  3  4  5  6\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl","page":"Kernel Programming","title":"AMDGPU.Device.shfl","text":"shfl(val, lane, width = wavefrontsize())\n\nRead data stored in val from a lane (this is a more high-level op than bpermute).\n\nIf lane is outside the range [0:width - 1], the value returned corresponds to the value held by the lane modulo width (within the same subsection).\n\njulia> function ker!(x)\n           i::UInt32 = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl(i, i + 1)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{UInt32}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> Int.(x)\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  2  3  4  5  6  7  0\n\nIf width is less than wavefront size then each subsection of the wavefront behaves as a separate entity with a starting logical lane ID of 0.\n\njulia> function ker!(x)\n           i::UInt32 = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl(i, i + 1, 4) # <-- Notice width = 4.\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{UInt32}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> Int.(x)\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  2  3  0  5  6  7  4\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_sync","page":"Kernel Programming","title":"AMDGPU.Device.shfl_sync","text":"shfl_sync(mask::UInt64, val, lane, width = wavefrontsize())\n\nSynchronize threads according to a mask and read data stored in val from a lane ID.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_up","page":"Kernel Programming","title":"AMDGPU.Device.shfl_up","text":"shfl_up(val, δ, width = wavefrontsize())\n\nSame as shfl, but instead of specifying lane ID, accepts δ that is subtracted from the current lane ID. I.e. read from a lane with lower ID relative to the caller.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl_up(i, 1)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Int}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 0  0  1  2  3  4  5  6\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_up_sync","page":"Kernel Programming","title":"AMDGPU.Device.shfl_up_sync","text":"shfl_up_sync(mask::UInt64, val, δ, width = wavefrontsize())\n\nSynchronize threads according to a mask and read data stored in val from a lane with lower ID relative to the caller.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_down","page":"Kernel Programming","title":"AMDGPU.Device.shfl_down","text":"shfl_down(val, δ, width = wavefrontsize())\n\nSame as shfl, but instead of specifying lane ID, accepts δ that is added to the current lane ID. I.e. read from a lane with higher ID relative to the caller.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl_down(i, 1, 8)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Int}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  2  3  4  5  6  7  7\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_down_sync","page":"Kernel Programming","title":"AMDGPU.Device.shfl_down_sync","text":"shfl_down_sync(mask::UInt64, val, δ, width = wavefrontsize())\n\nSynchronize threads according to a mask and read data stored in val from a lane with higher ID relative to the caller.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_xor","page":"Kernel Programming","title":"AMDGPU.Device.shfl_xor","text":"shfl_xor(val, lane_mask, width = wavefrontsize())\n\nSame as shfl, but instead of specifying lane ID, performs bitwise XOR of the caller's lane ID with the lane_mask.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           x[i + 1] = AMDGPU.Device.shfl_xor(i, 1)\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Int}(undef, 1, 8);\n\njulia> @roc groupsize=8 ker!(x);\n\njulia> x\n1×8 ROCArray{Int64, 2, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1  0  3  2  5  4  7  6\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.shfl_xor_sync","page":"Kernel Programming","title":"AMDGPU.Device.shfl_xor_sync","text":"shfl_xor_sync(mask::UInt64, val, lane_mask, width = wavefrontsize())\n\nSynchronize threads according to a mask and read data stored in val from a lane according to a bitwise XOR of the caller's lane ID with the lane_mask.\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.any_sync","page":"Kernel Programming","title":"AMDGPU.Device.any_sync","text":"any_sync(mask::UInt64, predicate::Bool)::Bool\n\nEvaluate predicate for all non-exited threads in mask and return non-zero if and only if predicate evaluates to non-zero for any of them.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           if i % 2 == 0\n               mask = 0x0000000055555555 # Only even threads.\n               x[1] = AMDGPU.Device.any_sync(mask, i == 0)\n           end\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Bool}(undef, 1);\n\njulia> @roc groupsize=32 ker!(x);\n\njulia> x\n1-element ROCArray{Bool, 1, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1\n\n\n\n\n\n","category":"function"},{"location":"kernel_programming/#AMDGPU.Device.all_sync","page":"Kernel Programming","title":"AMDGPU.Device.all_sync","text":"all_sync(mask::UInt64, predicate::Bool)::Bool\n\nEvaluate predicate for all non-exited threads in mask and return non-zero if and only if predicate evaluates to non-zero for all of them.\n\njulia> function ker!(x)\n           i = AMDGPU.Device.activelane()\n           if i % 2 == 0\n               mask = 0x0000000055555555 # Only even threads.\n               x[1] = AMDGPU.Device.all_sync(mask, true)\n           end\n           return\n       end\nker! (generic function with 1 method)\n\njulia> x = ROCArray{Bool}(undef, 1);\n\njulia> @roc groupsize=32 ker!(x);\n\njulia> x\n1-element ROCArray{Bool, 1, AMDGPU.Runtime.Mem.HIPBuffer}:\n 1\n\n\n\n\n\n","category":"function"}]
 }
diff --git a/dev/streams/index.html b/dev/streams/index.html
index a54f7976..738feaf9 100644
--- a/dev/streams/index.html
+++ b/dev/streams/index.html
@@ -3,12 +3,12 @@
   function gtag(){dataLayer.push(arguments);}
   gtag('js', new Date());
   gtag('config', 'UA-154489943-2', {'page_path': location.pathname + location.search + location.hash});
-</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li class="is-active"><a class="tocitem" href>Streams</a><ul class="internal"><li><a class="tocitem" href="#Synchronization"><span>Synchronization</span></a></li></ul></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../caching_allocator/">Caching Memory Allocator</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Streams</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Streams</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/streams.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Streams"><a class="docs-heading-anchor" href="#Streams">Streams</a><a id="Streams-1"></a><a class="docs-heading-anchor-permalink" href="#Streams" title="Permalink"></a></h1><p>Similar to CUDA streams, ROCm has HIP streams, which are buffers used to instruct the GPU hardware which kernels to launch. HIP streams are synchronous, like CUDA streams.</p><p>Each device has a default stream associated, which is accessible with <code>AMDGPU.stream()</code>.</p><p>There are several ways to specify which stream to launch a kernel on:</p><ul><li>Using <a href="#AMDGPU.stream!"><code>AMDGPU.stream!</code></a> to change default stream to be used   <strong>within the same Julia task</strong>.</li></ul><pre><code class="language-julia hljs">stream = AMDGPU.HIPStream()
+</script><script data-outdated-warner src="../assets/warner.js"></script><link href="https://cdnjs.cloudflare.com/ajax/libs/lato-font/3.0.0/css/lato-font.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/juliamono/0.050/juliamono.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/fontawesome.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/solid.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/brands.min.css" rel="stylesheet" type="text/css"/><link href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.8/katex.min.css" rel="stylesheet" type="text/css"/><script>documenterBaseURL=".."</script><script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" data-main="../assets/documenter.js"></script><script src="../search_index.js"></script><script src="../siteinfo.js"></script><script src="../../versions.js"></script><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-mocha.css" data-theme-name="catppuccin-mocha"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-macchiato.css" data-theme-name="catppuccin-macchiato"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-frappe.css" data-theme-name="catppuccin-frappe"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/catppuccin-latte.css" data-theme-name="catppuccin-latte"/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-dark.css" data-theme-name="documenter-dark" data-theme-primary-dark/><link class="docs-theme-link" rel="stylesheet" type="text/css" href="../assets/themes/documenter-light.css" data-theme-name="documenter-light" data-theme-primary/><script src="../assets/themeswap.js"></script><link href="../assets/favicon.ico" rel="icon" type="image/x-icon"/></head><body><div id="documenter"><nav class="docs-sidebar"><a class="docs-logo" href="../"><img src="../assets/logo.png" alt="AMDGPU.jl logo"/></a><div class="docs-package-name"><span class="docs-autofit"><a href="../">AMDGPU.jl</a></span></div><button class="docs-search-query input is-rounded is-small is-clickable my-2 mx-auto py-1 px-2" id="documenter-search-query">Search docs (Ctrl + /)</button><ul class="docs-menu"><li><a class="tocitem" href="../">Home</a></li><li><a class="tocitem" href="../quickstart/">Quick Start</a></li><li><a class="tocitem" href="../devices/">Devices</a></li><li class="is-active"><a class="tocitem" href>Streams</a><ul class="internal"><li><a class="tocitem" href="#Synchronization"><span>Synchronization</span></a></li></ul></li><li><a class="tocitem" href="../kernel_programming/">Kernel Programming</a></li><li><a class="tocitem" href="../exceptions/">Exceptions</a></li><li><a class="tocitem" href="../profiling/">Profiling</a></li><li><a class="tocitem" href="../memory/">Memory</a></li><li><a class="tocitem" href="../hostcall/">Host-Call</a></li><li><a class="tocitem" href="../printing/">Printing</a></li><li><a class="tocitem" href="../logging/">Logging</a></li><li><a class="tocitem" href="../api/">API Reference</a></li></ul><div class="docs-version-selector field has-addons"><div class="control"><span class="docs-label button is-static is-size-7">Version</span></div><div class="docs-selector control is-expanded"><div class="select is-fullwidth is-size-7"><select id="documenter-version-selector"></select></div></div></div></nav><div class="docs-main"><header class="docs-navbar"><a class="docs-sidebar-button docs-navbar-link fa-solid fa-bars is-hidden-desktop" id="documenter-sidebar-button" href="#"></a><nav class="breadcrumb"><ul class="is-hidden-mobile"><li class="is-active"><a href>Streams</a></li></ul><ul class="is-hidden-tablet"><li class="is-active"><a href>Streams</a></li></ul></nav><div class="docs-right"><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl" title="View the repository on GitHub"><span class="docs-icon fa-brands"></span><span class="docs-label is-hidden-touch">GitHub</span></a><a class="docs-navbar-link" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/master/docs/src/streams.md" title="Edit source on GitHub"><span class="docs-icon fa-solid"></span></a><a class="docs-settings-button docs-navbar-link fa-solid fa-gear" id="documenter-settings-button" href="#" title="Settings"></a><a class="docs-article-toggle-button fa-solid fa-chevron-up" id="documenter-article-toggle-button" href="javascript:;" title="Collapse all docstrings"></a></div></header><article class="content" id="documenter-page"><h1 id="Streams"><a class="docs-heading-anchor" href="#Streams">Streams</a><a id="Streams-1"></a><a class="docs-heading-anchor-permalink" href="#Streams" title="Permalink"></a></h1><p>Similar to CUDA streams, ROCm has HIP streams, which are buffers used to instruct the GPU hardware which kernels to launch. HIP streams are synchronous, like CUDA streams.</p><p>Each device has a default stream associated, which is accessible with <code>AMDGPU.stream()</code>.</p><p>There are several ways to specify which stream to launch a kernel on:</p><ul><li>Using <a href="#AMDGPU.stream!"><code>AMDGPU.stream!</code></a> to change default stream to be used   <strong>within the same Julia task</strong>.</li></ul><pre><code class="language-julia hljs">stream = AMDGPU.HIPStream()
 AMDGPU.stream!(stream) # Change default stream to be used for subsequent operations.
 AMDGPU.ones(Float32, 16) # Will be executed on `stream`.</code></pre><ul><li>Using <a href="#AMDGPU.stream!"><code>AMDGPU.stream!</code></a> to execute given function and reset   to the original stream after completion:</li></ul><pre><code class="language-julia hljs">stream = AMDGPU.HIPStream()
 x = AMDGPU.stream!(() -&gt; AMDGPU.ones(Float32, 16), stream)</code></pre><ul><li>Using <code>stream</code> argument to <code>@roc</code> macro:</li></ul><pre><code class="language-julia hljs">stream = AMDGPU.HIPStream()
 @roc stream=stream kernel(...)</code></pre><p>Streams also have an inherent priority, which allows control of kernel submission latency and on-device scheduling preference with respect to kernels submitted on other streams. There are three priorities: normal (the default), low, and high priority.</p><p>Priority of the default <code>stream</code> can be set with <a href="#AMDGPU.priority!"><code>AMDGPU.priority!</code></a>. Alternatively, it can be set at stream creation time:</p><pre><code class="language-julia hljs">low_prio = HIPStream(:low)
 high_prio = HIPStream(:high)
-normal_prio = HIPStream(:normal) # or just omit &quot;priority&quot;</code></pre><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.stream" href="#AMDGPU.stream"><code>AMDGPU.stream</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">stream()::HIPStream</code></pre><p>Get the HIP stream that should be used as the default one for the currently executing task.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/tls.jl#L83-L88">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.stream!" href="#AMDGPU.stream!"><code>AMDGPU.stream!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">stream!(s::HIPStream)</code></pre><p>Change the default stream to be used <strong>within the same Julia task</strong>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/tls.jl#L91-L95">source</a></section><section><div><pre><code class="language-julia hljs">stream!(f::Base.Callable, stream::HIPStream)</code></pre><p>Change the default stream to be used <strong>within the same Julia task</strong>, execute <code>f</code> and revert to the original stream.</p><p><strong>Returns:</strong></p><p>Return value of the function <code>f</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/tls.jl#L102-L111">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.priority!" href="#AMDGPU.priority!"><code>AMDGPU.priority!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">priority!(p::Symbol)</code></pre><p>Change the priority of the default stream. Accepted values are <code>:normal</code> (the default), <code>:low</code> and <code>:high</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/tls.jl#L155-L160">source</a></section><section><div><pre><code class="language-julia hljs">priority!(f::Base.Callable, priority::Symbol)</code></pre><p>Chnage the priority of default stream, execute <code>f</code> and revert to the original priority. Accepted values are <code>:normal</code> (the default), <code>:low</code> and <code>:high</code>.</p><p><strong>Returns:</strong></p><p>Return value of the function <code>f</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/tls.jl#L169-L179">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.HIPStream" href="#AMDGPU.HIP.HIPStream"><code>AMDGPU.HIP.HIPStream</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">HIPStream(priority::Symbol = :normal)</code></pre><p><strong>Arguments:</strong></p><ul><li><code>priority::Symbol</code>: Priority of the stream: <code>:normal</code>, <code>:high</code> or <code>:low</code>.</li></ul><p>Create HIPStream with given priority. Device is the default device that&#39;s currently in use.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/stream.jl#L13-L22">source</a></section><section><div><pre><code class="language-julia hljs">HIPStream(stream::hipStream_t)</code></pre><p>Create HIPStream from <code>hipStream_t</code> handle. Device is the default device that&#39;s currently in use.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/stream.jl#L42-L47">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><p>AMDGPU.jl by default uses non-blocking stream synchronization with <a href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> to work correctly with TLS and <a href="../hostcall/#Hostcall">Hostcall</a>.</p><p>Users, however, can switch to a blocking synchronization globally with <code>nonblocking_synchronization</code> <a href="https://github.com/JuliaPackaging/Preferences.jl">preference</a> or with fine-grained <code>AMDGPU.synchronize(; blocking=true)</code>. Blocking synchronization might offer slightly lower latency.</p><p>You can also perform synchronization of the expression with <a href="#AMDGPU.@sync"><code>AMDGPU.@sync</code></a> macro, which will execute given expression and synchronize afterwards (using <a href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> under the hood).</p><pre><code class="language-julia hljs">AMDGPU.@sync begin
+normal_prio = HIPStream(:normal) # or just omit &quot;priority&quot;</code></pre><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.stream" href="#AMDGPU.stream"><code>AMDGPU.stream</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">stream()::HIPStream</code></pre><p>Get the HIP stream that should be used as the default one for the currently executing task.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/tls.jl#L80-L85">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.stream!" href="#AMDGPU.stream!"><code>AMDGPU.stream!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">stream!(s::HIPStream)</code></pre><p>Change the default stream to be used <strong>within the same Julia task</strong>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/tls.jl#L88-L92">source</a></section><section><div><pre><code class="language-julia hljs">stream!(f::Base.Callable, stream::HIPStream)</code></pre><p>Change the default stream to be used <strong>within the same Julia task</strong>, execute <code>f</code> and revert to the original stream.</p><p><strong>Returns:</strong></p><p>Return value of the function <code>f</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/tls.jl#L99-L108">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.priority!" href="#AMDGPU.priority!"><code>AMDGPU.priority!</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">priority!(p::Symbol)</code></pre><p>Change the priority of the default stream. Accepted values are <code>:normal</code> (the default), <code>:low</code> and <code>:high</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/tls.jl#L152-L157">source</a></section><section><div><pre><code class="language-julia hljs">priority!(f::Base.Callable, priority::Symbol)</code></pre><p>Chnage the priority of default stream, execute <code>f</code> and revert to the original priority. Accepted values are <code>:normal</code> (the default), <code>:low</code> and <code>:high</code>.</p><p><strong>Returns:</strong></p><p>Return value of the function <code>f</code>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/tls.jl#L166-L176">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.HIPStream" href="#AMDGPU.HIP.HIPStream"><code>AMDGPU.HIP.HIPStream</code></a> — <span class="docstring-category">Type</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">HIPStream(priority::Symbol = :normal)</code></pre><p><strong>Arguments:</strong></p><ul><li><code>priority::Symbol</code>: Priority of the stream: <code>:normal</code>, <code>:high</code> or <code>:low</code>.</li></ul><p>Create HIPStream with given priority. Device is the default device that&#39;s currently in use.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/stream.jl#L13-L22">source</a></section><section><div><pre><code class="language-julia hljs">HIPStream(stream::hipStream_t)</code></pre><p>Create HIPStream from <code>hipStream_t</code> handle. Device is the default device that&#39;s currently in use.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/stream.jl#L42-L47">source</a></section></article><h2 id="Synchronization"><a class="docs-heading-anchor" href="#Synchronization">Synchronization</a><a id="Synchronization-1"></a><a class="docs-heading-anchor-permalink" href="#Synchronization" title="Permalink"></a></h2><p>AMDGPU.jl by default uses non-blocking stream synchronization with <a href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> to work correctly with TLS and <a href="../hostcall/#Hostcall">Hostcall</a>.</p><p>Users, however, can switch to a blocking synchronization globally with <code>nonblocking_synchronization</code> <a href="https://github.com/JuliaPackaging/Preferences.jl">preference</a> or with fine-grained <code>AMDGPU.synchronize(; blocking=true)</code>. Blocking synchronization might offer slightly lower latency.</p><p>You can also perform synchronization of the expression with <a href="#AMDGPU.@sync"><code>AMDGPU.@sync</code></a> macro, which will execute given expression and synchronize afterwards (using <a href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> under the hood).</p><pre><code class="language-julia hljs">AMDGPU.@sync begin
     @roc ...
-end</code></pre><p>Finally, you can perform full device synchronization with <a href="#AMDGPU.HIP.device_synchronize"><code>AMDGPU.device_synchronize</code></a>.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.synchronize" href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">synchronize(stream::HIPStream = stream(); blocking::Bool = false)</code></pre><p>Wait until all kernels executing on <code>stream</code> have completed.</p><p>If there are running HostCalls, then <code>blocking</code> <strong>must</strong> be <code>false</code>. Additionally, if you want to stop host calls afterwards, then provide <code>stop_hostcalls=true</code> keyword argument.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/highlevel.jl#L27-L35">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.@sync" href="#AMDGPU.@sync"><code>AMDGPU.@sync</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@sync ex</code></pre><p>Run expression <code>ex</code> on currently active stream and synchronize the GPU on that stream afterwards.</p><p>See also: <a href="#AMDGPU.synchronize"><code>synchronize</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/highlevel.jl#L65-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.device_synchronize" href="#AMDGPU.HIP.device_synchronize"><code>AMDGPU.HIP.device_synchronize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><p>Blocks until all kernels on all streams have completed. Uses currently active device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/2e5261a9906c52944c86f57b79ac3ff499a589c1/src/hip/HIP.jl#L83-L86">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../devices/">« Devices</a><a class="docs-footer-nextpage" href="../kernel_programming/">Kernel Programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Wednesday 1 January 2025 23:14">Wednesday 1 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>
+end</code></pre><p>Finally, you can perform full device synchronization with <a href="#AMDGPU.HIP.device_synchronize"><code>AMDGPU.device_synchronize</code></a>.</p><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.synchronize" href="#AMDGPU.synchronize"><code>AMDGPU.synchronize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">synchronize(stream::HIPStream = stream(); blocking::Bool = false)</code></pre><p>Wait until all kernels executing on <code>stream</code> have completed.</p><p>If there are running HostCalls, then <code>blocking</code> <strong>must</strong> be <code>false</code>. Additionally, if you want to stop host calls afterwards, then provide <code>stop_hostcalls=true</code> keyword argument.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/highlevel.jl#L27-L35">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.@sync" href="#AMDGPU.@sync"><code>AMDGPU.@sync</code></a> — <span class="docstring-category">Macro</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><pre><code class="language-julia hljs">@sync ex</code></pre><p>Run expression <code>ex</code> on currently active stream and synchronize the GPU on that stream afterwards.</p><p>See also: <a href="#AMDGPU.synchronize"><code>synchronize</code></a>.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/highlevel.jl#L65-L72">source</a></section></article><article class="docstring"><header><a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring"></a><a class="docstring-binding" id="AMDGPU.HIP.device_synchronize" href="#AMDGPU.HIP.device_synchronize"><code>AMDGPU.HIP.device_synchronize</code></a> — <span class="docstring-category">Function</span><span class="is-flex-grow-1 docstring-article-toggle-button" title="Collapse docstring"></span></header><section><div><p>Blocks until all kernels on all streams have completed. Uses currently active device.</p></div><a class="docs-sourcelink" target="_blank" href="https://github.com/JuliaGPU/AMDGPU.jl/blob/83d0fd83604a90f3d7b14ad97117deffc1958a06/src/hip/HIP.jl#L83-L86">source</a></section></article></article><nav class="docs-footer"><a class="docs-footer-prevpage" href="../devices/">« Devices</a><a class="docs-footer-nextpage" href="../kernel_programming/">Kernel Programming »</a><div class="flexbox-break"></div><p class="footer-message">Powered by <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> and the <a href="https://julialang.org/">Julia Programming Language</a>.</p></nav></div><div class="modal" id="documenter-settings"><div class="modal-background"></div><div class="modal-card"><header class="modal-card-head"><p class="modal-card-title">Settings</p><button class="delete"></button></header><section class="modal-card-body"><p><label class="label">Theme</label><div class="select"><select id="documenter-themepicker"><option value="auto">Automatic (OS)</option><option value="documenter-light">documenter-light</option><option value="documenter-dark">documenter-dark</option><option value="catppuccin-latte">catppuccin-latte</option><option value="catppuccin-frappe">catppuccin-frappe</option><option value="catppuccin-macchiato">catppuccin-macchiato</option><option value="catppuccin-mocha">catppuccin-mocha</option></select></div></p><hr/><p>This document was generated with <a href="https://github.com/JuliaDocs/Documenter.jl">Documenter.jl</a> version 1.8.0 on <span class="colophon-date" title="Monday 6 January 2025 00:32">Monday 6 January 2025</span>. Using Julia version 1.10.7.</p></section><footer class="modal-card-foot"></footer></div></div></div></body></html>

AMDGPU	CUDA
`workitemIdx`	`threadIdx`
`workgroupIdx`	`blockIdx`
`workgroupDim`	`blockDim`
`gridItemDim`	No equivalent
`gridGroupDim`	`gridDim`
`groupsize`	`threads`
`gridsize`	`blocks`
`stream`	`stream`