From 9ba11951d4d60c9f23f2ad4bb51ce10c533988ea Mon Sep 17 00:00:00 2001
From: Charles Kawczynski <kawczynski.charles@gmail.com>
Date: Wed, 26 Feb 2025 13:32:57 -0500
Subject: [PATCH] wip, adding docs

---
 docs/make.jl             |  5 ++++-
 docs/src/shmem_design.md | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 docs/src/shmem_design.md

diff --git a/docs/make.jl b/docs/make.jl
index fb7d996735..6ae1a759a2 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -80,7 +80,10 @@ withenv("GKSwstype" => "nul") do
             "Remapping" => "remapping.md",
             "MatrixFields" => "matrix_fields.md",
             "API" => "api.md",
-            "Developer docs" => ["Performance tips" => "performance_tips.md"],
+            "Developer docs" => [
+                "Performance tips" => "performance_tips.md"
+                "Shared memory design" => "shmem_design.md"
+            ],
             "Tutorials" => [
                 joinpath("tutorials", tutorial * ".md") for
                 tutorial in TUTORIALS
diff --git a/docs/src/shmem_design.md b/docs/src/shmem_design.md
new file mode 100644
index 0000000000..e9da4ef986
--- /dev/null
+++ b/docs/src/shmem_design.md
@@ -0,0 +1,42 @@
+# Shared memory design
+
+ClimaCore stencil operators support staggered (or collocated) finite difference
+operations. For example, the `DivergenceF2C` operator takes an argument that
+lives on the cell faces and the resulting divergence calculation lives on the
+cell centers.
+
+## Motivation
+
+A naive and simplified implementation of this operation looks like `div[i] = (f
+[i+1] - f[i]) / dz[i]`. Such a calculation on the gpu (or cpu) requires `f[i]`
+be read from global memory to compute the result of `div[i]` and `div[i-1]`. Not
+to mention, if `f` is a `Broadcasted` object (`Broadcasted` objects behave like
+arrays, and support `f[i]` behavior), then `f[i]` may require several reads and
+or computations.
+
+Reading data from global memory is often the main bottleneck for
+bandwidth-limited cuda kernels. As such, we use shared memory (or, "shmem" for
+short) to reduce the number of global memory reads (and compute) in our kernels.
+
+## High-level design
+
+The high-level view of the design is:
+
+ - The `bc::StencilBroadcasted` type has a `work` field, which is used to store
+   shared memory for the `bc.op` operator. The element type of the `work`
+   (or parts of `work` if there are multiple parts) is the type returned by the
+   `bc.op`'s `Operator.return_eltype`.
+ - Recursively reconstruct the broadcasted object, allocating shared memory for
+   each `StencilBroadcasted` along the way that supports shared memory
+   (different operators require different arguments, and therefore different
+   types and amounts of shared memory).
+ - Recursively fill the shared memory for all `StencilBroadcasted`. This is done
+   by reading the argument data from `getidx`
+ - The destination field is filled with the result of `getidx` (as it is without
+   shmem), except that we overload `getidx` (for supported `StencilBroadcasted`
+   types) to retrieve the result of `getidx` via `fd_operator_evaluate`, which
+   retrieves the result from the shmem, instead of global memory.
+
+
+
+