Enable int32_t support for reshape_vbe_offsets (pytorch#3782)

spcyppt · facebook-github-bot · commit 3e5dc3685f28 · 2025-03-07T03:45:59.000-08:00
Summary: - Enable int32_t support for reshape_vbe_offsets - Fix setting learning_rate as learning_rate_tensor in ssd. X-link: facebookresearch/FBGEMM#866 Reviewed By: nautsimon Differential Revision: D70760386
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp
@@ -26,6 +26,9 @@
 #include "fbgemm_gpu/utils/ops_utils.h"
 #include "fbgemm_gpu/utils/dispatch_macros.h"
 #include "fbgemm_gpu/embedding_common.h"
+// #include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
 {%- if has_vbe_support %}
 #include "fbgemm_gpu/utils/pt2_autograd_utils.h"
 {%- endif %}
@@ -64,12 +67,15 @@ Tensor split_embedding_codegen_grad_indice_weights{{ vdesc }}_pt2_cpu_wrapper(
     {%- endif %}
 ) {
     {%- if vbe %}
-    const auto offsets_ = reshape_vbe_offsets(
-        offsets,
-        vbe_B_offsets_rank_per_feature,
-        max_B,
-        D_offsets.numel() - 1
-    );
+    Tensor offsets_;
+    AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "reshape_vbe_offsets_cpu_grad_indices", [&]() {
+        offsets_ = reshape_vbe_offsets<index_t>(
+            offsets,
+            vbe_B_offsets_rank_per_feature,
+            max_B,
+            D_offsets.numel() - 1
+        );
+    });
     const auto grad_output_ = reshape_vbe_output(
         grad_output,
         max_B,
@@ -126,8 +132,11 @@ Tensor split_embedding_codegen_forward_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper(
     {%- endif %}
     const bool /*is_experimental = false*/,
     const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32)) {
+    Tensor offsets_;
     {%- if vbe %}
-    const auto offsets_ = reshape_vbe_offsets(offsets, vbe_B_offsets_rank_per_feature, max_B, D_offsets.numel() - 1);
+    AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "reshape_vbe_offsets_cpu_forward", [&]() {
+        offsets_ = reshape_vbe_offsets<index_t>(offsets, vbe_B_offsets_rank_per_feature, max_B, D_offsets.numel() - 1);
+    });
     {%- endif %}
     static auto op =
         torch::Dispatcher::singleton()
@@ -226,7 +235,10 @@ Tensor split_embedding_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vdesc }}_p
     {%- endif %})
     {
         {%- if vbe %}
-        const auto offsets_ = reshape_vbe_offsets(offsets, vbe_B_offsets_rank_per_feature, max_B, D_offsets.numel() - 1);
+        Tensor offsets_;
+        AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "reshape_vbe_offsets_cpu_backward", [&]() {
+            offsets_ = reshape_vbe_offsets<index_t>(offsets, vbe_B_offsets_rank_per_feature, max_B, D_offsets.numel() - 1);
+        });
         const auto grad_output_ = reshape_vbe_output(grad_output, max_B, vbe_B_offsets_rank_per_feature, D_offsets);
         {%- endif %}
         {%- set backward_op = "split_embedding_backward_codegen_{}_cpu".format(
diff --git a/fbgemm_gpu/codegen/training/pt2/pt2_autograd_utils.cpp b/fbgemm_gpu/codegen/training/pt2/pt2_autograd_utils.cpp
@@ -113,6 +113,7 @@ void checked_memcpy(
 ///                                    size(1) is number of ranks
 /// @param max_B                      Maximum batch size
 /// @param T                          Number of embedding tables (features)
+template <typename index_t>
 Tensor reshape_vbe_offsets(
     const Tensor& offsets,
     const Tensor& B_offsets_rank_per_feature,
@@ -125,12 +126,8 @@ Tensor reshape_vbe_offsets(
       B_offsets_rank_per_feature.accessor<int32_t, 2>();
   auto reshaped_offsets = at::empty({T * max_B + 1}, offsets.options());
   // TODO: support other types
-  TORCH_CHECK(
-      offsets.dtype() == at::kLong,
-      "Expected offsets to be int64 but got ",
-      offsets.dtype());
-  auto reshaped_offsets_acc = reshaped_offsets.accessor<int64_t, 1>();
-  auto offsets_acc = offsets.accessor<int64_t, 1>();
+  auto reshaped_offsets_acc = reshaped_offsets.accessor<index_t, 1>();
+  auto offsets_acc = offsets.accessor<index_t, 1>();
   auto begin = 0;
   for (int32_t t = 0; t < T; t++) {
     const auto batch_size =
@@ -167,4 +164,16 @@ Tensor reshape_vbe_offsets(
   return reshaped_offsets;
 }
 
+template Tensor reshape_vbe_offsets<int32_t>(
+    const Tensor& offsets,
+    const Tensor& B_offsets_rank_per_feature,
+    const int64_t max_B,
+    const int32_t T);
+
+template Tensor reshape_vbe_offsets<int64_t>(
+    const Tensor& offsets,
+    const Tensor& B_offsets_rank_per_feature,
+    const int64_t max_B,
+    const int32_t T);
+
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -1826,7 +1826,7 @@ def _set_learning_rate(self, lr: float) -> float:
         Helper function to script `set_learning_rate`.
         Note that returning None does not work.
         """
-        self.optimizer_args = self.optimizer_args._replace(learning_rate=lr)
+        self.optimizer_args.learning_rate_tensor.fill_(lr)
         return 0.0
 
     def flush(self) -> None:
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/pt2_autograd_utils.h b/fbgemm_gpu/include/fbgemm_gpu/utils/pt2_autograd_utils.h
@@ -8,12 +8,6 @@
 
 #include <ATen/ATen.h>
 #include <ATen/TypeDefault.h>
-// #include <ATen/core/op_registration/op_registration.h>
-// #include <torch/script.h>
-// #include "fbgemm_gpu/embedding_common.h"
-// #include "fbgemm_gpu/utils/dispatch_macros.h"
-// #include "fbgemm_gpu/utils/ops_utils.h"
-// #include "fbgemm_gpu/utils/tensor_utils.h"
 
 using Tensor = at::Tensor;
 
@@ -29,6 +23,7 @@ Tensor reshape_vbe_output(
     const Tensor& B_offsets_rank_per_feature,
     const Tensor& D_offsets);
 
+template <typename index_t>
 Tensor reshape_vbe_offsets(
     const Tensor& offsets,
     const Tensor& B_offsets_rank_per_feature,