Unifying TBE API using List (Frontend) (pytorch#3711)

spcyppt · facebook-github-bot · commit ce34aac9c7e1 · 2025-03-06T11:35:31.000-08:00
Summary: X-link: pytorch/torchrec#2751 X-link: facebookresearch/FBGEMM#793 **Backend**: D68054868 --- As the number of arguments in TBE keeps growing, some of the optimizers run into number of arguments limitation (i.e., 64) during pytorch operation registration. **For long-term growth and maintenance, we hence redesign TBE API by packing some of the arguments into list. Note that not all arguments are packed.** We pack the arguments as a list for each type. For **common** arguments, we pack - weights and arguments of type `Momentum` into TensorList - other tensors and optional tensors to list of optional tensors `aux_tensor` - `int` arguments into `aux_int` - `float` arguments into `aux_float` - `bool` arguments into `aux_bool`. Similarly for **optimizer-specific** arguments, we pack - arguments of type `Momentum` that are *__not__ optional* into TensorList - *optional* tensors to list of optional tensors `optim_tensor` - `int` arguments into `optim_int` - `float` arguments into `optim_float` - `bool` arguments into `optim_bool`. We see issues with pytorch registration across packing SymInt in python-C++, so we unroll and pass SymInt arguments individually. **This significantly reduces number of arguments.** For example, `split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function`, which currently has 61 arguments only have 26 arguments with this API design. Please refer to the design doc on which arguments are packed and signature. Design doc: https://docs.google.com/document/d/1dCBg7dcf7Yq9FHVrvXsAmFtBxkDi9o6u0r-Ptd4UDPE/edit?tab=t.0#heading=h.6bip5pwqq8xb Full signature for each optimizer lookup function will be provided shortly. Reviewed By: sryap, nautsimon Differential Revision: D68055168
diff --git a/fbgemm_gpu/codegen/genscript/generate_backward_split.py b/fbgemm_gpu/codegen/genscript/generate_backward_split.py
@@ -447,7 +447,7 @@ def generate() -> None:
                     ssd_optimizers.append(optim)
 
             BackwardSplitGenerator.generate_backward_split(
-                ssd_tensors=ssd_tensors, **optimizer
+                ssd_tensors=ssd_tensors, aux_args=aux_args, **optimizer
             )
         BackwardSplitGenerator.generate_rocm_backward_split()
 
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -601,6 +601,7 @@ Tensor {{ embedding_cuda_op }}(
 
     {%- if "learning_rate" in args.split_kernel_arg_names %}
     // convert `learning rate` to float since `learning rate` is float in kernels
+    TORCH_CHECK(learning_rate_tensor.is_cpu(), "learning_rate_tensor tensor needs to be on CPU. Ensure learning_rate_tensor is on CPU or contact FBGEMM team if you get this error.")
     const float learning_rate = learning_rate_tensor.item<float>();
     {%- endif %}
 
diff --git a/fbgemm_gpu/codegen/training/python/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/training/python/split_embedding_codegen_lookup_invoker.template
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -597,7 +597,7 @@ class SplitTableBatchedEmbeddingBagsCodegen(nn.Module):
     """
 
     embedding_specs: List[Tuple[int, int, EmbeddingLocation, ComputeDevice]]
-    optimizer_args: invokers.lookup_args.OptimizerArgs
+    optimizer_args: invokers.lookup_args.OptimizerArgsPT2
     lxu_cache_locations_list: List[Tensor]
     lxu_cache_locations_empty: Tensor
     timesteps_prefetched: List[int]
@@ -926,6 +926,11 @@ def __init__(  # noqa C901
             "feature_dims",
             torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
         )
+        (self.info_B_num_bits, self.info_B_mask) = torch.ops.fbgemm.get_infos_metadata(
+            self.D_offsets,  # unused tensor
+            1,  # max_B
+            T,  # T
+        )
 
         # A flag for indicating whether all embedding tables are placed in the
         # same locations
@@ -1070,6 +1075,9 @@ def __init__(  # noqa C901
             # which should not be effective when CounterBasedRegularizationDefinition
             # and CowClipDefinition are not used
             counter_halflife = -1
+        learning_rate_tensor = torch.tensor(
+            learning_rate, device=torch.device("cpu"), dtype=torch.float
+        )
 
         # TO DO: Enable this on the new interface
         # learning_rate_tensor = torch.tensor(
@@ -1085,12 +1093,12 @@ def __init__(  # noqa C901
                 "`use_rowwise_bias_correction` is only supported for OptimType.ADAM",
             )
 
-        self.optimizer_args = invokers.lookup_args.OptimizerArgs(
+        self.optimizer_args = invokers.lookup_args.OptimizerArgsPT2(
             stochastic_rounding=stochastic_rounding,
             gradient_clipping=gradient_clipping,
             max_gradient=max_gradient,
             max_norm=max_norm,
-            learning_rate=learning_rate,
+            learning_rate_tensor=learning_rate_tensor,
             eps=eps,
             beta1=beta1,
             beta2=beta2,
@@ -1873,7 +1881,7 @@ def forward(  # noqa: C901
             if len(self.lxu_cache_locations_list) == 0
             else self.lxu_cache_locations_list.pop(0)
         )
-        common_args = invokers.lookup_args.CommonArgs(
+        common_args = invokers.lookup_args.CommonArgsPT2(
             placeholder_autograd_tensor=self.placeholder_autograd_tensor,
             # pyre-fixme[6]: For 2nd argument expected `Tensor` but got
             #  `Union[Module, Tensor]`.
@@ -1920,6 +1928,8 @@ def forward(  # noqa: C901
             is_experimental=self.is_experimental,
             use_uniq_cache_locations_bwd=self.use_uniq_cache_locations_bwd,
             use_homogeneous_placements=self.use_homogeneous_placements,
+            info_B_num_bits=self.info_B_num_bits,
+            info_B_mask=self.info_B_mask,
         )
 
         if self.optimizer == OptimType.NONE:
@@ -2032,7 +2042,6 @@ def forward(  # noqa: C901
                     momentum1,
                     momentum2,
                     iter_int,
-                    self.use_rowwise_bias_correction,
                     row_counter=(
                         row_counter if self.use_rowwise_bias_correction else None
                     ),
@@ -2918,7 +2927,7 @@ def _set_learning_rate(self, lr: float) -> float:
         Helper function to script `set_learning_rate`.
         Note that returning None does not work.
         """
-        self.optimizer_args = self.optimizer_args._replace(learning_rate=lr)
+        self.optimizer_args.learning_rate_tensor.fill_(lr)
         return 0.0
 
     @torch.jit.ignore
@@ -3433,6 +3442,22 @@ def prepare_inputs(
             offsets, batch_size_per_feature_per_rank
         )
 
+        vbe = vbe_metadata.B_offsets is not None
+        # TODO: assert vbe_metadata.B_offsets.numel() - 1 == T
+        # T = self.D_offsets.numel() - 1
+        # vbe_metadata.B_offsets causes jit to fail for cogwheel forward compatibility test
+        # max_B = int(vbe_metadata.max_B) if vbe else int(offsets.numel() - 1 / T)
+
+        # TODO:  max_B <= self.info_B_mask
+        # cannot use assert as it breaks pt2 compile for dynamic shape
+        # Need to use torch._check for dynamic shape and cannot construct fstring, use constant string.
+        # cannot use lambda as it fails jit script.
+        # torch._check is not supported in jitscript
+        # torch._check(
+        #     max_B <= self.info_B_mask,
+        #     "Not enough infos bits to accommodate T and B.",
+        # )
+
         # TODO: remove this and add an assert after updating
         # bounds_check_indices to support different indices type and offset
         # type
@@ -3460,7 +3485,6 @@ def prepare_inputs(
                 per_sample_weights = per_sample_weights.float()
 
         if self.bounds_check_mode_int != BoundsCheckMode.NONE.value:
-            vbe = vbe_metadata.B_offsets is not None
             # Compute B info and VBE metadata for bounds_check_indices only if
             # VBE and bounds check indices v2 are used
             if vbe and self.use_bounds_check_v2:
@@ -3474,11 +3498,7 @@ def prepare_inputs(
                 assert isinstance(
                     output_offsets_feature_rank, Tensor
                 ), "output_offsets_feature_rank must be tensor"
-                info_B_num_bits, info_B_mask = torch.ops.fbgemm.get_infos_metadata(
-                    B_offsets,  # unused tensor
-                    vbe_metadata.max_B,
-                    B_offsets.numel() - 1,  # T
-                )
+
                 row_output_offsets, b_t_map = torch.ops.fbgemm.generate_vbe_metadata(
                     B_offsets,
                     B_offsets_rank_per_feature,
@@ -3487,13 +3507,11 @@ def prepare_inputs(
                     self.max_D,
                     self.is_nobag,
                     vbe_metadata.max_B_feature_rank,
-                    info_B_num_bits,
+                    self.info_B_num_bits,
                     offsets.numel() - 1,  # total_B
                 )
             else:
                 b_t_map = None
-                info_B_num_bits = -1
-                info_B_mask = -1
 
             torch.ops.fbgemm.bounds_check_indices(
                 self.rows_per_table,
@@ -3505,8 +3523,8 @@ def prepare_inputs(
                 B_offsets=vbe_metadata.B_offsets,
                 max_B=vbe_metadata.max_B,
                 b_t_map=b_t_map,
-                info_B_num_bits=info_B_num_bits,
-                info_B_mask=info_B_mask,
+                info_B_num_bits=self.info_B_num_bits,
+                info_B_mask=self.info_B_mask,
                 bounds_check_version=self.bounds_check_version,
             )
 
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -228,6 +228,12 @@ def __init__(
             torch.tensor(feature_dims, device="cpu", dtype=torch.int64),
         )
 
+        (self.info_B_num_bits, self.info_B_mask) = torch.ops.fbgemm.get_infos_metadata(
+            self.D_offsets,  # unused tensor
+            1,  # max_B
+            T,  # T
+        )
+
         assert cache_sets > 0
         element_size = weights_precision.bit_rate() // 8
         assert (
@@ -544,12 +550,15 @@ def __init__(
             )
         cowclip_regularization = CowClipDefinition()
 
-        self.optimizer_args = invokers.lookup_args_ssd.OptimizerArgs(
+        learning_rate_tensor = torch.tensor(
+            learning_rate, device=torch.device("cpu"), dtype=torch.float
+        )
+        self.optimizer_args = invokers.lookup_args_ssd.OptimizerArgsPT2(
             stochastic_rounding=stochastic_rounding,
             gradient_clipping=gradient_clipping,
             max_gradient=max_gradient,
             max_norm=max_norm,
-            learning_rate=learning_rate,
+            learning_rate_tensor=learning_rate_tensor,
             eps=eps,
             beta1=beta1,
             beta2=beta2,
@@ -1630,7 +1639,7 @@ def forward(
             vbe_metadata.max_B,
         )
 
-        common_args = invokers.lookup_args_ssd.CommonArgs(
+        common_args = invokers.lookup_args_ssd.CommonArgsPT2(
             placeholder_autograd_tensor=self.placeholder_autograd_tensor,
             output_dtype=self.output_dtype,
             dev_weights=self.weights_dev,
@@ -1665,6 +1674,8 @@ def forward(
             },
             # pyre-fixme[6]: Expected `lookup_args_ssd.VBEMetadata` but got `lookup_args.VBEMetadata`
             vbe_metadata=vbe_metadata,
+            info_B_num_bits=self.info_B_num_bits,
+            info_B_mask=self.info_B_mask,
         )
 
         self.timesteps_prefetched.pop(0)
diff --git a/fbgemm_gpu/test/tbe/training/backward_adagrad_common.py b/fbgemm_gpu/test/tbe/training/backward_adagrad_common.py
@@ -82,8 +82,10 @@
     "use_cache": st.booleans(),
     "cache_algorithm": st.sampled_from(CacheAlgorithm),
     "use_cpu": use_cpu_strategy(),
-    "output_dtype": st.sampled_from(
-        [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+    "output_dtype": (
+        st.sampled_from([SparseType.FP32, SparseType.FP16, SparseType.BF16])
+        if gpu_available
+        else st.sampled_from([SparseType.FP32, SparseType.FP16])
     ),
 }
 
diff --git a/fbgemm_gpu/test/tbe/training/backward_adagrad_global_weight_decay_test.py b/fbgemm_gpu/test/tbe/training/backward_adagrad_global_weight_decay_test.py
@@ -190,7 +190,7 @@ def apply_gwd(
             apply_gwd_per_table(
                 prev_iter_values,
                 weights_values,
-                emb.optimizer_args.learning_rate,
+                emb.optimizer_args.learning_rate_tensor.item(),
                 emb.optimizer_args.weight_decay,
                 step,
                 emb.current_device,
diff --git a/fbgemm_gpu/test/tbe/training/forward_test.py b/fbgemm_gpu/test/tbe/training/forward_test.py
@@ -76,6 +76,9 @@
         "test_faketensor__test_forward_gpu_uvm_cache_int8": [
             unittest.skip("Operator not implemented for Meta tensors"),
         ],
+        "test_faketensor__test_forward_cpu_fp32": [
+            unittest.skip("Operator not implemented for Meta tensors"),
+        ],
         # TODO: Make it compatible with opcheck tests
         "test_faketensor__test_forward_gpu_uvm_cache_fp16": [
             unittest.skip(
diff --git a/fbgemm_gpu/test/tbe/utils/split_embeddings_test.py b/fbgemm_gpu/test/tbe/utils/split_embeddings_test.py
@@ -594,7 +594,7 @@ def test_update_hyper_parameters(self) -> None:
         } | {"lr": 1.0, "lower_bound": 2.0}
         cc.update_hyper_parameters(updated_parameters)
         self.assertAlmostEqual(
-            cc.optimizer_args.learning_rate, updated_parameters["lr"]
+            cc.optimizer_args.learning_rate_tensor.item(), updated_parameters["lr"]
         )
         self.assertAlmostEqual(cc.optimizer_args.eps, updated_parameters["eps"])
         self.assertAlmostEqual(cc.optimizer_args.beta1, updated_parameters["beta1"])

Original file line number	Diff line number	Diff line change
`@@ -447,7 +447,7 @@ def generate() -> None:`
`447`	`447`	`ssd_optimizers.append(optim)`
`448`	`448`
`449`	`449`	`BackwardSplitGenerator.generate_backward_split(`
`450`		`- ssd_tensors=ssd_tensors, **optimizer`
	`450`	`+ ssd_tensors=ssd_tensors, aux_args=aux_args, **optimizer`
`451`	`451`	`)`
`452`	`452`	`BackwardSplitGenerator.generate_rocm_backward_split()`
`453`	`453`
Original file line number	Diff line number	Diff line change
`@@ -594,7 +594,7 @@ def test_update_hyper_parameters(self) -> None:`
`594`	`594`	`} \| {"lr": 1.0, "lower_bound": 2.0}`
`595`	`595`	`cc.update_hyper_parameters(updated_parameters)`
`596`	`596`	`self.assertAlmostEqual(`
`597`		`- cc.optimizer_args.learning_rate, updated_parameters["lr"]`
	`597`	`+ cc.optimizer_args.learning_rate_tensor.item(), updated_parameters["lr"]`
`598`	`598`	`)`
`599`	`599`	`self.assertAlmostEqual(cc.optimizer_args.eps, updated_parameters["eps"])`
`600`	`600`	`self.assertAlmostEqual(cc.optimizer_args.beta1, updated_parameters["beta1"])`