Adapt the Gaussian data module to the new noising paradigm.

rousseab · rousseab · commit a26f78f22692 · 2025-01-29T15:55:46.000-05:00
diff --git a/src/diffusion_for_multi_scale_molecular_dynamics/data/diffusion/gaussian_data_module.py b/src/diffusion_for_multi_scale_molecular_dynamics/data/diffusion/gaussian_data_module.py
@@ -9,10 +9,14 @@
 
 from diffusion_for_multi_scale_molecular_dynamics.data.diffusion.data_module_parameters import \
     DataModuleParameters
+from diffusion_for_multi_scale_molecular_dynamics.data.diffusion.noising_transform import \
+    NoisingTransform
 from diffusion_for_multi_scale_molecular_dynamics.data.element_types import \
     ElementTypes
 from diffusion_for_multi_scale_molecular_dynamics.namespace import (
     ATOM_TYPES, CARTESIAN_FORCES, RELATIVE_COORDINATES)
+from diffusion_for_multi_scale_molecular_dynamics.noise_schedulers.noise_parameters import \
+    NoiseParameters
 from diffusion_for_multi_scale_molecular_dynamics.utils.basis_transformations import \
     map_relative_coordinates_to_unit_cell
 
@@ -24,6 +28,9 @@ class GaussianDataModuleParameters(DataModuleParameters):
     """Hyper-parameters for a Gaussian, in memory data module."""
     data_source = "gaussian"
 
+    noise_parameters: NoiseParameters
+    use_optimal_transport: bool
+
     random_seed: int
     # the number of atoms in a configuration.
     number_of_atoms: int
@@ -42,14 +49,18 @@ def __post_init__(self):
         """Post init."""
         assert self.sigma_d > 0.0, "the sigma_d parameter should be positive."
 
-        assert len(self.equilibrium_relative_coordinates) == self.number_of_atoms, \
-            "There should be exactly one list of equilibrium coordinates per atom."
+        assert (
+            len(self.equilibrium_relative_coordinates) == self.number_of_atoms
+        ), "There should be exactly one list of equilibrium coordinates per atom."
 
         for x in self.equilibrium_relative_coordinates:
-            assert len(x) == self.spatial_dimension, \
-                "The equilibrium coordinates should be consistent with the spatial dimension."
+            assert (
+                len(x) == self.spatial_dimension
+            ), "The equilibrium coordinates should be consistent with the spatial dimension."
 
-        assert len(self.elements) == 1, "There can only be one element type for the gaussian data module."
+        assert (
+            len(self.elements) == 1
+        ), "There can only be one element type for the gaussian data module."
 
 
 class GaussianDataModule(pl.LightningDataModule):
@@ -69,8 +80,9 @@ def __init__(
         self.number_of_atoms = hyper_params.number_of_atoms
         self.spatial_dimension = hyper_params.spatial_dimension
         self.sigma_d = hyper_params.sigma_d
-        self.equilibrium_coordinates = torch.tensor(hyper_params.equilibrium_relative_coordinates,
-                                                    dtype=torch.float)
+        self.equilibrium_coordinates = torch.tensor(
+            hyper_params.equilibrium_relative_coordinates, dtype=torch.float
+        )
 
         self.train_dataset_size = hyper_params.train_dataset_size
         self.valid_dataset_size = hyper_params.valid_dataset_size
@@ -85,6 +97,41 @@ def __init__(
 
         self.element_types = ElementTypes(hyper_params.elements)
 
+        self.noising_transform = NoisingTransform(
+            noise_parameters=hyper_params.noise_parameters,
+            num_atom_types=len(hyper_params.elements),
+            spatial_dimension=self.spatial_dimension,
+            use_optimal_transport=hyper_params.use_optimal_transport,
+        )
+
+    def get_raw_dataset(self, batch_size: int, rng: torch.Generator):
+        """Get raw dataset."""
+        box = torch.ones(batch_size, self.spatial_dimension, dtype=torch.float)
+        atom_types = torch.zeros(batch_size, self.number_of_atoms, dtype=torch.long)
+
+        mean = einops.repeat(
+            self.equilibrium_coordinates,
+            "natoms space -> batch natoms space",
+            batch=batch_size,
+        )
+        std = self.sigma_d * torch.ones_like(mean)
+        relative_coordinates = map_relative_coordinates_to_unit_cell(
+            torch.normal(mean=mean, std=std, generator=rng).to(torch.float)
+        )
+
+        natoms = self.number_of_atoms * torch.ones(batch_size)
+        potential_energy = torch.zeros(batch_size)
+
+        raw_dataset = {
+            "natom": natoms,
+            "box": box,
+            RELATIVE_COORDINATES: relative_coordinates,
+            ATOM_TYPES: atom_types,
+            CARTESIAN_FORCES: torch.zeros_like(relative_coordinates),
+            "potential_energy": potential_energy,
+        }
+        return raw_dataset
+
     def setup(self, stage: Optional[str] = None):
         """Setup method."""
         self.train_dataset = []
@@ -93,29 +140,19 @@ def setup(self, stage: Optional[str] = None):
         rng = torch.Generator()
         rng.manual_seed(self.random_seed)
 
-        box = torch.ones(self.spatial_dimension, dtype=torch.float)
-
-        atom_types = torch.zeros(self.number_of_atoms, dtype=torch.long)
-
-        for dataset, batch_size in zip([self.train_dataset, self.valid_dataset],
-                                       [self.train_dataset_size, self.valid_dataset_size]):
-
-            mean = einops.repeat(self.equilibrium_coordinates,
-                                 "natoms space -> batch natoms space", batch=batch_size)
-            std = self.sigma_d * torch.ones_like(mean)
-            relative_coordinates = map_relative_coordinates_to_unit_cell(
-                torch.normal(mean=mean, std=std, generator=rng).to(torch.float))
-
-            for x in relative_coordinates:
-                row = {
-                    "natom": self.number_of_atoms,
-                    "box": box,
-                    RELATIVE_COORDINATES: x,
-                    ATOM_TYPES: atom_types,
-                    CARTESIAN_FORCES: torch.zeros_like(x),
-                    "potential_energy": torch.tensor([0.0], dtype=torch.float),
-                }
-                dataset.append(row)
+        for dataset, batch_size in zip(
+            [self.train_dataset, self.valid_dataset],
+            [self.train_dataset_size, self.valid_dataset_size],
+        ):
+
+            raw_dataset_as_single_batch = self.get_raw_dataset(batch_size, rng)
+            dataset_as_single_batch = self.noising_transform.transform(
+                raw_dataset_as_single_batch
+            )
+
+            keys = dataset_as_single_batch.keys()
+            for idx in range(batch_size):
+                dataset.append({key: dataset_as_single_batch[key][idx] for key in keys})
 
     def train_dataloader(self) -> DataLoader:
         """Create the training dataloader using the training data parser."""
diff --git a/src/diffusion_for_multi_scale_molecular_dynamics/data/diffusion/instantiate_data_module.py b/src/diffusion_for_multi_scale_molecular_dynamics/data/diffusion/instantiate_data_module.py
@@ -46,7 +46,9 @@ def load_data_module(hyper_params: Dict[AnyStr, Any], args: argparse.Namespace)
                                                        working_cache_dir=args.dataset_working_dir)
 
         case "gaussian":
-            data_params = GaussianDataModuleParameters(**data_config, elements=hyper_params["elements"])
+            data_params = GaussianDataModuleParameters(**data_config,
+                                                       noise_parameters=noise_parameters,
+                                                       elements=hyper_params["elements"])
             data_module = GaussianDataModule(data_params)
         case _:
             raise NotImplementedError(
diff --git a/tests/data/diffusion/test_gaussian_data_module.py b/tests/data/diffusion/test_gaussian_data_module.py
@@ -5,6 +5,8 @@
     GaussianDataModule, GaussianDataModuleParameters)
 from diffusion_for_multi_scale_molecular_dynamics.namespace import \
     RELATIVE_COORDINATES
+from diffusion_for_multi_scale_molecular_dynamics.noise_schedulers.noise_parameters import \
+    NoiseParameters
 
 
 class TestGaussianDataModule:
@@ -33,17 +35,23 @@ def spatial_dimension(self):
     def sigma_d(self):
         return 0.01
 
+    @pytest.fixture(params=[True, False])
+    def use_optimal_transport(self, request):
+        return request.param
+
     @pytest.fixture()
     def equilibrium_relative_coordinates(self, number_of_atoms, spatial_dimension):
         list_x = torch.rand(number_of_atoms, spatial_dimension)
         equilibrium_relative_coordinates = [list(x) for x in list_x.numpy()]
         return equilibrium_relative_coordinates
 
     @pytest.fixture
-    def data_module_hyperparameters(self, batch_size, train_dataset_size, valid_dataset_size,
+    def data_module_hyperparameters(self, batch_size, train_dataset_size, valid_dataset_size, use_optimal_transport,
                                     number_of_atoms, spatial_dimension, sigma_d, equilibrium_relative_coordinates):
         return GaussianDataModuleParameters(
             batch_size=batch_size,
+            noise_parameters=NoiseParameters(total_time_steps=10),
+            use_optimal_transport=use_optimal_transport,
             random_seed=42,
             num_workers=0,
             sigma_d=sigma_d,