-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.py
131 lines (120 loc) · 4.9 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from pathlib import Path
from typing import List, Optional
from mdlearn.utils import BaseSettings, OptimizerConfig
class Point3dAAEConfig(BaseSettings):
# File paths
# Path to adois file
input_path: Path = Path(
"./"
)
# Path to directory where trainer should write to (cannot already exist)
output_path: Path = Path("./")
# Optionally resume training from a checkpoint file
resume_checkpoint: Optional[Path] = None
# storage type, disk | memory
storage_type: str = "disk"
dataset_name: str = "point_cloud"
rmsd_name: str = "rmsd"
fnc_name: str = "fnc"
# Number of points per sample. Should be smaller or equal
# than the total number of points.
num_points: int = 200
# Number of additional per-point features in addition to xyz coords.
num_features: int = 0
# Name of scalar datasets.
scalar_dset_names: List[str] = []
# If True, subtract center of mass from batch and shift and scale
# batch by the full dataset statistics.
cms_transform: bool = True
# Sets requires_grad torch.Tensor parameter for scalars specified
# by scalar_dset_names. Set to True, to use scalars for multi-task
# learning. If scalars are only required for plotting, then set it as False.
scalar_requires_grad: bool = False
# Percentage of data to be used as training data after a random split.
split_pct: float = 0.8
# Random seed for shuffling train/validation data
seed: int = 333
# Whether or not to shuffle train/validation data
shuffle: bool = True
# Number of epochs to train
epochs: int = 30
# Training batch size
batch_size: int = 32
# Pretrained model weights
init_weights: Optional[str] = ""
# AE (encoder/decoder) optimizer params
ae_optimizer: OptimizerConfig = OptimizerConfig(name="Adam", hparams={"lr": 0.0001})
# Discriminator optimizer params
disc_optimizer: OptimizerConfig = OptimizerConfig(
name="Adam", hparams={"lr": 0.0001}
)
# Model hyperparameters
latent_dim: int = 16
encoder_bias: bool = True
encoder_relu_slope: float = 0.0
encoder_filters: List[int] = [64, 128, 256, 256, 512]
encoder_kernels: List[int] = [5, 5, 3, 1, 1]
decoder_bias: bool = True
decoder_relu_slope: float = 0.0
decoder_affine_widths: List[int] = [64, 128, 512, 1024]
discriminator_bias: bool = True
discriminator_relu_slope: float = 0.0
discriminator_affine_widths: List[int] = [512, 128, 64]
# Mean of the prior distribution
noise_mu: float = 0.0
# Standard deviation of the prior distribution
noise_std: float = 1.0
# Releative weight to put on gradient penalty
lambda_gp: float = 10.0
# Releative weight to put on reconstruction loss
lambda_rec: float = 0.5
# Training settings
# Number of data loaders for training
num_data_workers: int = 16
# Number of samples loaded in advance by each worker
prefetch_factor: int = 2
# Log checkpoint file every `checkpoint_log_every` epochs
# checkpoint_log_every: int = 1
# Log latent space plot `plot_log_every` epochs
# plot_log_every: int = 1
# Validation settings
# Method used to visualize latent space
# plot_method: str = "TSNE"
# Number of validation samples to run visualization on
# plot_n_samples: Optional[int] = None
# minimum number of steps in each aggregated file before the model is trained
min_step_increment: int = 5000
# take up to this number of samples from each aggregated file to train the model
max_steps: int = 8000
# if the loss is greater than this, do not publish the model, retrain the model from scratch at next iteration regardless of reinit value
max_loss: int = 10000
# number of aggregators
num_agg: int = 12
# if num_agg adios aggregated files are not available, sleep for timeout1 before trying again
timeout1: int = 30
# if less than min_step_increment is available in each aggregated file, sleep for timeout2 before trying again
timeout2: int = 10
# directory with aggregated tasks subdirectories
agg_dir: Path = Path()
# where to publish a trained model for the outlier search to pick up
published_model_dir: Path = Path("./")
# temporary directory with model checkpoints
checkpoint_dir: Path = Path("./")
# adios xml configuration file for aggregators
adios_xml_agg: Path = Path("./")
adios_xml_agg_4ml: Path = Path("./")
# retrain the model from scratch at each iteration or start with the previously trained model
reinit: bool = False
use_model_checkpoint = True
read_batch: int = 10000
experiment_directory: Path = Path("./")
init_weights_path: Path = Path("./")
model: str = "aae"
model_tag: str = "aae"
node_local_path: Path = Path("/tmp")
stage_idx: int = 0
task_idx: int = 0
embed_interval: int = 1
tsne_interval: int = 5
sample_interval: int = 20
initial_epochs: int = 10