forked from otzslayer/lgcns-mlops-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
135 lines (106 loc) · 4.23 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# TODO: 적절한 위치에 맞는 수준으로 로그 출력되도록 코드 작성
# sourcery skip: raise-specific-error
import os
import sys
import warnings
from distutils.dir_util import copy_tree
import bentoml
import mlflow
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from src.common.constants import ARTIFACT_PATH, DATA_PATH, LOG_FILEPATH
from src.common.logger import (
handle_exception,
log_feature_importance,
set_logger,
)
from src.common.metrics import rmse_cv_score
from src.common.utils import get_param_set
from src.preprocess import preprocess_pipeline
logger = set_logger(os.path.join(LOG_FILEPATH, "logs.log"))
sys.excepthook = handle_exception
warnings.filterwarnings(action="ignore")
if __name__ == "__main__":
train_df = pd.read_csv(os.path.join(DATA_PATH, "house_rent_train.csv"))
logger.debug("Load data...")
_X = train_df.drop(["rent", "area_locality", "posted_on"], axis=1)
y = np.log1p(train_df["rent"])
X = preprocess_pipeline.fit_transform(X=_X, y=y)
# Data storage - 피처 데이터 저장
if not os.path.exists(os.path.join(DATA_PATH, "storage")):
os.makedirs(os.path.join(DATA_PATH, "storage"))
X.assign(rent=y).to_csv(
os.path.join(DATA_PATH, "storage", "house_rent_train_features.csv"),
index=False,
)
logger.debug("Save the feature data..")
params_candidates = {
"learning_rate": [0.01, 0.05, 0.1],
"max_depth": [3, 4, 5, 6],
"max_features": [1.0, 0.9, 0.8, 0.7],
}
param_set = get_param_set(params=params_candidates)
# Set experiment name for mlflow
experiment_name = "new_experiment_with_log" # 새로운실험마다 바꿔야되
mlflow.set_experiment(experiment_name=experiment_name)
logger.info("Set an mlflow experiment..")
mlflow.set_tracking_uri("./mlruns")
for i, params in enumerate(param_set):
run_name = f"Run {i}"
logger.info(f"{run_name}:{params}")
with mlflow.start_run(run_name=f"Run {i}"):
regr = GradientBoostingRegressor(**params)
# 전처리 이후 모델 순서로 파이프라인 작성
pipeline = Pipeline(
[
("preprocessor", preprocess_pipeline),
("gradient_booster", regr),
]
)
pipeline.fit(_X, y)
# get evaluations scores
score_cv = rmse_cv_score(regr, X, y)
logger.info(f"CV RMSE score for Run {i} :{score_cv.mean():.4f}")
name = regr.__class__.__name__
mlflow.set_tag("estimator_name", name)
# 로깅 정보 : 파라미터 정보
mlflow.log_params({key: regr.get_params()[key] for key in params})
# 로깅 정보: 평가 메트릭
mlflow.log_metrics(
{
"RMSE_CV": score_cv.mean(),
}
)
# 로깅 정보 : 학습 loss
for s in regr.train_score_:
mlflow.log_metric("Train Loss", s)
# 모델 아티팩트 저장
mlflow.sklearn.log_model(
pipeline,
"model",
)
# log charts
mlflow.log_artifact(ARTIFACT_PATH)
# generate a chart for feature importance
log_feature_importance(train=X, model=regr)
# Find the best regr
best_run_df = mlflow.search_runs(
order_by=["metrics.RMSE_CV ASC"], max_results=1
)
if len(best_run_df.index) == 0:
raise Exception(f"Found no runs for experiment '{experiment_name}'")
best_run = mlflow.get_run(best_run_df.at[0, "run_id"])
best_params = best_run.data.params
logger.info(f"Best hyper-parameter:{best_params}")
best_model_uri = f"{best_run.info.artifact_uri}/model"
# TODO: 베스트 모델을 아티팩트 폴더에 복사
copy_tree(best_model_uri.replace("file://", ""), ARTIFACT_PATH)
# BentoML에 모델 저장
bentoml.sklearn.save_model(
name="house_rent",
model=mlflow.sklearn.load_model(best_model_uri),
signatures={"predict": {"batchable": True, "batch_dim": 0}},
metadata=best_params,
)