5 Quick Tips to Improve Your MLflow Model Experimentation | by Matt Collins | Mar, 2023

5 Quick Tips to Improve Your MLflow Model Experimentation | by Matt Collins | Mar, 2023

[ad_1]

Use the MLflow python API to drive better model development

Photo by Adrien Converse on Unsplash

mlflow.start_run()

1. run_id

# Start MLflow run for this experiment

# End any existing runs
mlflow.end_run()

with mlflow.start_run() as run:
# Turn autolog on to save model artifacts, requirements, etc.
mlflow.autolog(log_models=True)

print(run.info.run_id)

diabetes_X = diabetes.data
diabetes_y = diabetes.target

# Split data into test training sets, 3:1 ratio
(
diabetes_X_train,
diabetes_X_test,
diabetes_y_train,
diabetes_y_test,
) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

alpha = 0.9
solver = "cholesky"
regr = linear_model.Ridge(alpha=alpha, solver=solver)

regr.fit(diabetes_X_train, diabetes_y_train)

diabetes_y_pred = regr.predict(diabetes_X_test)

# Log desired metrics
mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
mlflow.log_metric(
"rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
)

with mlflow.start_run(run_id="3fcf403e1566422493cd6e625693829d") as run:
mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

2. experiment_id

mlflow.set_experiment("/mlflow_sdk_test")
my_experiment = mlflow.set_experiment("/mlflow_sdk_test")
experiment_id = my_experiment.experiment_id
# End any existing runs
mlflow.end_run()

with mlflow.start_run(experiment_id=experiment_id):
# Turn autolog on to save model artifacts, requirements, etc.
mlflow.autolog(log_models=True)

print(run.info.run_id)

diabetes_X = diabetes.data
diabetes_y = diabetes.target

# Split data into test training sets, 3:1 ratio
(
diabetes_X_train,
diabetes_X_test,
diabetes_y_train,
diabetes_y_test,
) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

alpha = 0.8
solver = "cholesky"
regr = linear_model.Ridge(alpha=alpha, solver=solver)

regr.fit(diabetes_X_train, diabetes_y_train)

diabetes_y_pred = regr.predict(diabetes_X_test)

# Log desired metrics
mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
mlflow.log_metric(
"rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
)
mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

3. run_name

# Start MLflow run for this experiment

# End any existing runs
mlflow.end_run()

# Explicitly name runs
today = dt.today()

run_name = "Ridge Regression " + str(today)

with mlflow.start_run(run_name=run_name) as run:
# Turn autolog on to save model artifacts, requirements, etc.
mlflow.autolog(log_models=True)

print(run.info.run_id)

diabetes_X = diabetes.data
diabetes_y = diabetes.target

# Split data into test training sets, 3:1 ratio
(
diabetes_X_train,
diabetes_X_test,
diabetes_y_train,
diabetes_y_test,
) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

alpha = 0.5
solver = "cholesky"
regr = linear_model.Ridge(alpha=alpha, solver=solver)

regr.fit(diabetes_X_train, diabetes_y_train)

diabetes_y_pred = regr.predict(diabetes_X_test)

# Log desired metrics
mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
mlflow.log_metric(
"rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
)
mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

MLflow experiment table view — Duplicate run names: Image by author

4. nested

MLflow experiment table view — nested experiments: Image by author
# End any existing runs
mlflow.end_run()

# Explicitly name runs
run_name = "Ridge Regression Nested"

with mlflow.start_run(run_name=run_name) as parent_run:
print(parent_run.info.run_id)

with mlflow.start_run(run_name="Child Run: alpha 0.1", nested=True):
# Turn autolog on to save model artifacts, requirements, etc.
mlflow.autolog(log_models=True)

diabetes_X = diabetes.data
diabetes_y = diabetes.target

# Split data into test training sets, 3:1 ratio
(
diabetes_X_train,
diabetes_X_test,
diabetes_y_train,
diabetes_y_test,
) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

alpha = 0.1
solver = "cholesky"
regr = linear_model.Ridge(alpha=alpha, solver=solver)

regr.fit(diabetes_X_train, diabetes_y_train)

diabetes_y_pred = regr.predict(diabetes_X_test)

# Log desired metrics
mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
mlflow.log_metric(
"rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
)
mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

# End any existing runs
mlflow.end_run()

with mlflow.start_run(run_id="61d34b13649c45699e7f05290935747c") as parent_run:
print(parent_run.info.run_id)
with mlflow.start_run(run_name="Child Run: alpha 0.2", nested=True):
# Turn autolog on to save model artifacts, requirements, etc.
mlflow.autolog(log_models=True)

diabetes_X = diabetes.data
diabetes_y = diabetes.target

# Split data into test training sets, 3:1 ratio
(
diabetes_X_train,
diabetes_X_test,
diabetes_y_train,
diabetes_y_test,
) = train_test_split(diabetes_X, diabetes_y, test_size=0.25, random_state=42)

alpha = 0.2
solver = "cholesky"
regr = linear_model.Ridge(alpha=alpha, solver=solver)

regr.fit(diabetes_X_train, diabetes_y_train)

diabetes_y_pred = regr.predict(diabetes_X_test)

# Log desired metrics
mlflow.log_metric("mse", mean_squared_error(diabetes_y_test, diabetes_y_pred))
mlflow.log_metric(
"rmse", sqrt(mean_squared_error(diabetes_y_test, diabetes_y_pred))
)
mlflow.log_metric("r2", r2_score(diabetes_y_test, diabetes_y_pred))

5. mlflow.search_runs()

# Create DataFrame of all runs in *current* experiment
df = mlflow.search_runs(order_by=["start_time DESC"])

# Print a list of the columns available
# print(list(df.columns))

# Create DataFrame with subset of columns
runs_df = df[
[
"run_id",
"experiment_id",
"status",
"start_time",
"metrics.mse",
"tags.mlflow.source.type",
"tags.mlflow.user",
"tags.estimator_name",
"tags.mlflow.rootRunId",
]
].copy()
runs_df.head()

# Feature engineering to create some additional columns
runs_df["start_date"] = runs_df["start_time"].dt.date
runs_df["is_nested_parent"] = runs_df[["run_id","tags.mlflow.rootRunId"]].apply(lambda x: 1 if x["run_id"] == x["tags.mlflow.rootRunId"] else 0, axis=1)
runs_df["is_nested_child"] = runs_df[["run_id","tags.mlflow.rootRunId"]].apply(lambda x: 1 if x["tags.mlflow.rootRunId"] is not None and x["run_id"] != x["tags.mlflow.rootRunId"]else 0, axis=1)
runs_df
pd.DataFrame(runs_df.groupby("start_date")["run_id"].count()).reset_index()
Output of the above query: Image by author
pd.DataFrame(runs_df.groupby("tags.estimator_name")["run_id"].count()).reset_index()
Output of the above query: Image by author
[ad_2]
Source link

Comments

No comments yet. Why don’t you start the discussion?

Leave a Reply

Your email address will not be published. Required fields are marked *