pytorch_lightning配合optuna进行自动调参示例

Published on Aug. 22, 2023, 12:11 p.m.

参示例

import optuna
import pytorch_lightning as pl
from optuna.integration import PyTorchLightningPruningCallback
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate
from string import Template
from model.myModel import myModel

"""
执行自动调参

"""

# 参考示例 https://www.kaggle.com/code/terrychanorg/optuna-wandb-logger-notebookd54069a901
def objective(trial: optuna.trial.Trial) -> float:
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    #     max_epochs = trial.suggest_int("max_epochs", 1, 1)
    batch_size = trial.suggest_categorical("batch_size", [64, 128])
    accumulate_grad_batches = trial.suggest_int("accumulate_grad_batches", 1, 10)
    num_labels = trial.suggest_categorical("num_labels", [28000])
    dropout = trial.suggest_categorical("dropout", [0.1, 0.2, 0.3, 0.4])
    hidden_size = trial.suggest_categorical("hidden_size", [128, 256, 512, 768])
    # "MomentumSGD",
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "AdamW", "RMSprop", "Adagrad", "SGD"])
    #     optimizer_name = trial.suggest_categorical("optimizer_name", ["AdamW"])
    #将生成的参数传递给模型
    model = myModel(learning_rate=learning_rate,
                    batch_size=batch_size,
                    optimizer_name=optimizer_name,
                    num_labels=num_labels,
                    dropout=dropout,
                    hidden_size=hidden_size,
                    data_path="/kaggle/data/out",
                    pretrained="uer/chinese_roberta_L-2_H-512"
                    )
    #print("hparams", model.hparams)
    # 开始训练
    trainer = pl.Trainer(
        gpus=1,
        min_epochs=1,
        precision=16,
        # amp_level='O2',
        callbacks=[PyTorchLightningPruningCallback(trial, monitor="val_loss")],
        deterministic=True,
        max_epochs=1,
        accumulate_grad_batches=accumulate_grad_batches
        )

    hyperparameters = dict(optimizer_name=optimizer_name,
                           accumulate_grad_batches=accumulate_grad_batches,
                           batch_size=batch_size,
                           lr=learning_rate,
                           dropout=dropout,
                           hidden_size=hidden_size
                           )

    trainer.logger.log_hyperparams(hyperparameters)

    trainer.fit(model)
    return trainer.callback_metrics["val_loss"].item()

study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=100, timeout=6000)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# 获取的最佳参数如下
best_params = {}
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    best_params[key] = value
# best_params

#  自动将最佳参数保存到配置文件,方便后续的训练使用,这里使用Template方法写入文件
s = Template("""
#%%writefile config/config_cuda.yaml
seed_everything: 42
trainer:
  logger:
  - class_path: pytorch_lightning.loggers.WandbLogger
    init_args:
      save_dir: "my_logs"
      offline: false
      project: "zhi"
      log_model: false
      prefix: ''
  checkpoint_callback: true
  callbacks:
  - class_path: pytorch_lightning.callbacks.EarlyStopping
    init_args:
      monitor: val_loss #val_f1
      min_delta: 0.00000
      patience: 150
      verbose: true
      mode: min
      strict: true
      check_finite: true
      check_on_train_epoch_end: false
  - class_path: pytorch_lightning.callbacks.LearningRateMonitor
    init_args:
      logging_interval: step
      log_momentum: false
  - class_path: pytorch_lightning.callbacks.ModelCheckpoint
    init_args:
      filename: '{epoch}-{val_loss:.2f}-{val_acc:.2f}-{val_precision:.2f}'
      monitor: val_loss
      verbose: true
      save_top_k: 1
      save_weights_only: false
      mode: min
      auto_insert_metric_name: true
      save_last: true
  enable_checkpointing: true
  default_root_dir: "out_log"
  gradient_clip_val: null
  gradient_clip_algorithm: null
  process_position: 0
  num_nodes: 1
  num_processes: null
  devices: null
  gpus: [0]
  #gpus: [0]
  auto_select_gpus: true
  tpu_cores: null
  ipus: null
  log_gpu_memory: null
  progress_bar_refresh_rate: null
  enable_progress_bar: true
  overfit_batches: 0.0
  track_grad_norm: -1
  check_val_every_n_epoch: 1
  fast_dev_run: false
  accumulate_grad_batches: ${accumulate_grad_batches}
  max_epochs: 300
  min_epochs: null
  max_steps: -1
  min_steps: null
  max_time: "00:7:00:00" # 训练5个小时
  limit_train_batches: null
  limit_val_batches: null
  limit_test_batches: null
  limit_predict_batches: null
  val_check_interval: null
  flush_logs_every_n_steps: null
  log_every_n_steps: 20
  accelerator: null
  strategy: null
  sync_batchnorm: false
  precision: 16 #16
  enable_model_summary: true
  weights_summary: top
  weights_save_path: null
  num_sanity_val_steps: 2
  resume_from_checkpoint: null
  profiler: null
  benchmark: null
  deterministic: false
  reload_dataloaders_every_n_epochs: 1
  auto_lr_find: false
  replace_sampler_ddp: true
  detect_anomaly: false
  auto_scale_batch_size: false
  prepare_data_per_node: null
  plugins: null
  amp_backend: native
  amp_level: null
  move_metrics_to_cpu: false
  multiple_trainloader_mode: max_size_cycle
  stochastic_weight_avg: false
  terminate_on_nan: null
#   log_dir: logs
model:
  learning_rate: ${learning_rate}
  T_max: 5
  optimizer_name: ${optimizer_name}
  batch_size: 128
  data_path: /kaggle/data/out
  pretrained: uer/chinese_roberta_L-2_H-512
  T_mult: 2
  T_0: 500
  num_labels: ${num_labels}
  dropout: ${dropout}
  hidden_size: ${hidden_size}
ckpt_path: null

""")
print(s.safe_substitute(**best_params),
      file=open('config/config_cuda_best.yaml', 'w'))

# !cat config/config_cuda.yaml

参考
https://www.notion.so/terrychanorg/pytorch_lightning-optuna-b98f4f71c5b0472880af2cfe655acb9a