tensorboardX 使用流程

1 minute read

Published:

记录一些TensorboardX的学习记录 不想学可以转用wandb.

使用torch及tensorboardX 记录实验

pytorch文档 : tensorboard

step1 : 在训练脚本(main.py)中的设置

#两种import方式. 
# 1 :
# from torch.utils.tensorboard import SummaryWriter
# 2 :
from tensorboardX import SummaryWriter

import torch.optim
from torch.optim.lr_scheduler import StepLR

now = time.strftime('%Y%m%d-%H_%M_%S', time.localtime(time.time()))
writer = SummaryWriter(log_dir=f'{LOG_DIR}/{now}')

# set optimizer 设置优化器
optimizer = torch.optim.SGD(...)
# set scheduler 设置学习率调整策略
scheduler = StepLR(optimizer, ...)

...
def train(..., epoch):
    ...
    for i, (input, target) in enumerate(train_loader):
      ...
	
  
    writer.add_scalars(main_tag='Loss',tag_scalar_dict={
        'train loss' : loss,
    }, global_step=epoch)

    writer.add_scalars(main_tag='Prec',tag_scalar_dict={
        'train top1' : top1,
        'trian top5' : top5
    }, global_step=epoch)

    writer.add_scalars(main_tag='time/train', tag_scalar_dict={
        'batch_time': batch_time,
        'data_time': data_time,
        },
        global_step=epoch)

def validate(..., epoch):
    ...
    with torch.no_grad():
        ...

      writer.add_scalars(main_tag='Loss', tag_scalar_dict={
          'val loss' : losses.avg,
      }, global_step=epoch)

      writer.add_scalars(main_tag='Prec', tag_scalar_dict={
          'validate top1' : top1.avg,
          'validate top5' : top5.avg
      }, global_step=epoch)

      writer.add_scalar('time/val/batch_time', batch_time.avg, epoch)

    return ...

def main():
    ...
    for epoch in range(args.start_epoch, args.epochs):
        ...
        train(..., epoch)
        validate(..., epoch)

        writer.add_scalar('learning rate', optimizer.state_dict()['param_groups'][0]['lr'], epoch)
        scheduler.step()

        ...
    
    writer.close()
    # only close once after all add_ function is done
    # the end of main()

step2 : check results 查看结果

tensorboard --logdir='your_logdir' \
            --port XXXX #默认6006端口

然后,浏览器打开 ` http://localhost:6006/ or http://localhost:XXXX/`

step3(OPT) : 若远程服务器没有图像界面,可映射到本地端口后查看

ref1 ; ref2

#本地计算机:
ssh -L 本地端口:127.0.0.1:TensorBoard端口 用户名@服务器的IP地址 -p 服务器登录端口

ssh -L 10086:127.0.0.1:6006 user_name@remote_server_ip -p loading_port

然后,本地计算机浏览器打开 http://127.0.0.1:10086/