Retry after wandb CommError

This commit is contained in:
Dominik Moritz Roth 2023-09-18 17:16:22 +02:00
parent e7d6326546
commit d812469c7c

View File

@ -2,6 +2,7 @@ import wandb
import yaml import yaml
import os import os
import math import math
import time
import random import random
import copy import copy
import re import re
@ -307,16 +308,28 @@ class Slate():
wandbC = self.consume(runnerConf, 'wandb', {}, expand=True, delta_desc=runnerConf.pop('delta_desc', 'BASE')) wandbC = self.consume(runnerConf, 'wandb', {}, expand=True, delta_desc=runnerConf.pop('delta_desc', 'BASE'))
if 'job_type' in wandbC and len(wandbC['job_type']) > 62: if 'job_type' in wandbC and len(wandbC['job_type']) > 62:
wandbC['job_type'] = "..."+wandbC['job_type'][-50:] wandbC['job_type'] = "..."+wandbC['job_type'][-50:]
with wandb.init(
project=project, retry = 5
config=copy.deepcopy(runnerConf), while retry:
reinit=self.consume(wandbC, 'reinit', DEFAULT_REINIT), try:
settings=wandb.Settings(**self.consume(wandbC, 'settings', {})), with wandb.init(
**wandbC project=project,
) as run: config=copy.deepcopy(runnerConf),
runner = Runner(self, runnerConf) reinit=self.consume(wandbC, 'reinit', DEFAULT_REINIT),
runner.setup() settings=wandb.Settings(**self.consume(wandbC, 'settings', {})),
runner.run(run) **wandbC
) as run:
runner = Runner(self, runnerConf)
runner.setup()
runner.run(run)
except wandb.errors.CommError as e:
retry -= 1
if retry:
print('Catched CommErr; retrying...')
time.sleep(int(60*random.random()))
else:
print('Catched CommErr; not retrying')
raise e
if runnerConf != {}: if runnerConf != {}:
msg = ('Config was not completely consumed: ', runnerConf) msg = ('Config was not completely consumed: ', runnerConf)