Retry after wandb CommError

This commit is contained in:
Dominik Moritz Roth 2023-09-18 17:16:22 +02:00
parent e7d6326546
commit d812469c7c

View File

@ -2,6 +2,7 @@ import wandb
import yaml
import os
import math
import time
import random
import copy
import re
@ -307,6 +308,10 @@ class Slate():
wandbC = self.consume(runnerConf, 'wandb', {}, expand=True, delta_desc=runnerConf.pop('delta_desc', 'BASE'))
if 'job_type' in wandbC and len(wandbC['job_type']) > 62:
wandbC['job_type'] = "..."+wandbC['job_type'][-50:]
retry = 5
while retry:
try:
with wandb.init(
project=project,
config=copy.deepcopy(runnerConf),
@ -317,6 +322,14 @@ class Slate():
runner = Runner(self, runnerConf)
runner.setup()
runner.run(run)
except wandb.errors.CommError as e:
retry -= 1
if retry:
print('Catched CommErr; retrying...')
time.sleep(int(60*random.random()))
else:
print('Catched CommErr; not retrying')
raise e
if runnerConf != {}:
msg = ('Config was not completely consumed: ', runnerConf)