Retry after wandb CommError
This commit is contained in:
parent
e7d6326546
commit
d812469c7c
@ -2,6 +2,7 @@ import wandb
|
||||
import yaml
|
||||
import os
|
||||
import math
|
||||
import time
|
||||
import random
|
||||
import copy
|
||||
import re
|
||||
@ -307,6 +308,10 @@ class Slate():
|
||||
wandbC = self.consume(runnerConf, 'wandb', {}, expand=True, delta_desc=runnerConf.pop('delta_desc', 'BASE'))
|
||||
if 'job_type' in wandbC and len(wandbC['job_type']) > 62:
|
||||
wandbC['job_type'] = "..."+wandbC['job_type'][-50:]
|
||||
|
||||
retry = 5
|
||||
while retry:
|
||||
try:
|
||||
with wandb.init(
|
||||
project=project,
|
||||
config=copy.deepcopy(runnerConf),
|
||||
@ -317,6 +322,14 @@ class Slate():
|
||||
runner = Runner(self, runnerConf)
|
||||
runner.setup()
|
||||
runner.run(run)
|
||||
except wandb.errors.CommError as e:
|
||||
retry -= 1
|
||||
if retry:
|
||||
print('Catched CommErr; retrying...')
|
||||
time.sleep(int(60*random.random()))
|
||||
else:
|
||||
print('Catched CommErr; not retrying')
|
||||
raise e
|
||||
|
||||
if runnerConf != {}:
|
||||
msg = ('Config was not completely consumed: ', runnerConf)
|
||||
|
Loading…
Reference in New Issue
Block a user