Retry after wandb CommError
This commit is contained in:
parent
e7d6326546
commit
d812469c7c
@ -2,6 +2,7 @@ import wandb
|
|||||||
import yaml
|
import yaml
|
||||||
import os
|
import os
|
||||||
import math
|
import math
|
||||||
|
import time
|
||||||
import random
|
import random
|
||||||
import copy
|
import copy
|
||||||
import re
|
import re
|
||||||
@ -307,16 +308,28 @@ class Slate():
|
|||||||
wandbC = self.consume(runnerConf, 'wandb', {}, expand=True, delta_desc=runnerConf.pop('delta_desc', 'BASE'))
|
wandbC = self.consume(runnerConf, 'wandb', {}, expand=True, delta_desc=runnerConf.pop('delta_desc', 'BASE'))
|
||||||
if 'job_type' in wandbC and len(wandbC['job_type']) > 62:
|
if 'job_type' in wandbC and len(wandbC['job_type']) > 62:
|
||||||
wandbC['job_type'] = "..."+wandbC['job_type'][-50:]
|
wandbC['job_type'] = "..."+wandbC['job_type'][-50:]
|
||||||
with wandb.init(
|
|
||||||
project=project,
|
retry = 5
|
||||||
config=copy.deepcopy(runnerConf),
|
while retry:
|
||||||
reinit=self.consume(wandbC, 'reinit', DEFAULT_REINIT),
|
try:
|
||||||
settings=wandb.Settings(**self.consume(wandbC, 'settings', {})),
|
with wandb.init(
|
||||||
**wandbC
|
project=project,
|
||||||
) as run:
|
config=copy.deepcopy(runnerConf),
|
||||||
runner = Runner(self, runnerConf)
|
reinit=self.consume(wandbC, 'reinit', DEFAULT_REINIT),
|
||||||
runner.setup()
|
settings=wandb.Settings(**self.consume(wandbC, 'settings', {})),
|
||||||
runner.run(run)
|
**wandbC
|
||||||
|
) as run:
|
||||||
|
runner = Runner(self, runnerConf)
|
||||||
|
runner.setup()
|
||||||
|
runner.run(run)
|
||||||
|
except wandb.errors.CommError as e:
|
||||||
|
retry -= 1
|
||||||
|
if retry:
|
||||||
|
print('Catched CommErr; retrying...')
|
||||||
|
time.sleep(int(60*random.random()))
|
||||||
|
else:
|
||||||
|
print('Catched CommErr; not retrying')
|
||||||
|
raise e
|
||||||
|
|
||||||
if runnerConf != {}:
|
if runnerConf != {}:
|
||||||
msg = ('Config was not completely consumed: ', runnerConf)
|
msg = ('Config was not completely consumed: ', runnerConf)
|
||||||
|
Loading…
Reference in New Issue
Block a user