Retry after wandb CommError
This commit is contained in:
		
							parent
							
								
									e7d6326546
								
							
						
					
					
						commit
						d812469c7c
					
				@ -2,6 +2,7 @@ import wandb
 | 
				
			|||||||
import yaml
 | 
					import yaml
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import copy
 | 
					import copy
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
@ -307,6 +308,10 @@ class Slate():
 | 
				
			|||||||
            wandbC = self.consume(runnerConf, 'wandb', {}, expand=True, delta_desc=runnerConf.pop('delta_desc', 'BASE'))
 | 
					            wandbC = self.consume(runnerConf, 'wandb', {}, expand=True, delta_desc=runnerConf.pop('delta_desc', 'BASE'))
 | 
				
			||||||
            if 'job_type' in wandbC and len(wandbC['job_type']) > 62:
 | 
					            if 'job_type' in wandbC and len(wandbC['job_type']) > 62:
 | 
				
			||||||
                wandbC['job_type'] = "..."+wandbC['job_type'][-50:]
 | 
					                wandbC['job_type'] = "..."+wandbC['job_type'][-50:]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            retry = 5
 | 
				
			||||||
 | 
					            while retry:
 | 
				
			||||||
 | 
					                try:
 | 
				
			||||||
                    with wandb.init(
 | 
					                    with wandb.init(
 | 
				
			||||||
                        project=project,
 | 
					                        project=project,
 | 
				
			||||||
                        config=copy.deepcopy(runnerConf),
 | 
					                        config=copy.deepcopy(runnerConf),
 | 
				
			||||||
@ -317,6 +322,14 @@ class Slate():
 | 
				
			|||||||
                        runner = Runner(self, runnerConf)
 | 
					                        runner = Runner(self, runnerConf)
 | 
				
			||||||
                        runner.setup()
 | 
					                        runner.setup()
 | 
				
			||||||
                        runner.run(run)
 | 
					                        runner.run(run)
 | 
				
			||||||
 | 
					                except wandb.errors.CommError as e:
 | 
				
			||||||
 | 
					                    retry -= 1
 | 
				
			||||||
 | 
					                    if retry:
 | 
				
			||||||
 | 
					                        print('Catched CommErr; retrying...')
 | 
				
			||||||
 | 
					                        time.sleep(int(60*random.random()))
 | 
				
			||||||
 | 
					                    else:
 | 
				
			||||||
 | 
					                        print('Catched CommErr; not retrying')
 | 
				
			||||||
 | 
					                        raise e
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if runnerConf != {}:
 | 
					            if runnerConf != {}:
 | 
				
			||||||
                msg = ('Config was not completely consumed: ', runnerConf)
 | 
					                msg = ('Config was not completely consumed: ', runnerConf)
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user