Module arvados.retry
Utilities to retry operations.
The core of this module is RetryLoop
, a utility class to retry operations
that might fail. It can distinguish between temporary and permanent failures;
provide exponential backoff; and save a series of results.
It also provides utility functions for common operations with RetryLoop
:
check_http_response_success()
can be used as aRetryLoop
success_check
for HTTP response codes from the Arvados API server.retry_method()
can decorate methods to provide a defaultnum_retries
keyword argument.
Expand source code
"""Utilities to retry operations.
The core of this module is `RetryLoop`, a utility class to retry operations
that might fail. It can distinguish between temporary and permanent failures;
provide exponential backoff; and save a series of results.
It also provides utility functions for common operations with `RetryLoop`:
* `check_http_response_success` can be used as a `RetryLoop` `success_check`
for HTTP response codes from the Arvados API server.
* `retry_method` can decorate methods to provide a default `num_retries`
keyword argument.
"""
# Copyright (C) The Arvados Authors. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
from builtins import range
from builtins import object
import functools
import inspect
import pycurl
import time
from collections import deque
import arvados.errors
_HTTP_SUCCESSES = set(range(200, 300))
_HTTP_CAN_RETRY = set([408, 409, 422, 423, 500, 502, 503, 504])
class RetryLoop(object):
"""Coordinate limited retries of code.
`RetryLoop` coordinates a loop that runs until it records a
successful result or tries too many times, whichever comes first.
Typical use looks like:
loop = RetryLoop(num_retries=2)
for tries_left in loop:
try:
result = do_something()
except TemporaryError as error:
log("error: {} ({} tries left)".format(error, tries_left))
else:
loop.save_result(result)
if loop.success():
return loop.last_result()
Arguments:
num_retries: int
: The maximum number of times to retry the loop if it
doesn't succeed. This means the loop body could run at most
`num_retries + 1` times.
success_check: Callable
: This is a function that will be called each
time the loop saves a result. The function should return
`True` if the result indicates the code succeeded, `False` if it
represents a permanent failure, and `None` if it represents a
temporary failure. If no function is provided, the loop will
end after any result is saved.
backoff_start: float
: The number of seconds that must pass before the loop's second
iteration. Default 0, which disables all waiting.
backoff_growth: float
: The wait time multiplier after each iteration.
Default 2 (i.e., double the wait time each time).
save_results: int
: Specify a number to store that many saved results from the loop.
These are available through the `results` attribute, oldest first.
Default 1.
max_wait: float
: Maximum number of seconds to wait between retries. Default 60.
"""
def __init__(self, num_retries, success_check=lambda r: True,
backoff_start=0, backoff_growth=2, save_results=1,
max_wait=60):
self.tries_left = num_retries + 1
self.check_result = success_check
self.backoff_wait = backoff_start
self.backoff_growth = backoff_growth
self.max_wait = max_wait
self.next_start_time = 0
self.results = deque(maxlen=save_results)
self._attempts = 0
self._running = None
self._success = None
def __iter__(self):
"""Return an iterator of retries."""
return self
def running(self):
"""Return whether this loop is running.
Returns `None` if the loop has never run, `True` if it is still running,
or `False` if it has stopped—whether that's because it has saved a
successful result, a permanent failure, or has run out of retries.
"""
return self._running and (self._success is None)
def __next__(self):
"""Record a loop attempt.
If the loop is still running, decrements the number of tries left and
returns it. Otherwise, raises `StopIteration`.
"""
if self._running is None:
self._running = True
if (self.tries_left < 1) or not self.running():
self._running = False
raise StopIteration
else:
wait_time = max(0, self.next_start_time - time.time())
time.sleep(wait_time)
self.backoff_wait *= self.backoff_growth
if self.backoff_wait > self.max_wait:
self.backoff_wait = self.max_wait
self.next_start_time = time.time() + self.backoff_wait
self.tries_left -= 1
return self.tries_left
def save_result(self, result):
"""Record a loop result.
Save the given result, and end the loop if it indicates
success or permanent failure. See documentation for the `__init__`
`success_check` argument to learn how that's indicated.
Raises `arvados.errors.AssertionError` if called after the loop has
already ended.
Arguments:
result: Any
: The result from this loop attempt to check and save.
"""
if not self.running():
raise arvados.errors.AssertionError(
"recorded a loop result after the loop finished")
self.results.append(result)
self._success = self.check_result(result)
self._attempts += 1
def success(self):
"""Return the loop's end state.
Returns `True` if the loop recorded a successful result, `False` if it
recorded permanent failure, or else `None`.
"""
return self._success
def last_result(self):
"""Return the most recent result the loop saved.
Raises `arvados.errors.AssertionError` if called before any result has
been saved.
"""
try:
return self.results[-1]
except IndexError:
raise arvados.errors.AssertionError(
"queried loop results before any were recorded")
def attempts(self):
"""Return the number of results that have been saved.
This count includes all kinds of results: success, permanent failure,
and temporary failure.
"""
return self._attempts
def attempts_str(self):
"""Return a human-friendly string counting saved results.
This method returns '1 attempt' or 'N attempts', where the number
in the string is the number of saved results.
"""
if self._attempts == 1:
return '1 attempt'
else:
return '{} attempts'.format(self._attempts)
def check_http_response_success(status_code):
"""Convert a numeric HTTP status code to a loop control flag.
This method takes a numeric HTTP status code and returns `True` if
the code indicates success, `None` if it indicates temporary
failure, and `False` otherwise. You can use this as the
`success_check` for a `RetryLoop` that queries the Arvados API server.
Specifically:
* Any 2xx result returns `True`.
* A select few status codes, or any malformed responses, return `None`.
422 Unprocessable Entity is in this category. This may not meet the
letter of the HTTP specification, but the Arvados API server will
use it for various server-side problems like database connection
errors.
* Everything else returns `False`. Note that this includes 1xx and
3xx status codes. They don't indicate success, and you can't
retry those requests verbatim.
Arguments:
status_code: int
: A numeric HTTP response code
"""
if status_code in _HTTP_SUCCESSES:
return True
elif status_code in _HTTP_CAN_RETRY:
return None
elif 100 <= status_code < 600:
return False
else:
return None # Get well soon, server.
def retry_method(orig_func):
"""Provide a default value for a method's num_retries argument.
This is a decorator for instance and class methods that accept a
`num_retries` keyword argument, with a `None` default. When the method
is called without a value for `num_retries`, this decorator will set it
from the `num_retries` attribute of the underlying instance or class.
Arguments:
orig_func: Callable
: A class or instance method that accepts a `num_retries` keyword argument
"""
@functools.wraps(orig_func)
def num_retries_setter(self, *args, **kwargs):
if kwargs.get('num_retries') is None:
kwargs['num_retries'] = self.num_retries
return orig_func(self, *args, **kwargs)
return num_retries_setter
Functions
def check_http_response_success(status_code)
-
Convert a numeric HTTP status code to a loop control flag.
This method takes a numeric HTTP status code and returns
True
if the code indicates success,None
if it indicates temporary failure, andFalse
otherwise. You can use this as thesuccess_check
for aRetryLoop
that queries the Arvados API server. Specifically:-
Any 2xx result returns
True
. -
A select few status codes, or any malformed responses, return
None
. 422 Unprocessable Entity is in this category. This may not meet the letter of the HTTP specification, but the Arvados API server will use it for various server-side problems like database connection errors. -
Everything else returns
False
. Note that this includes 1xx and 3xx status codes. They don't indicate success, and you can't retry those requests verbatim.
Arguments:
- status_code: int
- A numeric HTTP response code
Expand source code
def check_http_response_success(status_code): """Convert a numeric HTTP status code to a loop control flag. This method takes a numeric HTTP status code and returns `True` if the code indicates success, `None` if it indicates temporary failure, and `False` otherwise. You can use this as the `success_check` for a `RetryLoop` that queries the Arvados API server. Specifically: * Any 2xx result returns `True`. * A select few status codes, or any malformed responses, return `None`. 422 Unprocessable Entity is in this category. This may not meet the letter of the HTTP specification, but the Arvados API server will use it for various server-side problems like database connection errors. * Everything else returns `False`. Note that this includes 1xx and 3xx status codes. They don't indicate success, and you can't retry those requests verbatim. Arguments: status_code: int : A numeric HTTP response code """ if status_code in _HTTP_SUCCESSES: return True elif status_code in _HTTP_CAN_RETRY: return None elif 100 <= status_code < 600: return False else: return None # Get well soon, server.
-
def retry_method(orig_func)
-
Provide a default value for a method's num_retries argument.
This is a decorator for instance and class methods that accept a
num_retries
keyword argument, with aNone
default. When the method is called without a value fornum_retries
, this decorator will set it from thenum_retries
attribute of the underlying instance or class.Arguments:
- orig_func: Callable
- A class or instance method that accepts a
num_retries
keyword argument
Expand source code
def retry_method(orig_func): """Provide a default value for a method's num_retries argument. This is a decorator for instance and class methods that accept a `num_retries` keyword argument, with a `None` default. When the method is called without a value for `num_retries`, this decorator will set it from the `num_retries` attribute of the underlying instance or class. Arguments: orig_func: Callable : A class or instance method that accepts a `num_retries` keyword argument """ @functools.wraps(orig_func) def num_retries_setter(self, *args, **kwargs): if kwargs.get('num_retries') is None: kwargs['num_retries'] = self.num_retries return orig_func(self, *args, **kwargs) return num_retries_setter
Classes
class RetryLoop (num_retries, success_check=<function RetryLoop.<lambda>>, backoff_start=0, backoff_growth=2, save_results=1, max_wait=60)
-
Coordinate limited retries of code.
RetryLoop
coordinates a loop that runs until it records a successful result or tries too many times, whichever comes first. Typical use looks like:loop = RetryLoop(num_retries=2) for tries_left in loop: try: result = do_something() except TemporaryError as error: log("error: {} ({} tries left)".format(error, tries_left)) else: loop.save_result(result) if loop.success(): return loop.last_result()
Arguments:
- num_retries: int
- The maximum number of times to retry the loop if it
doesn't succeed.
This means the loop body could run at most
num_retries + 1
times. - success_check: Callable
- This is a function that will be called each
time the loop saves a result.
The function should return
True
if the result indicates the code succeeded,False
if it represents a permanent failure, andNone
if it represents a temporary failure. If no function is provided, the loop will end after any result is saved. - backoff_start: float
- The number of seconds that must pass before the loop's second iteration. Default 0, which disables all waiting.
- backoff_growth: float
- The wait time multiplier after each iteration. Default 2 (i.e., double the wait time each time).
- save_results: int
- Specify a number to store that many saved results from the loop.
These are available through the
results
attribute, oldest first. Default 1. - max_wait: float
- Maximum number of seconds to wait between retries. Default 60.
Expand source code
class RetryLoop(object): """Coordinate limited retries of code. `RetryLoop` coordinates a loop that runs until it records a successful result or tries too many times, whichever comes first. Typical use looks like: loop = RetryLoop(num_retries=2) for tries_left in loop: try: result = do_something() except TemporaryError as error: log("error: {} ({} tries left)".format(error, tries_left)) else: loop.save_result(result) if loop.success(): return loop.last_result() Arguments: num_retries: int : The maximum number of times to retry the loop if it doesn't succeed. This means the loop body could run at most `num_retries + 1` times. success_check: Callable : This is a function that will be called each time the loop saves a result. The function should return `True` if the result indicates the code succeeded, `False` if it represents a permanent failure, and `None` if it represents a temporary failure. If no function is provided, the loop will end after any result is saved. backoff_start: float : The number of seconds that must pass before the loop's second iteration. Default 0, which disables all waiting. backoff_growth: float : The wait time multiplier after each iteration. Default 2 (i.e., double the wait time each time). save_results: int : Specify a number to store that many saved results from the loop. These are available through the `results` attribute, oldest first. Default 1. max_wait: float : Maximum number of seconds to wait between retries. Default 60. """ def __init__(self, num_retries, success_check=lambda r: True, backoff_start=0, backoff_growth=2, save_results=1, max_wait=60): self.tries_left = num_retries + 1 self.check_result = success_check self.backoff_wait = backoff_start self.backoff_growth = backoff_growth self.max_wait = max_wait self.next_start_time = 0 self.results = deque(maxlen=save_results) self._attempts = 0 self._running = None self._success = None def __iter__(self): """Return an iterator of retries.""" return self def running(self): """Return whether this loop is running. Returns `None` if the loop has never run, `True` if it is still running, or `False` if it has stopped—whether that's because it has saved a successful result, a permanent failure, or has run out of retries. """ return self._running and (self._success is None) def __next__(self): """Record a loop attempt. If the loop is still running, decrements the number of tries left and returns it. Otherwise, raises `StopIteration`. """ if self._running is None: self._running = True if (self.tries_left < 1) or not self.running(): self._running = False raise StopIteration else: wait_time = max(0, self.next_start_time - time.time()) time.sleep(wait_time) self.backoff_wait *= self.backoff_growth if self.backoff_wait > self.max_wait: self.backoff_wait = self.max_wait self.next_start_time = time.time() + self.backoff_wait self.tries_left -= 1 return self.tries_left def save_result(self, result): """Record a loop result. Save the given result, and end the loop if it indicates success or permanent failure. See documentation for the `__init__` `success_check` argument to learn how that's indicated. Raises `arvados.errors.AssertionError` if called after the loop has already ended. Arguments: result: Any : The result from this loop attempt to check and save. """ if not self.running(): raise arvados.errors.AssertionError( "recorded a loop result after the loop finished") self.results.append(result) self._success = self.check_result(result) self._attempts += 1 def success(self): """Return the loop's end state. Returns `True` if the loop recorded a successful result, `False` if it recorded permanent failure, or else `None`. """ return self._success def last_result(self): """Return the most recent result the loop saved. Raises `arvados.errors.AssertionError` if called before any result has been saved. """ try: return self.results[-1] except IndexError: raise arvados.errors.AssertionError( "queried loop results before any were recorded") def attempts(self): """Return the number of results that have been saved. This count includes all kinds of results: success, permanent failure, and temporary failure. """ return self._attempts def attempts_str(self): """Return a human-friendly string counting saved results. This method returns '1 attempt' or 'N attempts', where the number in the string is the number of saved results. """ if self._attempts == 1: return '1 attempt' else: return '{} attempts'.format(self._attempts)
Methods
def attempts(self)
-
Return the number of results that have been saved.
This count includes all kinds of results: success, permanent failure, and temporary failure.
Expand source code
def attempts(self): """Return the number of results that have been saved. This count includes all kinds of results: success, permanent failure, and temporary failure. """ return self._attempts
def attempts_str(self)
-
Return a human-friendly string counting saved results.
This method returns '1 attempt' or 'N attempts', where the number in the string is the number of saved results.
Expand source code
def attempts_str(self): """Return a human-friendly string counting saved results. This method returns '1 attempt' or 'N attempts', where the number in the string is the number of saved results. """ if self._attempts == 1: return '1 attempt' else: return '{} attempts'.format(self._attempts)
def last_result(self)
-
Return the most recent result the loop saved.
Raises
AssertionError
if called before any result has been saved.Expand source code
def last_result(self): """Return the most recent result the loop saved. Raises `arvados.errors.AssertionError` if called before any result has been saved. """ try: return self.results[-1] except IndexError: raise arvados.errors.AssertionError( "queried loop results before any were recorded")
def running(self)
-
Return whether this loop is running.
Returns
None
if the loop has never run,True
if it is still running, orFalse
if it has stopped—whether that's because it has saved a successful result, a permanent failure, or has run out of retries.Expand source code
def running(self): """Return whether this loop is running. Returns `None` if the loop has never run, `True` if it is still running, or `False` if it has stopped—whether that's because it has saved a successful result, a permanent failure, or has run out of retries. """ return self._running and (self._success is None)
def save_result(self, result)
-
Record a loop result.
Save the given result, and end the loop if it indicates success or permanent failure. See documentation for the
__init__
success_check
argument to learn how that's indicated.Raises
AssertionError
if called after the loop has already ended.Arguments:
- result: Any
- The result from this loop attempt to check and save.
Expand source code
def save_result(self, result): """Record a loop result. Save the given result, and end the loop if it indicates success or permanent failure. See documentation for the `__init__` `success_check` argument to learn how that's indicated. Raises `arvados.errors.AssertionError` if called after the loop has already ended. Arguments: result: Any : The result from this loop attempt to check and save. """ if not self.running(): raise arvados.errors.AssertionError( "recorded a loop result after the loop finished") self.results.append(result) self._success = self.check_result(result) self._attempts += 1
def success(self)
-
Return the loop's end state.
Returns
True
if the loop recorded a successful result,False
if it recorded permanent failure, or elseNone
.Expand source code
def success(self): """Return the loop's end state. Returns `True` if the loop recorded a successful result, `False` if it recorded permanent failure, or else `None`. """ return self._success