cache.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. import datetime
  2. import lzma
  3. import pathlib
  4. import time
  5. import typing
  6. import json as stdlib_json
  7. import httpx
  8. import urllib.parse
  9. if typing.TYPE_CHECKING:
  10. import httpx._types
  11. ONE_DAY = datetime.timedelta(days=1)
  12. CACHE_DIR = pathlib.Path(__file__).parent / 'cache'
  13. client = httpx.Client(transport=httpx.HTTPTransport(http2=True, retries=3), timeout=5) # retry on ConnectError and ConnectTimeout
  14. @typing.overload
  15. def get(url: str, *, json: typing.Literal[True]=True, headers: httpx._types.HeaderTypes|None=None,
  16. expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> typing.Any:
  17. ...
  18. @typing.overload
  19. def get(url: str, *, json: typing.Literal[False], headers: httpx._types.HeaderTypes|None=None,
  20. expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> str:
  21. ...
  22. def get(url: str, *, json=True, headers=None, expiry=datetime.timedelta(minutes=10)) -> typing.Any:
  23. parsed = urllib.parse.urlparse(url)
  24. assert parsed.hostname is not None
  25. cache_filename = urllib.parse.quote(parsed.path.removeprefix('/'), safe='')
  26. if json:
  27. cache_filename += '.json'
  28. cache_filename += '.xz'
  29. cache_path = CACHE_DIR / parsed.hostname / cache_filename
  30. try:
  31. if cache_path.stat().st_mtime > time.time() - expiry.total_seconds(): # less than 10 minutes old
  32. with lzma.open(cache_path, 'rb') as f:
  33. if json:
  34. return stdlib_json.loads(f.read())
  35. else:
  36. return f.read().decode('utf-8')
  37. except FileNotFoundError:
  38. pass # fall through
  39. except Exception as e:
  40. # EXTREME DETAIL: We catch exceptions here to gracefully recover from corrupted local files.
  41. # Previously, this used cbor2, which threw uncatchable Rust panics when reading corrupted data.
  42. # Now that we use the standard library json module, standard Exceptions will be caught,
  43. # allowing the system to fall through and fetch fresh data instead of crashing.
  44. print(f"Warning: Corrupted cache detected for {url} ({type(e).__name__}). Fetching fresh data...")
  45. pass # fall through
  46. r = get_with_retries(url, headers)
  47. cache_path.parent.mkdir(parents=True, exist_ok=True)
  48. with lzma.open(cache_path, 'wb') as f:
  49. if json:
  50. data = r.json()
  51. f.write(stdlib_json.dumps(data).encode('utf-8'))
  52. else:
  53. data = r.text
  54. f.write(data.encode('utf-8'))
  55. return data
  56. def get_with_retries(url: str, headers: httpx._types.HeaderTypes|None=None) -> httpx.Response:
  57. for attempt in range(5):
  58. try:
  59. return client.get(url, headers=headers).raise_for_status()
  60. except httpx.ReadTimeout:
  61. if attempt == 4:
  62. raise
  63. else:
  64. print(url, 'attempt', attempt+1, 'timed out; retrying...')
  65. raise AssertionError