cache.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import datetime
  2. import lzma
  3. import pathlib
  4. import time
  5. import typing
  6. import cbor2
  7. import httpx
  8. import urllib.parse
  9. if typing.TYPE_CHECKING:
  10. import httpx._types
  11. ONE_DAY = datetime.timedelta(days=1)
  12. CACHE_DIR = pathlib.Path(__file__).parent / 'cache'
  13. client = httpx.Client(transport=httpx.HTTPTransport(http2=True, retries=3), timeout=5) # retry on ConnectError and ConnectTimeout
  14. @typing.overload
  15. def get(url: str, *, json: typing.Literal[True]=True, headers: httpx._types.HeaderTypes|None=None,
  16. expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> typing.Any:
  17. ...
  18. @typing.overload
  19. def get(url: str, *, json: typing.Literal[False], headers: httpx._types.HeaderTypes|None=None,
  20. expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> str:
  21. ...
  22. def get(url: str, *, json=True, headers=None, expiry=datetime.timedelta(minutes=10)) -> typing.Any:
  23. parsed = urllib.parse.urlparse(url)
  24. assert parsed.hostname is not None
  25. cache_filename = urllib.parse.quote(parsed.path.removeprefix('/'), safe='')
  26. if json:
  27. cache_filename += '.cbor'
  28. cache_filename += '.xz'
  29. cache_path = CACHE_DIR / parsed.hostname / cache_filename
  30. try:
  31. if cache_path.stat().st_mtime > time.time() - expiry.total_seconds(): # less than 10 minutes old
  32. with lzma.open(cache_path, 'rb') as f:
  33. if json:
  34. # EXTREME DETAIL: We use cbor2.loads(f.read()) instead of cbor2.load(f).
  35. # The cbor2 PyO3 Rust backend expects a continuous memory buffer. Attempting to
  36. # read directly from an LZMA streaming object feeds it decompressed chunks,
  37. # which causes memory alignment panics ("buffer size mismatch").
  38. # By calling f.read() first, we force Python to fully decompress the file into
  39. # a raw byte string in memory, which cbor2 parses flawlessly.
  40. return cbor2.loads(f.read())
  41. else:
  42. return f.read().decode('utf-8')
  43. except FileNotFoundError:
  44. pass # fall through
  45. except BaseException as e:
  46. if isinstance(e, (KeyboardInterrupt, SystemExit)):
  47. raise
  48. print(f"Warning: Corrupted cache detected for {url} ({type(e).__name__}). Fetching fresh data...")
  49. pass # fall through
  50. r = get_with_retries(url, headers)
  51. cache_path.parent.mkdir(parents=True, exist_ok=True)
  52. with lzma.open(cache_path, 'wb') as f:
  53. if json:
  54. data = r.json()
  55. # EXTREME DETAIL: Similarly, we use cbor2.dumps() to serialize the dictionary to bytes
  56. # in memory first, and then write the entire byte block to the LZMA stream at once.
  57. # This prevents the Rust backend from attempting to manage the compressed stream buffer.
  58. f.write(cbor2.dumps(data))
  59. else:
  60. data = r.text
  61. f.write(data.encode('utf-8'))
  62. return data
  63. def get_with_retries(url: str, headers: httpx._types.HeaderTypes|None=None) -> httpx.Response:
  64. for attempt in range(5):
  65. try:
  66. return client.get(url, headers=headers).raise_for_status()
  67. except httpx.ReadTimeout:
  68. if attempt == 4:
  69. raise
  70. else:
  71. print(url, 'attempt', attempt+1, 'timed out; retrying...')
  72. raise AssertionError