import datetime import lzma import pathlib import time import typing import cbor2 import httpx import urllib.parse if typing.TYPE_CHECKING: import httpx._types ONE_DAY = datetime.timedelta(days=1) CACHE_DIR = pathlib.Path(__file__).parent / 'cache' client = httpx.Client(transport=httpx.HTTPTransport(http2=True, retries=3), timeout=5) # retry on ConnectError and ConnectTimeout @typing.overload def get(url: str, *, json: typing.Literal[True]=True, headers: httpx._types.HeaderTypes|None=None, expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> typing.Any: ... @typing.overload def get(url: str, *, json: typing.Literal[False], headers: httpx._types.HeaderTypes|None=None, expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> str: ... def get(url: str, *, json=True, headers=None, expiry=datetime.timedelta(minutes=10)) -> typing.Any: parsed = urllib.parse.urlparse(url) assert parsed.hostname is not None cache_filename = urllib.parse.quote(parsed.path.removeprefix('/'), safe='') if json: cache_filename += '.cbor' cache_filename += '.xz' cache_path = CACHE_DIR / parsed.hostname / cache_filename try: if cache_path.stat().st_mtime > time.time() - expiry.total_seconds(): # less than 10 minutes old with lzma.open(cache_path, 'rb') as f: if json: # EXTREME DETAIL: We use cbor2.loads(f.read()) instead of cbor2.load(f). # The cbor2 PyO3 Rust backend expects a continuous memory buffer. Attempting to # read directly from an LZMA streaming object feeds it decompressed chunks, # which causes memory alignment panics ("buffer size mismatch"). # By calling f.read() first, we force Python to fully decompress the file into # a raw byte string in memory, which cbor2 parses flawlessly. return cbor2.loads(f.read()) else: return f.read().decode('utf-8') except FileNotFoundError: pass # fall through except BaseException as e: if isinstance(e, (KeyboardInterrupt, SystemExit)): raise print(f"Warning: Corrupted cache detected for {url} ({type(e).__name__}). Fetching fresh data...") pass # fall through r = get_with_retries(url, headers) cache_path.parent.mkdir(parents=True, exist_ok=True) with lzma.open(cache_path, 'wb') as f: if json: data = r.json() # EXTREME DETAIL: Similarly, we use cbor2.dumps() to serialize the dictionary to bytes # in memory first, and then write the entire byte block to the LZMA stream at once. # This prevents the Rust backend from attempting to manage the compressed stream buffer. f.write(cbor2.dumps(data)) else: data = r.text f.write(data.encode('utf-8')) return data def get_with_retries(url: str, headers: httpx._types.HeaderTypes|None=None) -> httpx.Response: for attempt in range(5): try: return client.get(url, headers=headers).raise_for_status() except httpx.ReadTimeout: if attempt == 4: raise else: print(url, 'attempt', attempt+1, 'timed out; retrying...') raise AssertionError