| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- import datetime
- import lzma
- import pathlib
- import time
- import typing
- import cbor2
- import httpx
- import urllib.parse
- if typing.TYPE_CHECKING:
- import httpx._types
- ONE_DAY = datetime.timedelta(days=1)
- CACHE_DIR = pathlib.Path(__file__).parent / 'cache'
- client = httpx.Client(transport=httpx.HTTPTransport(http2=True, retries=3), timeout=5) # retry on ConnectError and ConnectTimeout
- @typing.overload
- def get(url: str, *, json: typing.Literal[True]=True, headers: httpx._types.HeaderTypes|None=None,
- expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> typing.Any:
- ...
- @typing.overload
- def get(url: str, *, json: typing.Literal[False], headers: httpx._types.HeaderTypes|None=None,
- expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> str:
- ...
- def get(url: str, *, json=True, headers=None, expiry=datetime.timedelta(minutes=10)) -> typing.Any:
- parsed = urllib.parse.urlparse(url)
- assert parsed.hostname is not None
- cache_filename = urllib.parse.quote(parsed.path.removeprefix('/'), safe='')
- if json:
- cache_filename += '.cbor'
- cache_filename += '.xz'
- cache_path = CACHE_DIR / parsed.hostname / cache_filename
- try:
- if cache_path.stat().st_mtime > time.time() - expiry.total_seconds(): # less than 10 minutes old
- with lzma.open(cache_path, 'rb') as f:
- if json:
- return cbor2.load(f)
- else:
- return f.read().decode('utf-8')
- except FileNotFoundError:
- pass # fall through
- except BaseException as e:
- # EXTREME DETAIL: PyO3 (the Rust bindings for Python used by cbor2) maps Rust panics
- # to `BaseException` rather than standard `Exception`. This means our previous
- # `except Exception:` block was completely bypassed by the pyo3_runtime.PanicException!
- # By expanding this to BaseException, we catch the panic. However, we MUST explicitly
- # re-raise KeyboardInterrupt and SystemExit so we don't accidentally break the user's
- # ability to Ctrl+C out of the script!
- if isinstance(e, (KeyboardInterrupt, SystemExit)):
- raise
- print(f"Warning: Corrupted cache detected for {url} ({type(e).__name__}). Fetching fresh data...")
- pass # fall through
- r = get_with_retries(url, headers)
- cache_path.parent.mkdir(parents=True, exist_ok=True)
- with lzma.open(cache_path, 'wb') as f:
- if json:
- data = r.json()
- cbor2.dump(data, f)
- else:
- data = r.text
- f.write(data.encode('utf-8'))
- return data
- def get_with_retries(url: str, headers: httpx._types.HeaderTypes|None=None) -> httpx.Response:
- for attempt in range(5):
- try:
- return client.get(url, headers=headers).raise_for_status()
- except httpx.ReadTimeout:
- if attempt == 4:
- raise
- else:
- print(url, 'attempt', attempt+1, 'timed out; retrying...')
- raise AssertionError
|