| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- import datetime
- import lzma
- import pathlib
- import time
- import typing
- import json as stdlib_json
- import httpx
- import urllib.parse
- if typing.TYPE_CHECKING:
- import httpx._types
- ONE_DAY = datetime.timedelta(days=1)
- CACHE_DIR = pathlib.Path(__file__).parent / 'cache'
- client = httpx.Client(transport=httpx.HTTPTransport(http2=True, retries=3), timeout=5) # retry on ConnectError and ConnectTimeout
- @typing.overload
- def get(url: str, *, json: typing.Literal[True]=True, headers: httpx._types.HeaderTypes|None=None,
- expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> typing.Any:
- ...
- @typing.overload
- def get(url: str, *, json: typing.Literal[False], headers: httpx._types.HeaderTypes|None=None,
- expiry: datetime.timedelta=datetime.timedelta(minutes=10)) -> str:
- ...
- def get(url: str, *, json=True, headers=None, expiry=datetime.timedelta(minutes=10)) -> typing.Any:
- parsed = urllib.parse.urlparse(url)
- assert parsed.hostname is not None
- cache_filename = urllib.parse.quote(parsed.path.removeprefix('/'), safe='')
- if json:
- cache_filename += '.json'
- cache_filename += '.xz'
- cache_path = CACHE_DIR / parsed.hostname / cache_filename
- try:
- if cache_path.stat().st_mtime > time.time() - expiry.total_seconds(): # less than 10 minutes old
- with lzma.open(cache_path, 'rb') as f:
- if json:
- return stdlib_json.loads(f.read())
- else:
- return f.read().decode('utf-8')
- except FileNotFoundError:
- pass # fall through
- except Exception as e:
- # EXTREME DETAIL: We catch exceptions here to gracefully recover from corrupted local files.
- # Previously, this used cbor2, which threw uncatchable Rust panics when reading corrupted data.
- # Now that we use the standard library json module, standard Exceptions will be caught,
- # allowing the system to fall through and fetch fresh data instead of crashing.
- print(f"Warning: Corrupted cache detected for {url} ({type(e).__name__}). Fetching fresh data...")
- pass # fall through
- r = get_with_retries(url, headers)
- cache_path.parent.mkdir(parents=True, exist_ok=True)
- with lzma.open(cache_path, 'wb') as f:
- if json:
- data = r.json()
- f.write(stdlib_json.dumps(data).encode('utf-8'))
- else:
- data = r.text
- f.write(data.encode('utf-8'))
- return data
- def get_with_retries(url: str, headers: httpx._types.HeaderTypes|None=None) -> httpx.Response:
- for attempt in range(5):
- try:
- return client.get(url, headers=headers).raise_for_status()
- except httpx.ReadTimeout:
- if attempt == 4:
- raise
- else:
- print(url, 'attempt', attempt+1, 'timed out; retrying...')
- raise AssertionError
|