-
Notifications
You must be signed in to change notification settings - Fork 61
Replace custom cache with CacheControl library #671
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,19 +1,17 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import calendar | ||
| import contextlib | ||
| import hashlib | ||
| import io | ||
| import logging | ||
| import os | ||
| import platform | ||
| import shutil | ||
| import tempfile | ||
| import time | ||
| import typing as t | ||
|
|
||
| import cachecontrol | ||
| import requests | ||
| from cachecontrol.caches.file_cache import FileCache | ||
|
|
||
| _LASTMOD_FMT = "%a, %d %b %Y %H:%M:%S %Z" | ||
| log = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def _base_cache_dir() -> str | None: | ||
|
|
@@ -42,26 +40,23 @@ def _resolve_cache_dir(dirname: str) -> str | None: | |
| return cache_dir | ||
|
|
||
|
|
||
| def _lastmod_from_response(response: requests.Response) -> float: | ||
| try: | ||
| return calendar.timegm( | ||
| time.strptime(response.headers["last-modified"], _LASTMOD_FMT) | ||
| ) | ||
| # OverflowError: time outside of platform-specific bounds | ||
| # ValueError: malformed/unparseable | ||
| # LookupError: no such header | ||
| except (OverflowError, ValueError, LookupError): | ||
| return 0.0 | ||
|
|
||
|
|
||
| def _get_request( | ||
| file_url: str, *, response_ok: t.Callable[[requests.Response], bool] | ||
| session: requests.Session, | ||
| file_url: str, | ||
| *, | ||
| response_ok: t.Callable[[requests.Response], bool], | ||
| ) -> requests.Response: | ||
| num_retries = 2 | ||
| r: requests.Response | None = None | ||
| for _attempt in range(num_retries + 1): | ||
| try: | ||
| r = requests.get(file_url, stream=True) | ||
| # On retries, bypass CacheControl's local cache to avoid | ||
| # re-serving a cached bad response. Ideally we'd delete the | ||
| # cache entry directly, but CacheControl doesn't expose a public | ||
| # API for this (see https://github.com/psf/cachecontrol/issues/219). | ||
| # The no-cache header forces revalidation with the origin server. | ||
| headers = {"Cache-Control": "no-cache"} if _attempt > 0 else {} | ||
| r = session.get(file_url, headers=headers) | ||
| except requests.RequestException as e: | ||
| if _attempt == num_retries: | ||
| raise FailedDownloadError("encountered error during download") from e | ||
|
|
@@ -74,48 +69,6 @@ def _get_request( | |
| ) | ||
|
|
||
|
|
||
| def _atomic_write(dest: str, content: bytes) -> None: | ||
| # download to a temp file and then move to the dest | ||
| # this makes the download safe if run in parallel (parallel runs | ||
| # won't create a new empty file for writing and cause failures) | ||
| fp = tempfile.NamedTemporaryFile(mode="wb", delete=False) | ||
| fp.write(content) | ||
| fp.close() | ||
| shutil.copy(fp.name, dest) | ||
| os.remove(fp.name) | ||
|
|
||
|
|
||
| def _cache_hit(cachefile: str, response: requests.Response) -> bool: | ||
| # no file? miss | ||
| if not os.path.exists(cachefile): | ||
| return False | ||
|
|
||
| # compare mtime on any cached file against the remote last-modified time | ||
| # it is considered a hit if the local file is at least as new as the remote file | ||
| local_mtime = os.path.getmtime(cachefile) | ||
| remote_mtime = _lastmod_from_response(response) | ||
| return local_mtime >= remote_mtime | ||
|
|
||
|
|
||
| def url_to_cache_filename(ref_url: str) -> str: | ||
| """ | ||
| Given a schema URL, convert it to a filename for caching in a cache dir. | ||
|
|
||
| Rules are as follows: | ||
| - the base filename is an sha256 hash of the URL | ||
| - if the filename ends in an extension (.json, .yaml, etc) that extension | ||
| is appended to the hash | ||
|
|
||
| Preserving file extensions preserves the extension-based logic used for parsing, and | ||
| it also helps a local editor (browsing the cache) identify filetypes. | ||
| """ | ||
| filename = hashlib.sha256(ref_url.encode()).hexdigest() | ||
| if "." in (last_part := ref_url.rpartition("/")[-1]): | ||
| _, _, extension = last_part.rpartition(".") | ||
| filename = f"{filename}.{extension}" | ||
| return filename | ||
|
|
||
|
|
||
| class FailedDownloadError(Exception): | ||
| pass | ||
|
|
||
|
|
@@ -124,59 +77,42 @@ class CacheDownloader: | |
| def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: | ||
| self._cache_dir = _resolve_cache_dir(cache_dir) | ||
| self._disable_cache = disable_cache | ||
|
|
||
| def _download( | ||
| self, | ||
| file_url: str, | ||
| filename: str, | ||
| response_ok: t.Callable[[requests.Response], bool], | ||
| ) -> str: | ||
| assert self._cache_dir is not None | ||
| os.makedirs(self._cache_dir, exist_ok=True) | ||
| dest = os.path.join(self._cache_dir, filename) | ||
|
|
||
| def check_response_for_download(r: requests.Response) -> bool: | ||
| # if the response indicates a cache hit, treat it as valid | ||
| # this ensures that we short-circuit any further evaluation immediately on | ||
| # a hit | ||
| if _cache_hit(dest, r): | ||
| return True | ||
| # we now know it's not a hit, so validate the content (forces download) | ||
| return response_ok(r) | ||
|
|
||
| response = _get_request(file_url, response_ok=check_response_for_download) | ||
| # check to see if we have a file which matches the connection | ||
| # only download if we do not (cache miss, vs hit) | ||
| if not _cache_hit(dest, response): | ||
| _atomic_write(dest, response.content) | ||
|
|
||
| return dest | ||
| self._cached_session: requests.Session | None = None | ||
|
|
||
| @property | ||
| def _session(self) -> requests.Session: | ||
| if self._cached_session is None: | ||
| self._cached_session = self._build_session() | ||
| return self._cached_session | ||
|
|
||
| def _build_session(self) -> requests.Session: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could the property+builder collapse together with |
||
| session = requests.Session() | ||
| if self._cache_dir and not self._disable_cache: | ||
| log.debug("using cache dir: %s", self._cache_dir) | ||
| os.makedirs(self._cache_dir, exist_ok=True) | ||
| session = cachecontrol.CacheControl( | ||
| session, cache=FileCache(self._cache_dir) | ||
| ) | ||
| else: | ||
| log.debug("caching disabled") | ||
| return session | ||
|
|
||
| @contextlib.contextmanager | ||
| def open( | ||
| self, | ||
| file_url: str, | ||
| filename: str, | ||
| validate_response: t.Callable[[requests.Response], bool], | ||
| ) -> t.Iterator[t.IO[bytes]]: | ||
| if (not self._cache_dir) or self._disable_cache: | ||
| yield io.BytesIO( | ||
| _get_request(file_url, response_ok=validate_response).content | ||
| ) | ||
| else: | ||
| with open( | ||
| self._download(file_url, filename, response_ok=validate_response), "rb" | ||
| ) as fp: | ||
| yield fp | ||
| response = _get_request(self._session, file_url, response_ok=validate_response) | ||
| yield io.BytesIO(response.content) | ||
|
|
||
| def bind( | ||
| self, | ||
| file_url: str, | ||
| filename: str | None = None, | ||
| validation_callback: t.Callable[[bytes], t.Any] | None = None, | ||
| ) -> BoundCacheDownloader: | ||
| return BoundCacheDownloader( | ||
| file_url, self, filename=filename, validation_callback=validation_callback | ||
| file_url, self, validation_callback=validation_callback | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -186,27 +122,28 @@ def __init__( | |
| file_url: str, | ||
| downloader: CacheDownloader, | ||
| *, | ||
| filename: str | None = None, | ||
| validation_callback: t.Callable[[bytes], t.Any] | None = None, | ||
| ) -> None: | ||
| self._file_url = file_url | ||
| self._filename = filename or url_to_cache_filename(file_url) | ||
| self._downloader = downloader | ||
| self._validation_callback = validation_callback | ||
|
|
||
| @contextlib.contextmanager | ||
| def open(self) -> t.Iterator[t.IO[bytes]]: | ||
| with self._downloader.open( | ||
| self._file_url, | ||
| self._filename, | ||
| validate_response=self._validate_response, | ||
| ) as fp: | ||
| yield fp | ||
|
|
||
| def _validate_response(self, response: requests.Response) -> bool: | ||
| if not self._validation_callback: | ||
| return True | ||
|
|
||
| # CacheControl sets from_cache=True on cache hits; skip re-validation. | ||
| # Plain requests.Session (used when disable_cache=True) doesn't set this | ||
| # attribute at all, so we use getattr with a default. | ||
| if getattr(response, "from_cache", False): | ||
| return True | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes me nervous, and relates to my note above about bad data getting into the cache. Now the question I have is: how do we get bad data out of the cache? There's only one bit of code which calls the CacheDownloader in practice. Parsing is actually a dynamic problem! JSON5 parsers are used if available, but not if they aren't. And YAML parsing with In general, I think we need to be prepared for the possibility that data which passed validation yesterday might not be valid today. I'm not quite sure what to do on this front. We can remove this I think we may need the ability to do that. It would mean reaching "down the stack", to I'm also going to ask on the PyPA discord (I see some of the (past?) maintainers of
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, I got two pieces of information from a quick exchange with one of the maintainers.
(2) means that this has a pretty simple resolution. Use (1) is a suggestion, but I think |
||
| try: | ||
| self._validation_callback(response.content) | ||
| return True | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| import os | ||
| import textwrap | ||
| import typing as t | ||
|
|
@@ -43,6 +44,18 @@ def set_color_mode(ctx: click.Context, param: str, value: str) -> None: | |
| }[value] | ||
|
|
||
|
|
||
| def configure_logging( | ||
| ctx: click.Context, param: click.Parameter, value: str | None | ||
| ) -> None: | ||
| if value is None: | ||
| return | ||
| level = getattr(logging, value.upper()) | ||
| logging.basicConfig( | ||
| level=level, | ||
| format="%(name)s [%(levelname)s]: %(message)s", | ||
| ) | ||
|
|
||
|
|
||
| def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: | ||
| return textwrap.indent( | ||
| "\n".join( | ||
|
|
@@ -88,12 +101,20 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: | |
| ) | ||
| @click.help_option("-h", "--help") | ||
| @click.version_option() | ||
| @click.option( | ||
| "--log-level", | ||
| help="Set the log level for debug output (e.g., DEBUG, INFO, WARNING).", | ||
| type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR"], case_sensitive=False), | ||
| callback=configure_logging, | ||
| expose_value=False, | ||
| is_eager=True, | ||
| ) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't mind having an option for setting Python log-level, but I think we should make it I've already exposed possibly a couple too many knobs for logging and output control -- we have |
||
| @click.option( | ||
| "--schemafile", | ||
| help=( | ||
| "The path to a file containing the JSON Schema to use or an " | ||
| "HTTP(S) URI for the schema. If a remote file is used, " | ||
| "it will be downloaded and cached locally based on mtime. " | ||
| "it will be downloaded and cached locally. " | ||
| "Use '-' for stdin." | ||
| ), | ||
| metavar="[PATH|URI]", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm still reading
cachecontrolto figure out a bit more about how it works, but I thinkno-cachewill cause it to not cache the response that it gets, even to update/clear an existing cache entry. I'm actually slightly concerned about this.I'm looking at this line, where the
cached_responseis produced, and the definition site for that includes this bit which checkno-cache.The bad scenario is what happens if the response fails to satisfy the
response_ok()check. Withno-cacheset on the request, the cache won't be updated. So the first request gets data, which goes into the cache, and then subsequent requests will be unable to clear that data away, even though it fails theresponse_ok()callback.I think I have a solution for this, but not knowing
cachecontrol, I'm not 100% certain.If the
response_okcallback + the.okcheck could be encoded as a Caching Heuristic., I think that could stop the cache from being populated with data which fails the check.If this sounds correct, we should update to use a custom heuristic to avoid caching known-bad data.
If it doesn't sound right, help me learn where I've gone wrong in my reasoning! 😁