Signed-off-by: Shengqi Chen <harry-chen@outlook.com>
This commit is contained in:
Shengqi Chen 2025-09-02 23:05:13 +08:00
parent 9ee3c207a9
commit 6be5d42db3
No known key found for this signature in database

View File

@ -24,6 +24,7 @@ import tomllib
from copy import deepcopy
import functools
from http.client import HTTPConnection
import socket
import requests
import click
@ -43,9 +44,10 @@ IOWORKERS = int(os.environ.get("SHADOWMIRE_IOWORKERS", "2"))
# A safety net -- to avoid upstream issues casuing too many packages removed when determinating sync plan.
MAX_DELETION = int(os.environ.get("SHADOWMIRE_MAX_DELETION", "50000"))
# Sometimes PyPI is not consistent -- new packages could not be fetched. This option tries to avoid permanently mark that kind of package as nonexist.
IGNORE_THRESHOLD = int(os.environ.get("SHADOWMIRE_IGNORE_THRESHOLD", "1024"))
IGNORE_THRESHOLD = int(os.environ.get("SHADOWMIRE_IGNORE_THRESHOLD", "10000"))
# https://github.com/pypa/bandersnatch/blob/a05af547f8d1958217ef0dc0028890b1839e6116/src/bandersnatch_filter_plugins/prerelease_name.py#L18C1-L23C6
# These patterns shall work same in both re.match() and re.search(), as they begin with .+
PRERELEASE_PATTERNS = (
re.compile(r".+rc\d+$"),
re.compile(r".+a(lpha)?\d+$"),
@ -298,8 +300,9 @@ class CustomXMLRPCTransport(xmlrpc.client.Transport):
def make_connection(self, host: tuple[str, dict[str, str]] | str) -> HTTPConnection:
conn = super().make_connection(host)
if conn.timeout is None:
# 2 min timeout
if socket.getdefaulttimeout() is None:
# By default conn.timeout is socket._GLOBAL_DEFAULT_TIMEOUT instead of None.
# So here we check if default timeout is set, and if not, add a 2-min timeout
conn.timeout = 120
return conn
@ -488,8 +491,13 @@ class Plan:
def match_patterns(
s: str, ps: list[re.Pattern[str]] | tuple[re.Pattern[str], ...]
) -> bool:
"""
Search if any of the patterns match the string `s`.
Uses re.search(), matching anywhere in the string.
"""
for p in ps:
if p.match(s):
if p.search(s):
return True
return False
@ -575,6 +583,7 @@ class SyncBase:
self,
package_names: list[str],
prerelease_excludes: list[re.Pattern[str]],
excluded_wheel_filenames: list[re.Pattern[str]],
json_files: set[str],
packages_pathcache: set[str],
compare_size: bool,
@ -609,6 +618,30 @@ class SyncBase:
# something unexpected happens...
logger.info("add %s as its indexes are not consistent", package_name)
return False
# Check with JSON meta, ensuring that package file list is consistent
json_meta_path = self.jsonmeta_dir / package_name
try:
with open(json_meta_path, "r") as f:
meta = json.load(f)
meta_filters(meta, package_name, prerelease_excludes, excluded_wheel_filenames)
release_files = PyPI.get_release_files_from_meta(meta)
hrefs_from_meta = {
PyPI.file_url_to_local_url(i["url"]) for i in release_files
}
except (json.JSONDecodeError, FileNotFoundError, KeyError):
logger.info(
"add %s as its JSON meta is not valid",
package_name,
)
return False
for href in hrefs_html:
if href not in hrefs_from_meta:
logger.info(
"add %s as its HTML index has href %s not in JSON meta",
package_name,
href,
)
return False
# OK, check if all hrefs have corresponding files
if self.sync_packages:
@ -665,16 +698,25 @@ class SyncBase:
exit_with_futures(futures)
logger.info("%s packages to update in check_and_update()", len(to_update))
return self.parallel_update(to_update, prerelease_excludes)
return self.parallel_update(
to_update, prerelease_excludes, excluded_wheel_filenames
)
def parallel_update(
self, package_names: list[str], prerelease_excludes: list[re.Pattern[str]]
self,
package_names: list[str],
prerelease_excludes: list[re.Pattern[str]],
excluded_wheel_filenames: list[re.Pattern[str]],
) -> bool:
success = True
with ThreadPoolExecutor(max_workers=WORKERS) as executor:
futures = {
executor.submit(
self.do_update, package_name, prerelease_excludes, False
self.do_update,
package_name,
prerelease_excludes,
excluded_wheel_filenames,
False,
): (
idx,
package_name,
@ -705,7 +747,10 @@ class SyncBase:
return success
def do_sync_plan(
self, plan: Plan, prerelease_excludes: list[re.Pattern[str]]
self,
plan: Plan,
prerelease_excludes: list[re.Pattern[str]],
excluded_wheel_filenames: list[re.Pattern[str]],
) -> bool:
to_remove = plan.remove
to_update = plan.update
@ -713,7 +758,9 @@ class SyncBase:
for package_name in to_remove:
self.do_remove(package_name)
return self.parallel_update(to_update, prerelease_excludes)
return self.parallel_update(
to_update, prerelease_excludes, excluded_wheel_filenames
)
def do_remove(
self, package_name: str, use_db: bool = True, remove_packages: bool = True
@ -742,6 +789,7 @@ class SyncBase:
self,
package_name: str,
prerelease_excludes: list[re.Pattern[str]],
excluded_wheel_filenames: list[re.Pattern[str]],
use_db: bool = True,
) -> Optional[int]:
raise NotImplementedError
@ -828,10 +876,51 @@ def download(
def filter_release_from_meta(
meta: dict, patterns: list[re.Pattern[str]] | tuple[re.Pattern[str], ...]
) -> None:
) -> bool:
"""
Returns True if meta changes, False otherwise.
"""
changed = False
for release in list(meta["releases"].keys()):
if match_patterns(release, patterns):
del meta["releases"][release]
changed = True
return changed
def filter_wheel_file_from_meta(
meta: dict, patterns: list[re.Pattern[str]] | tuple[re.Pattern[str], ...]
) -> bool:
"""
Returns True if meta changes, False otherwise.
"""
changed = False
for release_infos in meta["releases"].values():
for release_idx in range(len(release_infos) - 1, -1, -1):
release_info = release_infos[release_idx]
filename = release_info["filename"]
if match_patterns(filename, patterns):
del release_infos[release_idx]
changed = True
return changed
def meta_filters(
meta: dict,
package_name: str,
prerelease_excludes: list[re.Pattern[str]],
excluded_wheel_filenames: list[re.Pattern[str]],
):
"""
Apply filters to the package meta.
Returns True if meta changes, False otherwise.
"""
changed = False
if match_patterns(package_name, prerelease_excludes):
changed |= filter_release_from_meta(meta, PRERELEASE_PATTERNS)
if excluded_wheel_filenames:
changed |= filter_wheel_file_from_meta(meta, excluded_wheel_filenames)
return changed
class SyncPyPI(SyncBase):
@ -857,6 +946,7 @@ class SyncPyPI(SyncBase):
self,
package_name: str,
prerelease_excludes: list[re.Pattern[str]],
excluded_wheel_filenames: list[re.Pattern[str]],
use_db: bool = True,
) -> Optional[int]:
logger.info("updating %s", package_name)
@ -900,9 +990,8 @@ class SyncPyPI(SyncBase):
self.local_db.set(package_name, -1)
return None
# filter prerelease, if necessary
if match_patterns(package_name, prerelease_excludes):
filter_release_from_meta(meta, PRERELEASE_PATTERNS)
# filter prerelease and wheel files, if necessary
meta_filters(meta, package_name, prerelease_excludes, excluded_wheel_filenames)
if self.sync_packages:
# sync packages first, then sync index
@ -911,7 +1000,7 @@ class SyncPyPI(SyncBase):
release_files = PyPI.get_release_files_from_meta(meta)
# remove packages that no longer exist remotely
remote_hrefs = [
self.pypi.file_url_to_local_url(i["url"]) for i in release_files
PyPI.file_url_to_local_url(i["url"]) for i in release_files
]
should_remove = list(set(existing_hrefs) - set(remote_hrefs))
for href in should_remove:
@ -942,6 +1031,9 @@ class SyncPyPI(SyncBase):
json_meta_path = self.jsonmeta_dir / package_name
with overwrite(json_meta_path) as f:
# Note that we're writing meta_original here!
# This is also the case for SyncPlainHTTP.
# When syncing, we want to keep the original meta (json/),
# but index.v1_json and index.v1_html are generated from modified meta.
json.dump(meta_original, f)
if use_db:
@ -987,6 +1079,7 @@ class SyncPlainHTTP(SyncBase):
self,
package_name: str,
prerelease_excludes: list[re.Pattern[str]],
excluded_wheel_filenames: list[re.Pattern[str]],
use_db: bool = True,
) -> Optional[int]:
logger.info("updating %s", package_name)
@ -1009,9 +1102,8 @@ class SyncPlainHTTP(SyncBase):
return None
assert resp
meta = resp.json()
# filter prerelease, if necessary
if match_patterns(package_name, prerelease_excludes):
filter_release_from_meta(meta, PRERELEASE_PATTERNS)
# filter prerelease and wheel files, if necessary
meta_filters(meta, package_name, prerelease_excludes, excluded_wheel_filenames)
if self.sync_packages:
release_files = PyPI.get_release_files_from_meta(meta)
@ -1034,8 +1126,24 @@ class SyncPlainHTTP(SyncBase):
dest.parent.mkdir(parents=True, exist_ok=True)
success, resp = download(self.session, url, dest)
if not success:
logger.warning("skipping %s as it fails downloading", package_name)
return None
if resp and resp.status_code == 404:
# handle special case: upstream filters out some files
logger.warning(
"cannot find %s at upstream, fallback to pypi", url
)
url = i["url"] # original pypi URL
success, resp = download(self.session, url, dest)
if not success:
logger.warning(
"skipping %s as it fails downloading (from pypi)",
package_name,
)
return None
else:
logger.warning(
"skipping %s as it fails downloading", package_name
)
return None
# OK, now it's safe to rename
(self.jsonmeta_dir / (package_name + ".new")).rename(
@ -1095,6 +1203,11 @@ def sync_shared_args(func: Callable[..., Any]) -> Callable[..., Any]:
multiple=True,
help="Package names of which prereleases will be excluded. Regex.",
),
click.option(
"--excluded-wheel-filename",
multiple=True,
help="Specify patterns to exclude wheel files (applies to all packages). Regex.",
),
]
for option in shared_options[::-1]:
func = option(func)
@ -1193,12 +1306,14 @@ def sync(
shadowmire_upstream: Optional[str],
exclude: tuple[str],
prerelease_exclude: tuple[str],
excluded_wheel_filename: tuple[str],
use_pypi_index: bool,
) -> None:
basedir: Path = ctx.obj["basedir"]
local_db: LocalVersionKV = ctx.obj["local_db"]
excludes = exclude_to_excludes(exclude)
prerelease_excludes = exclude_to_excludes(prerelease_exclude)
excluded_wheel_filenames = exclude_to_excludes(excluded_wheel_filename)
syncer = get_syncer(
basedir, local_db, sync_packages, shadowmire_upstream, use_pypi_index
)
@ -1207,7 +1322,7 @@ def sync(
# save plan for debugging
with overwrite(basedir / "plan.json") as f:
json.dump(plan, f, default=vars, indent=2)
success = syncer.do_sync_plan(plan, prerelease_excludes)
success = syncer.do_sync_plan(plan, prerelease_excludes, excluded_wheel_filenames)
syncer.finalize()
logger.info("Synchronization finished. Success: %s", success)
@ -1277,6 +1392,7 @@ def verify(
shadowmire_upstream: Optional[str],
exclude: tuple[str],
prerelease_exclude: tuple[str],
excluded_wheel_filename: tuple[str],
remove_not_in_local: bool,
compare_size: bool,
use_pypi_index: bool,
@ -1285,6 +1401,7 @@ def verify(
local_db: LocalVersionKV = ctx.obj["local_db"]
excludes = exclude_to_excludes(exclude)
prerelease_excludes = exclude_to_excludes(prerelease_exclude)
excluded_wheel_filenames = exclude_to_excludes(excluded_wheel_filename)
syncer = get_syncer(
basedir, local_db, sync_packages, shadowmire_upstream, use_pypi_index
)
@ -1341,7 +1458,9 @@ def verify(
return res
futures = {
executor.submit(packages_iterate, first_dir.name, idx % IOWORKERS): first_dir.name # type: ignore
executor.submit(
packages_iterate, first_dir.name, idx % IOWORKERS
): first_dir.name # type: ignore
for idx, first_dir in enumerate(fast_iterdir((basedir / "packages"), "dir"))
}
try:
@ -1364,6 +1483,7 @@ def verify(
success = syncer.check_and_update(
list(local_names),
prerelease_excludes,
excluded_wheel_filenames,
json_files,
packages_pathcache,
compare_size,
@ -1393,7 +1513,8 @@ def verify(
# MyPy does not enjoy same variable name with different types, even when --allow-redefinition
# Ignore here to make mypy happy
futures = {
executor.submit(iterate_simple, sname): sname for sname in simple_dirs # type: ignore
executor.submit(iterate_simple, sname): sname
for sname in simple_dirs # type: ignore
}
try:
for future in tqdm(
@ -1418,7 +1539,7 @@ def verify(
for path in tqdm(packages_pathcache, desc="Iterating path cache"):
if path not in ref_set:
logger.info("removing unreferenced file %s", path)
Path(path).unlink()
Path(path).unlink(missing_ok=True)
logger.info("Verification finished. Success: %s", success)
@ -1436,6 +1557,7 @@ def do_update(
shadowmire_upstream: Optional[str],
exclude: tuple[str],
prerelease_exclude: tuple[str],
excluded_wheel_filename: tuple[str],
use_pypi_index: bool,
package_name: str,
) -> None:
@ -1445,10 +1567,11 @@ def do_update(
if excludes:
logger.warning("--exclude is ignored in do_update()")
prerelease_excludes = exclude_to_excludes(prerelease_exclude)
excluded_wheel_filenames = exclude_to_excludes(excluded_wheel_filename)
syncer = get_syncer(
basedir, local_db, sync_packages, shadowmire_upstream, use_pypi_index
)
syncer.do_update(package_name, prerelease_excludes)
syncer.do_update(package_name, prerelease_excludes, excluded_wheel_filenames)
@cli.command(help="Manual remove given package for debugging purpose")
@ -1461,12 +1584,13 @@ def do_remove(
shadowmire_upstream: Optional[str],
exclude: tuple[str],
prerelease_exclude: tuple[str],
excluded_wheel_filename: tuple[str],
use_pypi_index: bool,
package_name: str,
) -> None:
basedir = ctx.obj["basedir"]
local_db = ctx.obj["local_db"]
if exclude or prerelease_exclude:
if exclude or prerelease_exclude or excluded_wheel_filename:
logger.warning("exclusion rules are ignored in do_remove()")
syncer = get_syncer(
basedir, local_db, sync_packages, shadowmire_upstream, use_pypi_index