diff --git a/shadowmire.py b/shadowmire.py
index 7bf595b..df2e4a6 100755
--- a/shadowmire.py
+++ b/shadowmire.py
@@ -36,6 +36,9 @@ logger = logging.getLogger("shadowmire")
USER_AGENT = "Shadowmire (https://github.com/taoky/shadowmire)"
+LOCAL_DB_NAME = "local.db"
+LOCAL_JSON_NAME = "local.json"
+LOCAL_DB_SERIAL_NAME = "local.db.serial"
# Note that it's suggested to use only 3 workers for PyPI.
WORKERS = int(os.environ.get("SHADOWMIRE_WORKERS", "3"))
@@ -450,6 +453,7 @@ class PyPI:
"files": [],
"meta": {
"api-version": "1.1",
+ # not required by PEP691, but bandersnatch has it
"_last-serial": str(package_meta["last_serial"]),
},
"name": package_meta["info"]["name"],
@@ -486,6 +490,7 @@ ShadowmirePackageItem = tuple[str, int]
class Plan:
remove: list[str]
update: list[str]
+ remote_last_serial: int
def match_patterns(
@@ -535,15 +540,15 @@ class SyncBase:
"""
local should NOT skip invalid (-1) serials
"""
- remote = self.fetch_remote_versions()
- remote = self.filter_remote_with_excludes(remote, excludes)
+ remote_sn, remote_pkgs = self.fetch_remote_versions()
+ remote_pkgs = self.filter_remote_with_excludes(remote_pkgs, excludes)
with open(self.basedir / "remote_excluded.json", "w") as f:
- json.dump(remote, f)
+ json.dump(remote_pkgs, f)
to_remove = []
to_update = []
local_keys = set(local.keys())
- remote_keys = set(remote.keys())
+ remote_keys = set(remote_pkgs.keys())
for i in local_keys - remote_keys:
to_remove.append(i)
local_keys.remove(i)
@@ -566,17 +571,21 @@ class SyncBase:
to_update.append(i)
for i in local_keys:
local_serial = local[i]
- remote_serial = remote[i]
+ remote_serial = remote_pkgs[i]
if local_serial != remote_serial:
if local_serial == -1:
logger.info("skip %s, as it's marked as not exist at upstream", i)
to_remove.append(i)
else:
to_update.append(i)
- output = Plan(remove=to_remove, update=to_update)
+ output = Plan(remove=to_remove, update=to_update, remote_last_serial=remote_sn)
return output
- def fetch_remote_versions(self) -> dict[str, int]:
+ def fetch_remote_versions(self) -> tuple[int, dict[str, int]]:
+ # returns (last_serial, {package_name: serial, ...})
+ raise NotImplementedError
+
+ def get_package_metadata(self, package_name: str) -> dict:
raise NotImplementedError
def check_and_update(
@@ -623,7 +632,9 @@ class SyncBase:
try:
with open(json_meta_path, "r") as f:
meta = json.load(f)
- meta_filters(meta, package_name, prerelease_excludes, excluded_wheel_filenames)
+ meta_filters(
+ meta, package_name, prerelease_excludes, excluded_wheel_filenames
+ )
release_files = PyPI.get_release_files_from_meta(meta)
hrefs_from_meta = {
PyPI.file_url_to_local_url(i["url"]) for i in release_files
@@ -811,12 +822,12 @@ class SyncBase:
index_html_path.unlink()
index_html_path.symlink_to("index.v1_html")
- def finalize(self) -> None:
+ def finalize(self, index_serial: int) -> None:
local_names = self.local_db.keys()
- # generate index.html at basedir
- index_path = self.basedir / "simple" / "index.html"
+ # generate v1_html index
+ v1_html_index_path = self.basedir / "simple" / "index.v1_html"
# modified from bandersnatch
- with overwrite(index_path) as f:
+ with overwrite(v1_html_index_path) as f:
f.write("\n")
f.write("\n")
f.write("
\n")
@@ -830,6 +841,25 @@ class SyncBase:
# We're really trusty that this is all encoded in UTF-8. :/
f.write(f' {pkg}
\n')
f.write(" \n")
+ # always link index.html to index.v1_html
+ html_simple_path = self.basedir / "simple" / "index.html"
+ if not html_simple_path.is_symlink():
+ html_simple_path.unlink(missing_ok=True)
+ html_simple_path.symlink_to("index.v1_html")
+
+ # generate v1_json index and local.db{,.serial} for downstream use
+ v1_json_index_path = self.basedir / "simple" / "index.v1_json"
+ with overwrite(v1_json_index_path) as f:
+ index_json: dict[str, Any] = {
+ "meta": {
+ "api-version": "1.1",
+ "_last-serial": index_serial,
+ },
+ "projects": [{"name": n} for n in sorted(local_names)],
+ }
+ json.dump(index_json, f)
+ with overwrite(self.basedir / LOCAL_DB_SERIAL_NAME) as f:
+ f.write(str(index_serial))
self.local_db.dump_json()
def skip_this_package(self, i: dict, dest: Path) -> bool:
@@ -933,14 +963,17 @@ class SyncPyPI(SyncBase):
self.remote_packages: Optional[dict[str, int]] = None
super().__init__(basedir, local_db, sync_packages)
- def fetch_remote_versions(self) -> dict[str, int]:
+ def fetch_remote_versions(self) -> tuple[int, dict[str, int]]:
self.last_serial = self.pypi.changelog_last_serial()
self.remote_packages = self.pypi.list_packages_with_serial()
logger.info("Remote has %s packages", len(self.remote_packages))
with overwrite(self.basedir / "remote.json") as f:
json.dump(self.remote_packages, f)
logger.info("File saved to remote.json.")
- return self.remote_packages
+ return self.last_serial, self.remote_packages
+
+ def get_package_metadata(self, package_name: str) -> dict:
+ return self.pypi.get_package_metadata(package_name)
def do_update(
self,
@@ -953,7 +986,7 @@ class SyncPyPI(SyncBase):
package_simple_path = self.simple_dir / package_name
package_simple_path.mkdir(exist_ok=True)
try:
- meta = self.pypi.get_package_metadata(package_name)
+ meta = self.get_package_metadata(package_name)
meta_original = deepcopy(meta)
logger.debug("%s meta: %s", package_name, meta)
except PackageNotFoundError:
@@ -999,9 +1032,7 @@ class SyncPyPI(SyncBase):
existing_hrefs = [] if existing_hrefs is None else existing_hrefs
release_files = PyPI.get_release_files_from_meta(meta)
# remove packages that no longer exist remotely
- remote_hrefs = [
- PyPI.file_url_to_local_url(i["url"]) for i in release_files
- ]
+ remote_hrefs = [PyPI.file_url_to_local_url(i["url"]) for i in release_files]
should_remove = list(set(existing_hrefs) - set(remote_hrefs))
for href in should_remove:
p = unquote(href)
@@ -1060,20 +1091,49 @@ class SyncPlainHTTP(SyncBase):
self.pypi = None
super().__init__(basedir, local_db, sync_packages)
- def fetch_remote_versions(self) -> dict[str, int]:
- remote: dict[str, int]
+ def fetch_remote_versions(self) -> tuple[int, dict[str, int]]:
+ remote_pkgs: dict[str, int]
if not self.pypi:
- remote_url = urljoin(self.upstream, "local.json")
- resp = self.session.get(remote_url)
+ remote_pkg_db_url = urljoin(self.upstream, LOCAL_JSON_NAME)
+ resp = self.session.get(remote_pkg_db_url)
resp.raise_for_status()
- remote = resp.json()
+ remote_pkgs = resp.json()
+ # first fallback to max serial in remote_pkgs
+ serial = max(remote_pkgs.values()) if remote_pkgs else -1
+ # then try to get last serial from remote
+ remote_last_serial_url = urljoin(self.upstream, LOCAL_DB_SERIAL_NAME)
+ try:
+ resp = self.session.get(remote_last_serial_url)
+ resp.raise_for_status()
+ serial = int(resp.text.strip())
+ except (requests.RequestException, ValueError):
+ logger.warning(
+ f"cannot get last_serial from upstream, fallback to max package serial in {LOCAL_JSON_NAME}",
+ exc_info=True,
+ )
else:
- remote = self.pypi.list_packages_with_serial()
- logger.info("Remote has %s packages", len(remote))
+ serial = self.pypi.changelog_last_serial()
+ remote_pkgs = self.pypi.list_packages_with_serial()
+ logger.info("Remote has %s packages", len(remote_pkgs))
with overwrite(self.basedir / "remote.json") as f:
- json.dump(remote, f)
+ json.dump(remote_pkgs, f)
logger.info("File saved to remote.json.")
- return remote
+ return serial, remote_pkgs
+
+ def get_package_metadata(self, package_name: str) -> dict:
+ file_url = urljoin(self.upstream, f"json/{package_name}")
+ success, resp = download(
+ self.session, file_url, self.jsonmeta_dir / (package_name + ".new")
+ )
+ if not success:
+ logger.error(
+ "download %s JSON meta fails with code %s",
+ package_name,
+ resp.status_code if resp else None,
+ )
+ raise PackageNotFoundError
+ assert resp
+ return resp.json()
def do_update(
self,
@@ -1089,19 +1149,10 @@ class SyncPlainHTTP(SyncBase):
hrefs = get_existing_hrefs(package_simple_path)
existing_hrefs = [] if hrefs is None else hrefs
# Download JSON meta
- file_url = urljoin(self.upstream, f"json/{package_name}")
- success, resp = download(
- self.session, file_url, self.jsonmeta_dir / (package_name + ".new")
- )
- if not success:
- logger.error(
- "download %s JSON meta fails with code %s",
- package_name,
- resp.status_code if resp else None,
- )
+ try:
+ meta = self.get_package_metadata(package_name)
+ except PackageNotFoundError:
return None
- assert resp
- meta = resp.json()
# filter prerelease and wheel files, if necessary
meta_filters(meta, package_name, prerelease_excludes, excluded_wheel_filenames)
@@ -1264,8 +1315,7 @@ def cli(ctx: click.Context, repo: str) -> None:
# Make sure basedir is absolute
basedir = Path(repo).resolve()
- local_db = LocalVersionKV(basedir / "local.db", basedir / "local.json")
-
+ local_db = LocalVersionKV(basedir / LOCAL_DB_NAME, basedir / LOCAL_JSON_NAME)
ctx.obj["basedir"] = basedir
ctx.obj["local_db"] = local_db
@@ -1323,7 +1373,7 @@ def sync(
with overwrite(basedir / "plan.json") as f:
json.dump(plan, f, default=vars, indent=2)
success = syncer.do_sync_plan(plan, prerelease_excludes, excluded_wheel_filenames)
- syncer.finalize()
+ syncer.finalize(plan.remote_last_serial)
logger.info("Synchronization finished. Success: %s", success)
@@ -1488,7 +1538,7 @@ def verify(
packages_pathcache,
compare_size,
)
- syncer.finalize()
+ syncer.finalize(plan.remote_last_serial)
logger.info(
"====== Step 5. Remove any unreferenced files in `packages` folder ======"