aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZac Medico <zmedico@gentoo.org>2021-02-24 11:56:38 -0800
committerZac Medico <zmedico@gentoo.org>2021-02-26 23:43:23 -0800
commitfd04c5fb1619f86381b5d5e6ff66b20fa3967c43 (patch)
treec8f94ee19d8c4932317ca54f890d75bb923523ea
parentemirrordist: support minimal object as options for use in unit tests (diff)
downloadportage-fd04c5fb.tar.gz
portage-fd04c5fb.tar.bz2
portage-fd04c5fb.zip
emirrordist: add --content-db option required for content-hash layout (bug 756778)
Add a --content-db option which is required for the content-hash layout because its file listings return content digests instead of distfile names. The content db serves to translate content digests to distfiles names, and distfiles names to content digests. All keys have one or more prefixes separated by colons. For a digest key, the first prefix is "digest" and the second prefix is the hash algorithm name. For a filename key, the prefix is "filename". The value associated with a digest key is a set of file names. The value associated with a distfile key is a set of content revisions. Each content revision is expressed as a dictionary of digests which is suitable for construction of a DistfileName instance. A given content digest will translate to multiple distfile names if multiple associations have been created via the content db add method. The relationship between a content digest and a distfile name is similar to the relationship between an inode and a hardlink. Bug: https://bugs.gentoo.org/756778 Signed-off-by: Zac Medico <zmedico@gentoo.org>
-rw-r--r--lib/portage/_emirrordist/Config.py6
-rw-r--r--lib/portage/_emirrordist/ContentDB.py196
-rw-r--r--lib/portage/_emirrordist/DeletionIterator.py25
-rw-r--r--lib/portage/_emirrordist/DeletionTask.py8
-rw-r--r--lib/portage/_emirrordist/FetchTask.py5
-rw-r--r--lib/portage/_emirrordist/main.py15
-rw-r--r--lib/portage/package/ebuild/fetch.py8
-rw-r--r--lib/portage/tests/ebuild/test_fetch.py148
-rw-r--r--man/emirrordist.16
9 files changed, 407 insertions, 10 deletions
diff --git a/lib/portage/_emirrordist/Config.py b/lib/portage/_emirrordist/Config.py
index 1c7a27d66..a4b75809f 100644
--- a/lib/portage/_emirrordist/Config.py
+++ b/lib/portage/_emirrordist/Config.py
@@ -10,6 +10,7 @@ import time
from portage import os
from portage.package.ebuild.fetch import MirrorLayoutConfig
from portage.util import grabdict, grablines
+from .ContentDB import ContentDB
class Config:
def __init__(self, options, portdb, event_loop):
@@ -65,6 +66,11 @@ class Config:
self.distfiles_db = self._open_shelve(
options.distfiles_db, 'distfiles')
+ self.content_db = None
+ if getattr(options, 'content_db', None) is not None:
+ self.content_db = ContentDB(self._open_shelve(
+ options.content_db, 'content'))
+
self.deletion_db = None
if getattr(options, 'deletion_db', None) is not None:
self.deletion_db = self._open_shelve(
diff --git a/lib/portage/_emirrordist/ContentDB.py b/lib/portage/_emirrordist/ContentDB.py
new file mode 100644
index 000000000..d9ce3cc45
--- /dev/null
+++ b/lib/portage/_emirrordist/ContentDB.py
@@ -0,0 +1,196 @@
+# Copyright 2021 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+import logging
+import operator
+import shelve
+import typing
+
+from portage.package.ebuild.fetch import DistfileName
+
+
+class ContentDB:
+ """
+ The content db serves to translate content digests to distfiles
+ names, and distfiles names to content digests. All keys have one or
+ more prefixes separated by colons. For a digest key, the first
+ prefix is "digest" and the second prefix is the hash algorithm name.
+ For a filename key, the prefix is "filename".
+
+ The value associated with a digest key is a set of file names. The
+ value associated with a distfile key is a set of content revisions.
+ Each content revision is expressed as a dictionary of digests which
+ is suitable for construction of a DistfileName instance.
+ """
+
+ def __init__(self, shelve_instance: shelve.Shelf):
+ self._shelve = shelve_instance
+
+ def add(self, filename: DistfileName):
+ """
+ Add file name and digests, creating a new content revision, or
+ incrementing the reference count to an identical content revision
+ if one exists. If the file name had previous content revisions,
+ then they continue to exist independently of the new one.
+
+ @param filename: file name with digests attribute
+ """
+ distfile_str = str(filename)
+ distfile_key = "filename:{}".format(distfile_str)
+ for k, v in filename.digests.items():
+ if k != "size":
+ digest_key = "digest:{}:{}".format(k.upper(), v.lower())
+ try:
+ digest_files = self._shelve[digest_key]
+ except KeyError:
+ digest_files = set()
+ digest_files.add(distfile_str)
+ self._shelve[digest_key] = digest_files
+ try:
+ content_revisions = self._shelve[distfile_key]
+ except KeyError:
+ content_revisions = set()
+
+ revision_key = tuple(
+ sorted(
+ (
+ (algo.upper(), filename.digests[algo.upper()].lower())
+ for algo in filename.digests
+ if algo != "size"
+ ),
+ key=operator.itemgetter(0),
+ )
+ )
+ content_revisions.add(revision_key)
+ self._shelve[distfile_key] = content_revisions
+
+ def remove(self, filename: DistfileName):
+ """
+ Remove a file name and digests from the database. If identical
+ content is still referenced by one or more other file names,
+ then those references are preserved (like removing one of many
+ hardlinks). Also, this file name may reference other content
+ revisions with different digests, and those content revisions
+ will remain as well.
+
+ @param filename: file name with digests attribute
+ """
+ distfile_key = "filename:{}".format(filename)
+ try:
+ content_revisions = self._shelve[distfile_key]
+ except KeyError:
+ pass
+ else:
+ remaining = set()
+ for revision_key in content_revisions:
+ if not any(digest_item in revision_key for digest_item in filename.digests.items()):
+ remaining.add(revision_key)
+ continue
+ for k, v in revision_key:
+ digest_key = "digest:{}:{}".format(k, v)
+ try:
+ digest_files = self._shelve[digest_key]
+ except KeyError:
+ digest_files = set()
+
+ try:
+ digest_files.remove(filename)
+ except KeyError:
+ pass
+
+ if digest_files:
+ self._shelve[digest_key] = digest_files
+ else:
+ try:
+ del self._shelve[digest_key]
+ except KeyError:
+ pass
+
+ if remaining:
+ logging.debug(("drop '%s' revision(s) from content db") % filename)
+ self._shelve[distfile_key] = remaining
+ else:
+ logging.debug(("drop '%s' from content db") % filename)
+ try:
+ del self._shelve[distfile_key]
+ except KeyError:
+ pass
+
+ def get_filenames_translate(
+ self, filename: typing.Union[str, DistfileName]
+ ) -> typing.Generator[DistfileName, None, None]:
+ """
+ Translate distfiles content digests to zero or more distfile names.
+ If filename is already a distfile name, then it will pass
+ through unchanged.
+
+ A given content digest will translate to multiple distfile names if
+ multiple associations have been created via the add method. The
+ relationship between a content digest and a distfile name is similar
+ to the relationship between an inode and a hardlink.
+
+ @param filename: A filename listed by layout get_filenames
+ """
+ if not isinstance(filename, DistfileName):
+ filename = DistfileName(filename)
+
+ # Match content digests with zero or more content revisions.
+ matched_revisions = {}
+
+ for k, v in filename.digests.items():
+ digest_item = (k.upper(), v.lower())
+ digest_key = "digest:{}:{}".format(*digest_item)
+ try:
+ digest_files = self._shelve[digest_key]
+ except KeyError:
+ continue
+
+ for distfile_str in digest_files:
+ matched_revisions.setdefault(distfile_str, set())
+ try:
+ content_revisions = self._shelve["filename:{}".format(distfile_str)]
+ except KeyError:
+ pass
+ else:
+ for revision_key in content_revisions:
+ if (
+ digest_item in revision_key
+ and revision_key not in matched_revisions[distfile_str]
+ ):
+ matched_revisions[distfile_str].add(revision_key)
+ yield DistfileName(distfile_str, digests=dict(revision_key))
+
+ if not any(matched_revisions.values()):
+ # Since filename matched zero content revisions, allow
+ # it to pass through unchanged (on the path toward deletion).
+ yield filename
+
+ def __len__(self):
+ return len(self._shelve)
+
+ def __contains__(self, k):
+ return k in self._shelve
+
+ def __iter__(self):
+ return self._shelve.__iter__()
+
+ def items(self):
+ return self._shelve.items()
+
+ def __setitem__(self, k, v):
+ self._shelve[k] = v
+
+ def __getitem__(self, k):
+ return self._shelve[k]
+
+ def __delitem__(self, k):
+ del self._shelve[k]
+
+ def get(self, k, *args):
+ return self._shelve.get(k, *args)
+
+ def close(self):
+ self._shelve.close()
+
+ def clear(self):
+ self._shelve.clear()
diff --git a/lib/portage/_emirrordist/DeletionIterator.py b/lib/portage/_emirrordist/DeletionIterator.py
index 08985ed6c..ab4309f9a 100644
--- a/lib/portage/_emirrordist/DeletionIterator.py
+++ b/lib/portage/_emirrordist/DeletionIterator.py
@@ -1,10 +1,12 @@
-# Copyright 2013-2019 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
+import itertools
import logging
import stat
from portage import os
+from portage.package.ebuild.fetch import DistfileName
from .DeletionTask import DeletionTask
class DeletionIterator:
@@ -21,8 +23,25 @@ class DeletionIterator:
deletion_delay = self._config.options.deletion_delay
start_time = self._config.start_time
distfiles_set = set()
- for layout in self._config.layouts:
- distfiles_set.update(layout.get_filenames(distdir))
+ distfiles_set.update(
+ (
+ filename
+ if isinstance(filename, DistfileName)
+ else DistfileName(filename)
+ for filename in itertools.chain.from_iterable(
+ layout.get_filenames(distdir) for layout in self._config.layouts
+ )
+ )
+ if self._config.content_db is None
+ else itertools.chain.from_iterable(
+ (
+ self._config.content_db.get_filenames_translate(filename)
+ for filename in itertools.chain.from_iterable(
+ layout.get_filenames(distdir) for layout in self._config.layouts
+ )
+ )
+ )
+ )
for filename in distfiles_set:
# require at least one successful stat()
exceptions = []
diff --git a/lib/portage/_emirrordist/DeletionTask.py b/lib/portage/_emirrordist/DeletionTask.py
index 5eb01d840..73493c5a1 100644
--- a/lib/portage/_emirrordist/DeletionTask.py
+++ b/lib/portage/_emirrordist/DeletionTask.py
@@ -5,6 +5,7 @@ import errno
import logging
from portage import os
+from portage.package.ebuild.fetch import ContentHashLayout
from portage.util._async.FileCopier import FileCopier
from _emerge.CompositeTask import CompositeTask
@@ -99,6 +100,10 @@ class DeletionTask(CompositeTask):
def _delete_links(self):
success = True
for layout in self.config.layouts:
+ if isinstance(layout, ContentHashLayout) and not self.distfile.digests:
+ logging.debug(("_delete_links: '%s' has "
+ "no digests") % self.distfile)
+ continue
distfile_path = os.path.join(
self.config.options.distfiles,
layout.get_path(self.distfile))
@@ -134,6 +139,9 @@ class DeletionTask(CompositeTask):
logging.debug(("drop '%s' from "
"distfiles db") % self.distfile)
+ if self.config.content_db is not None:
+ self.config.content_db.remove(self.distfile)
+
if self.config.deletion_db is not None:
try:
del self.config.deletion_db[self.distfile]
diff --git a/lib/portage/_emirrordist/FetchTask.py b/lib/portage/_emirrordist/FetchTask.py
index 997762082..5a48f91cd 100644
--- a/lib/portage/_emirrordist/FetchTask.py
+++ b/lib/portage/_emirrordist/FetchTask.py
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
import collections
@@ -47,6 +47,9 @@ class FetchTask(CompositeTask):
# Convert _pkg_str to str in order to prevent pickle problems.
self.config.distfiles_db[self.distfile] = str(self.cpv)
+ if self.config.content_db is not None:
+ self.config.content_db.add(self.distfile)
+
if not self._have_needed_digests():
msg = "incomplete digests: %s" % " ".join(self.digests)
self.scheduler.output(msg, background=self.background,
diff --git a/lib/portage/_emirrordist/main.py b/lib/portage/_emirrordist/main.py
index 8d00a05f5..2200ec715 100644
--- a/lib/portage/_emirrordist/main.py
+++ b/lib/portage/_emirrordist/main.py
@@ -1,4 +1,4 @@
-# Copyright 2013-2020 Gentoo Authors
+# Copyright 2013-2021 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2
import argparse
@@ -7,6 +7,7 @@ import sys
import portage
from portage import os
+from portage.package.ebuild.fetch import ContentHashLayout
from portage.util import normalize_path, _recursive_file_list
from portage.util._async.run_main_scheduler import run_main_scheduler
from portage.util._async.SchedulerInterface import SchedulerInterface
@@ -152,6 +153,12 @@ common_options = (
"metavar" : "FILE"
},
{
+ "longopt" : "--content-db",
+ "help" : "database file used to map content digests to"
+ "distfiles names (required for content-hash layout)",
+ "metavar" : "FILE"
+ },
+ {
"longopt" : "--recycle-dir",
"help" : "directory for extended retention of files that "
"are removed from distdir with the --delete option",
@@ -441,6 +448,12 @@ def emirrordist_main(args):
if not options.mirror:
parser.error('No action specified')
+ if options.delete and config.content_db is None:
+ for layout in config.layouts:
+ if isinstance(layout, ContentHashLayout):
+ parser.error("content-hash layout requires "
+ "--content-db to be specified")
+
returncode = os.EX_OK
if options.mirror:
diff --git a/lib/portage/package/ebuild/fetch.py b/lib/portage/package/ebuild/fetch.py
index a683793f0..73abec595 100644
--- a/lib/portage/package/ebuild/fetch.py
+++ b/lib/portage/package/ebuild/fetch.py
@@ -365,10 +365,10 @@ class DistfileName(str):
In order to prepare for a migration from filename-hash to
content-hash layout, all consumers of the layout get_filenames
method need to be updated to work with content digests as a
- substitute for distfile names. For example, in order to prepare
- emirrordist for content-hash, a key-value store needs to be
- added as a means to associate distfile names with content
- digest values yielded by the content-hash get_filenames
+ substitute for distfile names. For example, emirrordist requires
+ the --content-db option when working with a content-hash layout,
+ which serves as a means to associate distfile names
+ with content digest values yielded by the content-hash get_filenames
implementation.
"""
def __new__(cls, s, digests=None):
diff --git a/lib/portage/tests/ebuild/test_fetch.py b/lib/portage/tests/ebuild/test_fetch.py
index d50a4cbfc..24990e4db 100644
--- a/lib/portage/tests/ebuild/test_fetch.py
+++ b/lib/portage/tests/ebuild/test_fetch.py
@@ -4,6 +4,7 @@
import functools
import io
import tempfile
+import types
import portage
from portage import shutil, os
@@ -28,6 +29,7 @@ from portage.package.ebuild.fetch import (
FlatLayout,
MirrorLayoutConfig,
)
+from portage._emirrordist.Config import Config as EmirrordistConfig
from _emerge.EbuildFetcher import EbuildFetcher
from _emerge.Package import Package
@@ -172,6 +174,16 @@ class EbuildFetchTestCase(TestCase):
with open(os.path.join(settings['DISTDIR'], 'layout.conf'), 'wt') as f:
f.write(layout_data)
+ if any(isinstance(layout, ContentHashLayout) for layout in layouts):
+ content_db = os.path.join(playground.eprefix, 'var/db/emirrordist/content.db')
+ os.makedirs(os.path.dirname(content_db), exist_ok=True)
+ try:
+ os.unlink(content_db)
+ except OSError:
+ pass
+ else:
+ content_db = None
+
# Demonstrate that fetch preserves a stale file in DISTDIR when no digests are given.
foo_uri = {'foo': ('{scheme}://{host}:{port}/distfiles/foo'.format(scheme=scheme, host=host, port=server.server_port),)}
foo_path = os.path.join(settings['DISTDIR'], 'foo')
@@ -233,9 +245,13 @@ class EbuildFetchTestCase(TestCase):
os.path.join(self.bindir, 'emirrordist'),
'--distfiles', settings['DISTDIR'],
'--config-root', settings['EPREFIX'],
+ '--delete',
'--repositories-configuration', settings.repositories.config_string(),
'--repo', 'test_repo', '--mirror')
+ if content_db is not None:
+ emirrordist_cmd = emirrordist_cmd + ('--content-db', content_db,)
+
env = settings.environ()
env['PYTHONPATH'] = ':'.join(
filter(None, [PORTAGE_PYM_PATH] + os.environ.get('PYTHONPATH', '').split(':')))
@@ -253,6 +269,19 @@ class EbuildFetchTestCase(TestCase):
with open(os.path.join(settings['DISTDIR'], layouts[0].get_path(k)), 'rb') as f:
self.assertEqual(f.read(), distfiles[k])
+ if content_db is not None:
+ loop.run_until_complete(
+ self._test_content_db(
+ emirrordist_cmd,
+ env,
+ layouts,
+ content_db,
+ distfiles,
+ settings,
+ portdb,
+ )
+ )
+
# Tests only work with one ebuild at a time, so the config
# pool only needs a single config instance.
class config_pool:
@@ -427,6 +456,125 @@ class EbuildFetchTestCase(TestCase):
settings.features.remove('skiprocheck')
settings.features.add('distlocks')
+ async def _test_content_db(
+ self, emirrordist_cmd, env, layouts, content_db, distfiles, settings, portdb
+ ):
+ # Simulate distfile digest change for ContentDB.
+ emdisopts = types.SimpleNamespace(
+ content_db=content_db, distfiles=settings["DISTDIR"]
+ )
+ with EmirrordistConfig(
+ emdisopts, portdb, asyncio.get_event_loop()
+ ) as emdisconf:
+ # Copy revisions from bar to foo.
+ for revision_key in emdisconf.content_db["filename:{}".format("bar")]:
+ emdisconf.content_db.add(
+ DistfileName("foo", digests=dict(revision_key))
+ )
+
+ # Copy revisions from foo to bar.
+ for revision_key in emdisconf.content_db["filename:{}".format("foo")]:
+ emdisconf.content_db.add(
+ DistfileName("bar", digests=dict(revision_key))
+ )
+
+ content_db_state = dict(emdisconf.content_db.items())
+ self.assertEqual(content_db_state, dict(emdisconf.content_db.items()))
+ self.assertEqual(
+ [
+ k[len("filename:") :]
+ for k in content_db_state
+ if k.startswith("filename:")
+ ],
+ ["bar", "foo"],
+ )
+ self.assertEqual(
+ content_db_state["filename:foo"], content_db_state["filename:bar"]
+ )
+ self.assertEqual(len(content_db_state["filename:foo"]), 2)
+
+ for k in distfiles:
+ try:
+ os.unlink(os.path.join(settings["DISTDIR"], k))
+ except OSError:
+ pass
+
+ proc = await asyncio.create_subprocess_exec(*emirrordist_cmd, env=env)
+ self.assertEqual(await proc.wait(), 0)
+
+ for k in distfiles:
+ with open(
+ os.path.join(settings["DISTDIR"], layouts[0].get_path(k)), "rb"
+ ) as f:
+ self.assertEqual(f.read(), distfiles[k])
+
+ with EmirrordistConfig(
+ emdisopts, portdb, asyncio.get_event_loop()
+ ) as emdisconf:
+ self.assertEqual(content_db_state, dict(emdisconf.content_db.items()))
+
+ # Verify that remove works as expected
+ filename = [filename for filename in distfiles if filename == "foo"][0]
+ self.assertTrue(bool(filename.digests))
+ emdisconf.content_db.remove(filename)
+ # foo should still have a content revision corresponding to bar's content.
+ self.assertEqual(
+ [
+ k[len("filename:") :]
+ for k in emdisconf.content_db
+ if k.startswith("filename:")
+ ],
+ ["bar", "foo"],
+ )
+ self.assertEqual(len(emdisconf.content_db["filename:foo"]), 1)
+ self.assertEqual(
+ len(
+ [
+ revision_key
+ for revision_key in emdisconf.content_db["filename:foo"]
+ if not filename.digests_equal(
+ DistfileName(
+ "foo",
+ digests=dict(revision_key),
+ )
+ )
+ ]
+ ),
+ 1,
+ )
+ # bar should still have a content revision corresponding to foo's content.
+ self.assertEqual(len(emdisconf.content_db["filename:bar"]), 2)
+ self.assertEqual(
+ len(
+ [
+ revision_key
+ for revision_key in emdisconf.content_db["filename:bar"]
+ if filename.digests_equal(
+ DistfileName(
+ "bar",
+ digests=dict(revision_key),
+ )
+ )
+ ]
+ ),
+ 1,
+ )
+ # remove the foo which refers to bar's content
+ bar = [filename for filename in distfiles if filename == "bar"][0]
+ foo_remaining = DistfileName("foo", digests=bar.digests)
+ emdisconf.content_db.remove(foo_remaining)
+ self.assertEqual(
+ [
+ k[len("filename:") :]
+ for k in emdisconf.content_db
+ if k.startswith("filename:")
+ ],
+ ["bar"],
+ )
+ self.assertRaises(KeyError, emdisconf.content_db.__getitem__, "filename:foo")
+ # bar should still have a content revision corresponding to foo's content.
+ self.assertEqual(len(emdisconf.content_db["filename:bar"]), 2)
+
def test_flat_layout(self):
self.assertTrue(FlatLayout.verify_args(('flat',)))
self.assertFalse(FlatLayout.verify_args(('flat', 'extraneous-arg')))
diff --git a/man/emirrordist.1 b/man/emirrordist.1
index 45108ef8c..7ad10dfd0 100644
--- a/man/emirrordist.1
+++ b/man/emirrordist.1
@@ -1,4 +1,4 @@
-.TH "EMIRRORDIST" "1" "Dec 2015" "Portage VERSION" "Portage"
+.TH "EMIRRORDIST" "1" "Feb 2021" "Portage VERSION" "Portage"
.SH "NAME"
emirrordist \- a fetch tool for mirroring of package distfiles
.SH SYNOPSIS
@@ -66,6 +66,10 @@ reporting purposes. Opened in append mode.
Log file for scheduled deletions, with tab\-delimited output, for
reporting purposes. Overwritten with each run.
.TP
+\fB\-\-content\-db\fR=\fIFILE\fR
+Database file used to pair content digests with distfiles names
+(required fo content\-hash layout).
+.TP
\fB\-\-delete\fR
Enable deletion of unused distfiles.
.TP