diff options
author | Daniel Robbins <drobbins@funtoo.org> | 2017-10-14 17:38:05 -0600 |
---|---|---|
committer | Zac Medico <zmedico@gentoo.org> | 2017-10-14 17:53:40 -0700 |
commit | c5a2a0edc4f4b01b16a274268431fa21f7f678b2 (patch) | |
tree | da18689e723369b115621770fab74928f24d2d55 | |
parent | man: Update URI of GLEP references. (diff) | |
download | portage-c5a2a0ed.tar.gz portage-c5a2a0ed.tar.bz2 portage-c5a2a0ed.zip |
portdbapi: factor out _better_cache class
Better_cache -- now even better :) This version only scans individual
categories on-demand. I have addressed concerns about PMS-compliance by
enhancing the documentation so that developers are aware of what
assumptions to make (and not make) when using better_cache.
Closes: https://github.com/gentoo/portage/pull/219
-rw-r--r-- | pym/portage/dbapi/porttree.py | 124 |
1 files changed, 71 insertions, 53 deletions
diff --git a/pym/portage/dbapi/porttree.py b/pym/portage/dbapi/porttree.py index 53edcd18f..f5979d2d0 100644 --- a/pym/portage/dbapi/porttree.py +++ b/pym/portage/dbapi/porttree.py @@ -16,7 +16,7 @@ portage.proxy.lazyimport.lazyimport(globals(), 'portage.package.ebuild.doebuild:doebuild', 'portage.util:ensure_dirs,shlex_split,writemsg,writemsg_level', 'portage.util.listdir:listdir', - 'portage.versions:best,catpkgsplit,_pkgsplit@pkgsplit,ver_regexp,_pkg_str', + 'portage.versions:best,catsplit,catpkgsplit,_pkgsplit@pkgsplit,ver_regexp,_pkg_str', ) from portage.cache import volatile @@ -103,6 +103,68 @@ class _dummy_list(list): except ValueError: pass + +class _better_cache(object): + + """ + The purpose of better_cache is to locate catpkgs in repositories using ``os.listdir()`` as much as possible, which + is less expensive IO-wise than exhaustively doing a stat on each repo for a particular catpkg. better_cache stores a + list of repos in which particular catpkgs appear. Various dbapi methods use better_cache to locate repositories of + interest related to particular catpkg rather than performing an exhaustive scan of all repos/overlays. + + Better_cache.items data may look like this:: + + { "sys-apps/portage" : [ repo1, repo2 ] } + + Without better_cache, Portage will get slower and slower (due to excessive IO) as more overlays are added. + + Also note that it is OK if this cache has some 'false positive' catpkgs in it. We use it to search for specific + catpkgs listed in ebuilds. The likelihood of a false positive catpkg in our cache causing a problem is extremely + low, because the user of our cache is passing us a catpkg that came from somewhere and has already undergone some + validation, and even then will further interrogate the short-list of repos we return to gather more information + on the catpkg. + + Thus, the code below is optimized for speed rather than painstaking correctness. I have added a note to + ``dbapi.getRepositories()`` to ensure that developers are aware of this just in case. + + The better_cache has been redesigned to perform on-demand scans -- it will only scan a category at a time, as + needed. This should further optimize IO performance by not scanning category directories that are not needed by + Portage. + """ + + def __init__(self, repositories): + self._items = collections.defaultdict(list) + self._scanned_cats = set() + + # ordered list of all portree locations we'll scan: + self._repo_list = [repo for repo in reversed(list(repositories)) + if repo.location is not None] + + def __getitem__(self, catpkg): + result = self._items.get(catpkg) + if result is not None: + return result + + cat, pkg = catsplit(catpkg) + if cat not in self._scanned_cats: + self._scan_cat(cat) + return self._items[catpkg] + + def _scan_cat(self, cat): + for repo in self._repo_list: + cat_dir = repo.location + "/" + cat + try: + pkg_list = os.listdir(cat_dir) + except OSError as e: + if e.errno not in (errno.ENOTDIR, errno.ENOENT, errno.ESTALE): + raise + continue + for p in pkg_list: + if os.path.isdir(cat_dir + "/" + p): + self._items[cat + "/" + p].append(repo) + self._scanned_cats.add(cat) + + class portdbapi(dbapi): """this tree will scan a portage directory located at root (passed to init)""" portdbapi_instances = _dummy_list() @@ -346,11 +408,14 @@ class portdbapi(dbapi): return None def getRepositories(self, catpkg=None): + """ With catpkg=None, this will return a complete list of repositories in this dbapi. With catpkg set to a value, this method will return a short-list of repositories that contain this catpkg. Use this second approach if possible, to avoid exhaustively searching all repos for a particular catpkg. It's faster for this method to - find the catpkg than for you do it yourself. + find the catpkg than for you do it yourself. When specifying catpkg, you should have reasonable assurance that + the category is valid and PMS-compliant as the caching mechanism we use does not perform validation checks for + categories. This function is required for GLEP 42 compliance. @@ -358,7 +423,8 @@ class portdbapi(dbapi): catpkg; if None, return a list of all Repositories that contain a particular catpkg. @return: a list of repositories. """ - if catpkg is not None and self._better_cache is not None and catpkg in self._better_cache: + + if catpkg is not None and self._better_cache is not None: return [repo.name for repo in self._better_cache[catpkg]] return self._ordered_repo_name_list @@ -796,12 +862,7 @@ class portdbapi(dbapi): elif self._better_cache is None: mytrees = self.porttrees else: - try: - repos = self._better_cache[mycp] - except KeyError: - mytrees = [] - else: - mytrees = [repo.location for repo in repos] + mytrees = [repo.location for repo in self._better_cache[mycp]] for oroot in mytrees: try: file_list = os.listdir(os.path.join(oroot, mycp)) @@ -850,50 +911,7 @@ class portdbapi(dbapi): "minimum-all-ignore-profile", "minimum-visible"): self.xcache[x]={} self.frozen=1 - self._better_cache = better_cache = collections.defaultdict(list) - - # The purpose of self._better_cache is to perform an initial quick scan of all repositories - # using os.listdir(), which is less expensive IO-wise than exhaustively doing a stat on each - # repo. self._better_cache stores a list of repos in which particular catpkgs appear. - # - # For example, better_cache data may look like this: - # - # { "sys-apps/portage" : [ repo1, repo2 ] } - # - # Without this tweak, Portage will get slower and slower as more overlays are added. - # - # Also note that it is OK if this cache has some 'false positive' catpkgs in it. We use it - # to search for specific catpkgs listed in ebuilds. The likelihood of a false positive catpkg - # in our cache causing a problem is extremely low. Thus, the code below is optimized for - # speed rather than painstaking correctness. - - valid_categories = self.settings.categories - for repo_loc in reversed(self.porttrees): - repo = self.repositories.get_repo_for_location(repo_loc) - try: - categories = os.listdir(repo_loc) - except OSError as e: - if e.errno not in (errno.ENOTDIR, errno.ENOENT, errno.ESTALE): - raise - continue - - for cat in categories: - if cat not in valid_categories: - continue - cat_dir = repo_loc + "/" + cat - try: - pkg_list = os.listdir(cat_dir) - except OSError as e: - if e.errno != errno.ENOTDIR: - raise - continue - - for p in pkg_list: - catpkg_dir = cat_dir + "/" + p - if not os.path.isdir(catpkg_dir): - continue - catpkg = cat + "/" + p - better_cache[catpkg].append(repo) + self._better_cache = _better_cache(self.repositories) def melt(self): self.xcache = {} |