From c788a835067c5ffe8859f38078b390f06a223f5d Mon Sep 17 00:00:00 2001 From: Zac Medico Date: Sat, 3 Oct 2015 15:21:01 -0700 Subject: dohtml: handle unicode (bug 561846) Decode all arguments and listdir results as UTF-8, and return unsuccessfully if anything fails to decode as UTF-8. Use portage os and shutil wrappers to encode file names as UTF-8 regardless of locale. X-Gentoo-Bug: 561846 X-Gentoo-Bug-URL: https://bugs.gentoo.org/show_bug.cgi?id=561846 Acked-by: Brian Dolbec --- bin/dohtml.py | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/bin/dohtml.py b/bin/dohtml.py index 5359f5e89..dfcaa6026 100755 --- a/bin/dohtml.py +++ b/bin/dohtml.py @@ -28,13 +28,13 @@ # - will do as 'dohtml -r', but ignore directories named CVS, SCCS, RCS # -from __future__ import print_function +from __future__ import print_function, unicode_literals -import os -import shutil +import os as _os import sys -from portage.util import normalize_path +from portage import _unicode_encode, _unicode_decode, os, shutil +from portage.util import normalize_path, writemsg # Change back to original cwd _after_ all imports (bug #469338). os.chdir(os.environ["__PORTAGE_HELPER_CWD"]) @@ -92,7 +92,13 @@ def install(basename, dirname, options, prefix=""): skipped_files.append(fullpath) elif options.recurse and os.path.isdir(fullpath) and \ basename not in options.disallowed_dirs: - for i in os.listdir(fullpath): + for i in _os.listdir(_unicode_encode(fullpath)): + try: + i = _unicode_decode(i, errors='strict') + except UnicodeDecodeError: + writemsg('dohtml: argument is not encoded as UTF-8: %s\n' % + _unicode_decode(i), noiselevel=-1) + sys.exit(1) pfx = basename if prefix: pfx = os.path.join(prefix, pfx) @@ -155,12 +161,29 @@ def print_help(): print() def parse_args(): + argv = sys.argv[:] + + if sys.hexversion >= 0x3000000: + # We can't trust that the filesystem encoding (locale dependent) + # correctly matches the arguments, so use surrogateescape to + # pass through the original argv bytes for Python 3. + fs_encoding = sys.getfilesystemencoding() + argv = [x.encode(fs_encoding, 'surrogateescape') for x in argv] + + for x, arg in enumerate(argv): + try: + argv[x] = _unicode_decode(arg, errors='strict') + except UnicodeDecodeError: + writemsg('dohtml: argument is not encoded as UTF-8: %s\n' % + _unicode_decode(arg), noiselevel=-1) + sys.exit(1) + options = OptionsClass() args = [] x = 1 - while x < len(sys.argv): - arg = sys.argv[x] + while x < len(argv): + arg = argv[x] if arg in ["-h","-r","-V"]: if arg == "-h": print_help() @@ -169,17 +192,17 @@ def parse_args(): options.recurse = True elif arg == "-V": options.verbose = True - elif sys.argv[x] in ["-A","-a","-f","-x","-p"]: + elif argv[x] in ["-A","-a","-f","-x","-p"]: x += 1 - if x == len(sys.argv): + if x == len(argv): print_help() sys.exit(0) elif arg == "-p": - options.doc_prefix = sys.argv[x] + options.doc_prefix = argv[x] if options.doc_prefix: options.doc_prefix = normalize_path(options.doc_prefix) else: - values = sys.argv[x].split(",") + values = argv[x].split(",") if arg == "-A": options.allowed_exts.extend(values) elif arg == "-a": @@ -189,7 +212,7 @@ def parse_args(): elif arg == "-x": options.disallowed_dirs = values else: - args.append(sys.argv[x]) + args.append(argv[x]) x += 1 return (options, args) -- cgit v1.2.3-65-gdbad