From: Martin Liska Date: Wed, 6 Jan 2021 07:11:57 +0000 (+0100) Subject: gcc-changelog: workaround for utf8 filenames X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=57706dd7e001d8302b596521217827855324e748;p=gcc.git gcc-changelog: workaround for utf8 filenames contrib/ChangeLog: * gcc-changelog/git_commit.py: Add decode_path function. * gcc-changelog/git_email.py: Use it in order to solve utf8 encoding filename issues. * gcc-changelog/git_repository.py: Likewise. * gcc-changelog/test_email.py: Test it. --- diff --git a/contrib/gcc-changelog/git_commit.py b/contrib/gcc-changelog/git_commit.py index d2e5dbe294a..ee1973371be 100755 --- a/contrib/gcc-changelog/git_commit.py +++ b/contrib/gcc-changelog/git_commit.py @@ -174,6 +174,24 @@ REVIEW_PREFIXES = ('reviewed-by: ', 'reviewed-on: ', 'signed-off-by: ', DATE_FORMAT = '%Y-%m-%d' +def decode_path(path): + # When core.quotepath is true (default value), utf8 chars are encoded like: + # "b/ko\304\215ka.txt" + # + # The upstream bug is fixed: + # https://github.com/gitpython-developers/GitPython/issues/1099 + # + # but we still need a workaround for older versions of the library. + # Please take a look at the explanation of the transformation: + # https://stackoverflow.com/questions/990169/how-do-convert-unicode-escape-sequences-to-unicode-characters-in-a-python-string + + if path.startswith('"') and path.endswith('"'): + return (path.strip('"').encode('utf8').decode('unicode-escape') + .encode('latin-1').decode('utf8')) + else: + return path + + class Error: def __init__(self, message, line=None): self.message = message @@ -303,14 +321,6 @@ class GitCommit: 'separately from normal commits')) return - # check for an encoded utf-8 filename - hint = 'git config --global core.quotepath false' - for modified, _ in self.info.modified_files: - if modified.startswith('"') or modified.endswith('"'): - self.errors.append(Error('Quoted UTF8 filename, please set: ' - f'"{hint}"', modified)) - return - all_are_ignored = (len(project_files) + len(ignored_files) == len(self.info.modified_files)) self.parse_lines(all_are_ignored) diff --git a/contrib/gcc-changelog/git_email.py b/contrib/gcc-changelog/git_email.py index 5b53ca4a6a9..00ad00458f4 100755 --- a/contrib/gcc-changelog/git_email.py +++ b/contrib/gcc-changelog/git_email.py @@ -22,7 +22,7 @@ from itertools import takewhile from dateutil.parser import parse -from git_commit import GitCommit, GitInfo +from git_commit import GitCommit, GitInfo, decode_path from unidiff import PatchSet, PatchedFile @@ -52,8 +52,8 @@ class GitEmail(GitCommit): modified_files = [] for f in diff: # Strip "a/" and "b/" prefixes - source = f.source_file[2:] - target = f.target_file[2:] + source = decode_path(f.source_file)[2:] + target = decode_path(f.target_file)[2:] if f.is_added_file: t = 'A' diff --git a/contrib/gcc-changelog/git_repository.py b/contrib/gcc-changelog/git_repository.py index 8edcff91ad6..a0e293d756d 100755 --- a/contrib/gcc-changelog/git_repository.py +++ b/contrib/gcc-changelog/git_repository.py @@ -26,7 +26,7 @@ except ImportError: print(' Debian, Ubuntu: python3-git') exit(1) -from git_commit import GitCommit, GitInfo +from git_commit import GitCommit, GitInfo, decode_path def parse_git_revisions(repo_path, revisions, strict=True): @@ -51,11 +51,11 @@ def parse_git_revisions(repo_path, revisions, strict=True): # Consider that renamed files are two operations: # the deletion of the original name # and the addition of the new one. - modified_files.append((file.a_path, 'D')) + modified_files.append((decode_path(file.a_path), 'D')) t = 'A' else: t = 'M' - modified_files.append((file.b_path, t)) + modified_files.append((decode_path(file.b_path), t)) date = datetime.utcfromtimestamp(c.committed_date) author = '%s <%s>' % (c.author.name, c.author.email) diff --git a/contrib/gcc-changelog/test_email.py b/contrib/gcc-changelog/test_email.py index 2053531452c..5db56caef9e 100755 --- a/contrib/gcc-changelog/test_email.py +++ b/contrib/gcc-changelog/test_email.py @@ -402,4 +402,5 @@ class TestGccChangelog(unittest.TestCase): def test_bad_unicode_chars_in_filename(self): email = self.from_patch_glob('0001-Add-horse2.patch') - assert email.errors[0].message.startswith('Quoted UTF8 filename') + assert not email.errors + assert email.changelog_entries[0].files == ['koníček.txt']