gcc-changelog: workaround for utf8 filenames

author Martin Liska <mliska@suse.cz>

Wed, 6 Jan 2021 07:11:57 +0000 (08:11 +0100)

committer Martin Liska <mliska@suse.cz>

Wed, 6 Jan 2021 07:26:10 +0000 (08:26 +0100)
author Martin Liska <mliska@suse.cz>
Wed, 6 Jan 2021 07:11:57 +0000 (08:11 +0100)
committer Martin Liska <mliska@suse.cz>
Wed, 6 Jan 2021 07:26:10 +0000 (08:26 +0100)
diff --git a/contrib/gcc-changelog/git_commit.py b/contrib/gcc-changelog/git_commit.py

index d2e5dbe294a3d0d325f86290f869095685b53e1b..ee1973371be47315dccdb1dbdd177482f35e3c44 100755 (executable)
--- a/contrib/gcc-changelog/git_commit.py
+++ b/contrib/gcc-changelog/git_commit.py
@@ -174,6 +174,24 @@ REVIEW_PREFIXES = ('reviewed-by: ', 'reviewed-on: ', 'signed-off-by: ',
  DATE_FORMAT = '%Y-%m-%d'
  
  
+def decode_path(path):
+    # When core.quotepath is true (default value), utf8 chars are encoded like:
+    # "b/ko\304\215ka.txt"
+    #
+    # The upstream bug is fixed:
+    # https://github.com/gitpython-developers/GitPython/issues/1099
+    #
+    # but we still need a workaround for older versions of the library.
+    # Please take a look at the explanation of the transformation:
+    # https://stackoverflow.com/questions/990169/how-do-convert-unicode-escape-sequences-to-unicode-characters-in-a-python-string
+
+    if path.startswith('"') and path.endswith('"'):
+        return (path.strip('"').encode('utf8').decode('unicode-escape')
+                .encode('latin-1').decode('utf8'))
+    else:
+        return path
+
+
  class Error:
      def __init__(self, message, line=None):
          self.message = message
@@ -303,14 +321,6 @@ class GitCommit:
                                       'separately from normal commits'))
              return
  
-        # check for an encoded utf-8 filename
-        hint = 'git config --global core.quotepath false'
-        for modified, _ in self.info.modified_files:
-            if modified.startswith('"') or modified.endswith('"'):
-                self.errors.append(Error('Quoted UTF8 filename, please set: '
-                                         f'"{hint}"', modified))
-                return
-
          all_are_ignored = (len(project_files) + len(ignored_files)
                             == len(self.info.modified_files))
          self.parse_lines(all_are_ignored)
diff --git a/contrib/gcc-changelog/git_email.py b/contrib/gcc-changelog/git_email.py

index 5b53ca4a6a9c89923b37e0a167ad3139a580da37..00ad00458f407758b6b42bc9e642974f6e8e997e 100755 (executable)
--- a/contrib/gcc-changelog/git_email.py
+++ b/contrib/gcc-changelog/git_email.py
@@ -22,7 +22,7 @@ from itertools import takewhile
  
  from dateutil.parser import parse
  
-from git_commit import GitCommit, GitInfo
+from git_commit import GitCommit, GitInfo, decode_path
  
  from unidiff import PatchSet, PatchedFile
  
@@ -52,8 +52,8 @@ class GitEmail(GitCommit):
          modified_files = []
          for f in diff:
              # Strip "a/" and "b/" prefixes
-            source = f.source_file[2:]
-            target = f.target_file[2:]
+            source = decode_path(f.source_file)[2:]
+            target = decode_path(f.target_file)[2:]
  
              if f.is_added_file:
                  t = 'A'
diff --git a/contrib/gcc-changelog/git_repository.py b/contrib/gcc-changelog/git_repository.py

index 8edcff91ad6720bf33e85a86102ac9b6ed6d2357..a0e293d756d8b82d9229ae15b2f25066b8631d4e 100755 (executable)
--- a/contrib/gcc-changelog/git_repository.py
+++ b/contrib/gcc-changelog/git_repository.py
@@ -26,7 +26,7 @@ except ImportError:
      print('  Debian, Ubuntu: python3-git')
      exit(1)
  
-from git_commit import GitCommit, GitInfo
+from git_commit import GitCommit, GitInfo, decode_path
  
  
  def parse_git_revisions(repo_path, revisions, strict=True):
@@ -51,11 +51,11 @@ def parse_git_revisions(repo_path, revisions, strict=True):
                      # Consider that renamed files are two operations:
                      # the deletion of the original name
                      # and the addition of the new one.
-                    modified_files.append((file.a_path, 'D'))
+                    modified_files.append((decode_path(file.a_path), 'D'))
                      t = 'A'
                  else:
                      t = 'M'
-                modified_files.append((file.b_path, t))
+                modified_files.append((decode_path(file.b_path), t))
  
              date = datetime.utcfromtimestamp(c.committed_date)
              author = '%s  <%s>' % (c.author.name, c.author.email)
diff --git a/contrib/gcc-changelog/test_email.py b/contrib/gcc-changelog/test_email.py

index 2053531452c9d0c3fbfce89a9cafbc4efef700ec..5db56caef9e16a97cf7bcd4ed63f0e94dc19dd63 100755 (executable)
--- a/contrib/gcc-changelog/test_email.py
+++ b/contrib/gcc-changelog/test_email.py
@@ -402,4 +402,5 @@ class TestGccChangelog(unittest.TestCase):
  
      def test_bad_unicode_chars_in_filename(self):
          email = self.from_patch_glob('0001-Add-horse2.patch')
-        assert email.errors[0].message.startswith('Quoted UTF8 filename')
+        assert not email.errors
+        assert email.changelog_entries[0].files == ['koníček.txt']
author	Martin Liska <mliska@suse.cz>
	Wed, 6 Jan 2021 07:11:57 +0000 (08:11 +0100)
committer	Martin Liska <mliska@suse.cz>
	Wed, 6 Jan 2021 07:26:10 +0000 (08:26 +0100)
contrib/gcc-changelog/git_commit.py		patch \| blob \| history
contrib/gcc-changelog/git_email.py		patch \| blob \| history
contrib/gcc-changelog/git_repository.py		patch \| blob \| history
contrib/gcc-changelog/test_email.py		patch \| blob \| history