From 51e8dbe1fbe7d8955589703140ca5eba7b4f1bd7 Mon Sep 17 00:00:00 2001
From: Andrew Burgess <aburgess@redhat.com>
Date: Mon, 16 May 2022 19:26:54 +0100
Subject: [PATCH] gdb/python: improve formatting of help text for user defined
 commands

Consider this command defined in Python (in the file test-cmd.py):

  class test_cmd (gdb.Command):
    """
    This is the first line.
      Indented second line.
    This is the third line.
    """

    def __init__ (self):
      super ().__init__ ("test-cmd", gdb.COMMAND_OBSCURE)

    def invoke (self, arg, from_tty):
      print ("In test-cmd")

  test_cmd()

Now, within a GDB session:

  (gdb) source test-cmd.py
  (gdb) help test-cmd

    This is the first line.
      Indented second line.
    This is the third line.

  (gdb)

I think there's three things wrong here:

  1. The leading blank line,
  2. The trailing blank line, and
  3. Every line is indented from the left edge slightly.

The problem of course, is that GDB is using the Python doc string
verbatim as its help text.  While the user has formatted the help text
so that it appears clear within the .py file, this means that the text
appear less well formatted when displayed in the "help" output.

The same problem can be observed for gdb.Parameter objects in their
set/show output.

In this commit I aim to improve the "help" output for commands and
parameters.

To do this I have added gdbpy_fix_doc_string_indentation, a new
function that rewrites the doc string text following the following
rules:

  1. Leading blank lines are removed,
  2. Trailing blank lines are removed, and
  3. Leading whitespace is removed in a "smart" way such that the
  relative indentation of lines is retained.

With this commit in place the above example now looks like this:

  (gdb) source ~/tmp/test-cmd.py
  (gdb) help test-cmd
  This is the first line.
    Indented second line.
  This is the third line.
  (gdb)

Which I think is much neater.  Notice that the indentation of the
second line is retained.  Any blank lines within the help text (not
leading or trailing) will be retained.

I've added a NEWS entry to note that there has been a change in
behaviour, but I didn't update the manual.  The existing manual is
suitably vague about how the doc string is used, so I think the new
behaviour is covered just as well by the existing text.
---
 gdb/NEWS                                     |   7 +
 gdb/python/py-cmd.c                          |   1 +
 gdb/python/py-param.c                        |   2 +
 gdb/python/py-utils.c                        | 194 +++++++++++++
 gdb/python/python-internal.h                 |  21 ++
 gdb/testsuite/gdb.python/py-doc-reformat.exp | 282 +++++++++++++++++++
 6 files changed, 507 insertions(+)
 create mode 100644 gdb/testsuite/gdb.python/py-doc-reformat.exp

diff --git a/gdb/NEWS b/gdb/NEWS
index 52ffdc4c83a..dac6dabfa42 100644
--- a/gdb/NEWS
+++ b/gdb/NEWS
@@ -27,6 +27,13 @@
   emit to indicate where a breakpoint should be placed to break in a function
   past its prologue.
 
+* Python API
+
+  ** GDB will now reformat the doc string for gdb.Command and
+     gdb.Parameter sub-classes to remove unnecessary leading
+     whitespace from each line before using the string as the help
+     output.
+
 * New commands
 
 maintenance set ignore-prologue-end-flag on|off
diff --git a/gdb/python/py-cmd.c b/gdb/python/py-cmd.c
index bc48c2ea9f9..f80318374d3 100644
--- a/gdb/python/py-cmd.c
+++ b/gdb/python/py-cmd.c
@@ -493,6 +493,7 @@ cmdpy_init (PyObject *self, PyObject *args, PyObject *kw)
 	  docstring = python_string_to_host_string (ds_obj.get ());
 	  if (docstring == nullptr)
 	    return -1;
+	  docstring = gdbpy_fix_doc_string_indentation (std::move (docstring));
 	}
     }
   if (docstring == nullptr)
diff --git a/gdb/python/py-param.c b/gdb/python/py-param.c
index cac9bd209a4..5d509ba4658 100644
--- a/gdb/python/py-param.c
+++ b/gdb/python/py-param.c
@@ -385,6 +385,8 @@ get_doc_string (PyObject *object, enum doc_string_type doc_type,
 	  result = python_string_to_host_string (ds_obj.get ());
 	  if (result == NULL)
 	    gdbpy_print_stack ();
+	  else if (doc_type == doc_string_description)
+	    result = gdbpy_fix_doc_string_indentation (std::move (result));
 	}
     }
 
diff --git a/gdb/python/py-utils.c b/gdb/python/py-utils.c
index 63eb4e8078c..1bd7b477ecb 100644
--- a/gdb/python/py-utils.c
+++ b/gdb/python/py-utils.c
@@ -400,3 +400,197 @@ gdbpy_handle_exception ()
   else
     error ("%s", msg.get ());
 }
+
+/* See python-internal.h.  */
+
+gdb::unique_xmalloc_ptr<char>
+gdbpy_fix_doc_string_indentation (gdb::unique_xmalloc_ptr<char> doc)
+{
+  /* A structure used to track the white-space information on each line of
+     DOC.  */
+  struct line_whitespace
+  {
+    /* Constructor.  OFFSET is the offset from the start of DOC, WS_COUNT
+       is the number of whitespace characters starting at OFFSET.  */
+    line_whitespace (size_t offset, int ws_count)
+      : m_offset (offset),
+	m_ws_count (ws_count)
+    { /* Nothing.  */ }
+
+    /* The offset from the start of DOC.  */
+    size_t offset () const
+    { return m_offset; }
+
+    /* The number of white-space characters at the start of this line.  */
+    int ws () const
+    { return m_ws_count; }
+
+  private:
+    /* The offset from the start of DOC to the first character of this
+       line.  */
+    size_t m_offset;
+
+    /* White space count on this line, the first character of this
+       whitespace is at OFFSET.  */
+    int m_ws_count;
+  };
+
+  /* Count the number of white-space character starting at TXT.  We
+     currently only count true single space characters, things like tabs,
+     newlines, etc are not counted.  */
+  auto count_whitespace = [] (const char *txt) -> int
+  {
+    int count = 0;
+
+    while (*txt == ' ')
+      {
+	++txt;
+	++count;
+      }
+
+    return count;
+  };
+
+  /* In MIN_WHITESPACE we track the smallest number of whitespace
+     characters seen at the start of a line (that has actual content), this
+     is the number of characters that we can delete off all lines without
+     altering the relative indentation of all lines in DOC.
+
+     The first line often has no indentation, but instead starts immediates
+     after the 3-quotes marker within the Python doc string, so, if the
+     first line has zero white-space then we just ignore it, and don't set
+     MIN_WHITESPACE to zero.
+
+     Lines without any content should (ideally) have no white-space at
+     all, but if they do then they might have an artificially low number
+     (user left a single stray space at the start of an otherwise blank
+     line), we don't consider lines without content when updating the
+     MIN_WHITESPACE value.  */
+  gdb::optional<int> min_whitespace;
+
+  /* The index into WS_INFO at which the processing of DOC can be
+     considered "all done", that is, after this point there are no further
+     lines with useful content and we should just stop.  */
+  gdb::optional<size_t> all_done_idx;
+
+  /* White-space information for each line in DOC.  */
+  std::vector<line_whitespace> ws_info;
+
+  /* Now look through DOC and collect the required information.  */
+  const char *tmp = doc.get ();
+  while (*tmp != '\0')
+    {
+      /* Add an entry for the offset to the start of this line, and how
+	 much white-space there is at the start of this line.  */
+      size_t offset = tmp - doc.get ();
+      int ws_count = count_whitespace (tmp);
+      ws_info.emplace_back (offset, ws_count);
+
+      /* Skip over the white-space.  */
+      tmp += ws_count;
+
+      /* Remember where the content of this line starts, and skip forward
+	 to either the end of this line (newline) or the end of the DOC
+	 string (null character), whichever comes first.  */
+      const char *content_start = tmp;
+      while (*tmp != '\0' && *tmp != '\n')
+	++tmp;
+
+      /* If this is not the first line, and if this line has some content,
+	 then update MIN_WHITESPACE, this reflects the smallest number of
+	 whitespace characters we can delete from all lines without
+	 impacting the relative indentation of all the lines of DOC.  */
+      if (offset > 0 && tmp > content_start)
+	{
+	  if (!min_whitespace.has_value ())
+	    min_whitespace = ws_count;
+	  else
+	    min_whitespace = std::min (*min_whitespace, ws_count);
+	}
+
+      /* Each time we encounter a line that has some content we update
+	 ALL_DONE_IDX to be the index of the next line.  If the last lines
+	 of DOC don't contain any content then ALL_DONE_IDX will be left
+	 pointing at an earlier line.  When we rewrite DOC, when we reach
+	 ALL_DONE_IDX then we can stop, the allows us to trim any blank
+	 lines from the end of DOC.  */
+      if (tmp > content_start)
+	all_done_idx = ws_info.size ();
+
+      /* If we reached a newline then skip forward to the start of the next
+	 line.  The other possibility at this point is that we're at the
+	 very end of the DOC string (null terminator).  */
+      if (*tmp == '\n')
+	++tmp;
+    }
+
+  /* We found no lines with content, fail safe by just returning the
+     original documentation string.  */
+  if (!all_done_idx.has_value () || !min_whitespace.has_value ())
+    return doc;
+
+  /* Setup DST and SRC, both pointing into the DOC string.  We're going to
+     rewrite DOC in-place, as we only ever make DOC shorter (by removing
+     white-space), thus we know this will not overflow.  */
+  char *dst = doc.get ();
+  char *src = doc.get ();
+
+  /* Array indices used with DST, SRC, and WS_INFO respectively.  */
+  size_t dst_offset = 0;
+  size_t src_offset = 0;
+  size_t ws_info_offset = 0;
+
+  /* Now, walk over the source string, this is the original DOC.  */
+  while (src[src_offset] != '\0')
+    {
+      /* If we are at the start of the next line (in WS_INFO), then we may
+	 need to skip some white-space characters.  */
+      if (src_offset == ws_info[ws_info_offset].offset ())
+	{
+	  /* If a line has leading white-space then we need to skip over
+	     some number of characters now.  */
+	  if (ws_info[ws_info_offset].ws () > 0)
+	    {
+	      /* If the line is entirely white-space then we skip all of
+		 the white-space, the next character to copy will be the
+		 newline or null character.  Otherwise, we skip the just
+		 some portion of the leading white-space.  */
+	      if (src[src_offset + ws_info[ws_info_offset].ws ()] == '\n'
+		  || src[src_offset + ws_info[ws_info_offset].ws ()] == '\0')
+		src_offset += ws_info[ws_info_offset].ws ();
+	      else
+		src_offset += std::min (*min_whitespace,
+					ws_info[ws_info_offset].ws ());
+
+	      /* If we skipped white-space, and are now at the end of the
+		 input, then we're done.  */
+	      if (src[src_offset] == '\0')
+		break;
+	    }
+	  if (ws_info_offset < (ws_info.size () - 1))
+	    ++ws_info_offset;
+	  if (ws_info_offset > *all_done_idx)
+	    break;
+	}
+
+      /* Don't copy a newline to the start of the DST string, this would
+	 result in a leading blank line.  But in all other cases, copy the
+	 next character into the destination string.  */
+      if ((dst_offset > 0 || src[src_offset] != '\n'))
+	{
+	  dst[dst_offset] = src[src_offset];
+	  ++dst_offset;
+	}
+
+      /* Move to the next source character.  */
+      ++src_offset;
+    }
+
+  /* Remove the trailing newline character(s), and ensure we have a null
+     terminator in place.  */
+  while (dst_offset > 1 && dst[dst_offset - 1] == '\n')
+    --dst_offset;
+  dst[dst_offset] = '\0';
+
+  return doc;
+}
diff --git a/gdb/python/python-internal.h b/gdb/python/python-internal.h
index d947b96033b..217bc15bb28 100644
--- a/gdb/python/python-internal.h
+++ b/gdb/python/python-internal.h
@@ -822,4 +822,25 @@ extern bool gdbpy_is_architecture (PyObject *obj);
 
 extern bool gdbpy_is_progspace (PyObject *obj);
 
+/* Take DOC, the documentation string for a GDB command defined in Python,
+   and return an (possibly) modified version of that same string.
+
+   When a command is defined in Python, the documentation string will
+   usually be indented based on the indentation of the surrounding Python
+   code.  However, the documentation string is a literal string, all the
+   white-space added for indentation is included within the documentation
+   string.
+
+   This indentation is then included in the help text that GDB displays,
+   which looks odd out of the context of the original Python source code.
+
+   This function analyses DOC and tries to figure out what white-space
+   within DOC was added as part of the indentation, and then removes that
+   white-space from the copy that is returned.
+
+   If the analysis of DOC fails then DOC will be returned unmodified.  */
+
+extern gdb::unique_xmalloc_ptr<char> gdbpy_fix_doc_string_indentation
+  (gdb::unique_xmalloc_ptr<char> doc);
+
 #endif /* PYTHON_PYTHON_INTERNAL_H */
diff --git a/gdb/testsuite/gdb.python/py-doc-reformat.exp b/gdb/testsuite/gdb.python/py-doc-reformat.exp
new file mode 100644
index 00000000000..aae3939bd07
--- /dev/null
+++ b/gdb/testsuite/gdb.python/py-doc-reformat.exp
@@ -0,0 +1,282 @@
+# Copyright 2022 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test how GDB reformats the documentation string of commands
+# implemented in Python for use as the help text of those same
+# commands.
+
+load_lib gdb-python.exp
+
+# A global counter used to number the tests.
+set idx 0
+
+# Run a test, create both a command and a parameter with INPUT_DOCS as
+# the Python documentation string, load the command and parameter into
+# GDB, and then ask for the help text of the command, and the show and
+# set help for the parameter.  The output from GDB should match
+# EXPECTED_OUTPUT in each case.
+#
+# The INPUT_DOCS must start with 2 space characters, followed by the
+# 3-quote characters to start the doc string, and must end with the
+# final 3-quote characters.  If this is not true then an error is
+# thrown, and this function immediately returns.
+proc test { input_docs expected_output } {
+    global idx
+
+    set local_idx $idx
+    incr idx
+
+    with_test_prefix "test ${local_idx}" {
+
+	# The tests in this script don't have descriptive names,
+	# rather they use the global IDX to generate a unique name for
+	# each test.  To make it easier to track down a failing test,
+	# we print out the test name and the input string.
+	verbose -log "Start of: test_cmd ${local_idx}"
+	verbose -log "Input:\n${input_docs}"
+
+	if { [string range $input_docs 0 4] != "  \"\"\"" } {
+	    perror "Input string does not start with the required pattern"
+	    return
+	}
+
+	if { [string range $input_docs end-2 end] != "\"\"\"" } {
+	    perror "Input string does not end with the required pattern"
+	    return
+	}
+
+	set python_filename [standard_output_file "${::gdb_test_file_name}-${local_idx}.py"]
+
+	set fd [open $python_filename w]
+
+	puts $fd "class test_cmd (gdb.Command):"
+	puts $fd $input_docs
+	puts $fd ""
+	puts $fd "  def __init__ (self):"
+	puts $fd "    super ().__init__ (\"test-cmd\", gdb.COMMAND_OBSCURE)"
+	puts $fd ""
+	puts $fd "  def invoke (self, arg, from_tty):"
+	puts $fd "    print (\"In test-cmd\")"
+	puts $fd ""
+
+	puts $fd "class test_param (gdb.Parameter):"
+	puts $fd $input_docs
+	puts $fd ""
+	puts $fd "  show_doc = \"This is the show doc line\""
+	puts $fd "  set_doc = \"This is the set doc line\""
+	puts $fd ""
+	puts $fd "  def __init__ (self):"
+	puts $fd "    super ().__init__ (\"test-param\", gdb.COMMAND_OBSCURE, gdb.PARAM_BOOLEAN)"
+	puts $fd ""
+	puts $fd "  def get_show_string (self, value):"
+	puts $fd "    return \"The state is '\" + value + \"'\""
+
+	puts $fd ""
+	puts $fd "test_param ()"
+	puts $fd "test_cmd ()"
+
+	puts $fd ""
+	puts $fd "print(\"Loaded OK.\")"
+
+	close $fd
+
+	set remote_python_file \
+	    [gdb_remote_download host $python_filename]
+
+	clean_restart
+
+	gdb_test "source ${remote_python_file}" \
+	    "Loaded OK\\." \
+	    "source python command file"
+
+	# Use gdb_test_multiple here as plain gdb_test will allow
+	# excess blank lines between the expected_output pattern and
+	# the gdb_prompt - a core part of this test is that we are
+	# checking that such blank lines are removed - so we can't use
+	# gdb_test here.
+	gdb_test_multiple "help test-cmd" "" {
+	    -re "^help test-cmd\r\n" {
+		exp_continue
+	    }
+	    -re "^$expected_output\r\n$::gdb_prompt $" {
+		pass $gdb_test_name
+	    }
+	}
+
+	gdb_test_multiple "help set test-param" "" {
+	    -re "^help set test-param\r\n" {
+		exp_continue
+	    }
+	    -re "^This is the set doc line\r\n" {
+		exp_continue
+	    }
+	    -re "^$expected_output\r\n$::gdb_prompt $" {
+		pass $gdb_test_name
+	    }
+	}
+
+	gdb_test_multiple "help show test-param" "" {
+	    -re "^help show test-param\r\n" {
+		exp_continue
+	    }
+	    -re "^This is the show doc line\r\n" {
+		exp_continue
+	    }
+	    -re "^$expected_output\r\n$::gdb_prompt $" {
+		pass $gdb_test_name
+	    }
+	}
+    }
+}
+
+# The first test is pretty vanilla. First line starts immediately
+# after the opening quotes, and closing quotes are immediately after
+# the last line.
+test \
+    [multi_line_input \
+	 "  \"\"\"This is the first line." \
+	 "" \
+	 "  This is the third line.\"\"\""] \
+    [multi_line \
+	 "This is the first line\\." \
+	 "" \
+	 "This is the third line\\."]
+
+# In this test the first line starts on the line after the opening
+# quotes.  The blank line in the middle is still completely empty.
+test \
+    [multi_line_input \
+	 "  \"\"\"" \
+	 "  This is the first line." \
+	 "" \
+	 "  This is the third line.\"\"\""] \
+    [multi_line \
+	 "This is the first line\\." \
+	 "" \
+	 "This is the third line\\."]
+
+# This test adds an indented line in the middle, we expect the
+# relative indentation to be retained.
+test\
+    [multi_line_input \
+	 "  \"\"\"" \
+	 "  This is the first line." \
+	 "    Indented second line." \
+	 "  This is the third line.\"\"\""] \
+    [multi_line \
+	 "This is the first line\\." \
+	 "  Indented second line\\." \
+	 "This is the third line\\."]
+
+# The indentation moves to the first line.
+test\
+    [multi_line_input \
+	 "  \"\"\"" \
+	 "    Indented first line." \
+	 "  This is the second line." \
+	 "  This is the third line.\"\"\""] \
+    [multi_line \
+	 "  Indented first line\\." \
+	 "This is the second line\\." \
+	 "This is the third line\\."]
+
+# The indentation moves to the last line.
+test\
+    [multi_line_input \
+	 "  \"\"\"" \
+	 "  This is the first line." \
+	 "  This is the second line." \
+	 "    Indented third line.\"\"\""] \
+    [multi_line \
+	 "This is the first line\\." \
+	 "This is the second line\\." \
+	 "  Indented third line\\."]
+
+# Tests using different amounts of white-space on a line of its own.
+# We test with the white-space at the beginning, end, and in the
+# middle of the doc string.
+foreach line [list "" " " "  " "   " "    "] {
+    # Blank lines at the beginning should be removed.
+    test \
+	[multi_line_input \
+	     "  \"\"\"" \
+	     $line \
+	     "  This is the first line." \
+	     "    Indented second line." \
+	     "  This is the third line.\"\"\""] \
+	[multi_line \
+	     "This is the first line\\." \
+	     "  Indented second line\\." \
+	     "This is the third line\\."]
+
+    # As should blank lines at the end.
+    test \
+	[multi_line_input \
+	     "  \"\"\"" \
+	     "  This is the first line." \
+	     "    Indented second line." \
+	     "  This is the third line." \
+	     $line \
+	     "  \"\"\""] \
+	[multi_line \
+	     "This is the first line\\." \
+	     "  Indented second line\\." \
+	     "This is the third line\\."]
+
+    # While blank lines in the middle should have all white-space
+    # removed.
+    test \
+	[multi_line_input \
+	     "  \"\"\"" \
+	     "  This is the first line." \
+	     $line \
+	     "  This is the third line." \
+	     $line \
+	     "  \"\"\""] \
+	[multi_line \
+	     "This is the first line\\." \
+	     "" \
+	     "This is the third line\\."]
+}
+
+# Check for correct handling of closing quotes being on a line of
+# their own.
+test\
+    [multi_line_input \
+	 "  \"\"\"  " \
+	 "  This is the first line." \
+	 "    Indented second line." \
+	 "  This is the third line." \
+	 "  \"\"\""] \
+    [multi_line \
+	 "This is the first line\\." \
+	 "  Indented second line\\." \
+	 "This is the third line\\." ]
+
+
+# Check with a single 'x' character before the closing quotes.  Make
+# sure we don't drop this character.
+test\
+    [multi_line_input \
+	 "  \"\"\"  " \
+	 "  This is the first line." \
+	 "    Indented second line." \
+	 "  This is the third line." \
+	 "  x\"\"\""] \
+    [multi_line \
+	 "This is the first line\\." \
+	 "  Indented second line\\." \
+	 "This is the third line\\." \
+	 "x"]
-- 
2.30.2