Properly sanatize user names in LFSC (#8080)
authorAndrew Reynolds <andrew.j.reynolds@gmail.com>
Wed, 23 Feb 2022 16:46:50 +0000 (10:46 -0600)
committerGitHub <noreply@github.com>
Wed, 23 Feb 2022 16:46:50 +0000 (16:46 +0000)
Previously, we would get LFSC proof checking failures if we were using an identifier that was valid in SMT-LIB but not LFSC.

Co-authored-by: Alex Ozdemir <aozdemir@hmc.edu>
src/proof/lfsc/lfsc_node_converter.cpp
src/proof/lfsc/lfsc_node_converter.h
test/unit/CMakeLists.txt
test/unit/proof/CMakeLists.txt [new file with mode: 0644]
test/unit/proof/lfsc_node_converter_black.cpp [new file with mode: 0644]

index 1ff5d2a0ffe39d932ebdddc7ef3105318a7debb2..3dc0aa3aa7c3741eb2433de72bf7cebe0a930967 100644 (file)
@@ -15,6 +15,7 @@
 
 #include "proof/lfsc/lfsc_node_converter.h"
 
+#include <iomanip>
 #include <sstream>
 
 #include "expr/array_store_all.h"
@@ -142,10 +143,7 @@ Node LfscNodeConverter::postConvert(Node n)
   }
   else if (n.isVar())
   {
-    std::stringstream ss;
-    ss << n;
-    Node nn = mkInternalSymbol(getNameForUserName(ss.str()), tn);
-    return nn;
+    return mkInternalSymbol(getNameForUserNameOf(n), tn);
   }
   else if (k == APPLY_UF)
   {
@@ -604,9 +602,48 @@ TypeNode LfscNodeConverter::postConvertType(TypeNode tn)
 
 std::string LfscNodeConverter::getNameForUserName(const std::string& name)
 {
-  std::stringstream ss;
-  ss << "cvc." << name;
-  return ss.str();
+  // For user name X, we do cvc.Y, where Y contains an escaped version of X.
+  // Specifically, since LFSC does not allow these characters in identifier
+  // bodies: "() \t\n\f;", each is replaced with an escape sequence "\xXX"
+  // where XX is the zero-padded hex representation of the code point. "\\" is
+  // also escaped.
+  //
+  // See also: https://github.com/cvc5/LFSC/blob/master/src/lexer.flex#L24
+  //
+  // The "cvc." prefix ensures we do not clash with LFSC definitions.
+  //
+  // The escaping ensures that all names are valid LFSC identifiers.
+  std::string sanitized("cvc.");
+  size_t found = sanitized.size();
+  sanitized += name;
+  // The following sanitizes symbols that are not allowed in LFSC identifiers
+  // here, e.g.
+  //   |a b|
+  // is converted to:
+  //   cvc.a\x20b
+  // where "20" is the hex unicode value of " ".
+  do
+  {
+    found = sanitized.find_first_of("() \t\n\f\\;", found);
+    if (found != std::string::npos)
+    {
+      // Emit hex sequence
+      std::stringstream seq;
+      seq << "\\x" << std::setbase(16) << std::setfill('0') << std::setw(2)
+          << static_cast<size_t>(sanitized[found]);
+      sanitized.replace(found, 1, seq.str());
+      // increment found over the escape
+      found += 3;
+    }
+  } while (found != std::string::npos);
+  return sanitized;
+}
+
+std::string LfscNodeConverter::getNameForUserNameOf(Node v)
+{
+  std::string name;
+  v.getAttribute(expr::VarNameAttr(), name);
+  return getNameForUserName(name);
 }
 
 bool LfscNodeConverter::shouldTraverse(Node n)
@@ -871,11 +908,9 @@ Node LfscNodeConverter::getOperatorOfTerm(Node n, bool macroApply)
       {
         Assert(indices.size() == 1);
         // must convert to user name
-        std::stringstream sss;
-        sss << indices[0];
         TypeNode intType = nm->integerType();
         indices[0] =
-            getSymbolInternal(k, intType, getNameForUserName(sss.str()));
+            getSymbolInternal(k, intType, getNameForUserNameOf(indices[0]));
       }
     }
     else if (op.getType().isFunction())
@@ -918,9 +953,8 @@ Node LfscNodeConverter::getOperatorOfTerm(Node n, bool macroApply)
     {
       unsigned index = DType::indexOf(op);
       const DType& dt = DType::datatypeOf(op);
-      std::stringstream ssc;
-      ssc << dt[index].getConstructor();
-      opName << getNameForUserName(ssc.str());
+      // get its variable name
+      opName << getNameForUserNameOf(dt[index].getConstructor());
     }
     else if (k == APPLY_SELECTOR)
     {
index 07eed996c9fe813f9c4e57e391d942ffca6bc271..bbfbaba8e0496f1e5ef7803145ea8c4fb21a717d 100644 (file)
@@ -103,6 +103,8 @@ class LfscNodeConverter : public NodeConverter
 
   /** get name for user name */
   static std::string getNameForUserName(const std::string& name);
+  /** get name for the name of node v, where v should be a variable */
+  static std::string getNameForUserNameOf(Node v);
 
  private:
   /** Should we traverse n? */
index 0ba29544ecf42ea26243cd321f4e1cddbc2e88cf..1fbaf5ba2fc1a5cdcc3e1c9473f53311ee6b2c12 100644 (file)
@@ -102,6 +102,7 @@ if(ENABLE_UNIT_TESTING)
   add_subdirectory(options)
   add_subdirectory(parser)
   add_subdirectory(printer)
+  add_subdirectory(proof)
   add_subdirectory(prop)
   add_subdirectory(theory)
   add_subdirectory(preprocessing)
diff --git a/test/unit/proof/CMakeLists.txt b/test/unit/proof/CMakeLists.txt
new file mode 100644 (file)
index 0000000..749948e
--- /dev/null
@@ -0,0 +1 @@
+cvc5_add_unit_test_black(lfsc_node_converter_black proof)
diff --git a/test/unit/proof/lfsc_node_converter_black.cpp b/test/unit/proof/lfsc_node_converter_black.cpp
new file mode 100644 (file)
index 0000000..31badc8
--- /dev/null
@@ -0,0 +1,60 @@
+/******************************************************************************
+ * Top contributors (to current version):
+ *   Alex Ozdemir
+ *
+ * This file is part of the cvc5 project.
+ *
+ * Copyright (c) 2009-2021 by the authors listed in the file AUTHORS
+ * in the top-level source directory and their institutional affiliations.
+ * All rights reserved.  See the file COPYING in the top-level source
+ * directory for licensing information.
+ * ****************************************************************************
+ *
+ * Black box testing of LFSC proof conversion
+ */
+
+#include <string>
+
+#include "proof/lfsc/lfsc_node_converter.h"
+#include "test.h"
+
+namespace cvc5 {
+using namespace cvc5::proof;
+
+namespace test {
+
+class TestLfscNodeConverterBlack : public TestInternal
+{
+};
+
+TEST_F(TestLfscNodeConverterBlack, ident_sanitize)
+{
+  // LFSC does not allow these characters in identifier bodies: "() \t\n\f;"
+  //
+  // See also: https://github.com/cvc5/LFSC/blob/master/src/lexer.flex#L24
+  //
+  // We use hex escapes that are valid in Python:
+  // https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("there are spaces"),
+            R"x(cvc.there\x20are\x20spaces)x");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("  "), R"x(cvc.\x20\x20)x");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("there\\are\\slashes"),
+            R"x(cvc.there\x5care\x5cslashes)x");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("there\nare\nnewlines"),
+            R"x(cvc.there\x0aare\x0anewlines)x");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("there\rare\rCRs"),
+            "cvc.there\rare\rCRs");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("there\tare\ttabs"),
+            R"x(cvc.there\x09are\x09tabs)x");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("there(are(parens"),
+            R"x(cvc.there\x28are\x28parens)x");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("there)are)parens"),
+            R"x(cvc.there\x29are\x29parens)x");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName("there;are;semis"),
+            R"x(cvc.there\x3bare\x3bsemis)x");
+  ASSERT_EQ(LfscNodeConverter::getNameForUserName(""), R"x(cvc.)x");
+}
+
+}  // namespace test
+
+}  // namespace cvc5