From a9ac3972bd3d0f0328e957bc04d8c0ef12811a51 Mon Sep 17 00:00:00 2001
From: Andrew Reynolds <andrew.j.reynolds@gmail.com>
Date: Mon, 23 Aug 2021 16:57:13 -0500
Subject: [PATCH] Purify substitutions in strings proof reconstruction (#7035)

This fixes an issue in strings proof reconstruction where sequential substitutions are used over possibly non-atomic terms like (str.replace x a b) and x simultaneously.

This leads to cases where we failed to reconstruct proofs, since a substitution x -> c, (str.replace x a b) -> d may unintentionally generate the term (str.replace c a b), after which the second substitution fails to apply.
---
 src/theory/strings/infer_proof_cons.cpp       | 140 ++++++++++++++++--
 src/theory/strings/infer_proof_cons.h         |  62 ++++++++
 test/regress/CMakeLists.txt                   |   1 +
 .../regress0/strings/delta-trust-subs.smt2    |   7 +
 4 files changed, 201 insertions(+), 9 deletions(-)
 create mode 100644 test/regress/regress0/strings/delta-trust-subs.smt2
diff --git a/src/theory/strings/infer_proof_cons.cpp b/src/theory/strings/infer_proof_cons.cpp
index d214babae..b8c0a851c 100644
--- a/src/theory/strings/infer_proof_cons.cpp
+++ b/src/theory/strings/infer_proof_cons.cpp
@@ -84,7 +84,7 @@ void InferProofCons::convert(InferenceId infer,
   {
     Trace("strings-ipc-debug") << "InferProofCons::convert: " << infer
                                << (isRev ? " :rev " : " ") << conc << std::endl;
-    for (const Node& ec : exp)
+    for (const Node& ec : ps.d_children)
     {
       Trace("strings-ipc-debug") << "    e: " << ec << std::endl;
     }
@@ -103,9 +103,17 @@ void InferProofCons::convert(InferenceId infer,
     case InferenceId::STRINGS_NORMAL_FORM:
     case InferenceId::STRINGS_CODE_PROXY:
     {
-      ps.d_args.push_back(conc);
-      // will attempt this rule
-      ps.d_rule = PfRule::MACRO_SR_PRED_INTRO;
+      std::vector<Node> pcs = ps.d_children;
+      Node pconc = conc;
+      // purify core substitution proves conc from pconc if necessary,
+      // we apply MACRO_SR_PRED_INTRO to prove pconc
+      if (purifyCoreSubstitution(pconc, pcs, psb, false))
+      {
+        if (psb.applyPredIntro(pconc, pcs))
+        {
+          useBuffer = true;
+        }
+      }
     }
     break;
     // ========================== substitution + rewriting
@@ -232,14 +240,23 @@ void InferProofCons::convert(InferenceId infer,
         break;
       }
       // apply MACRO_SR_PRED_ELIM using equalities up to the main eq
+      // we purify the core substitution
+      std::vector<Node> pcsr(ps.d_children.begin(),
+                             ps.d_children.begin() + mainEqIndex);
+      Node pmainEq = mainEq;
+      // we transform mainEq to pmainEq and then use this as the first
+      // argument to MACRO_SR_PRED_ELIM.
+      if (!purifyCoreSubstitution(pmainEq, pcsr, psb, true))
+      {
+        break;
+      }
       std::vector<Node> childrenSRew;
-      childrenSRew.push_back(mainEq);
-      childrenSRew.insert(childrenSRew.end(),
-                          ps.d_children.begin(),
-                          ps.d_children.begin() + mainEqIndex);
+      childrenSRew.push_back(pmainEq);
+      childrenSRew.insert(childrenSRew.end(), pcsr.begin(), pcsr.end());
+      // now, conclude the proper equality
       Node mainEqSRew =
           psb.tryStep(PfRule::MACRO_SR_PRED_ELIM, childrenSRew, {});
-      if (CDProof::isSame(mainEqSRew, mainEq))
+      if (CDProof::isSame(mainEqSRew, pmainEq))
       {
         Trace("strings-ipc-core") << "...undo step" << std::endl;
         // the rule added above was not necessary
@@ -1048,6 +1065,111 @@ std::string InferProofCons::identify() const
   return "strings::InferProofCons";
 }
 
+bool InferProofCons::purifyCoreSubstitution(Node& tgt,
+                                            std::vector<Node>& children,
+                                            TheoryProofStepBuffer& psb,
+                                            bool concludeTgtNew) const
+{
+  // collect the terms to purify, which are the LHS of the substitution
+  std::unordered_set<Node> termsToPurify;
+  for (const Node& nc : children)
+  {
+    Assert(nc.getKind() == EQUAL && nc[0].getType().isStringLike());
+    termsToPurify.insert(nc[0]);
+  }
+  // now, purify each of the children of the substitution
+  for (size_t i = 0, nchild = children.size(); i < nchild; i++)
+  {
+    Node pnc = purifyCorePredicate(children[i], true, psb, termsToPurify);
+    if (pnc.isNull())
+    {
+      return false;
+    }
+    if (children[i] != pnc)
+    {
+      Trace("strings-ipc-pure-subs")
+          << "Converted: " << children[i] << " to " << pnc << std::endl;
+      children[i] = pnc;
+    }
+    // we now should have a substitution with only atomic terms
+    Assert(children[i][0].getNumChildren() == 0);
+  }
+  // now, purify the target predicate
+  tgt = purifyCorePredicate(tgt, concludeTgtNew, psb, termsToPurify);
+  return !tgt.isNull();
+}
+
+Node InferProofCons::purifyCorePredicate(
+    Node lit,
+    bool concludeNew,
+    TheoryProofStepBuffer& psb,
+    std::unordered_set<Node>& termsToPurify) const
+{
+  bool pol = lit.getKind() != NOT;
+  Node atom = pol ? lit : lit[0];
+  if (atom.getKind() != EQUAL || !atom[0].getType().isStringLike())
+  {
+    // we only purify string (dis)equalities
+    return lit;
+  }
+  // purify both sides of the equality
+  std::vector<Node> pcs;
+  bool childChanged = false;
+  for (const Node& lc : atom)
+  {
+    Node plc = purifyCoreTerm(lc, termsToPurify);
+    childChanged = childChanged || plc != lc;
+    pcs.push_back(plc);
+  }
+  if (!childChanged)
+  {
+    return lit;
+  }
+  NodeManager* nm = NodeManager::currentNM();
+  Node newLit = nm->mkNode(EQUAL, pcs);
+  if (!pol)
+  {
+    newLit = newLit.notNode();
+  }
+  Assert(lit != newLit);
+  // prove by transformation, should always succeed
+  if (!psb.applyPredTransform(
+          concludeNew ? lit : newLit, concludeNew ? newLit : lit, {}))
+  {
+    // failed, return null
+    return Node::null();
+  }
+  return newLit;
+}
+
+Node InferProofCons::purifyCoreTerm(
+    Node n, std::unordered_set<Node>& termsToPurify) const
+{
+  Assert(n.getType().isStringLike());
+  if (n.getNumChildren() == 0)
+  {
+    return n;
+  }
+  NodeManager* nm = NodeManager::currentNM();
+  if (n.getKind() == STRING_CONCAT)
+  {
+    std::vector<Node> pcs;
+    for (const Node& nc : n)
+    {
+      pcs.push_back(purifyCoreTerm(nc, termsToPurify));
+    }
+    return nm->mkNode(STRING_CONCAT, pcs);
+  }
+  if (termsToPurify.find(n) == termsToPurify.end())
+  {
+    // did not need to purify
+    return n;
+  }
+  SkolemManager* sm = nm->getSkolemManager();
+  Node k = sm->mkPurifySkolem(n, "k");
+  return k;
+}
+
 }  // namespace strings
 }  // namespace theory
 }  // namespace cvc5
diff --git a/src/theory/strings/infer_proof_cons.h b/src/theory/strings/infer_proof_cons.h
index bba03dd28..6bc64a085 100644
--- a/src/theory/strings/infer_proof_cons.h
+++ b/src/theory/strings/infer_proof_cons.h
@@ -120,6 +120,68 @@ class InferProofCons : public ProofGenerator
    * conclusion, or null if we were not able to construct a TRANS step.
    */
   Node convertTrans(Node eqa, Node eqb, TheoryProofStepBuffer& psb);
+  /**
+   * Purify core substitution.
+   *
+   * When reconstructing proofs for the core strings calculus, we rely on
+   * sequential substitutions for constructing proofs involving recursive
+   * computation of normal forms. However, this can be incorrect in cases where
+   * a term like (str.replace x a b) is being treated as an atomic term,
+   * and a substitution applied over (str.replace x a b) -> c, x -> d.
+   * This can lead to the term (str.replace d a b) being generated instead of
+   * c.
+   *
+   * As an example of this method, given input:
+   *   tgt = (= x (str.++ (f x) c))
+   *   children = { (= (f x) a), (= x (str.++ b (f x))) }
+   *   concludeTgtNew = true
+   * This method updates:
+   *   tgt = (= x (str.++ k c))
+   *   children = { (= k a), (= x (str.++ b k)) }
+   * where k is the purification skolem for (f x). Additionally, it ensures
+   * that psb has a proof of:
+   *   (= x (str.++ k c)) from (= x (str.++ (f x) c))
+   *      ...note the direction, since concludeTgtNew = true
+   *   (= k a) from (= (f x) a)
+   *   (= x (str.++ b k)) from (= x (str.++ b (f x)))
+   * Notice that the resulting substitution can now be safely used as a
+   * sequential substution, since (f x) has been purified with k. The proofs
+   * in psb ensure that a proof step involving the purified substitution will
+   * have the same net effect as a proof step using the original substitution.
+   *
+   * @param tgt The term we were originally going to apply the substitution to.
+   * @param children The premises corresponding to the substitution.
+   * @param psb The proof step buffer
+   * @param concludeTgtNew Whether we require proving the purified form of
+   * tgt from tgt or vice versa.
+   * @return true if we successfully purified the substitution and the target
+   * term. Additionally, if successful, we ensure psb contains proofs of
+   * children'[i] from children[i] for all i, and tgt' from tgt (or vice versa
+   * based on concludeTgtNew).
+   */
+  bool purifyCoreSubstitution(Node& tgt,
+                              std::vector<Node>& children,
+                              TheoryProofStepBuffer& psb,
+                              bool concludeTgtNew = false) const;
+  /**
+   * Return the purified form of the predicate lit with respect to a set of
+   * terms to purify, call the returned literal lit'.
+   * If concludeNew is true, then we add a proof of lit' from lit in psb;
+   * otherwise we add a proof of lit from lit'.
+   * Note that string predicates that require purification are string
+   * (dis)equalities only.
+   */
+  Node purifyCorePredicate(Node lit,
+                           bool concludeNew,
+                           TheoryProofStepBuffer& psb,
+                           std::unordered_set<Node>& termsToPurify) const;
+  /**
+   * Purify term with respect to a set of terms to purify. This replaces
+   * all terms to purify with their purification variables that occur in
+   * positions that are relevant for the core calculus of strings (direct
+   * children of concat or equal).
+   */
+  Node purifyCoreTerm(Node n, std::unordered_set<Node>& termsToPurify) const;
   /** the proof node manager */
   ProofNodeManager* d_pnm;
   /** The lazy fact map */
diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt
index 55649c2f0..ce047c877 100644
--- a/test/regress/CMakeLists.txt
+++ b/test/regress/CMakeLists.txt
@@ -1149,6 +1149,7 @@ set(regress_0_tests
   regress0/strings/code-perf.smt2
   regress0/strings/code-sat-neg-one.smt2
   regress0/strings/complement-simple.smt2
+  regress0/strings/delta-trust-subs.smt2
   regress0/strings/escchar_25.smt2
   regress0/strings/escchar.smt2
   regress0/strings/from_code.smt2
diff --git a/test/regress/regress0/strings/delta-trust-subs.smt2 b/test/regress/regress0/strings/delta-trust-subs.smt2
new file mode 100644
index 000000000..eb3c90374
--- /dev/null
+++ b/test/regress/regress0/strings/delta-trust-subs.smt2
@@ -0,0 +1,7 @@
+; COMMAND-LINE: --strings-exp
+; EXPECT: unsat
+(set-logic ALL)
+(set-info :status unsat)
+(declare-fun a () String)
+(assert (= (str.++ a "0" (str.++ a a) "A") (str.++ "0" (str.from_code (str.len a)) (str.replace "A" (str.++ a "A") a))))
+(check-sat)
-- 
2.30.2