src/theory/strings/strings_entail.h

   1 /*********************                                                        */
   2 /*! \file strings_entail.h
   3  ** \verbatim
   4  ** Top contributors (to current version):
   5  **   Andrew Reynolds, Andres Noetzli
   6  ** This file is part of the CVC4 project.
   7  ** Copyright (c) 2009-2021 by the authors listed in the file AUTHORS
   8  ** in the top-level source directory and their institutional affiliations.
   9  ** All rights reserved.  See the file COPYING in the top-level source
  10  ** directory for licensing information.\endverbatim
  11  **
  12  ** \brief Entailment tests involving strings
  13  **/
  14
  15 #include "cvc4_private.h"
  16
  17 #ifndef CVC4__THEORY__STRINGS__STRING_ENTAIL_H
  18 #define CVC4__THEORY__STRINGS__STRING_ENTAIL_H
  19
  20 #include <vector>
  21
  22 #include "expr/node.h"
  23
  24 namespace CVC5 {
  25 namespace theory {
  26 namespace strings {
  27
  28 class SequencesRewriter;
  29
  30 /**
  31  * Entailment tests involving strings.
  32  * Some of these techniques are described in Reynolds et al, "High Level
  33  * Abstractions for Simplifying Extended String Constraints in SMT", CAV 2019.
  34  */
  35 class StringsEntail
  36 {
  37  public:
  38   StringsEntail(SequencesRewriter& rewriter);
  39
  40   /** can constant contain list
  41    * return true if constant c can contain the list l in order
  42    * firstc/lastc store which indices in l were used to determine the return
  43    * value.
  44    *   (This is typically used when this function returns false, for minimizing
  45    * explanations)
  46    *
  47    * For example:
  48    *   canConstantContainList( "abc", { x, "c", y } ) returns true
  49    *     firstc/lastc are updated to 1/1
  50    *   canConstantContainList( "abc", { x, "d", y } ) returns false
  51    *     firstc/lastc are updated to 1/1
  52    *   canConstantContainList( "abcdef", { x, "b", y, "a", z, "c", w }
  53    *     returns false
  54    *     firstc/lastc are updated to 1/3
  55    *   canConstantContainList( "abcdef", { x, "b", y, "e", z, "c", w }
  56    *     returns false
  57    *     firstc/lastc are updated to 1/5
  58    */
  59   static bool canConstantContainList(Node c,
  60                                      std::vector<Node>& l,
  61                                      int& firstc,
  62                                      int& lastc);
  63   /** can constant contain concat
  64    * same as above but with n = str.++( l ) instead of l
  65    */
  66   static bool canConstantContainConcat(Node c, Node n, int& firstc, int& lastc);
  67
  68   /** strip symbolic length
  69    *
  70    * This function strips off components of n1 whose length is less than or
  71    * equal to argument curr, and stores them in nr. The direction dir
  72    * determines whether the components are removed from the start or end of n1.
  73    * If `strict` is set to true, then the function only returns true if full
  74    * length `curr` can be stripped.
  75    *
  76    * In detail, this function updates n1 to n1' such that:
  77    *   If dir=1,
  78    *     n1 = str.++( nr, n1' )
  79    *   If dir=-1
  80    *     n1 = str.++( n1', nr )
  81    * It updates curr to curr' such that:
  82    *   curr' = curr - str.len( str.++( nr ) ), and
  83    *   curr' >= 0
  84    * where the latter fact is determined by checkArithEntail.
  85    *
  86    * This function returns true if n1 is modified.
  87    *
  88    * For example:
  89    *
  90    *  stripSymbolicLength( { x, "abc", y }, {}, 1, str.len(x)+1 )
  91    *    returns true
  92    *    n1 is updated to { "bc", y }
  93    *    nr is updated to { x, "a" }
  94    *    curr is updated to 0   *
  95    *
  96    * stripSymbolicLength( { x, "abc", y }, {}, 1, str.len(x)-1 )
  97    *    returns false
  98    *
  99    *  stripSymbolicLength( { y, "abc", x }, {}, 1, str.len(x)+1 )
 100    *    returns false
 101    *
 102    *  stripSymbolicLength( { x, "abc", y }, {}, -1, 2*str.len(y)+4 )
 103    *    returns true
 104    *    n1 is updated to { x }
 105    *    nr is updated to { "abc", y }
 106    *    curr is updated to str.len(y)+1
 107    */
 108   static bool stripSymbolicLength(std::vector<Node>& n1,
 109                                   std::vector<Node>& nr,
 110                                   int dir,
 111                                   Node& curr,
 112                                   bool strict = false);
 113   /** component contains
 114    * This function is used when rewriting str.contains( t1, t2 ), where
 115    * n1 is the vector form of t1
 116    * n2 is the vector form of t2
 117    *
 118    * If this function returns n>=0 for some n, then
 119    *    n1 = { x1...x{n-1} xn...x{n+s} x{n+s+1}...xm },
 120    *    n2 = { y1...ys },
 121    *    y1 is a suffix of xn,
 122    *    y2...y{s-1} = x{n+1}...x{n+s-1}, and
 123    *    ys is a prefix of x{n+s}
 124    * Otherwise it returns -1.
 125    *
 126    * This function may update n1 if computeRemainder = true.
 127    *   We maintain the invariant that the resulting value n1'
 128    *   of n1 after this function is such that:
 129    *     n1 = str.++( nb, n1', ne )
 130    * The vectors nb and ne have the following properties.
 131    * If computeRemainder = true, then
 132    *   If remainderDir != -1, then
 133    *     ne is { x{n+s}' x{n+s+1}...xm }
 134    *     where x{n+s} = str.++( ys, x{n+s}' ).
 135    *   If remainderDir != 1, then
 136    *     nb is { x1, ..., x{n-1}, xn' }
 137    *     where xn = str.++( xn', y1 ).
 138    *
 139    * For example:
 140    *
 141    * componentContains({ x, "abc", x }, { "b" }, {}, true, 0)
 142    *   returns 1,
 143    *   n1 is updated to { "b" },
 144    *   nb is updated to { x, "a" },
 145    *   ne is updated to { "c", x }
 146    *
 147    * componentContains({ x, "abc", x }, { "b" }, {}, true, 1)
 148    *   returns 1,
 149    *   n1 is updated to { x, "ab" },
 150    *   ne is updated to { "c", x }
 151    *
 152    * componentContains({ y, z, "abc", x, "def" }, { "c", x, "de" }, {}, true, 1)
 153    *   returns 2,
 154    *   n1 is updated to { y, z, "abc", x, "de" },
 155    *   ne is updated to { "f" }
 156    *
 157    * componentContains({ y, "abc", x, "def" }, { "c", x, "de" }, {}, true, -1)
 158    *   returns 1,
 159    *   n1 is updated to { "c", x, "def" },
 160    *   nb is updated to { y, "ab" }
 161    */
 162   int componentContains(std::vector<Node>& n1,
 163                         std::vector<Node>& n2,
 164                         std::vector<Node>& nb,
 165                         std::vector<Node>& ne,
 166                         bool computeRemainder = false,
 167                         int remainderDir = 0);
 168   /** strip constant endpoints
 169    * This function is used when rewriting str.contains( t1, t2 ), where
 170    * n1 is the vector form of t1
 171    * n2 is the vector form of t2
 172    *
 173    * It modifies n1 to a new vector n1' such that:
 174    * (1) str.contains( str.++( n1 ), str.++( n2 ) ) is equivalent to
 175    * str.contains( str.++( n1' ), str.++( n2 ) )
 176    * (2) str.++( n1 ) = str.++( nb, n1', ne )
 177    *
 178    * "dir" is the direction in which we can modify n1:
 179    * if dir = 1, then we allow dropping components from the front of n1,
 180    * if dir = -1, then we allow dropping components from the back of n1,
 181    * if dir = 0, then we allow dropping components from either.
 182    *
 183    * It returns true if n1 is modified.
 184    *
 185    * For example:
 186    * stripConstantEndpoints({ "ab", x, "de" }, { "c" }, {}, {}, 1)
 187    *   returns true,
 188    *   n1 is updated to { x, "de" }
 189    *   nb is updated to { "ab" }
 190    * stripConstantEndpoints({ "ab", x, "de" }, { "bd" }, {}, {}, 0)
 191    *   returns true,
 192    *   n1 is updated to { "b", x, "d" }
 193    *   nb is updated to { "a" }
 194    *   ne is updated to { "e" }
 195    * stripConstantEndpoints({ "ad", substr("ccc",x,y) }, { "d" }, {}, {}, -1)
 196    *   returns true,
 197    *   n1 is updated to {"ad"}
 198    *   ne is updated to { substr("ccc",x,y) }
 199    */
 200   static bool stripConstantEndpoints(std::vector<Node>& n1,
 201                                      std::vector<Node>& n2,
 202                                      std::vector<Node>& nb,
 203                                      std::vector<Node>& ne,
 204                                      int dir = 0);
 205
 206   /**
 207    * Checks whether a string term `a` is entailed to contain or not contain a
 208    * string term `b`.
 209    *
 210    * @param a The string that is checked whether it contains `b`
 211    * @param b The string that is checked whether it is contained in `a`
 212    * @param fullRewriter Determines whether the function can use the full
 213    * rewriter or only `rewriteContains()` (useful for avoiding loops)
 214    * @return true node if it can be shown that `a` contains `b`, false node if
 215    * it can be shown that `a` does not contain `b`, null node otherwise
 216    */
 217   Node checkContains(Node a, Node b, bool fullRewriter = true);
 218
 219   /** entail non-empty
 220    *
 221    * Checks whether string a is entailed to be non-empty. Is equivalent to
 222    * the call checkArithEntail( len( a ), true ).
 223    */
 224   static bool checkNonEmpty(Node a);
 225
 226   /**
 227    * Checks whether string has at most/exactly length one. Length one strings
 228    * can be used for more aggressive rewriting because there is guaranteed that
 229    * it cannot be overlap multiple components in a string concatenation.
 230    *
 231    * @param s The string to check
 232    * @param strict If true, the string must have exactly length one, otherwise
 233    * at most length one
 234    * @return True if the string has at most/exactly length one, false otherwise
 235    */
 236   static bool checkLengthOne(Node s, bool strict = false);
 237
 238   /**
 239    * Checks whether it is always true that `a` is a strict subset of `b` in the
 240    * multiset domain.
 241    *
 242    * Examples:
 243    *
 244    * a = (str.++ "A" x), b = (str.++ "A" x "B") ---> true
 245    * a = (str.++ "A" x), b = (str.++ "B" x "AA") ---> true
 246    * a = (str.++ "A" x), b = (str.++ "B" y "AA") ---> false
 247    *
 248    * @param a The term for which it should be checked if it is a strict subset
 249    * of `b` in the multiset domain
 250    * @param b The term for which it should be checked if it is a strict
 251    * superset of `a` in the multiset domain
 252    * @return True if it is always the case that `a` is a strict subset of `b`,
 253    * false otherwise.
 254    */
 255   static bool checkMultisetSubset(Node a, Node b);
 256
 257   /**
 258    * Returns a character `c` if it is always the case that str.in.re(a, c*),
 259    * i.e. if all possible values of `a` only consist of `c` characters, and the
 260    * null node otherwise. If `a` is the empty string, the function returns an
 261    * empty string.
 262    *
 263    * @param a The node to check for homogeneity
 264    * @return If `a` is homogeneous, the only character that it may contain, the
 265    * empty string if `a` is empty, and the null node otherwise
 266    */
 267   static Node checkHomogeneousString(Node a);
 268
 269   /**
 270    * Overapproximates the possible values of node n. This overapproximation
 271    * assumes that n can return a value x or the empty string and tries to find
 272    * the simplest x such that this holds. In the general case, x is the same as
 273    * the input n. This overapproximation can be used to sort terms with the
 274    * same possible values in string concatenation for example.
 275    *
 276    * Example:
 277    *
 278    * getStringOrEmpty( (str.replace "" x y) ) --> y because (str.replace "" x y)
 279    * either returns y or ""
 280    *
 281    * getStringOrEmpty( (str.substr "ABC" x y) ) --> (str.substr "ABC" x y)
 282    * because the function could not compute a simpler
 283    */
 284   static Node getStringOrEmpty(Node n);
 285
 286   /**
 287    * Infers a conjunction of equalities that correspond to (str.contains x y)
 288    * if it can show that the length of y is greater or equal to the length of
 289    * x. If y is a concatentation, we get x = y1 ++ ... ++ yn, the conjunction
 290    * is of the form:
 291    *
 292    * (and (= x (str.++ y1' ... ym')) (= y1'' "") ... (= yk'' ""))
 293    *
 294    * where each yi'' are yi that must be empty for (= x y) to hold and yi' are
 295    * yi that the function could not infer anything about.  Returns a null node
 296    * if the function cannot infer that str.len(y) >= str.len(x). Returns (= x
 297    * y) if the function can infer that str.len(y) >= str.len(x) but cannot
 298    * infer that any of the yi must be empty.
 299    */
 300   static Node inferEqsFromContains(Node x, Node y);
 301
 302  private:
 303   /** component contains base
 304    *
 305    * This function is a helper for the above function.
 306    *
 307    * It returns true if n2 is contained in n1 with the following
 308    * restrictions:
 309    *   If dir=1, then n2 must be a suffix of n1.
 310    *   If dir=-1, then n2 must be a prefix of n1.
 311    *
 312    * If computeRemainder is true, then n1rb and n1re are
 313    * updated such that :
 314    *   n1 = str.++( n1rb, n2, n1re )
 315    * where a null value of n1rb and n1re indicates the
 316    * empty string.
 317    *
 318    * For example:
 319    *
 320    * componentContainsBase("cabe", "ab", n1rb, n1re, 1, false)
 321    *   returns false.
 322    *
 323    * componentContainsBase("cabe", "ab", n1rb, n1re, 0, true)
 324    *   returns true,
 325    *   n1rb is set to "c",
 326    *   n1re is set to "e".
 327    *
 328    * componentContainsBase(y, str.substr(y,0,5), n1rb, n1re, -1, true)
 329    *   returns true,
 330    *   n1re is set to str.substr(y,5,str.len(y)).
 331    *
 332    *
 333    * Notice that this function may return false when it cannot compute a
 334    * remainder when it otherwise would have returned true. For example:
 335    *
 336    * componentContainsBase(y, str.substr(y,x,z), n1rb, n1re, 0, false)
 337    *   returns true.
 338    *
 339    * Hence, we know that str.substr(y,x,z) is contained in y. However:
 340    *
 341    * componentContainsBase(y, str.substr(y,x,z), n1rb, n1re, 0, true)
 342    *   returns false.
 343    *
 344    * The reason is since computeRemainder=true, it must be that
 345    *   y = str.++( n1rb, str.substr(y,x,z), n1re )
 346    * for some n1rb, n1re. However, to construct such n1rb, n1re would require
 347    * e.g. the terms:
 348    *   y = str.++( ite( x+z < 0 OR x < 0, "", str.substr(y,0,x) ),
 349    *               str.substr(y,x,z),
 350    *               ite( x+z < 0 OR x < 0, y, str.substr(y,x+z,len(y)) ) )
 351    *
 352    * Since we do not wish to introduce ITE terms in the rewriter, we instead
 353    * return false, indicating that we cannot compute the remainder.
 354    */
 355   bool componentContainsBase(
 356       Node n1, Node n2, Node& n1rb, Node& n1re, int dir, bool computeRemainder);
 357   /**
 358    * Simplifies a given node `a` s.t. the result is a concatenation of string
 359    * terms that can be interpreted as a multiset and which contains all
 360    * multisets that `a` could form.
 361    *
 362    * Examples:
 363    *
 364    * (str.substr "AA" 0 n) ---> "AA"
 365    * (str.replace "AAA" x "BB") ---> (str.++ "AAA" "BB")
 366    *
 367    * @param a The node to simplify
 368    * @return A concatenation that can be interpreted as a multiset
 369    */
 370   static Node getMultisetApproximation(Node a);
 371
 372  private:
 373   /**
 374    * Reference to the sequences rewriter that owns this `StringsEntail`
 375    * instance.
 376    */
 377   SequencesRewriter& d_rewriter;
 378 };
 379
 380 }  // namespace strings
 381 }  // namespace theory
 382 }  // namespace CVC5
 383
 384 #endif /* CVC4__THEORY__STRINGS__STRING_ENTAIL_H */