1 /********************* */
2 /*! \file strings_entail.cpp
4 ** Top contributors (to current version):
5 ** Andrew Reynolds, Andres Noetzli
6 ** This file is part of the CVC4 project.
7 ** Copyright (c) 2009-2019 by the authors listed in the file AUTHORS
8 ** in the top-level source directory) and their institutional affiliations.
9 ** All rights reserved. See the file COPYING in the top-level source
10 ** directory for licensing information.\endverbatim
12 ** \brief Implementation of entailment tests involving strings.
15 #include "theory/strings/strings_entail.h"
17 #include "expr/node_builder.h"
18 #include "theory/rewriter.h"
19 #include "theory/strings/arith_entail.h"
20 #include "theory/strings/sequences_rewriter.h"
21 #include "theory/strings/theory_strings_utils.h"
22 #include "theory/strings/word.h"
24 using namespace CVC4::kind
;
30 StringsEntail::StringsEntail(SequencesRewriter
& rewriter
) : d_rewriter(rewriter
)
34 bool StringsEntail::canConstantContainConcat(Node c
,
40 CVC4::String t
= c
.getConst
<String
>();
41 const std::vector
<unsigned>& tvec
= t
.getVec();
42 Assert(n
.getKind() == STRING_CONCAT
);
43 // must find constant components in order
47 for (unsigned i
= 0; i
< n
.getNumChildren(); i
++)
51 firstc
= firstc
== -1 ? i
: firstc
;
53 CVC4::String s
= n
[i
].getConst
<String
>();
54 size_t new_pos
= t
.find(s
, pos
);
55 if (new_pos
== std::string::npos
)
61 pos
= new_pos
+ s
.size();
64 else if (n
[i
].getKind() == STRING_ITOS
&& ArithEntail::check(n
[i
][0]))
66 // find the first occurrence of a digit starting at pos
67 while (pos
< tvec
.size() && !String::isDigit(tvec
[pos
]))
71 if (pos
== tvec
.size())
75 // must consume at least one digit here
82 bool StringsEntail::canConstantContainList(Node c
,
88 // must find constant components in order
92 for (unsigned i
= 0; i
< l
.size(); i
++)
96 firstc
= firstc
== -1 ? i
: firstc
;
98 size_t new_pos
= Word::find(c
, l
[i
], pos
);
99 if (new_pos
== std::string::npos
)
105 pos
= new_pos
+ Word::getLength(l
[i
]);
112 bool StringsEntail::stripSymbolicLength(std::vector
<Node
>& n1
,
113 std::vector
<Node
>& nr
,
117 Assert(dir
== 1 || dir
== -1);
119 Node zero
= NodeManager::currentNM()->mkConst(CVC4::Rational(0));
125 Assert(!curr
.isNull());
127 if (curr
!= zero
&& sindex
< n1
.size())
129 unsigned sindex_use
= dir
== 1 ? sindex
: ((n1
.size() - 1) - sindex
);
130 if (n1
[sindex_use
].isConst())
132 // could strip part of a constant
134 ArithEntail::getConstantBound(Rewriter::rewrite(curr
));
135 if (!lowerBound
.isNull())
137 Assert(lowerBound
.isConst());
138 Rational lbr
= lowerBound
.getConst
<Rational
>();
141 Assert(ArithEntail::check(curr
, true));
142 CVC4::String s
= n1
[sindex_use
].getConst
<String
>();
144 NodeManager::currentNM()->mkConst(CVC4::Rational(s
.size()));
146 NodeManager::currentNM()->mkNode(MINUS
, lowerBound
, ncl
);
147 next_s
= Rewriter::rewrite(next_s
);
148 Assert(next_s
.isConst());
149 // we can remove the entire constant
150 if (next_s
.getConst
<Rational
>().sgn() >= 0)
152 curr
= Rewriter::rewrite(
153 NodeManager::currentNM()->mkNode(MINUS
, curr
, ncl
));
159 // we can remove part of the constant
160 // lower bound minus the length of a concrete string is negative,
161 // hence lowerBound cannot be larger than long max
162 Assert(lbr
< Rational(String::maxSize()));
163 curr
= Rewriter::rewrite(
164 NodeManager::currentNM()->mkNode(MINUS
, curr
, lowerBound
));
165 uint32_t lbsize
= lbr
.getNumerator().toUnsignedInt();
166 Assert(lbsize
< s
.size());
169 // strip partially from the front
171 NodeManager::currentNM()->mkConst(s
.prefix(lbsize
)));
172 n1
[sindex_use
] = NodeManager::currentNM()->mkConst(
173 s
.suffix(s
.size() - lbsize
));
177 // strip partially from the back
179 NodeManager::currentNM()->mkConst(s
.suffix(lbsize
)));
180 n1
[sindex_use
] = NodeManager::currentNM()->mkConst(
181 s
.prefix(s
.size() - lbsize
));
185 Assert(ArithEntail::check(curr
));
189 // we cannot remove the constant
195 Node next_s
= NodeManager::currentNM()->mkNode(
198 NodeManager::currentNM()->mkNode(STRING_LENGTH
, n1
[sindex_use
]));
199 next_s
= Rewriter::rewrite(next_s
);
200 if (ArithEntail::check(next_s
))
213 nr
.insert(nr
.begin(), n1
.begin(), n1
.begin() + sindex
);
214 n1
.erase(n1
.begin(), n1
.begin() + sindex
);
218 nr
.insert(nr
.end(), n1
.end() - sindex
, n1
.end());
219 n1
.erase(n1
.end() - sindex
, n1
.end());
226 int StringsEntail::componentContains(std::vector
<Node
>& n1
,
227 std::vector
<Node
>& n2
,
228 std::vector
<Node
>& nb
,
229 std::vector
<Node
>& ne
,
230 bool computeRemainder
,
235 // if n2 is a singleton, we can do optimized version here
238 for (unsigned i
= 0; i
< n1
.size(); i
++)
242 if (componentContainsBase(n1
[i
], n2
[0], n1rb
, n1re
, 0, computeRemainder
))
244 if (computeRemainder
)
247 if (remainderDir
!= -1)
253 ne
.insert(ne
.end(), n1
.begin() + i
+ 1, n1
.end());
254 n1
.erase(n1
.begin() + i
+ 1, n1
.end());
256 else if (!n1re
.isNull())
258 n1
[i
] = Rewriter::rewrite(
259 NodeManager::currentNM()->mkNode(STRING_CONCAT
, n1
[i
], n1re
));
261 if (remainderDir
!= 1)
263 nb
.insert(nb
.end(), n1
.begin(), n1
.begin() + i
);
264 n1
.erase(n1
.begin(), n1
.begin() + i
);
270 else if (!n1rb
.isNull())
272 n1
[i
] = Rewriter::rewrite(
273 NodeManager::currentNM()->mkNode(STRING_CONCAT
, n1rb
, n1
[i
]));
280 else if (n1
.size() >= n2
.size())
282 unsigned diff
= n1
.size() - n2
.size();
283 for (unsigned i
= 0; i
<= diff
; i
++)
287 // first component of n2 must be a suffix
288 if (componentContainsBase(n1
[i
],
293 computeRemainder
&& remainderDir
!= 1))
295 Assert(n1re_first
.isNull());
296 for (unsigned j
= 1; j
< n2
.size(); j
++)
298 // are we in the last component?
299 if (j
+ 1 == n2
.size())
303 // last component of n2 must be a prefix
304 if (componentContainsBase(n1
[i
+ j
],
309 computeRemainder
&& remainderDir
!= -1))
311 Assert(n1rb_last
.isNull());
312 if (computeRemainder
)
314 if (remainderDir
!= -1)
316 if (!n1re_last
.isNull())
318 ne
.push_back(n1re_last
);
320 ne
.insert(ne
.end(), n1
.begin() + i
+ j
+ 1, n1
.end());
321 n1
.erase(n1
.begin() + i
+ j
+ 1, n1
.end());
324 if (remainderDir
!= 1)
327 nb
.insert(nb
.end(), n1
.begin(), n1
.begin() + i
);
328 n1
.erase(n1
.begin(), n1
.begin() + i
);
329 if (!n1rb_first
.isNull())
331 nb
.push_back(n1rb_first
);
342 else if (n1
[i
+ j
] != n2
[j
])
353 bool StringsEntail::componentContainsBase(
354 Node n1
, Node n2
, Node
& n1rb
, Node
& n1re
, int dir
, bool computeRemainder
)
356 Assert(n1rb
.isNull());
357 Assert(n1re
.isNull());
359 NodeManager
* nm
= NodeManager::currentNM();
367 if (n1
.isConst() && n2
.isConst())
369 size_t len1
= Word::getLength(n1
);
370 size_t len2
= Word::getLength(n2
);
375 if (Word::suffix(n1
, len2
) == n2
)
377 if (computeRemainder
)
379 n1rb
= Word::prefix(n1
, len1
- len2
);
386 if (Word::prefix(n1
, len2
) == n2
)
388 if (computeRemainder
)
390 n1re
= Word::suffix(n1
, len1
- len2
);
397 size_t f
= Word::find(n1
, n2
);
398 if (f
!= std::string::npos
)
400 if (computeRemainder
)
404 n1rb
= Word::prefix(n1
, f
);
408 n1re
= Word::suffix(n1
, len1
- (f
+ len2
));
419 // n1 = x containing n2 = substr( x, n2[1], n2[2] )
420 if (n2
.getKind() == STRING_SUBSTR
)
425 Node start_pos
= n2
[1];
426 Node end_pos
= nm
->mkNode(PLUS
, n2
[1], n2
[2]);
427 Node len_n2s
= nm
->mkNode(STRING_LENGTH
, n2
[0]);
430 // To be a suffix, start + length must be greater than
431 // or equal to the length of the string.
432 success
= ArithEntail::check(end_pos
, len_n2s
);
436 // To be a prefix, must literally start at 0, since
437 // if we knew it started at <0, it should be rewritten to "",
438 // if we knew it started at 0, then n2[1] should be rewritten to
440 success
= start_pos
.isConst()
441 && start_pos
.getConst
<Rational
>().sgn() == 0;
445 if (computeRemainder
)
447 // we can only compute the remainder if start_pos and end_pos
448 // are known to be non-negative.
449 if (!ArithEntail::check(start_pos
)
450 || !ArithEntail::check(end_pos
))
457 STRING_SUBSTR
, n2
[0], nm
->mkConst(Rational(0)), start_pos
);
461 n1re
= nm
->mkNode(STRING_SUBSTR
, n2
[0], end_pos
, len_n2s
);
469 if (!computeRemainder
&& dir
== 0)
471 if (n1
.getKind() == STRING_STRREPL
)
473 // (str.contains (str.replace x y z) w) ---> true
474 // if (str.contains x w) --> true and (str.contains z w) ---> true
475 Node xCtnW
= checkContains(n1
[0], n2
);
476 if (!xCtnW
.isNull() && xCtnW
.getConst
<bool>())
478 Node zCtnW
= checkContains(n1
[2], n2
);
479 if (!zCtnW
.isNull() && zCtnW
.getConst
<bool>())
491 bool StringsEntail::stripConstantEndpoints(std::vector
<Node
>& n1
,
492 std::vector
<Node
>& n2
,
493 std::vector
<Node
>& nb
,
494 std::vector
<Node
>& ne
,
500 NodeManager
* nm
= NodeManager::currentNM();
501 bool changed
= false;
502 // for ( forwards, backwards ) direction
503 for (unsigned r
= 0; r
< 2; r
++)
505 if (dir
== 0 || (r
== 0 && dir
== 1) || (r
== 1 && dir
== -1))
507 unsigned index0
= r
== 0 ? 0 : n1
.size() - 1;
508 unsigned index1
= r
== 0 ? 0 : n2
.size() - 1;
509 bool removeComponent
= false;
510 Node n1cmp
= n1
[index0
];
512 if (n1cmp
.isConst() && n1cmp
.getConst
<String
>().size() == 0)
517 std::vector
<Node
> sss
;
518 std::vector
<Node
> sls
;
519 n1cmp
= utils::decomposeSubstrChain(n1cmp
, sss
, sls
);
520 Trace("strings-rewrite-debug2")
521 << "stripConstantEndpoints : Compare " << n1cmp
<< " " << n2
[index1
]
522 << ", dir = " << dir
<< std::endl
;
525 CVC4::String s
= n1cmp
.getConst
<String
>();
526 // overlap is an overapproximation of the number of characters
527 // n2[index1] can match in s
528 unsigned overlap
= s
.size();
529 if (n2
[index1
].isConst())
531 CVC4::String t
= n2
[index1
].getConst
<String
>();
532 std::size_t ret
= r
== 0 ? s
.find(t
) : s
.rfind(t
);
533 if (ret
== std::string::npos
)
537 // can remove everything
538 // e.g. str.contains( "abc", str.++( "ba", x ) ) -->
539 // str.contains( "", str.++( "ba", x ) )
540 removeComponent
= true;
542 else if (sss
.empty()) // only if not substr
544 // check how much overlap there is
545 // This is used to partially strip off the endpoint
546 // e.g. str.contains( str.++( "abc", x ), str.++( "cd", y ) ) -->
547 // str.contains( str.++( "c", x ), str.++( "cd", y ) )
548 overlap
= r
== 0 ? s
.overlap(t
) : t
.overlap(s
);
552 // if we are looking at a substring, we can remove the component
553 // if there is no overlap
554 // e.g. str.contains( str.++( str.substr( "c", i, j ), x), "a" )
555 // --> str.contains( x, "a" )
556 removeComponent
= ((r
== 0 ? s
.overlap(t
) : t
.overlap(s
)) == 0);
559 else if (sss
.empty()) // only if not substr
561 Assert(ret
< s
.size());
562 // can strip off up to the find position, e.g.
563 // str.contains( str.++( "abc", x ), str.++( "b", y ) ) -->
564 // str.contains( str.++( "bc", x ), str.++( "b", y ) ),
566 // str.contains( str.++( x, "abbd" ), str.++( y, "b" ) ) -->
567 // str.contains( str.++( x, "abb" ), str.++( y, "b" ) )
568 overlap
= s
.size() - ret
;
575 // process the overlap
576 if (overlap
< s
.size())
581 removeComponent
= true;
585 // can drop the prefix (resp. suffix) from the first (resp. last)
589 nb
.push_back(nm
->mkConst(s
.prefix(s
.size() - overlap
)));
590 n1
[index0
] = nm
->mkConst(s
.suffix(overlap
));
594 ne
.push_back(nm
->mkConst(s
.suffix(s
.size() - overlap
)));
595 n1
[index0
] = nm
->mkConst(s
.prefix(overlap
));
600 else if (n1cmp
.getKind() == STRING_ITOS
)
602 if (n2
[index1
].isConst())
604 CVC4::String t
= n2
[index1
].getConst
<String
>();
608 // if n1.size()==1, then if n2[index1] is not a number, we can drop
609 // the entire component
610 // e.g. str.contains( int.to.str(x), "123a45") --> false
613 removeComponent
= true;
618 const std::vector
<unsigned>& tvec
= t
.getVec();
619 Assert(tvec
.size() > 0);
621 // if n1.size()>1, then if the first (resp. last) character of
623 // is not a digit, we can drop the entire component, e.g.:
624 // str.contains( str.++( int.to.str(x), y ), "a12") -->
625 // str.contains( y, "a12" )
626 // str.contains( str.++( y, int.to.str(x) ), "a0b") -->
627 // str.contains( y, "a0b" )
628 unsigned i
= r
== 0 ? 0 : (tvec
.size() - 1);
629 if (!String::isDigit(tvec
[i
]))
631 removeComponent
= true;
638 // can drop entire first (resp. last) component
641 nb
.push_back(n1
[index0
]);
642 n1
.erase(n1
.begin(), n1
.begin() + 1);
646 ne
.push_back(n1
[index0
]);
651 // if we've removed everything, just return (we will rewrite to false)
661 // TODO (#1180) : computing the maximal overlap in this function may be
663 // str.contains( str.++( str.to.int(x), str.substr(y,0,3) ), "2aaaa" ) --->
665 // ...since str.to.int(x) can contain at most 1 character from "2aaaa",
666 // leaving 4 characters
667 // which is larger that the upper bound for length of str.substr(y,0,3),
672 Node
StringsEntail::checkContains(Node a
, Node b
, bool fullRewriter
)
674 NodeManager
* nm
= NodeManager::currentNM();
675 Node ctn
= nm
->mkNode(STRING_STRCTN
, a
, b
);
679 ctn
= Rewriter::rewrite(ctn
);
687 ctn
= d_rewriter
.rewriteContains(ctn
);
688 } while (prev
!= ctn
&& ctn
.getKind() == STRING_STRCTN
);
691 Assert(ctn
.getType().isBoolean());
692 return ctn
.isConst() ? ctn
: Node::null();
695 bool StringsEntail::checkNonEmpty(Node a
)
697 Node len
= NodeManager::currentNM()->mkNode(STRING_LENGTH
, a
);
698 len
= Rewriter::rewrite(len
);
699 return ArithEntail::check(len
, true);
702 bool StringsEntail::checkLengthOne(Node s
, bool strict
)
704 NodeManager
* nm
= NodeManager::currentNM();
705 Node one
= nm
->mkConst(Rational(1));
706 Node len
= nm
->mkNode(STRING_LENGTH
, s
);
707 len
= Rewriter::rewrite(len
);
708 return ArithEntail::check(one
, len
)
709 && (!strict
|| ArithEntail::check(len
, true));
712 bool StringsEntail::checkMultisetSubset(Node a
, Node b
)
714 std::vector
<Node
> avec
;
715 utils::getConcat(getMultisetApproximation(a
), avec
);
716 std::vector
<Node
> bvec
;
717 utils::getConcat(b
, bvec
);
719 std::map
<Node
, unsigned> num_nconst
[2];
720 std::map
<Node
, unsigned> num_const
[2];
721 for (unsigned j
= 0; j
< 2; j
++)
723 std::vector
<Node
>& jvec
= j
== 0 ? avec
: bvec
;
724 for (const Node
& cc
: jvec
)
736 bool ms_success
= true;
737 for (std::pair
<const Node
, unsigned>& nncp
: num_nconst
[0])
739 if (nncp
.second
> num_nconst
[1][nncp
.first
])
747 // count the number of constant characters in the first argument
748 std::map
<Node
, unsigned> count_const
[2];
749 std::vector
<Node
> chars
;
750 for (unsigned j
= 0; j
< 2; j
++)
752 for (std::pair
<const Node
, unsigned>& ncp
: num_const
[j
])
755 Assert(cn
.isConst());
756 std::vector
<Node
> cnChars
= Word::getChars(cn
);
757 for (const Node
& ch
: cnChars
)
759 count_const
[j
][ch
] += ncp
.second
;
760 if (std::find(chars
.begin(), chars
.end(), ch
) == chars
.end())
767 Trace("strings-entail-ms-ss")
768 << "For " << a
<< " and " << b
<< " : " << std::endl
;
769 for (const Node
& ch
: chars
)
771 Trace("strings-entail-ms-ss") << " # occurrences of substring ";
772 Trace("strings-entail-ms-ss") << ch
<< " in arguments is ";
773 Trace("strings-entail-ms-ss")
774 << count_const
[0][ch
] << " / " << count_const
[1][ch
] << std::endl
;
775 if (count_const
[0][ch
] < count_const
[1][ch
])
781 // TODO (#1180): count the number of 2,3,4,.. character substrings
783 // str.contains( str.++( x, "cbabc" ), str.++( "cabbc", x ) ) ---> false
784 // since the second argument contains more occurrences of "bb".
785 // note this is orthogonal reasoning to inductive reasoning
786 // via regular membership reduction in Liang et al CAV 2015.
791 Node
StringsEntail::checkHomogeneousString(Node a
)
793 std::vector
<Node
> avec
;
794 utils::getConcat(getMultisetApproximation(a
), avec
);
798 for (const Node
& ac
: avec
)
802 std::vector
<Node
> acv
= Word::getChars(ac
);
803 for (const Node
& cc
: acv
)
812 // Found a different character
819 // Could produce a different character
826 return Word::mkEmptyWord(a
.getType());
832 Node
StringsEntail::getMultisetApproximation(Node a
)
834 NodeManager
* nm
= NodeManager::currentNM();
835 if (a
.getKind() == STRING_SUBSTR
)
839 else if (a
.getKind() == STRING_STRREPL
)
841 return getMultisetApproximation(nm
->mkNode(STRING_CONCAT
, a
[0], a
[2]));
843 else if (a
.getKind() == STRING_CONCAT
)
845 NodeBuilder
<> nb(STRING_CONCAT
);
846 for (const Node
& ac
: a
)
848 nb
<< getMultisetApproximation(ac
);
850 return nb
.constructNode();
858 Node
StringsEntail::getStringOrEmpty(Node n
)
867 if (Word::isEmpty(n
[0]))
869 // (str.replace "" x y) --> y
874 if (checkLengthOne(n
[0]) && Word::isEmpty(n
[2]))
876 // (str.replace "A" x "") --> "A"
886 if (checkLengthOne(n
[0]))
888 // (str.substr "A" x y) --> "A"
905 Node
StringsEntail::inferEqsFromContains(Node x
, Node y
)
907 NodeManager
* nm
= NodeManager::currentNM();
908 Node emp
= Word::mkEmptyWord(x
.getType());
909 Assert(x
.getType() == y
.getType());
910 TypeNode stype
= x
.getType();
912 Node xLen
= nm
->mkNode(STRING_LENGTH
, x
);
913 std::vector
<Node
> yLens
;
914 if (y
.getKind() != STRING_CONCAT
)
916 yLens
.push_back(nm
->mkNode(STRING_LENGTH
, y
));
920 for (const Node
& yi
: y
)
922 yLens
.push_back(nm
->mkNode(STRING_LENGTH
, yi
));
926 std::vector
<Node
> zeroLens
;
929 // If x is the empty string, then all ys must be empty, too, and we can
930 // skip the expensive checks. Note that this is just a performance
932 zeroLens
.swap(yLens
);
936 // Check if we can infer that str.len(x) <= str.len(y). If that is the
937 // case, try to minimize the sum in str.len(x) <= str.len(y1) + ... +
938 // str.len(yn) (where y = y1 ++ ... ++ yn) while keeping the inequality
939 // true. The terms that can have length zero without making the inequality
940 // false must be all be empty if (str.contains x y) is true.
941 if (!ArithEntail::inferZerosInSumGeq(xLen
, yLens
, zeroLens
))
943 // We could not prove that the inequality holds
946 else if (yLens
.size() == y
.getNumChildren())
948 // We could only prove that the inequality holds but not that any of the
950 return nm
->mkNode(EQUAL
, x
, y
);
954 if (y
.getKind() != STRING_CONCAT
)
956 if (zeroLens
.size() == 1)
958 // y is not a concatenation and we found that it must be empty, so just
960 Assert(zeroLens
[0][0] == y
);
961 return nm
->mkNode(EQUAL
, y
, emp
);
965 Assert(yLens
.size() == 1 && yLens
[0][0] == y
);
966 return nm
->mkNode(EQUAL
, x
, y
);
970 std::vector
<Node
> cs
;
971 for (const Node
& yiLen
: yLens
)
973 Assert(std::find(y
.begin(), y
.end(), yiLen
[0]) != y
.end());
974 cs
.push_back(yiLen
[0]);
977 NodeBuilder
<> nb(AND
);
978 // (= x (str.++ y1' ... ym'))
981 nb
<< nm
->mkNode(EQUAL
, x
, utils::mkConcat(cs
, stype
));
983 // (= y1'' "") ... (= yk'' "")
984 for (const Node
& zeroLen
: zeroLens
)
986 Assert(std::find(y
.begin(), y
.end(), zeroLen
[0]) != y
.end());
987 nb
<< nm
->mkNode(EQUAL
, zeroLen
[0], emp
);
990 // (and (= x (str.++ y1' ... ym')) (= y1'' "") ... (= yk'' ""))
991 return nb
.constructNode();
994 } // namespace strings
995 } // namespace theory