From b320aa323923822a7702997bbca05e8512da55a4 Mon Sep 17 00:00:00 2001 From: Andrew Reynolds Date: Wed, 26 Feb 2020 13:54:41 -0600 Subject: [PATCH] Basic support for regular expression complement (#3437) Fixes #3408 . Adds basic rewriter, parsing, type checking and implements the required cases in regexp_operation.cpp. It also adds some missing documentation in regexp_operation.h --- src/api/cvc4cpp.cpp | 2 + src/api/cvc4cppkind.h | 8 ++ src/parser/cvc/Cvc.g | 3 + src/parser/smt2/smt2.cpp | 1 + src/printer/smt2/smt2_printer.cpp | 2 + src/theory/strings/kinds | 2 + src/theory/strings/regexp_operation.cpp | 74 +++++++++++++++---- src/theory/strings/regexp_operation.h | 21 +++++- .../strings/theory_strings_rewriter.cpp | 13 +++- test/regress/CMakeLists.txt | 2 + .../regress0/strings/complement-simple.smt2 | 5 ++ .../regress1/strings/complement-test.smt2 | 12 +++ 12 files changed, 128 insertions(+), 17 deletions(-) create mode 100644 test/regress/regress0/strings/complement-simple.smt2 create mode 100644 test/regress/regress1/strings/complement-test.smt2 diff --git a/src/api/cvc4cpp.cpp b/src/api/cvc4cpp.cpp index 622d48ac9..4e5f4604e 100644 --- a/src/api/cvc4cpp.cpp +++ b/src/api/cvc4cpp.cpp @@ -277,6 +277,7 @@ const static std::unordered_map s_kinds{ {REGEXP_LOOP, CVC4::Kind::REGEXP_LOOP}, {REGEXP_EMPTY, CVC4::Kind::REGEXP_EMPTY}, {REGEXP_SIGMA, CVC4::Kind::REGEXP_SIGMA}, + {REGEXP_COMPLEMENT, CVC4::Kind::REGEXP_COMPLEMENT}, /* Quantifiers --------------------------------------------------------- */ {FORALL, CVC4::Kind::FORALL}, {EXISTS, CVC4::Kind::EXISTS}, @@ -540,6 +541,7 @@ const static std::unordered_map {CVC4::Kind::REGEXP_LOOP, REGEXP_LOOP}, {CVC4::Kind::REGEXP_EMPTY, REGEXP_EMPTY}, {CVC4::Kind::REGEXP_SIGMA, REGEXP_SIGMA}, + {CVC4::Kind::REGEXP_COMPLEMENT, REGEXP_COMPLEMENT}, /* Quantifiers ----------------------------------------------------- */ {CVC4::Kind::FORALL, FORALL}, {CVC4::Kind::EXISTS, EXISTS}, diff --git a/src/api/cvc4cppkind.h b/src/api/cvc4cppkind.h index 591ff9b1e..ca5696537 100644 --- a/src/api/cvc4cppkind.h +++ b/src/api/cvc4cppkind.h @@ -2172,6 +2172,14 @@ enum CVC4_PUBLIC Kind : int32_t * mkTerm(Kind kind) */ REGEXP_SIGMA, + /** + * Regexp complement. + * Parameters: 1 + * -[1]: Term of sort RegExp + * Create with: + * mkTerm(Kind kind, Term child1) + */ + REGEXP_COMPLEMENT, #if 0 /* regexp rv (internal use only) */ REGEXP_RV, diff --git a/src/parser/cvc/Cvc.g b/src/parser/cvc/Cvc.g index 3fde52bbe..33ca7bcf2 100644 --- a/src/parser/cvc/Cvc.g +++ b/src/parser/cvc/Cvc.g @@ -240,6 +240,7 @@ tokens { REGEXP_LOOP_TOK = 'RE_LOOP'; REGEXP_EMPTY_TOK = 'RE_EMPTY'; REGEXP_SIGMA_TOK = 'RE_SIGMA'; + REGEXP_COMPLEMENT_TOK = 'RE_COMPLEMENT'; SETS_CARD_TOK = 'CARD'; @@ -2045,6 +2046,8 @@ stringTerm[CVC4::Expr& f] { f = MK_EXPR(CVC4::kind::REGEXP_RANGE, f, f2); } | REGEXP_LOOP_TOK LPAREN formula[f] COMMA formula[f2] COMMA formula[f3] RPAREN { f = MK_EXPR(CVC4::kind::REGEXP_LOOP, f, f2, f3); } + | REGEXP_COMPLEMENT_TOK LPAREN formula[f] RPAREN + { f = MK_EXPR(CVC4::kind::REGEXP_COMPLEMENT, f); } | REGEXP_EMPTY_TOK { f = MK_EXPR(CVC4::kind::REGEXP_EMPTY, std::vector()); } | REGEXP_SIGMA_TOK diff --git a/src/parser/smt2/smt2.cpp b/src/parser/smt2/smt2.cpp index 42111f581..8faeab491 100644 --- a/src/parser/smt2/smt2.cpp +++ b/src/parser/smt2/smt2.cpp @@ -200,6 +200,7 @@ void Smt2::addStringOperators() { addOperator(kind::REGEXP_OPT, "re.opt"); addOperator(kind::REGEXP_RANGE, "re.range"); addOperator(kind::REGEXP_LOOP, "re.loop"); + addOperator(kind::REGEXP_COMPLEMENT, "re.comp"); addOperator(kind::STRING_LT, "str.<"); addOperator(kind::STRING_LEQ, "str.<="); } diff --git a/src/printer/smt2/smt2_printer.cpp b/src/printer/smt2/smt2_printer.cpp index 07422cd8b..e9a4d0a83 100644 --- a/src/printer/smt2/smt2_printer.cpp +++ b/src/printer/smt2/smt2_printer.cpp @@ -665,6 +665,7 @@ void Smt2Printer::toStream(std::ostream& out, case kind::REGEXP_OPT: case kind::REGEXP_RANGE: case kind::REGEXP_LOOP: + case kind::REGEXP_COMPLEMENT: case kind::REGEXP_EMPTY: case kind::REGEXP_SIGMA: out << smtKindString(k, d_variant) << " "; break; @@ -1236,6 +1237,7 @@ static string smtKindString(Kind k, Variant v) case kind::REGEXP_OPT: return "re.opt"; case kind::REGEXP_RANGE: return "re.range"; case kind::REGEXP_LOOP: return "re.loop"; + case kind::REGEXP_COMPLEMENT: return "re.comp"; //sep theory case kind::SEP_STAR: return "sep"; diff --git a/src/theory/strings/kinds b/src/theory/strings/kinds index aa1e2627a..052b75302 100644 --- a/src/theory/strings/kinds +++ b/src/theory/strings/kinds @@ -68,6 +68,7 @@ operator REGEXP_PLUS 1 "regexp +" operator REGEXP_OPT 1 "regexp ?" operator REGEXP_RANGE 2 "regexp range" operator REGEXP_LOOP 2:3 "regexp loop" +operator REGEXP_COMPLEMENT 1 "regexp complement" operator REGEXP_EMPTY 0 "regexp empty" operator REGEXP_SIGMA 0 "regexp all characters" @@ -85,6 +86,7 @@ typerule REGEXP_PLUS "SimpleTypeRule" typerule REGEXP_OPT "SimpleTypeRule" typerule REGEXP_RANGE ::CVC4::theory::strings::RegExpRangeTypeRule typerule REGEXP_LOOP "SimpleTypeRule>" +typerule REGEXP_COMPLEMENT "SimpleTypeRule" typerule STRING_TO_REGEXP "SimpleTypeRule" diff --git a/src/theory/strings/regexp_operation.cpp b/src/theory/strings/regexp_operation.cpp index 1b2de0eb5..048dc88b6 100644 --- a/src/theory/strings/regexp_operation.cpp +++ b/src/theory/strings/regexp_operation.cpp @@ -68,9 +68,9 @@ RegExpConstType RegExpOpr::getRegExpConstType(Node r) visit.pop_back(); it = d_constCache.find(cur); + Kind ck = cur.getKind(); if (it == d_constCache.end()) { - Kind ck = cur.getKind(); if (ck == STRING_TO_REGEXP) { Node tmp = Rewriter::rewrite(cur[0]); @@ -104,7 +104,7 @@ RegExpConstType RegExpOpr::getRegExpConstType(Node r) } else if (it->second == RE_C_UNKNOWN) { - RegExpConstType ret = RE_C_CONRETE_CONSTANT; + RegExpConstType ret = ck == REGEXP_COMPLEMENT ? RE_C_CONSTANT : RE_C_CONRETE_CONSTANT; for (const Node& cn : cur) { it = d_constCache.find(cn); @@ -126,7 +126,8 @@ bool RegExpOpr::isRegExpKind(Kind k) return k == REGEXP_EMPTY || k == REGEXP_SIGMA || k == STRING_TO_REGEXP || k == REGEXP_CONCAT || k == REGEXP_UNION || k == REGEXP_INTER || k == REGEXP_STAR || k == REGEXP_PLUS || k == REGEXP_OPT - || k == REGEXP_RANGE || k == REGEXP_LOOP || k == REGEXP_RV; + || k == REGEXP_RANGE || k == REGEXP_LOOP || k == REGEXP_RV + || k == REGEXP_COMPLEMENT; } // 0-unknown, 1-yes, 2-no @@ -264,6 +265,14 @@ int RegExpOpr::delta( Node r, Node &exp ) { } break; } + case kind::REGEXP_COMPLEMENT: + { + int tmp = delta(r[0], exp); + // flip the result if known + tmp = tmp == 0 ? 0 : (3 - tmp); + exp = exp.isNull() ? exp : exp.negate(); + break; + } default: { Assert(!isRegExpKind(k)); break; @@ -504,6 +513,12 @@ int RegExpOpr::derivativeS( Node r, CVC4::String c, Node &retNode ) { } break; } + case kind::REGEXP_COMPLEMENT: + { + // don't know result + return 0; + break; + } default: { Assert(!isRegExpKind(r.getKind())); return 0; @@ -679,6 +694,7 @@ Node RegExpOpr::derivativeSingle( Node r, CVC4::String c ) { //Trace("regexp-derive") << "RegExp-derive : REGEXP_LOOP returns /" << mkString(retNode) << "/" << std::endl; break; } + case kind::REGEXP_COMPLEMENT: default: { Trace("strings-error") << "Unsupported term: " << mkString( r ) << " in derivative of RegExp." << std::endl; Unreachable(); @@ -786,12 +802,13 @@ void RegExpOpr::firstChars(Node r, std::set &pcset, SetNodes &pvset) break; } case kind::REGEXP_SIGMA: + case kind::REGEXP_COMPLEMENT: default: { // we do not expect to call this function on regular expressions that // aren't a standard regular expression kind. However, if we do, then // the following code is conservative and says that the current // regular expression can begin with any character. - Assert(k == REGEXP_SIGMA); + Assert(isRegExpKind(k)); // can start with any character Assert(d_lastchar < std::numeric_limits::max()); for (unsigned i = 0; i <= d_lastchar; i++) @@ -1046,6 +1063,12 @@ void RegExpOpr::simplifyNRegExp( Node s, Node r, std::vector< Node > &new_nodes } break; } + case kind::REGEXP_COMPLEMENT: + { + // ~( s in complement(R) ) ---> s in R + conc = nm->mkNode(STRING_IN_REGEXP, s, r[0]); + break; + } default: { Assert(!isRegExpKind(k)); break; @@ -1240,6 +1263,12 @@ void RegExpOpr::simplifyPRegExp( Node s, Node r, std::vector< Node > &new_nodes } break; } + case kind::REGEXP_COMPLEMENT: + { + // s in complement(R) ---> ~( s in R ) + conc = nm->mkNode(STRING_IN_REGEXP, s, r[0]).negate(); + break; + } default: { Assert(!isRegExpKind(k)); break; @@ -1305,10 +1334,14 @@ void RegExpOpr::convert2(unsigned cnt, Node n, Node &r1, Node &r2) { if(n == d_emptyRegexp) { r1 = d_emptyRegexp; r2 = d_emptyRegexp; + return; } else if(n == d_emptySingleton) { r1 = d_emptySingleton; r2 = d_emptySingleton; - } else if(n.getKind() == kind::REGEXP_RV) { + } + Kind nk = n.getKind(); + if (nk == REGEXP_RV) + { Assert(n[0].getConst() <= Rational(String::maxSize())) << "Exceeded UINT32_MAX in RegExpOpr::convert2"; unsigned y = n[0].getConst().getNumerator().toUnsignedInt(); @@ -1318,7 +1351,9 @@ void RegExpOpr::convert2(unsigned cnt, Node n, Node &r1, Node &r2) { } else { r2 = n; } - } else if(n.getKind() == kind::REGEXP_CONCAT) { + } + else if (nk == REGEXP_CONCAT) + { bool flag = true; std::vector vr1, vr2; for( unsigned i=0; i vr1, vr2; for( unsigned i=0; imkNode(kind::REGEXP_UNION, vr1); r2 = NodeManager::currentNM()->mkNode(kind::REGEXP_UNION, vr2); - } else if(n.getKind() == kind::STRING_TO_REGEXP || n.getKind() == kind::REGEXP_SIGMA || n.getKind() == kind::REGEXP_RANGE) { - r1 = d_emptySingleton; - r2 = n; - } else if(n.getKind() == kind::REGEXP_LOOP) { - //TODO:LOOP + } + else if (nk == STRING_TO_REGEXP || nk == REGEXP_SIGMA || nk == REGEXP_RANGE + || nk == REGEXP_COMPLEMENT || nk == REGEXP_LOOP) + { + // this leaves n unchanged r1 = d_emptySingleton; r2 = n; - //Unreachable(); - } else { + } + else + { //is it possible? Unreachable(); } @@ -1541,6 +1579,7 @@ Node RegExpOpr::removeIntersection(Node r) { case REGEXP_CONCAT: case REGEXP_UNION: case REGEXP_STAR: + case REGEXP_COMPLEMENT: { NodeBuilder<> nb(rk); for (const Node& rc : r) @@ -1696,6 +1735,13 @@ std::string RegExpOpr::mkString( Node r ) { retStr += ">"; break; } + case REGEXP_COMPLEMENT: + { + retStr += "^("; + retStr += mkString(r[0]); + retStr += ")"; + break; + } default: { std::stringstream ss; diff --git a/src/theory/strings/regexp_operation.h b/src/theory/strings/regexp_operation.h index 91d5df744..b9dbedba5 100644 --- a/src/theory/strings/regexp_operation.h +++ b/src/theory/strings/regexp_operation.h @@ -41,10 +41,13 @@ namespace strings { */ enum RegExpConstType { - // the regular expression doesn't contain variables or re.allchar or re.range + // the regular expression doesn't contain variables or re.comp, + // re.allchar or re.range (call these three operators "non-concrete + // operators"). Notice that re.comp is a non-concrete operator + // since it can be seen as indirectly defined in terms of re.allchar. RE_C_CONRETE_CONSTANT, // the regular expression doesn't contain variables, but may contain - // re.allchar or re.range + // re.comp, re.allchar or re.range RE_C_CONSTANT, // the regular expression may contain variables RE_C_VARIABLE, @@ -122,6 +125,20 @@ class RegExpOpr { /** is k a native operator whose return type is a regular expression? */ static bool isRegExpKind(Kind k); void simplify(Node t, std::vector< Node > &new_nodes, bool polarity); + /** + * This method returns 1 if the empty string is in r, 2 if the empty string + * is not in r, or 0 if it is unknown whether the empty string is in r. + * TODO (project #2): refactor the return value of this function. + * + * If this method returns 0, then exp is updated to an explanation that + * would imply that the empty string is in r. + * + * For example, + * - delta( (re.inter (str.to.re x) (re.* "A")) ) returns 0 and sets exp to + * x = "", + * - delta( (re.++ (str.to.re "A") R) ) returns 2, + * - delta( (re.union (re.* "A") R) ) returns 1. + */ int delta( Node r, Node &exp ); int derivativeS( Node r, CVC4::String c, Node &retNode ); Node derivativeSingle( Node r, CVC4::String c ); diff --git a/src/theory/strings/theory_strings_rewriter.cpp b/src/theory/strings/theory_strings_rewriter.cpp index 339d11dd2..9cd4c1ecc 100644 --- a/src/theory/strings/theory_strings_rewriter.cpp +++ b/src/theory/strings/theory_strings_rewriter.cpp @@ -1329,6 +1329,11 @@ bool TheoryStringsRewriter::testConstStringInRegExp( CVC4::String &s, unsigned i } } } + case REGEXP_COMPLEMENT: + { + return !testConstStringInRegExp(s, index_start, r[0]); + break; + } default: { Assert(!RegExpOpr::isRegExpKind(k)); return false; @@ -1469,7 +1474,13 @@ Node TheoryStringsRewriter::rewriteMembership(TNode node) { retNode = nm->mkNode(AND, nm->mkNode(LEQ, nm->mkNode(STRING_CODE, r[0]), xcode), nm->mkNode(LEQ, xcode, nm->mkNode(STRING_CODE, r[1]))); - }else if(x != node[0] || r != node[1]) { + } + else if (r.getKind() == REGEXP_COMPLEMENT) + { + retNode = nm->mkNode(STRING_IN_REGEXP, x, r[0]).negate(); + } + else if (x != node[0] || r != node[1]) + { retNode = NodeManager::currentNM()->mkNode( kind::STRING_IN_REGEXP, x, r ); } diff --git a/test/regress/CMakeLists.txt b/test/regress/CMakeLists.txt index 3fe4321d0..1113717ce 100644 --- a/test/regress/CMakeLists.txt +++ b/test/regress/CMakeLists.txt @@ -898,6 +898,7 @@ set(regress_0_tests regress0/strings/code-eval.smt2 regress0/strings/code-perf.smt2 regress0/strings/code-sat-neg-one.smt2 + regress0/strings/complement-simple.smt2 regress0/strings/escchar.smt2 regress0/strings/escchar_25.smt2 regress0/strings/hconst-092618.smt2 @@ -1661,6 +1662,7 @@ set(regress_1_tests regress1/strings/cmu-inc-nlpp-071516.smt2 regress1/strings/cmu-substr-rw.smt2 regress1/strings/code-sequence.smt2 + regress1/strings/complement-test.smt2 regress1/strings/crash-1019.smt2 regress1/strings/csp-prefix-exp-bug.smt2 regress1/strings/double-replace.smt2 diff --git a/test/regress/regress0/strings/complement-simple.smt2 b/test/regress/regress0/strings/complement-simple.smt2 new file mode 100644 index 000000000..c19699448 --- /dev/null +++ b/test/regress/regress0/strings/complement-simple.smt2 @@ -0,0 +1,5 @@ +(set-logic SLIA) +(set-info :status sat) +(declare-fun x () String) +(assert (str.in.re x (re.comp (str.to.re "ABC")))) +(check-sat) diff --git a/test/regress/regress1/strings/complement-test.smt2 b/test/regress/regress1/strings/complement-test.smt2 new file mode 100644 index 000000000..1fcbdc2c3 --- /dev/null +++ b/test/regress/regress1/strings/complement-test.smt2 @@ -0,0 +1,12 @@ +(set-logic SLIA) +(set-info :status sat) +(declare-fun x () String) +(declare-fun y () String) +(declare-fun z () String) +(assert (= x (str.++ "AB" y))) +(assert (or (= y "C") (= y (str.++ "D" z)) (= (str.len y) 10))) +(assert (str.in.re x + (re.inter + (re.comp (str.to.re "AB")) + (re.comp (re.++ (str.to.re "AB") (re.range "A" "E") (re.* re.allchar)))))) +(check-sat) -- 2.30.2