From 96338f1975a1543a0547df82b9d0f2c5d206fb1b Mon Sep 17 00:00:00 2001 From: Emmanuel Briot Date: Wed, 6 Jun 2007 12:30:52 +0200 Subject: [PATCH] g-regpat.adb (Quote): Fix improper quoting of '.' 2007-04-20 Emmanuel Briot * g-regpat.adb (Quote): Fix improper quoting of '.' From-SVN: r125423 --- gcc/ada/g-regpat.adb | 3609 +----------------------------------------- 1 file changed, 5 insertions(+), 3604 deletions(-) diff --git a/gcc/ada/g-regpat.adb b/gcc/ada/g-regpat.adb index 053ee930d84..e7a488a1435 100644 --- a/gcc/ada/g-regpat.adb +++ b/gcc/ada/g-regpat.adb @@ -7,7 +7,7 @@ -- B o d y -- -- -- -- Copyright (C) 1986 by University of Toronto. -- --- Copyright (C) 1999-2006, AdaCore -- +-- Copyright (C) 1999-2007, AdaCore -- -- -- -- GNAT is free software; you can redistribute it and/or modify it under -- -- terms of the GNU General Public License as published by the Free Soft- -- @@ -32,3607 +32,8 @@ -- -- ------------------------------------------------------------------------------ --- This is an altered Ada 95 version of the original V8 style regular --- expression library written in C by Henry Spencer. Apart from the --- translation to Ada, the interface has been considerably changed to --- use the Ada String type instead of C-style nul-terminated strings. +-- This package does not require a body, since it is a package renaming. We +-- provide a dummy file containing a No_Body pragma so that previous versions +-- of the body (which did exist) will not intefere. --- Beware that some of this code is subtly aware of the way operator --- precedence is structured in regular expressions. Serious changes in --- regular-expression syntax might require a total rethink. - -with System.IO; use System.IO; -with Ada.Characters.Handling; use Ada.Characters.Handling; -with Unchecked_Conversion; - -package body GNAT.Regpat is - - MAGIC : constant Character := Character'Val (10#0234#); - -- The first byte of the regexp internal "program" is actually - -- this magic number; the start node begins in the second byte. - -- - -- This is used to make sure that a regular expression was correctly - -- compiled. - - ---------------------------- - -- Implementation details -- - ---------------------------- - - -- This is essentially a linear encoding of a nondeterministic - -- finite-state machine, also known as syntax charts or - -- "railroad normal form" in parsing technology. - - -- Each node is an opcode plus a "next" pointer, possibly plus an - -- operand. "Next" pointers of all nodes except BRANCH implement - -- concatenation; a "next" pointer with a BRANCH on both ends of it - -- is connecting two alternatives. - - -- The operand of some types of node is a literal string; for others, - -- it is a node leading into a sub-FSM. In particular, the operand of - -- a BRANCH node is the first node of the branch. - -- (NB this is *not* a tree structure: the tail of the branch connects - -- to the thing following the set of BRANCHes). - - -- You can see the exact byte-compiled version by using the Dump - -- subprogram. However, here are a few examples: - - -- (a|b): 1 : MAGIC - -- 2 : BRANCH (next at 10) - -- 5 : EXACT (next at 18) operand=a - -- 10 : BRANCH (next at 18) - -- 13 : EXACT (next at 18) operand=b - -- 18 : EOP (next at 0) - -- - -- (ab)*: 1 : MAGIC - -- 2 : CURLYX (next at 26) { 0, 32767} - -- 9 : OPEN 1 (next at 13) - -- 13 : EXACT (next at 19) operand=ab - -- 19 : CLOSE 1 (next at 23) - -- 23 : WHILEM (next at 0) - -- 26 : NOTHING (next at 29) - -- 29 : EOP (next at 0) - - -- The opcodes are: - - type Opcode is - - -- Name Operand? Meaning - - (EOP, -- no End of program - MINMOD, -- no Next operator is not greedy - - -- Classes of characters - - ANY, -- no Match any one character except newline - SANY, -- no Match any character, including new line - ANYOF, -- class Match any character in this class - EXACT, -- str Match this string exactly - EXACTF, -- str Match this string (case-folding is one) - NOTHING, -- no Match empty string - SPACE, -- no Match any whitespace character - NSPACE, -- no Match any non-whitespace character - DIGIT, -- no Match any numeric character - NDIGIT, -- no Match any non-numeric character - ALNUM, -- no Match any alphanumeric character - NALNUM, -- no Match any non-alphanumeric character - - -- Branches - - BRANCH, -- node Match this alternative, or the next - - -- Simple loops (when the following node is one character in length) - - STAR, -- node Match this simple thing 0 or more times - PLUS, -- node Match this simple thing 1 or more times - CURLY, -- 2num node Match this simple thing between n and m times. - - -- Complex loops - - CURLYX, -- 2num node Match this complex thing {n,m} times - -- The nums are coded on two characters each - - WHILEM, -- no Do curly processing and see if rest matches - - -- Matches after or before a word - - BOL, -- no Match "" at beginning of line - MBOL, -- no Same, assuming mutiline (match after \n) - SBOL, -- no Same, assuming single line (don't match at \n) - EOL, -- no Match "" at end of line - MEOL, -- no Same, assuming mutiline (match before \n) - SEOL, -- no Same, assuming single line (don't match at \n) - - BOUND, -- no Match "" at any word boundary - NBOUND, -- no Match "" at any word non-boundary - - -- Parenthesis groups handling - - REFF, -- num Match some already matched string, folded - OPEN, -- num Mark this point in input as start of #n - CLOSE); -- num Analogous to OPEN - - for Opcode'Size use 8; - - -- Opcode notes: - - -- BRANCH - -- The set of branches constituting a single choice are hooked - -- together with their "next" pointers, since precedence prevents - -- anything being concatenated to any individual branch. The - -- "next" pointer of the last BRANCH in a choice points to the - -- thing following the whole choice. This is also where the - -- final "next" pointer of each individual branch points; each - -- branch starts with the operand node of a BRANCH node. - - -- STAR,PLUS - -- '?', and complex '*' and '+', are implemented with CURLYX. - -- branches. Simple cases (one character per match) are implemented with - -- STAR and PLUS for speed and to minimize recursive plunges. - - -- OPEN,CLOSE - -- ...are numbered at compile time. - - -- EXACT, EXACTF - -- There are in fact two arguments, the first one is the length (minus - -- one of the string argument), coded on one character, the second - -- argument is the string itself, coded on length + 1 characters. - - -- A node is one char of opcode followed by two chars of "next" pointer. - -- "Next" pointers are stored as two 8-bit pieces, high order first. The - -- value is a positive offset from the opcode of the node containing it. - -- An operand, if any, simply follows the node. (Note that much of the - -- code generation knows about this implicit relationship.) - - -- Using two bytes for the "next" pointer is vast overkill for most - -- things, but allows patterns to get big without disasters. - - ----------------------- - -- Character classes -- - ----------------------- - -- This is the implementation for character classes ([...]) in the - -- syntax for regular expressions. Each character (0..256) has an - -- entry into the table. This makes for a very fast matching - -- algorithm. - - type Class_Byte is mod 256; - type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte; - - type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte; - Bit_Conversion : constant Bit_Conversion_Array := - (1, 2, 4, 8, 16, 32, 64, 128); - - type Std_Class is (ANYOF_NONE, - ANYOF_ALNUM, -- Alphanumeric class [a-zA-Z0-9] - ANYOF_NALNUM, - ANYOF_SPACE, -- Space class [ \t\n\r\f] - ANYOF_NSPACE, - ANYOF_DIGIT, -- Digit class [0-9] - ANYOF_NDIGIT, - ANYOF_ALNUMC, -- Alphanumeric class [a-zA-Z0-9] - ANYOF_NALNUMC, - ANYOF_ALPHA, -- Alpha class [a-zA-Z] - ANYOF_NALPHA, - ANYOF_ASCII, -- Ascii class (7 bits) 0..127 - ANYOF_NASCII, - ANYOF_CNTRL, -- Control class - ANYOF_NCNTRL, - ANYOF_GRAPH, -- Graphic class - ANYOF_NGRAPH, - ANYOF_LOWER, -- Lower case class [a-z] - ANYOF_NLOWER, - ANYOF_PRINT, -- printable class - ANYOF_NPRINT, - ANYOF_PUNCT, -- - ANYOF_NPUNCT, - ANYOF_UPPER, -- Upper case class [A-Z] - ANYOF_NUPPER, - ANYOF_XDIGIT, -- Hexadecimal digit - ANYOF_NXDIGIT - ); - - procedure Set_In_Class - (Bitmap : in out Character_Class; - C : Character); - -- Set the entry to True for C in the class Bitmap - - function Get_From_Class - (Bitmap : Character_Class; - C : Character) return Boolean; - -- Return True if the entry is set for C in the class Bitmap - - procedure Reset_Class (Bitmap : out Character_Class); - -- Clear all the entries in the class Bitmap - - pragma Inline (Set_In_Class); - pragma Inline (Get_From_Class); - pragma Inline (Reset_Class); - - ----------------------- - -- Local Subprograms -- - ----------------------- - - function "=" (Left : Character; Right : Opcode) return Boolean; - - function Is_Alnum (C : Character) return Boolean; - -- Return True if C is an alphanum character or an underscore ('_') - - function Is_White_Space (C : Character) return Boolean; - -- Return True if C is a whitespace character - - function Is_Printable (C : Character) return Boolean; - -- Return True if C is a printable character - - function Operand (P : Pointer) return Pointer; - -- Return a pointer to the first operand of the node at P - - function String_Length - (Program : Program_Data; - P : Pointer) return Program_Size; - -- Return the length of the string argument of the node at P - - function String_Operand (P : Pointer) return Pointer; - -- Return a pointer to the string argument of the node at P - - procedure Bitmap_Operand - (Program : Program_Data; - P : Pointer; - Op : out Character_Class); - -- Return a pointer to the string argument of the node at P - - function Get_Next_Offset - (Program : Program_Data; - IP : Pointer) return Pointer; - -- Get the offset field of a node. Used by Get_Next - - function Get_Next - (Program : Program_Data; - IP : Pointer) return Pointer; - -- Dig the next instruction pointer out of a node - - procedure Optimize (Self : in out Pattern_Matcher); - -- Optimize a Pattern_Matcher by noting certain special cases - - function Read_Natural - (Program : Program_Data; - IP : Pointer) return Natural; - -- Return the 2-byte natural coded at position IP - - -- All of the subprograms above are tiny and should be inlined - - pragma Inline ("="); - pragma Inline (Is_Alnum); - pragma Inline (Is_White_Space); - pragma Inline (Get_Next); - pragma Inline (Get_Next_Offset); - pragma Inline (Operand); - pragma Inline (Read_Natural); - pragma Inline (String_Length); - pragma Inline (String_Operand); - - type Expression_Flags is record - Has_Width, -- Known never to match null string - Simple, -- Simple enough to be STAR/PLUS operand - SP_Start : Boolean; -- Starts with * or + - end record; - - Worst_Expression : constant Expression_Flags := (others => False); - -- Worst case - - --------- - -- "=" -- - --------- - - function "=" (Left : Character; Right : Opcode) return Boolean is - begin - return Character'Pos (Left) = Opcode'Pos (Right); - end "="; - - -------------------- - -- Bitmap_Operand -- - -------------------- - - procedure Bitmap_Operand - (Program : Program_Data; - P : Pointer; - Op : out Character_Class) - is - function Convert is new Unchecked_Conversion - (Program_Data, Character_Class); - - begin - Op (0 .. 31) := Convert (Program (P + 3 .. P + 34)); - end Bitmap_Operand; - - ------------- - -- Compile -- - ------------- - - procedure Compile - (Matcher : out Pattern_Matcher; - Expression : String; - Final_Code_Size : out Program_Size; - Flags : Regexp_Flags := No_Flags) - is - -- We can't allocate space until we know how big the compiled form - -- will be, but we can't compile it (and thus know how big it is) - -- until we've got a place to put the code. So we cheat: we compile - -- it twice, once with code generation turned off and size counting - -- turned on, and once "for real". - - -- This also means that we don't allocate space until we are sure - -- that the thing really will compile successfully, and we never - -- have to move the code and thus invalidate pointers into it. - - -- Beware that the optimization-preparation code in here knows - -- about some of the structure of the compiled regexp. - - PM : Pattern_Matcher renames Matcher; - Program : Program_Data renames PM.Program; - - Emit_Code : constant Boolean := PM.Size > 0; - Emit_Ptr : Pointer := Program_First; - - Parse_Pos : Natural := Expression'First; -- Input-scan pointer - Parse_End : constant Natural := Expression'Last; - - ---------------------------- - -- Subprograms for Create -- - ---------------------------- - - procedure Emit (B : Character); - -- Output the Character B to the Program. If code-generation is - -- disabled, simply increments the program counter. - - function Emit_Node (Op : Opcode) return Pointer; - -- If code-generation is enabled, Emit_Node outputs the - -- opcode Op and reserves space for a pointer to the next node. - -- Return value is the location of new opcode, ie old Emit_Ptr. - - procedure Emit_Natural (IP : Pointer; N : Natural); - -- Split N on two characters at position IP - - procedure Emit_Class (Bitmap : Character_Class); - -- Emits a character class - - procedure Case_Emit (C : Character); - -- Emit C, after converting is to lower-case if the regular - -- expression is case insensitive. - - procedure Parse - (Parenthesized : Boolean; - Flags : out Expression_Flags; - IP : out Pointer); - -- Parse regular expression, i.e. main body or parenthesized thing - -- Caller must absorb opening parenthesis. - - procedure Parse_Branch - (Flags : out Expression_Flags; - First : Boolean; - IP : out Pointer); - -- Implements the concatenation operator and handles '|' - -- First should be true if this is the first item of the alternative. - - procedure Parse_Piece - (Expr_Flags : out Expression_Flags; - IP : out Pointer); - -- Parse something followed by possible [*+?] - - procedure Parse_Atom - (Expr_Flags : out Expression_Flags; - IP : out Pointer); - -- Parse_Atom is the lowest level parse procedure. - -- Optimization: gobbles an entire sequence of ordinary characters - -- so that it can turn them into a single node, which is smaller to - -- store and faster to run. Backslashed characters are exceptions, - -- each becoming a separate node; the code is simpler that way and - -- it's not worth fixing. - - procedure Insert_Operator - (Op : Opcode; - Operand : Pointer; - Greedy : Boolean := True); - -- Insert_Operator inserts an operator in front of an - -- already-emitted operand and relocates the operand. - -- This applies to PLUS and STAR. - -- If Minmod is True, then the operator is non-greedy. - - procedure Insert_Curly_Operator - (Op : Opcode; - Min : Natural; - Max : Natural; - Operand : Pointer; - Greedy : Boolean := True); - -- Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}). - -- If Minmod is True, then the operator is non-greedy. - - procedure Link_Tail (P, Val : Pointer); - -- Link_Tail sets the next-pointer at the end of a node chain - - procedure Link_Operand_Tail (P, Val : Pointer); - -- Link_Tail on operand of first argument; nop if operandless - - function Next_Instruction (P : Pointer) return Pointer; - -- Dig the "next" pointer out of a node - - procedure Fail (M : String); - pragma No_Return (Fail); - -- Fail with a diagnostic message, if possible - - function Is_Curly_Operator (IP : Natural) return Boolean; - -- Return True if IP is looking at a '{' that is the beginning - -- of a curly operator, ie it matches {\d+,?\d*} - - function Is_Mult (IP : Natural) return Boolean; - -- Return True if C is a regexp multiplier: '+', '*' or '?' - - procedure Get_Curly_Arguments - (IP : Natural; - Min : out Natural; - Max : out Natural; - Greedy : out Boolean); - -- Parse the argument list for a curly operator. - -- It is assumed that IP is indeed pointing at a valid operator. - -- So what is IP and how come IP is not referenced in the body ??? - - procedure Parse_Character_Class (IP : out Pointer); - -- Parse a character class. - -- The calling subprogram should consume the opening '[' before. - - procedure Parse_Literal - (Expr_Flags : out Expression_Flags; - IP : out Pointer); - -- Parse_Literal encodes a string of characters to be matched exactly - - function Parse_Posix_Character_Class return Std_Class; - -- Parse a posic character class, like [:alpha:] or [:^alpha:]. - -- The called is suppoed to absorbe the opening [. - - pragma Inline (Is_Mult); - pragma Inline (Emit_Natural); - pragma Inline (Parse_Character_Class); -- since used only once - - --------------- - -- Case_Emit -- - --------------- - - procedure Case_Emit (C : Character) is - begin - if (Flags and Case_Insensitive) /= 0 then - Emit (To_Lower (C)); - - else - -- Dump current character - - Emit (C); - end if; - end Case_Emit; - - ---------- - -- Emit -- - ---------- - - procedure Emit (B : Character) is - begin - if Emit_Code then - Program (Emit_Ptr) := B; - end if; - - Emit_Ptr := Emit_Ptr + 1; - end Emit; - - ---------------- - -- Emit_Class -- - ---------------- - - procedure Emit_Class (Bitmap : Character_Class) is - subtype Program31 is Program_Data (0 .. 31); - - function Convert is new Unchecked_Conversion - (Character_Class, Program31); - - begin - if Emit_Code then - Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap); - end if; - - Emit_Ptr := Emit_Ptr + 32; - end Emit_Class; - - ------------------ - -- Emit_Natural -- - ------------------ - - procedure Emit_Natural (IP : Pointer; N : Natural) is - begin - if Emit_Code then - Program (IP + 1) := Character'Val (N / 256); - Program (IP) := Character'Val (N mod 256); - end if; - end Emit_Natural; - - --------------- - -- Emit_Node -- - --------------- - - function Emit_Node (Op : Opcode) return Pointer is - Result : constant Pointer := Emit_Ptr; - - begin - if Emit_Code then - Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op)); - Program (Emit_Ptr + 1) := ASCII.NUL; - Program (Emit_Ptr + 2) := ASCII.NUL; - end if; - - Emit_Ptr := Emit_Ptr + 3; - return Result; - end Emit_Node; - - ---------- - -- Fail -- - ---------- - - procedure Fail (M : String) is - begin - raise Expression_Error with M; - end Fail; - - ------------------------- - -- Get_Curly_Arguments -- - ------------------------- - - procedure Get_Curly_Arguments - (IP : Natural; - Min : out Natural; - Max : out Natural; - Greedy : out Boolean) - is - pragma Unreferenced (IP); - - Save_Pos : Natural := Parse_Pos + 1; - - begin - Min := 0; - Max := Max_Curly_Repeat; - - while Expression (Parse_Pos) /= '}' - and then Expression (Parse_Pos) /= ',' - loop - Parse_Pos := Parse_Pos + 1; - end loop; - - Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); - - if Expression (Parse_Pos) = ',' then - Save_Pos := Parse_Pos + 1; - while Expression (Parse_Pos) /= '}' loop - Parse_Pos := Parse_Pos + 1; - end loop; - - if Save_Pos /= Parse_Pos then - Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); - end if; - - else - Max := Min; - end if; - - if Parse_Pos < Expression'Last - and then Expression (Parse_Pos + 1) = '?' - then - Greedy := False; - Parse_Pos := Parse_Pos + 1; - - else - Greedy := True; - end if; - end Get_Curly_Arguments; - - --------------------------- - -- Insert_Curly_Operator -- - --------------------------- - - procedure Insert_Curly_Operator - (Op : Opcode; - Min : Natural; - Max : Natural; - Operand : Pointer; - Greedy : Boolean := True) - is - Dest : constant Pointer := Emit_Ptr; - Old : Pointer; - Size : Pointer := 7; - - begin - -- If the operand is not greedy, insert an extra operand before it - - if not Greedy then - Size := Size + 3; - end if; - - -- Move the operand in the byte-compilation, so that we can insert - -- the operator before it. - - if Emit_Code then - Program (Operand + Size .. Emit_Ptr + Size) := - Program (Operand .. Emit_Ptr); - end if; - - -- Insert the operator at the position previously occupied by the - -- operand. - - Emit_Ptr := Operand; - - if not Greedy then - Old := Emit_Node (MINMOD); - Link_Tail (Old, Old + 3); - end if; - - Old := Emit_Node (Op); - Emit_Natural (Old + 3, Min); - Emit_Natural (Old + 5, Max); - - Emit_Ptr := Dest + Size; - end Insert_Curly_Operator; - - --------------------- - -- Insert_Operator -- - --------------------- - - procedure Insert_Operator - (Op : Opcode; - Operand : Pointer; - Greedy : Boolean := True) - is - Dest : constant Pointer := Emit_Ptr; - Old : Pointer; - Size : Pointer := 3; - - Discard : Pointer; - pragma Warnings (Off, Discard); - - begin - -- If not greedy, we have to emit another opcode first - - if not Greedy then - Size := Size + 3; - end if; - - -- Move the operand in the byte-compilation, so that we can insert - -- the operator before it. - - if Emit_Code then - Program (Operand + Size .. Emit_Ptr + Size) := - Program (Operand .. Emit_Ptr); - end if; - - -- Insert the operator at the position previously occupied by the - -- operand. - - Emit_Ptr := Operand; - - if not Greedy then - Old := Emit_Node (MINMOD); - Link_Tail (Old, Old + 3); - end if; - - Discard := Emit_Node (Op); - Emit_Ptr := Dest + Size; - end Insert_Operator; - - ----------------------- - -- Is_Curly_Operator -- - ----------------------- - - function Is_Curly_Operator (IP : Natural) return Boolean is - Scan : Natural := IP; - - begin - if Expression (Scan) /= '{' - or else Scan + 2 > Expression'Last - or else not Is_Digit (Expression (Scan + 1)) - then - return False; - end if; - - Scan := Scan + 1; - - -- The first digit - - loop - Scan := Scan + 1; - - if Scan > Expression'Last then - return False; - end if; - - exit when not Is_Digit (Expression (Scan)); - end loop; - - if Expression (Scan) = ',' then - loop - Scan := Scan + 1; - - if Scan > Expression'Last then - return False; - end if; - - exit when not Is_Digit (Expression (Scan)); - end loop; - end if; - - return Expression (Scan) = '}'; - end Is_Curly_Operator; - - ------------- - -- Is_Mult -- - ------------- - - function Is_Mult (IP : Natural) return Boolean is - C : constant Character := Expression (IP); - - begin - return C = '*' - or else C = '+' - or else C = '?' - or else (C = '{' and then Is_Curly_Operator (IP)); - end Is_Mult; - - ----------------------- - -- Link_Operand_Tail -- - ----------------------- - - procedure Link_Operand_Tail (P, Val : Pointer) is - begin - if Emit_Code and then Program (P) = BRANCH then - Link_Tail (Operand (P), Val); - end if; - end Link_Operand_Tail; - - --------------- - -- Link_Tail -- - --------------- - - procedure Link_Tail (P, Val : Pointer) is - Scan : Pointer; - Temp : Pointer; - Offset : Pointer; - - begin - if not Emit_Code then - return; - end if; - - -- Find last node - - Scan := P; - loop - Temp := Next_Instruction (Scan); - exit when Temp = 0; - Scan := Temp; - end loop; - - Offset := Val - Scan; - - Emit_Natural (Scan + 1, Natural (Offset)); - end Link_Tail; - - ---------------------- - -- Next_Instruction -- - ---------------------- - - function Next_Instruction (P : Pointer) return Pointer is - Offset : Pointer; - - begin - if not Emit_Code then - return 0; - end if; - - Offset := Get_Next_Offset (Program, P); - - if Offset = 0 then - return 0; - end if; - - return P + Offset; - end Next_Instruction; - - ----------- - -- Parse -- - ----------- - - -- Combining parenthesis handling with the base level - -- of regular expression is a trifle forced, but the - -- need to tie the tails of the branches to what follows - -- makes it hard to avoid. - - procedure Parse - (Parenthesized : Boolean; - Flags : out Expression_Flags; - IP : out Pointer) - is - E : String renames Expression; - Br : Pointer; - Ender : Pointer; - Par_No : Natural; - New_Flags : Expression_Flags; - Have_Branch : Boolean := False; - - begin - Flags := (Has_Width => True, others => False); -- Tentatively - - -- Make an OPEN node, if parenthesized - - if Parenthesized then - if Matcher.Paren_Count > Max_Paren_Count then - Fail ("too many ()"); - end if; - - Par_No := Matcher.Paren_Count + 1; - Matcher.Paren_Count := Matcher.Paren_Count + 1; - IP := Emit_Node (OPEN); - Emit (Character'Val (Par_No)); - - else - IP := 0; - Par_No := 0; - end if; - - -- Pick up the branches, linking them together - - Parse_Branch (New_Flags, True, Br); - - if Br = 0 then - IP := 0; - return; - end if; - - if Parse_Pos <= Parse_End - and then E (Parse_Pos) = '|' - then - Insert_Operator (BRANCH, Br); - Have_Branch := True; - end if; - - if IP /= 0 then - Link_Tail (IP, Br); -- OPEN -> first - else - IP := Br; - end if; - - if not New_Flags.Has_Width then - Flags.Has_Width := False; - end if; - - Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start; - - while Parse_Pos <= Parse_End - and then (E (Parse_Pos) = '|') - loop - Parse_Pos := Parse_Pos + 1; - Parse_Branch (New_Flags, False, Br); - - if Br = 0 then - IP := 0; - return; - end if; - - Link_Tail (IP, Br); -- BRANCH -> BRANCH - - if not New_Flags.Has_Width then - Flags.Has_Width := False; - end if; - - Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start; - end loop; - - -- Make a closing node, and hook it on the end - - if Parenthesized then - Ender := Emit_Node (CLOSE); - Emit (Character'Val (Par_No)); - else - Ender := Emit_Node (EOP); - end if; - - Link_Tail (IP, Ender); - - if Have_Branch then - - -- Hook the tails of the branches to the closing node - - Br := IP; - loop - exit when Br = 0; - Link_Operand_Tail (Br, Ender); - Br := Next_Instruction (Br); - end loop; - end if; - - -- Check for proper termination - - if Parenthesized then - if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then - Fail ("unmatched ()"); - end if; - - Parse_Pos := Parse_Pos + 1; - - elsif Parse_Pos <= Parse_End then - if E (Parse_Pos) = ')' then - Fail ("unmatched ()"); - else - Fail ("junk on end"); -- "Can't happen" - end if; - end if; - end Parse; - - ---------------- - -- Parse_Atom -- - ---------------- - - procedure Parse_Atom - (Expr_Flags : out Expression_Flags; - IP : out Pointer) - is - C : Character; - - begin - -- Tentatively set worst expression case - - Expr_Flags := Worst_Expression; - - C := Expression (Parse_Pos); - Parse_Pos := Parse_Pos + 1; - - case (C) is - when '^' => - if (Flags and Multiple_Lines) /= 0 then - IP := Emit_Node (MBOL); - elsif (Flags and Single_Line) /= 0 then - IP := Emit_Node (SBOL); - else - IP := Emit_Node (BOL); - end if; - - when '$' => - if (Flags and Multiple_Lines) /= 0 then - IP := Emit_Node (MEOL); - elsif (Flags and Single_Line) /= 0 then - IP := Emit_Node (SEOL); - else - IP := Emit_Node (EOL); - end if; - - when '.' => - if (Flags and Single_Line) /= 0 then - IP := Emit_Node (SANY); - else - IP := Emit_Node (ANY); - end if; - - Expr_Flags.Has_Width := True; - Expr_Flags.Simple := True; - - when '[' => - Parse_Character_Class (IP); - Expr_Flags.Has_Width := True; - Expr_Flags.Simple := True; - - when '(' => - declare - New_Flags : Expression_Flags; - - begin - Parse (True, New_Flags, IP); - - if IP = 0 then - return; - end if; - - Expr_Flags.Has_Width := - Expr_Flags.Has_Width or New_Flags.Has_Width; - Expr_Flags.SP_Start := - Expr_Flags.SP_Start or New_Flags.SP_Start; - end; - - when '|' | ASCII.LF | ')' => - Fail ("internal urp"); -- Supposed to be caught earlier - - when '?' | '+' | '*' => - Fail (C & " follows nothing"); - - when '{' => - if Is_Curly_Operator (Parse_Pos - 1) then - Fail (C & " follows nothing"); - else - Parse_Literal (Expr_Flags, IP); - end if; - - when '\' => - if Parse_Pos > Parse_End then - Fail ("trailing \"); - end if; - - Parse_Pos := Parse_Pos + 1; - - case Expression (Parse_Pos - 1) is - when 'b' => - IP := Emit_Node (BOUND); - - when 'B' => - IP := Emit_Node (NBOUND); - - when 's' => - IP := Emit_Node (SPACE); - Expr_Flags.Simple := True; - Expr_Flags.Has_Width := True; - - when 'S' => - IP := Emit_Node (NSPACE); - Expr_Flags.Simple := True; - Expr_Flags.Has_Width := True; - - when 'd' => - IP := Emit_Node (DIGIT); - Expr_Flags.Simple := True; - Expr_Flags.Has_Width := True; - - when 'D' => - IP := Emit_Node (NDIGIT); - Expr_Flags.Simple := True; - Expr_Flags.Has_Width := True; - - when 'w' => - IP := Emit_Node (ALNUM); - Expr_Flags.Simple := True; - Expr_Flags.Has_Width := True; - - when 'W' => - IP := Emit_Node (NALNUM); - Expr_Flags.Simple := True; - Expr_Flags.Has_Width := True; - - when 'A' => - IP := Emit_Node (SBOL); - - when 'G' => - IP := Emit_Node (SEOL); - - when '0' .. '9' => - IP := Emit_Node (REFF); - - declare - Save : constant Natural := Parse_Pos - 1; - - begin - while Parse_Pos <= Expression'Last - and then Is_Digit (Expression (Parse_Pos)) - loop - Parse_Pos := Parse_Pos + 1; - end loop; - - Emit (Character'Val (Natural'Value - (Expression (Save .. Parse_Pos - 1)))); - end; - - when others => - Parse_Pos := Parse_Pos - 1; - Parse_Literal (Expr_Flags, IP); - end case; - - when others => - Parse_Literal (Expr_Flags, IP); - end case; - end Parse_Atom; - - ------------------ - -- Parse_Branch -- - ------------------ - - procedure Parse_Branch - (Flags : out Expression_Flags; - First : Boolean; - IP : out Pointer) - is - E : String renames Expression; - Chain : Pointer; - Last : Pointer; - New_Flags : Expression_Flags; - - Discard : Pointer; - pragma Warnings (Off, Discard); - - begin - Flags := Worst_Expression; -- Tentatively - - if First then - IP := Emit_Ptr; - else - IP := Emit_Node (BRANCH); - end if; - - Chain := 0; - - while Parse_Pos <= Parse_End - and then E (Parse_Pos) /= ')' - and then E (Parse_Pos) /= ASCII.LF - and then E (Parse_Pos) /= '|' - loop - Parse_Piece (New_Flags, Last); - - if Last = 0 then - IP := 0; - return; - end if; - - Flags.Has_Width := Flags.Has_Width or New_Flags.Has_Width; - - if Chain = 0 then -- First piece - Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start; - else - Link_Tail (Chain, Last); - end if; - - Chain := Last; - end loop; - - -- Case where loop ran zero CURLY - - if Chain = 0 then - Discard := Emit_Node (NOTHING); - end if; - end Parse_Branch; - - --------------------------- - -- Parse_Character_Class -- - --------------------------- - - procedure Parse_Character_Class (IP : out Pointer) is - Bitmap : Character_Class; - Invert : Boolean := False; - In_Range : Boolean := False; - Named_Class : Std_Class := ANYOF_NONE; - Value : Character; - Last_Value : Character := ASCII.Nul; - - begin - Reset_Class (Bitmap); - - -- Do we have an invert character class ? - - if Parse_Pos <= Parse_End - and then Expression (Parse_Pos) = '^' - then - Invert := True; - Parse_Pos := Parse_Pos + 1; - end if; - - -- First character can be ] or - without closing the class - - if Parse_Pos <= Parse_End - and then (Expression (Parse_Pos) = ']' - or else Expression (Parse_Pos) = '-') - then - Set_In_Class (Bitmap, Expression (Parse_Pos)); - Parse_Pos := Parse_Pos + 1; - end if; - - -- While we don't have the end of the class - - while Parse_Pos <= Parse_End - and then Expression (Parse_Pos) /= ']' - loop - Named_Class := ANYOF_NONE; - Value := Expression (Parse_Pos); - Parse_Pos := Parse_Pos + 1; - - -- Do we have a Posix character class - if Value = '[' then - Named_Class := Parse_Posix_Character_Class; - - elsif Value = '\' then - if Parse_Pos = Parse_End then - Fail ("Trailing \"); - end if; - Value := Expression (Parse_Pos); - Parse_Pos := Parse_Pos + 1; - - case Value is - when 'w' => Named_Class := ANYOF_ALNUM; - when 'W' => Named_Class := ANYOF_NALNUM; - when 's' => Named_Class := ANYOF_SPACE; - when 'S' => Named_Class := ANYOF_NSPACE; - when 'd' => Named_Class := ANYOF_DIGIT; - when 'D' => Named_Class := ANYOF_NDIGIT; - when 'n' => Value := ASCII.LF; - when 'r' => Value := ASCII.CR; - when 't' => Value := ASCII.HT; - when 'f' => Value := ASCII.FF; - when 'e' => Value := ASCII.ESC; - when 'a' => Value := ASCII.BEL; - - -- when 'x' => ??? hexadecimal value - -- when 'c' => ??? control character - -- when '0'..'9' => ??? octal character - - when others => null; - end case; - end if; - - -- Do we have a character class? - - if Named_Class /= ANYOF_NONE then - - -- A range like 'a-\d' or 'a-[:digit:] is not a range - - if In_Range then - Set_In_Class (Bitmap, Last_Value); - Set_In_Class (Bitmap, '-'); - In_Range := False; - end if; - - -- Expand the range - - case Named_Class is - when ANYOF_NONE => null; - - when ANYOF_ALNUM | ANYOF_ALNUMC => - for Value in Class_Byte'Range loop - if Is_Alnum (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NALNUM | ANYOF_NALNUMC => - for Value in Class_Byte'Range loop - if not Is_Alnum (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_SPACE => - for Value in Class_Byte'Range loop - if Is_White_Space (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NSPACE => - for Value in Class_Byte'Range loop - if not Is_White_Space (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_DIGIT => - for Value in Class_Byte'Range loop - if Is_Digit (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NDIGIT => - for Value in Class_Byte'Range loop - if not Is_Digit (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_ALPHA => - for Value in Class_Byte'Range loop - if Is_Letter (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NALPHA => - for Value in Class_Byte'Range loop - if not Is_Letter (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_ASCII => - for Value in 0 .. 127 loop - Set_In_Class (Bitmap, Character'Val (Value)); - end loop; - - when ANYOF_NASCII => - for Value in 128 .. 255 loop - Set_In_Class (Bitmap, Character'Val (Value)); - end loop; - - when ANYOF_CNTRL => - for Value in Class_Byte'Range loop - if Is_Control (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NCNTRL => - for Value in Class_Byte'Range loop - if not Is_Control (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_GRAPH => - for Value in Class_Byte'Range loop - if Is_Graphic (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NGRAPH => - for Value in Class_Byte'Range loop - if not Is_Graphic (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_LOWER => - for Value in Class_Byte'Range loop - if Is_Lower (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NLOWER => - for Value in Class_Byte'Range loop - if not Is_Lower (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_PRINT => - for Value in Class_Byte'Range loop - if Is_Printable (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NPRINT => - for Value in Class_Byte'Range loop - if not Is_Printable (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_PUNCT => - for Value in Class_Byte'Range loop - if Is_Printable (Character'Val (Value)) - and then not Is_White_Space (Character'Val (Value)) - and then not Is_Alnum (Character'Val (Value)) - then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NPUNCT => - for Value in Class_Byte'Range loop - if not Is_Printable (Character'Val (Value)) - or else Is_White_Space (Character'Val (Value)) - or else Is_Alnum (Character'Val (Value)) - then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_UPPER => - for Value in Class_Byte'Range loop - if Is_Upper (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NUPPER => - for Value in Class_Byte'Range loop - if not Is_Upper (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_XDIGIT => - for Value in Class_Byte'Range loop - if Is_Hexadecimal_Digit (Character'Val (Value)) then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - when ANYOF_NXDIGIT => - for Value in Class_Byte'Range loop - if not Is_Hexadecimal_Digit - (Character'Val (Value)) - then - Set_In_Class (Bitmap, Character'Val (Value)); - end if; - end loop; - - end case; - - -- Not a character range - - elsif not In_Range then - Last_Value := Value; - - if Expression (Parse_Pos) = '-' - and then Parse_Pos < Parse_End - and then Expression (Parse_Pos + 1) /= ']' - then - Parse_Pos := Parse_Pos + 1; - - -- Do we have a range like '\d-a' and '[:space:]-a' - -- which is not a real range - - if Named_Class /= ANYOF_NONE then - Set_In_Class (Bitmap, '-'); - else - In_Range := True; - end if; - - else - Set_In_Class (Bitmap, Value); - - end if; - - -- Else in a character range - - else - if Last_Value > Value then - Fail ("Invalid Range [" & Last_Value'Img - & "-" & Value'Img & "]"); - end if; - - while Last_Value <= Value loop - Set_In_Class (Bitmap, Last_Value); - Last_Value := Character'Succ (Last_Value); - end loop; - - In_Range := False; - - end if; - - end loop; - - -- Optimize case-insensitive ranges (put the upper case or lower - -- case character into the bitmap) - - if (Flags and Case_Insensitive) /= 0 then - for C in Character'Range loop - if Get_From_Class (Bitmap, C) then - Set_In_Class (Bitmap, To_Lower (C)); - Set_In_Class (Bitmap, To_Upper (C)); - end if; - end loop; - end if; - - -- Optimize inverted classes - - if Invert then - for J in Bitmap'Range loop - Bitmap (J) := not Bitmap (J); - end loop; - end if; - - Parse_Pos := Parse_Pos + 1; - - -- Emit the class - - IP := Emit_Node (ANYOF); - Emit_Class (Bitmap); - end Parse_Character_Class; - - ------------------- - -- Parse_Literal -- - ------------------- - - -- This is a bit tricky due to quoted chars and due to - -- the multiplier characters '*', '+', and '?' that - -- take the SINGLE char previous as their operand. - - -- On entry, the character at Parse_Pos - 1 is going to go - -- into the string, no matter what it is. It could be - -- following a \ if Parse_Atom was entered from the '\' case. - - -- Basic idea is to pick up a good char in C and examine - -- the next char. If Is_Mult (C) then twiddle, if it's a \ - -- then frozzle and if it's another magic char then push C and - -- terminate the string. If none of the above, push C on the - -- string and go around again. - - -- Start_Pos is used to remember where "the current character" - -- starts in the string, if due to an Is_Mult we need to back - -- up and put the current char in a separate 1-character string. - -- When Start_Pos is 0, C is the only char in the string; - -- this is used in Is_Mult handling, and in setting the SIMPLE - -- flag at the end. - - procedure Parse_Literal - (Expr_Flags : out Expression_Flags; - IP : out Pointer) - is - Start_Pos : Natural := 0; - C : Character; - Length_Ptr : Pointer; - - Has_Special_Operator : Boolean := False; - - begin - Parse_Pos := Parse_Pos - 1; -- Look at current character - - if (Flags and Case_Insensitive) /= 0 then - IP := Emit_Node (EXACTF); - else - IP := Emit_Node (EXACT); - end if; - - Length_Ptr := Emit_Ptr; - Emit_Ptr := String_Operand (IP); - - Parse_Loop : - loop - C := Expression (Parse_Pos); -- Get current character - - case C is - when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' => - - if Start_Pos = 0 then - Start_Pos := Parse_Pos; - Emit (C); -- First character is always emitted - else - exit Parse_Loop; -- Else we are done - end if; - - when '?' | '+' | '*' | '{' => - - if Start_Pos = 0 then - Start_Pos := Parse_Pos; - Emit (C); -- First character is always emitted - - -- Are we looking at an operator, or is this - -- simply a normal character ? - - elsif not Is_Mult (Parse_Pos) then - Start_Pos := Parse_Pos; - Case_Emit (C); - - else - -- We've got something like "abc?d". Mark this as a - -- special case. What we want to emit is a first - -- constant string for "ab", then one for "c" that will - -- ultimately be transformed with a CURLY operator, A - -- special case has to be handled for "a?", since there - -- is no initial string to emit. - - Has_Special_Operator := True; - exit Parse_Loop; - end if; - - when '\' => - Start_Pos := Parse_Pos; - - if Parse_Pos = Parse_End then - Fail ("Trailing \"); - - else - case Expression (Parse_Pos + 1) is - when 'b' | 'B' | 's' | 'S' | 'd' | 'D' - | 'w' | 'W' | '0' .. '9' | 'G' | 'A' - => exit Parse_Loop; - when 'n' => Emit (ASCII.LF); - when 't' => Emit (ASCII.HT); - when 'r' => Emit (ASCII.CR); - when 'f' => Emit (ASCII.FF); - when 'e' => Emit (ASCII.ESC); - when 'a' => Emit (ASCII.BEL); - when others => Emit (Expression (Parse_Pos + 1)); - end case; - - Parse_Pos := Parse_Pos + 1; - end if; - - when others => - Start_Pos := Parse_Pos; - Case_Emit (C); - end case; - - exit Parse_Loop when Emit_Ptr - Length_Ptr = 254; - - Parse_Pos := Parse_Pos + 1; - - exit Parse_Loop when Parse_Pos > Parse_End; - end loop Parse_Loop; - - -- Is the string followed by a '*+?{' operator ? If yes, and if there - -- is an initial string to emit, do it now. - - if Has_Special_Operator - and then Emit_Ptr >= Length_Ptr + 3 - then - Emit_Ptr := Emit_Ptr - 1; - Parse_Pos := Start_Pos; - end if; - - if Emit_Code then - Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2); - end if; - - Expr_Flags.Has_Width := True; - - -- Slight optimization when there is a single character - - if Emit_Ptr = Length_Ptr + 2 then - Expr_Flags.Simple := True; - end if; - end Parse_Literal; - - ----------------- - -- Parse_Piece -- - ----------------- - - -- Note that the branching code sequences used for '?' and the - -- general cases of '*' and + are somewhat optimized: they use - -- the same NOTHING node as both the endmarker for their branch - -- list and the body of the last branch. It might seem that - -- this node could be dispensed with entirely, but the endmarker - -- role is not redundant. - - procedure Parse_Piece - (Expr_Flags : out Expression_Flags; - IP : out Pointer) - is - Op : Character; - New_Flags : Expression_Flags; - Greedy : Boolean := True; - - begin - Parse_Atom (New_Flags, IP); - - if IP = 0 then - return; - end if; - - if Parse_Pos > Parse_End - or else not Is_Mult (Parse_Pos) - then - Expr_Flags := New_Flags; - return; - end if; - - Op := Expression (Parse_Pos); - - if Op /= '+' then - Expr_Flags := (SP_Start => True, others => False); - else - Expr_Flags := (Has_Width => True, others => False); - end if; - - -- Detect non greedy operators in the easy cases - - if Op /= '{' - and then Parse_Pos + 1 <= Parse_End - and then Expression (Parse_Pos + 1) = '?' - then - Greedy := False; - Parse_Pos := Parse_Pos + 1; - end if; - - -- Generate the byte code - - case Op is - when '*' => - - if New_Flags.Simple then - Insert_Operator (STAR, IP, Greedy); - else - Link_Tail (IP, Emit_Node (WHILEM)); - Insert_Curly_Operator - (CURLYX, 0, Max_Curly_Repeat, IP, Greedy); - Link_Tail (IP, Emit_Node (NOTHING)); - end if; - - when '+' => - - if New_Flags.Simple then - Insert_Operator (PLUS, IP, Greedy); - else - Link_Tail (IP, Emit_Node (WHILEM)); - Insert_Curly_Operator - (CURLYX, 1, Max_Curly_Repeat, IP, Greedy); - Link_Tail (IP, Emit_Node (NOTHING)); - end if; - - when '?' => - if New_Flags.Simple then - Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy); - else - Link_Tail (IP, Emit_Node (WHILEM)); - Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy); - Link_Tail (IP, Emit_Node (NOTHING)); - end if; - - when '{' => - declare - Min, Max : Natural; - - begin - Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy); - - if New_Flags.Simple then - Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy); - else - Link_Tail (IP, Emit_Node (WHILEM)); - Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy); - Link_Tail (IP, Emit_Node (NOTHING)); - end if; - end; - - when others => - null; - end case; - - Parse_Pos := Parse_Pos + 1; - - if Parse_Pos <= Parse_End - and then Is_Mult (Parse_Pos) - then - Fail ("nested *+{"); - end if; - end Parse_Piece; - - --------------------------------- - -- Parse_Posix_Character_Class -- - --------------------------------- - - function Parse_Posix_Character_Class return Std_Class is - Invert : Boolean := False; - Class : Std_Class := ANYOF_NONE; - E : String renames Expression; - - -- Class names. Note that code assumes that the length of all - -- classes starting with the same letter have the same length. - - Alnum : constant String := "alnum:]"; - Alpha : constant String := "alpha:]"; - Ascii_C : constant String := "ascii:]"; - Cntrl : constant String := "cntrl:]"; - Digit : constant String := "digit:]"; - Graph : constant String := "graph:]"; - Lower : constant String := "lower:]"; - Print : constant String := "print:]"; - Punct : constant String := "punct:]"; - Space : constant String := "space:]"; - Upper : constant String := "upper:]"; - Word : constant String := "word:]"; - Xdigit : constant String := "xdigit:]"; - - begin - -- Case of character class specified - - if Parse_Pos <= Parse_End - and then Expression (Parse_Pos) = ':' - then - Parse_Pos := Parse_Pos + 1; - - -- Do we have something like: [[:^alpha:]] - - if Parse_Pos <= Parse_End - and then Expression (Parse_Pos) = '^' - then - Invert := True; - Parse_Pos := Parse_Pos + 1; - end if; - - -- Check for class names based on first letter - - case Expression (Parse_Pos) is - when 'a' => - - -- All 'a' classes have the same length (Alnum'Length) - - if Parse_Pos + Alnum'Length - 1 <= Parse_End then - if - E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) = Alnum - then - if Invert then - Class := ANYOF_NALNUMC; - else - Class := ANYOF_ALNUMC; - end if; - - Parse_Pos := Parse_Pos + Alnum'Length; - - elsif - E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) = Alpha - then - if Invert then - Class := ANYOF_NALPHA; - else - Class := ANYOF_ALPHA; - end if; - - Parse_Pos := Parse_Pos + Alpha'Length; - - elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) = - Ascii_C - then - if Invert then - Class := ANYOF_NASCII; - else - Class := ANYOF_ASCII; - end if; - - Parse_Pos := Parse_Pos + Ascii_C'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - else - Fail ("Invalid character class: " & E); - end if; - - when 'c' => - if Parse_Pos + Cntrl'Length - 1 <= Parse_End - and then - E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) = Cntrl - then - if Invert then - Class := ANYOF_NCNTRL; - else - Class := ANYOF_CNTRL; - end if; - - Parse_Pos := Parse_Pos + Cntrl'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - when 'd' => - if Parse_Pos + Digit'Length - 1 <= Parse_End - and then - E (Parse_Pos .. Parse_Pos + Digit'Length - 1) = Digit - then - if Invert then - Class := ANYOF_NDIGIT; - else - Class := ANYOF_DIGIT; - end if; - - Parse_Pos := Parse_Pos + Digit'Length; - end if; - - when 'g' => - if Parse_Pos + Graph'Length - 1 <= Parse_End - and then - E (Parse_Pos .. Parse_Pos + Graph'Length - 1) = Graph - then - if Invert then - Class := ANYOF_NGRAPH; - else - Class := ANYOF_GRAPH; - end if; - - Parse_Pos := Parse_Pos + Graph'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - when 'l' => - if Parse_Pos + Lower'Length - 1 <= Parse_End - and then - E (Parse_Pos .. Parse_Pos + Lower'Length - 1) = Lower - then - if Invert then - Class := ANYOF_NLOWER; - else - Class := ANYOF_LOWER; - end if; - - Parse_Pos := Parse_Pos + Lower'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - when 'p' => - - -- All 'p' classes have the same length - - if Parse_Pos + Print'Length - 1 <= Parse_End then - if - E (Parse_Pos .. Parse_Pos + Print'Length - 1) = Print - then - if Invert then - Class := ANYOF_NPRINT; - else - Class := ANYOF_PRINT; - end if; - - Parse_Pos := Parse_Pos + Print'Length; - - elsif - E (Parse_Pos .. Parse_Pos + Punct'Length - 1) = Punct - then - if Invert then - Class := ANYOF_NPUNCT; - else - Class := ANYOF_PUNCT; - end if; - - Parse_Pos := Parse_Pos + Punct'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - else - Fail ("Invalid character class: " & E); - end if; - - when 's' => - if Parse_Pos + Space'Length - 1 <= Parse_End - and then - E (Parse_Pos .. Parse_Pos + Space'Length - 1) = Space - then - if Invert then - Class := ANYOF_NSPACE; - else - Class := ANYOF_SPACE; - end if; - - Parse_Pos := Parse_Pos + Space'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - when 'u' => - if Parse_Pos + Upper'Length - 1 <= Parse_End - and then - E (Parse_Pos .. Parse_Pos + Upper'Length - 1) = Upper - then - if Invert then - Class := ANYOF_NUPPER; - else - Class := ANYOF_UPPER; - end if; - - Parse_Pos := Parse_Pos + Upper'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - when 'w' => - if Parse_Pos + Word'Length - 1 <= Parse_End - and then - E (Parse_Pos .. Parse_Pos + Word'Length - 1) = Word - then - if Invert then - Class := ANYOF_NALNUM; - else - Class := ANYOF_ALNUM; - end if; - - Parse_Pos := Parse_Pos + Word'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - when 'x' => - if Parse_Pos + Xdigit'Length - 1 <= Parse_End - and then - E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1) = Xdigit - then - if Invert then - Class := ANYOF_NXDIGIT; - else - Class := ANYOF_XDIGIT; - end if; - - Parse_Pos := Parse_Pos + Xdigit'Length; - - else - Fail ("Invalid character class: " & E); - end if; - - when others => - Fail ("Invalid character class: " & E); - end case; - - -- Character class not specified - - else - return ANYOF_NONE; - end if; - - return Class; - end Parse_Posix_Character_Class; - - Expr_Flags : Expression_Flags; - Result : Pointer; - - -- Start of processing for Compile - - begin - Emit (MAGIC); - Parse (False, Expr_Flags, Result); - - if Result = 0 then - Fail ("Couldn't compile expression"); - end if; - - Final_Code_Size := Emit_Ptr - 1; - - -- Do we want to actually compile the expression, or simply get the - -- code size ??? - - if Emit_Code then - Optimize (PM); - end if; - - PM.Flags := Flags; - end Compile; - - function Compile - (Expression : String; - Flags : Regexp_Flags := No_Flags) return Pattern_Matcher - is - Size : Program_Size; - Dummy : Pattern_Matcher (0); - - begin - Compile (Dummy, Expression, Size, Flags); - - declare - Result : Pattern_Matcher (Size); - begin - Compile (Result, Expression, Size, Flags); - return Result; - end; - end Compile; - - procedure Compile - (Matcher : out Pattern_Matcher; - Expression : String; - Flags : Regexp_Flags := No_Flags) - is - Size : Program_Size; - begin - Compile (Matcher, Expression, Size, Flags); - end Compile; - - ---------- - -- Dump -- - ---------- - - procedure Dump (Self : Pattern_Matcher) is - Op : Opcode; - Program : Program_Data renames Self.Program; - - procedure Dump_Until - (Start : Pointer; - Till : Pointer; - Indent : Natural := 0); - -- Dump the program until the node Till (not included) is met. - -- Every line is indented with Index spaces at the beginning - -- Dumps till the end if Till is 0. - - ---------------- - -- Dump_Until -- - ---------------- - - procedure Dump_Until - (Start : Pointer; - Till : Pointer; - Indent : Natural := 0) - is - Next : Pointer; - Index : Pointer; - Local_Indent : Natural := Indent; - Length : Pointer; - - begin - Index := Start; - while Index < Till loop - Op := Opcode'Val (Character'Pos ((Self.Program (Index)))); - - if Op = CLOSE then - Local_Indent := Local_Indent - 3; - end if; - - declare - Point : constant String := Pointer'Image (Index); - - begin - for J in 1 .. 6 - Point'Length loop - Put (' '); - end loop; - - Put (Point - & " : " - & (1 .. Local_Indent => ' ') - & Opcode'Image (Op)); - end; - - -- Print the parenthesis number - - if Op = OPEN or else Op = CLOSE or else Op = REFF then - Put (Natural'Image (Character'Pos (Program (Index + 3)))); - end if; - - Next := Index + Get_Next_Offset (Program, Index); - - if Next = Index then - Put (" (next at 0)"); - else - Put (" (next at " & Pointer'Image (Next) & ")"); - end if; - - case Op is - - -- Character class operand - - when ANYOF => null; - declare - Bitmap : Character_Class; - Last : Character := ASCII.Nul; - Current : Natural := 0; - - Current_Char : Character; - - begin - Bitmap_Operand (Program, Index, Bitmap); - Put (" operand="); - - while Current <= 255 loop - Current_Char := Character'Val (Current); - - -- First item in a range - - if Get_From_Class (Bitmap, Current_Char) then - Last := Current_Char; - - -- Search for the last item in the range - - loop - Current := Current + 1; - exit when Current > 255; - Current_Char := Character'Val (Current); - exit when - not Get_From_Class (Bitmap, Current_Char); - - end loop; - - if Last <= ' ' then - Put (Last'Img); - else - Put (Last); - end if; - - if Character'Succ (Last) /= Current_Char then - Put ("-" & Character'Pred (Current_Char)); - end if; - - else - Current := Current + 1; - end if; - end loop; - - New_Line; - Index := Index + 3 + Bitmap'Length; - end; - - -- string operand - - when EXACT | EXACTF => - Length := String_Length (Program, Index); - Put (" operand (length:" & Program_Size'Image (Length + 1) - & ") =" - & String (Program (String_Operand (Index) - .. String_Operand (Index) - + Length))); - Index := String_Operand (Index) + Length + 1; - New_Line; - - -- Node operand - - when BRANCH => - New_Line; - Dump_Until (Index + 3, Next, Local_Indent + 3); - Index := Next; - - when STAR | PLUS => - New_Line; - - -- Only one instruction - - Dump_Until (Index + 3, Index + 4, Local_Indent + 3); - Index := Next; - - when CURLY | CURLYX => - Put (" {" - & Natural'Image (Read_Natural (Program, Index + 3)) - & "," - & Natural'Image (Read_Natural (Program, Index + 5)) - & "}"); - New_Line; - Dump_Until (Index + 7, Next, Local_Indent + 3); - Index := Next; - - when OPEN => - New_Line; - Index := Index + 4; - Local_Indent := Local_Indent + 3; - - when CLOSE | REFF => - New_Line; - Index := Index + 4; - - when EOP => - Index := Index + 3; - New_Line; - exit; - - -- No operand - - when others => - Index := Index + 3; - New_Line; - end case; - end loop; - end Dump_Until; - - -- Start of processing for Dump - - begin - pragma Assert (Self.Program (Program_First) = MAGIC, - "Corrupted Pattern_Matcher"); - - Put_Line ("Must start with (Self.First) = " - & Character'Image (Self.First)); - - if (Self.Flags and Case_Insensitive) /= 0 then - Put_Line (" Case_Insensitive mode"); - end if; - - if (Self.Flags and Single_Line) /= 0 then - Put_Line (" Single_Line mode"); - end if; - - if (Self.Flags and Multiple_Lines) /= 0 then - Put_Line (" Multiple_Lines mode"); - end if; - - Put_Line (" 1 : MAGIC"); - Dump_Until (Program_First + 1, Self.Program'Last + 1); - end Dump; - - -------------------- - -- Get_From_Class -- - -------------------- - - function Get_From_Class - (Bitmap : Character_Class; - C : Character) return Boolean - is - Value : constant Class_Byte := Character'Pos (C); - begin - return - (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0; - end Get_From_Class; - - -------------- - -- Get_Next -- - -------------- - - function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is - Offset : constant Pointer := Get_Next_Offset (Program, IP); - begin - if Offset = 0 then - return 0; - else - return IP + Offset; - end if; - end Get_Next; - - --------------------- - -- Get_Next_Offset -- - --------------------- - - function Get_Next_Offset - (Program : Program_Data; - IP : Pointer) return Pointer - is - begin - return Pointer (Read_Natural (Program, IP + 1)); - end Get_Next_Offset; - - -------------- - -- Is_Alnum -- - -------------- - - function Is_Alnum (C : Character) return Boolean is - begin - return Is_Alphanumeric (C) or else C = '_'; - end Is_Alnum; - - ------------------ - -- Is_Printable -- - ------------------ - - function Is_Printable (C : Character) return Boolean is - begin - -- Printable if space or graphic character or other whitespace - -- Other white space includes (HT/LF/VT/FF/CR = codes 9-13) - - return C in Character'Val (32) .. Character'Val (126) - or else C in ASCII.HT .. ASCII.CR; - end Is_Printable; - - -------------------- - -- Is_White_Space -- - -------------------- - - function Is_White_Space (C : Character) return Boolean is - begin - -- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13 - - return C = ' ' or else C in ASCII.HT .. ASCII.CR; - end Is_White_Space; - - ----------- - -- Match -- - ----------- - - procedure Match - (Self : Pattern_Matcher; - Data : String; - Matches : out Match_Array; - Data_First : Integer := -1; - Data_Last : Positive := Positive'Last) - is - pragma Assert (Matches'First = 0); - - Program : Program_Data renames Self.Program; -- Shorter notation - - First_In_Data : constant Integer := Integer'Max (Data_First, Data'First); - Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last); - - -- Global work variables - - Input_Pos : Natural; -- String-input pointer - BOL_Pos : Natural; -- Beginning of input, for ^ check - Matched : Boolean := False; -- Until proven True - - Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count, - Matches'Last)); - -- Stores the value of all the parenthesis pairs. - -- We do not use directly Matches, so that we can also use back - -- references (REFF) even if Matches is too small. - - type Natural_Array is array (Match_Count range <>) of Natural; - Matches_Tmp : Natural_Array (Matches_Full'Range); - -- Save the opening position of parenthesis - - Last_Paren : Natural := 0; - -- Last parenthesis seen - - Greedy : Boolean := True; - -- True if the next operator should be greedy - - type Current_Curly_Record; - type Current_Curly_Access is access all Current_Curly_Record; - type Current_Curly_Record is record - Paren_Floor : Natural; -- How far back to strip parenthesis data - Cur : Integer; -- How many instances of scan we've matched - Min : Natural; -- Minimal number of scans to match - Max : Natural; -- Maximal number of scans to match - Greedy : Boolean; -- Whether to work our way up or down - Scan : Pointer; -- The thing to match - Next : Pointer; -- What has to match after it - Lastloc : Natural; -- Where we started matching this scan - Old_Cc : Current_Curly_Access; -- Before we started this one - end record; - -- Data used to handle the curly operator and the plus and star - -- operators for complex expressions. - - Current_Curly : Current_Curly_Access := null; - -- The curly currently being processed - - ----------------------- - -- Local Subprograms -- - ----------------------- - - function Index (Start : Positive; C : Character) return Natural; - -- Find character C in Data starting at Start and return position - - function Repeat - (IP : Pointer; - Max : Natural := Natural'Last) return Natural; - -- Repeatedly match something simple, report how many - -- It only matches on things of length 1. - -- Starting from Input_Pos, it matches at most Max CURLY. - - function Try (Pos : Positive) return Boolean; - -- Try to match at specific point - - function Match (IP : Pointer) return Boolean; - -- This is the main matching routine. Conceptually the strategy - -- is simple: check to see whether the current node matches, - -- call self recursively to see whether the rest matches, - -- and then act accordingly. - -- - -- In practice Match makes some effort to avoid recursion, in - -- particular by going through "ordinary" nodes (that don't - -- need to know whether the rest of the match failed) by - -- using a loop instead of recursion. - -- Why is the above comment part of the spec rather than body ??? - - function Match_Whilem (IP : Pointer) return Boolean; - -- Return True if a WHILEM matches - -- How come IP is unreferenced in the body ??? - - function Recurse_Match (IP : Pointer; From : Natural) return Boolean; - pragma Inline (Recurse_Match); - -- Calls Match recursively. It saves and restores the parenthesis - -- status and location in the input stream correctly, so that - -- backtracking is possible - - function Match_Simple_Operator - (Op : Opcode; - Scan : Pointer; - Next : Pointer; - Greedy : Boolean) return Boolean; - -- Return True it the simple operator (possibly non-greedy) matches - - pragma Inline (Index); - pragma Inline (Repeat); - - -- These are two complex functions, but used only once - - pragma Inline (Match_Whilem); - pragma Inline (Match_Simple_Operator); - - ----------- - -- Index -- - ----------- - - function Index (Start : Positive; C : Character) return Natural is - begin - for J in Start .. Last_In_Data loop - if Data (J) = C then - return J; - end if; - end loop; - - return 0; - end Index; - - ------------------- - -- Recurse_Match -- - ------------------- - - function Recurse_Match (IP : Pointer; From : Natural) return Boolean is - L : constant Natural := Last_Paren; - - Tmp_F : constant Match_Array := - Matches_Full (From + 1 .. Matches_Full'Last); - - Start : constant Natural_Array := - Matches_Tmp (From + 1 .. Matches_Tmp'Last); - Input : constant Natural := Input_Pos; - - begin - if Match (IP) then - return True; - end if; - - Last_Paren := L; - Matches_Full (Tmp_F'Range) := Tmp_F; - Matches_Tmp (Start'Range) := Start; - Input_Pos := Input; - return False; - end Recurse_Match; - - ----------- - -- Match -- - ----------- - - function Match (IP : Pointer) return Boolean is - Scan : Pointer := IP; - Next : Pointer; - Op : Opcode; - - begin - State_Machine : - loop - pragma Assert (Scan /= 0); - - -- Determine current opcode and count its usage in debug mode - - Op := Opcode'Val (Character'Pos (Program (Scan))); - - -- Calculate offset of next instruction. - -- Second character is most significant in Program_Data. - - Next := Get_Next (Program, Scan); - - case Op is - when EOP => - return True; -- Success ! - - when BRANCH => - if Program (Next) /= BRANCH then - Next := Operand (Scan); -- No choice, avoid recursion - - else - loop - if Recurse_Match (Operand (Scan), 0) then - return True; - end if; - - Scan := Get_Next (Program, Scan); - exit when Scan = 0 or else Program (Scan) /= BRANCH; - end loop; - - exit State_Machine; - end if; - - when NOTHING => - null; - - when BOL => - exit State_Machine when Input_Pos /= BOL_Pos - and then ((Self.Flags and Multiple_Lines) = 0 - or else Data (Input_Pos - 1) /= ASCII.LF); - - when MBOL => - exit State_Machine when Input_Pos /= BOL_Pos - and then Data (Input_Pos - 1) /= ASCII.LF; - - when SBOL => - exit State_Machine when Input_Pos /= BOL_Pos; - - when EOL => - exit State_Machine when Input_Pos <= Data'Last - and then ((Self.Flags and Multiple_Lines) = 0 - or else Data (Input_Pos) /= ASCII.LF); - - when MEOL => - exit State_Machine when Input_Pos <= Data'Last - and then Data (Input_Pos) /= ASCII.LF; - - when SEOL => - exit State_Machine when Input_Pos <= Data'Last; - - when BOUND | NBOUND => - - -- Was last char in word ? - - declare - N : Boolean := False; - Ln : Boolean := False; - - begin - if Input_Pos /= First_In_Data then - N := Is_Alnum (Data (Input_Pos - 1)); - end if; - - if Input_Pos > Last_In_Data then - Ln := False; - else - Ln := Is_Alnum (Data (Input_Pos)); - end if; - - if Op = BOUND then - if N = Ln then - exit State_Machine; - end if; - else - if N /= Ln then - exit State_Machine; - end if; - end if; - end; - - when SPACE => - exit State_Machine when Input_Pos > Last_In_Data - or else not Is_White_Space (Data (Input_Pos)); - Input_Pos := Input_Pos + 1; - - when NSPACE => - exit State_Machine when Input_Pos > Last_In_Data - or else Is_White_Space (Data (Input_Pos)); - Input_Pos := Input_Pos + 1; - - when DIGIT => - exit State_Machine when Input_Pos > Last_In_Data - or else not Is_Digit (Data (Input_Pos)); - Input_Pos := Input_Pos + 1; - - when NDIGIT => - exit State_Machine when Input_Pos > Last_In_Data - or else Is_Digit (Data (Input_Pos)); - Input_Pos := Input_Pos + 1; - - when ALNUM => - exit State_Machine when Input_Pos > Last_In_Data - or else not Is_Alnum (Data (Input_Pos)); - Input_Pos := Input_Pos + 1; - - when NALNUM => - exit State_Machine when Input_Pos > Last_In_Data - or else Is_Alnum (Data (Input_Pos)); - Input_Pos := Input_Pos + 1; - - when ANY => - exit State_Machine when Input_Pos > Last_In_Data - or else Data (Input_Pos) = ASCII.LF; - Input_Pos := Input_Pos + 1; - - when SANY => - exit State_Machine when Input_Pos > Last_In_Data; - Input_Pos := Input_Pos + 1; - - when EXACT => - declare - Opnd : Pointer := String_Operand (Scan); - Current : Positive := Input_Pos; - - Last : constant Pointer := - Opnd + String_Length (Program, Scan); - - begin - while Opnd <= Last loop - exit State_Machine when Current > Last_In_Data - or else Program (Opnd) /= Data (Current); - Current := Current + 1; - Opnd := Opnd + 1; - end loop; - - Input_Pos := Current; - end; - - when EXACTF => - declare - Opnd : Pointer := String_Operand (Scan); - Current : Positive := Input_Pos; - - Last : constant Pointer := - Opnd + String_Length (Program, Scan); - - begin - while Opnd <= Last loop - exit State_Machine when Current > Last_In_Data - or else Program (Opnd) /= To_Lower (Data (Current)); - Current := Current + 1; - Opnd := Opnd + 1; - end loop; - - Input_Pos := Current; - end; - - when ANYOF => - declare - Bitmap : Character_Class; - begin - Bitmap_Operand (Program, Scan, Bitmap); - exit State_Machine when Input_Pos > Last_In_Data - or else not Get_From_Class (Bitmap, Data (Input_Pos)); - Input_Pos := Input_Pos + 1; - end; - - when OPEN => - declare - No : constant Natural := - Character'Pos (Program (Operand (Scan))); - begin - Matches_Tmp (No) := Input_Pos; - end; - - when CLOSE => - declare - No : constant Natural := - Character'Pos (Program (Operand (Scan))); - - begin - Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1); - - if Last_Paren < No then - Last_Paren := No; - end if; - end; - - when REFF => - declare - No : constant Natural := - Character'Pos (Program (Operand (Scan))); - - Data_Pos : Natural; - - begin - -- If we haven't seen that parenthesis yet - - if Last_Paren < No then - return False; - end if; - - Data_Pos := Matches_Full (No).First; - - while Data_Pos <= Matches_Full (No).Last loop - if Input_Pos > Last_In_Data - or else Data (Input_Pos) /= Data (Data_Pos) - then - return False; - end if; - - Input_Pos := Input_Pos + 1; - Data_Pos := Data_Pos + 1; - end loop; - end; - - when MINMOD => - Greedy := False; - - when STAR | PLUS | CURLY => - declare - Greed : constant Boolean := Greedy; - begin - Greedy := True; - return Match_Simple_Operator (Op, Scan, Next, Greed); - end; - - when CURLYX => - - -- Looking at something like: - - -- 1: CURLYX {n,m} (->4) - -- 2: code for complex thing (->3) - -- 3: WHILEM (->0) - -- 4: NOTHING - - declare - Min : constant Natural := - Read_Natural (Program, Scan + 3); - Max : constant Natural := - Read_Natural (Program, Scan + 5); - Cc : aliased Current_Curly_Record; - - Has_Match : Boolean; - - begin - Cc := (Paren_Floor => Last_Paren, - Cur => -1, - Min => Min, - Max => Max, - Greedy => Greedy, - Scan => Scan + 7, - Next => Next, - Lastloc => 0, - Old_Cc => Current_Curly); - Current_Curly := Cc'Unchecked_Access; - - Has_Match := Match (Next - 3); - - -- Start on the WHILEM - - Current_Curly := Cc.Old_Cc; - return Has_Match; - end; - - when WHILEM => - return Match_Whilem (IP); - end case; - - Scan := Next; - end loop State_Machine; - - -- If we get here, there is no match. - -- For successful matches when EOP is the terminating point. - - return False; - end Match; - - --------------------------- - -- Match_Simple_Operator -- - --------------------------- - - function Match_Simple_Operator - (Op : Opcode; - Scan : Pointer; - Next : Pointer; - Greedy : Boolean) return Boolean - is - Next_Char : Character := ASCII.Nul; - Next_Char_Known : Boolean := False; - No : Integer; -- Can be negative - Min : Natural; - Max : Natural := Natural'Last; - Operand_Code : Pointer; - Old : Natural; - Last_Pos : Natural; - Save : constant Natural := Input_Pos; - - begin - -- Lookahead to avoid useless match attempts - -- when we know what character comes next. - - if Program (Next) = EXACT then - Next_Char := Program (String_Operand (Next)); - Next_Char_Known := True; - end if; - - -- Find the minimal and maximal values for the operator - - case Op is - when STAR => - Min := 0; - Operand_Code := Operand (Scan); - - when PLUS => - Min := 1; - Operand_Code := Operand (Scan); - - when others => - Min := Read_Natural (Program, Scan + 3); - Max := Read_Natural (Program, Scan + 5); - Operand_Code := Scan + 7; - end case; - - -- Non greedy operators - - if not Greedy then - - -- Test the minimal repetitions - - if Min /= 0 - and then Repeat (Operand_Code, Min) < Min - then - return False; - end if; - - Old := Input_Pos; - - -- Find the place where 'next' could work - - if Next_Char_Known then - -- Last position to check - - if Max = Natural'Last then - Last_Pos := Last_In_Data; - else - Last_Pos := Input_Pos + Max; - - if Last_Pos > Last_In_Data then - Last_Pos := Last_In_Data; - end if; - end if; - - -- Look for the first possible opportunity - - loop - -- Find the next possible position - - while Input_Pos <= Last_Pos - and then Data (Input_Pos) /= Next_Char - loop - Input_Pos := Input_Pos + 1; - end loop; - - if Input_Pos > Last_Pos then - return False; - end if; - - -- Check that we still match if we stop - -- at the position we just found. - - declare - Num : constant Natural := Input_Pos - Old; - - begin - Input_Pos := Old; - - if Repeat (Operand_Code, Num) < Num then - return False; - end if; - end; - - -- Input_Pos now points to the new position - - if Match (Get_Next (Program, Scan)) then - return True; - end if; - - Old := Input_Pos; - Input_Pos := Input_Pos + 1; - end loop; - - -- We know what the next character is - - else - while Max >= Min loop - - -- If the next character matches - - if Match (Next) then - return True; - end if; - - Input_Pos := Save + Min; - - -- Could not or did not match -- move forward - - if Repeat (Operand_Code, 1) /= 0 then - Min := Min + 1; - else - return False; - end if; - end loop; - end if; - - return False; - - -- Greedy operators - - else - No := Repeat (Operand_Code, Max); - - -- ??? Perl has some special code here in case the - -- next instruction is of type EOL, since $ and \Z - -- can match before *and* after newline at the end. - - -- ??? Perl has some special code here in case (paren) - -- is True. - - -- Else, if we don't have any parenthesis - - while No >= Min loop - if not Next_Char_Known - or else (Input_Pos <= Last_In_Data - and then Data (Input_Pos) = Next_Char) - then - if Match (Next) then - return True; - end if; - end if; - - -- Could not or did not work, we back up - - No := No - 1; - Input_Pos := Save + No; - end loop; - - return False; - end if; - end Match_Simple_Operator; - - ------------------ - -- Match_Whilem -- - ------------------ - - -- This is really hard to understand, because after we match what we - -- are trying to match, we must make sure the rest of the REx is going - -- to match for sure, and to do that we have to go back UP the parse - -- tree by recursing ever deeper. And if it fails, we have to reset - -- our parent's current state that we can try again after backing off. - - function Match_Whilem (IP : Pointer) return Boolean is - pragma Unreferenced (IP); - - Cc : constant Current_Curly_Access := Current_Curly; - N : constant Natural := Cc.Cur + 1; - Ln : Natural := 0; - - Lastloc : constant Natural := Cc.Lastloc; - -- Detection of 0-len - - begin - -- If degenerate scan matches "", assume scan done - - if Input_Pos = Cc.Lastloc - and then N >= Cc.Min - then - -- Temporarily restore the old context, and check that we - -- match was comes after CURLYX. - - Current_Curly := Cc.Old_Cc; - - if Current_Curly /= null then - Ln := Current_Curly.Cur; - end if; - - if Match (Cc.Next) then - return True; - end if; - - if Current_Curly /= null then - Current_Curly.Cur := Ln; - end if; - - Current_Curly := Cc; - return False; - end if; - - -- First, just match a string of min scans - - if N < Cc.Min then - Cc.Cur := N; - Cc.Lastloc := Input_Pos; - - if Match (Cc.Scan) then - return True; - end if; - - Cc.Cur := N - 1; - Cc.Lastloc := Lastloc; - return False; - end if; - - -- Prefer next over scan for minimal matching - - if not Cc.Greedy then - Current_Curly := Cc.Old_Cc; - - if Current_Curly /= null then - Ln := Current_Curly.Cur; - end if; - - if Recurse_Match (Cc.Next, Cc.Paren_Floor) then - return True; - end if; - - if Current_Curly /= null then - Current_Curly.Cur := Ln; - end if; - - Current_Curly := Cc; - - -- Maximum greed exceeded ? - - if N >= Cc.Max then - return False; - end if; - - -- Try scanning more and see if it helps - Cc.Cur := N; - Cc.Lastloc := Input_Pos; - - if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then - return True; - end if; - - Cc.Cur := N - 1; - Cc.Lastloc := Lastloc; - return False; - end if; - - -- Prefer scan over next for maximal matching - - if N < Cc.Max then -- more greed allowed ? - Cc.Cur := N; - Cc.Lastloc := Input_Pos; - - if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then - return True; - end if; - end if; - - -- Failed deeper matches of scan, so see if this one works - - Current_Curly := Cc.Old_Cc; - - if Current_Curly /= null then - Ln := Current_Curly.Cur; - end if; - - if Match (Cc.Next) then - return True; - end if; - - if Current_Curly /= null then - Current_Curly.Cur := Ln; - end if; - - Current_Curly := Cc; - Cc.Cur := N - 1; - Cc.Lastloc := Lastloc; - return False; - end Match_Whilem; - - ------------ - -- Repeat -- - ------------ - - function Repeat - (IP : Pointer; - Max : Natural := Natural'Last) return Natural - is - Scan : Natural := Input_Pos; - Last : Natural; - Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP))); - Count : Natural; - C : Character; - Is_First : Boolean := True; - Bitmap : Character_Class; - - begin - if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then - Last := Last_In_Data; - else - Last := Scan + Max - 1; - end if; - - case Op is - when ANY => - while Scan <= Last - and then Data (Scan) /= ASCII.LF - loop - Scan := Scan + 1; - end loop; - - when SANY => - Scan := Last + 1; - - when EXACT => - - -- The string has only one character if Repeat was called - - C := Program (String_Operand (IP)); - while Scan <= Last - and then C = Data (Scan) - loop - Scan := Scan + 1; - end loop; - - when EXACTF => - - -- The string has only one character if Repeat was called - - C := Program (String_Operand (IP)); - while Scan <= Last - and then To_Lower (C) = Data (Scan) - loop - Scan := Scan + 1; - end loop; - - when ANYOF => - if Is_First then - Bitmap_Operand (Program, IP, Bitmap); - Is_First := False; - end if; - - while Scan <= Last - and then Get_From_Class (Bitmap, Data (Scan)) - loop - Scan := Scan + 1; - end loop; - - when ALNUM => - while Scan <= Last - and then Is_Alnum (Data (Scan)) - loop - Scan := Scan + 1; - end loop; - - when NALNUM => - while Scan <= Last - and then not Is_Alnum (Data (Scan)) - loop - Scan := Scan + 1; - end loop; - - when SPACE => - while Scan <= Last - and then Is_White_Space (Data (Scan)) - loop - Scan := Scan + 1; - end loop; - - when NSPACE => - while Scan <= Last - and then not Is_White_Space (Data (Scan)) - loop - Scan := Scan + 1; - end loop; - - when DIGIT => - while Scan <= Last - and then Is_Digit (Data (Scan)) - loop - Scan := Scan + 1; - end loop; - - when NDIGIT => - while Scan <= Last - and then not Is_Digit (Data (Scan)) - loop - Scan := Scan + 1; - end loop; - - when others => - raise Program_Error; - end case; - - Count := Scan - Input_Pos; - Input_Pos := Scan; - return Count; - end Repeat; - - --------- - -- Try -- - --------- - - function Try (Pos : Positive) return Boolean is - begin - Input_Pos := Pos; - Last_Paren := 0; - Matches_Full := (others => No_Match); - - if Match (Program_First + 1) then - Matches_Full (0) := (Pos, Input_Pos - 1); - return True; - end if; - - return False; - end Try; - - -- Start of processing for Match - - begin - -- Do we have the regexp Never_Match? - - if Self.Size = 0 then - Matches (0) := No_Match; - return; - end if; - - -- Check validity of program - - pragma Assert - (Program (Program_First) = MAGIC, - "Corrupted Pattern_Matcher"); - - -- If there is a "must appear" string, look for it - - if Self.Must_Have_Length > 0 then - declare - First : constant Character := Program (Self.Must_Have); - Must_First : constant Pointer := Self.Must_Have; - Must_Last : constant Pointer := - Must_First + Pointer (Self.Must_Have_Length - 1); - Next_Try : Natural := Index (First_In_Data, First); - - begin - while Next_Try /= 0 - and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1) - = String (Program (Must_First .. Must_Last)) - loop - Next_Try := Index (Next_Try + 1, First); - end loop; - - if Next_Try = 0 then - Matches_Full := (others => No_Match); - return; -- Not present - end if; - end; - end if; - - -- Mark beginning of line for ^ - - BOL_Pos := Data'First; - - -- Simplest case first: an anchored match need be tried only once - - if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then - Matched := Try (First_In_Data); - - elsif Self.Anchored then - declare - Next_Try : Natural := First_In_Data; - begin - -- Test the first position in the buffer - Matched := Try (Next_Try); - - -- Else only test after newlines - - if not Matched then - while Next_Try <= Last_In_Data loop - while Next_Try <= Last_In_Data - and then Data (Next_Try) /= ASCII.LF - loop - Next_Try := Next_Try + 1; - end loop; - - Next_Try := Next_Try + 1; - - if Next_Try <= Last_In_Data then - Matched := Try (Next_Try); - exit when Matched; - end if; - end loop; - end if; - end; - - elsif Self.First /= ASCII.NUL then - -- We know what char it must start with - - declare - Next_Try : Natural := Index (First_In_Data, Self.First); - - begin - while Next_Try /= 0 loop - Matched := Try (Next_Try); - exit when Matched; - Next_Try := Index (Next_Try + 1, Self.First); - end loop; - end; - - else - -- Messy cases: try all locations (including for the empty string) - - Matched := Try (First_In_Data); - - if not Matched then - for S in First_In_Data + 1 .. Last_In_Data loop - Matched := Try (S); - exit when Matched; - end loop; - end if; - end if; - - -- Matched has its value - - for J in Last_Paren + 1 .. Matches'Last loop - Matches_Full (J) := No_Match; - end loop; - - Matches := Matches_Full (Matches'Range); - return; - end Match; - - ----------- - -- Match -- - ----------- - - function Match - (Self : Pattern_Matcher; - Data : String; - Data_First : Integer := -1; - Data_Last : Positive := Positive'Last) return Natural - is - Matches : Match_Array (0 .. 0); - - begin - Match (Self, Data, Matches, Data_First, Data_Last); - if Matches (0) = No_Match then - return Data'First - 1; - else - return Matches (0).First; - end if; - end Match; - - function Match - (Self : Pattern_Matcher; - Data : String; - Data_First : Integer := -1; - Data_Last : Positive := Positive'Last) return Boolean - is - Matches : Match_Array (0 .. 0); - - begin - Match (Self, Data, Matches, Data_First, Data_Last); - return Matches (0).First >= Data'First; - end Match; - - procedure Match - (Expression : String; - Data : String; - Matches : out Match_Array; - Size : Program_Size := Auto_Size; - Data_First : Integer := -1; - Data_Last : Positive := Positive'Last) - is - PM : Pattern_Matcher (Size); - Finalize_Size : Program_Size; - - begin - if Size = 0 then - Match (Compile (Expression), Data, Matches, Data_First, Data_Last); - else - Compile (PM, Expression, Finalize_Size); - Match (PM, Data, Matches, Data_First, Data_Last); - end if; - end Match; - - ----------- - -- Match -- - ----------- - - function Match - (Expression : String; - Data : String; - Size : Program_Size := Auto_Size; - Data_First : Integer := -1; - Data_Last : Positive := Positive'Last) return Natural - is - PM : Pattern_Matcher (Size); - Final_Size : Program_Size; -- unused - - begin - if Size = 0 then - return Match (Compile (Expression), Data, Data_First, Data_Last); - else - Compile (PM, Expression, Final_Size); - return Match (PM, Data, Data_First, Data_Last); - end if; - end Match; - - ----------- - -- Match -- - ----------- - - function Match - (Expression : String; - Data : String; - Size : Program_Size := Auto_Size; - Data_First : Integer := -1; - Data_Last : Positive := Positive'Last) return Boolean - is - Matches : Match_Array (0 .. 0); - PM : Pattern_Matcher (Size); - Final_Size : Program_Size; -- unused - - begin - if Size = 0 then - Match (Compile (Expression), Data, Matches, Data_First, Data_Last); - else - Compile (PM, Expression, Final_Size); - Match (PM, Data, Matches, Data_First, Data_Last); - end if; - - return Matches (0).First >= Data'First; - end Match; - - ------------- - -- Operand -- - ------------- - - function Operand (P : Pointer) return Pointer is - begin - return P + 3; - end Operand; - - -------------- - -- Optimize -- - -------------- - - procedure Optimize (Self : in out Pattern_Matcher) is - Scan : Pointer; - Program : Program_Data renames Self.Program; - - begin - -- Start with safe defaults (no optimization): - -- * No known first character of match - -- * Does not necessarily start at beginning of line - -- * No string known that has to appear in data - - Self.First := ASCII.NUL; - Self.Anchored := False; - Self.Must_Have := Program'Last + 1; - Self.Must_Have_Length := 0; - - Scan := Program_First + 1; -- First instruction (can be anything) - - if Program (Scan) = EXACT then - Self.First := Program (String_Operand (Scan)); - - elsif Program (Scan) = BOL - or else Program (Scan) = SBOL - or else Program (Scan) = MBOL - then - Self.Anchored := True; - end if; - end Optimize; - - ----------------- - -- Paren_Count -- - ----------------- - - function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is - begin - return Regexp.Paren_Count; - end Paren_Count; - - ----------- - -- Quote -- - ----------- - - function Quote (Str : String) return String is - S : String (1 .. Str'Length * 2); - Last : Natural := 0; - - begin - for J in Str'Range loop - case Str (J) is - when '^' | '$' | '|' | '*' | '+' | '?' | '{' | - '}' | '[' | ']' | '(' | ')' | '\' => - - S (Last + 1) := '\'; - S (Last + 2) := Str (J); - Last := Last + 2; - - when others => - S (Last + 1) := Str (J); - Last := Last + 1; - end case; - end loop; - - return S (1 .. Last); - end Quote; - - ------------------ - -- Read_Natural -- - ------------------ - - function Read_Natural - (Program : Program_Data; - IP : Pointer) return Natural - is - begin - return Character'Pos (Program (IP)) + - 256 * Character'Pos (Program (IP + 1)); - end Read_Natural; - - ----------------- - -- Reset_Class -- - ----------------- - - procedure Reset_Class (Bitmap : out Character_Class) is - begin - Bitmap := (others => 0); - end Reset_Class; - - ------------------ - -- Set_In_Class -- - ------------------ - - procedure Set_In_Class - (Bitmap : in out Character_Class; - C : Character) - is - Value : constant Class_Byte := Character'Pos (C); - begin - Bitmap (Value / 8) := Bitmap (Value / 8) - or Bit_Conversion (Value mod 8); - end Set_In_Class; - - ------------------- - -- String_Length -- - ------------------- - - function String_Length - (Program : Program_Data; - P : Pointer) return Program_Size - is - begin - pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF); - return Character'Pos (Program (P + 3)); - end String_Length; - - -------------------- - -- String_Operand -- - -------------------- - - function String_Operand (P : Pointer) return Pointer is - begin - return P + 4; - end String_Operand; - -end GNAT.Regpat; +pragma No_Body; -- 2.30.2