regexp.sa


Generated by gen_html_sa_files from ICSI. Contact gomes@icsi.berkeley.edu for details
 
------------------------->  GNU Sather - sourcefile  <-------------------------
-- Copyright (C) 2000 by K Hopper, University of Waikato, New Zealand        --
-- This file is part of the GNU Sather library. It is free software; you may --
-- redistribute  and/or modify it under the terms of the GNU Library General --
-- Public  License (LGPL)  as published  by the  Free  Software  Foundation; --
-- either version 2 of the license, or (at your option) any later version.   --
-- This  library  is distributed  in the  hope that it will  be  useful, but --
-- WITHOUT ANY WARRANTY without even the implied warranty of MERCHANTABILITY --
-- or FITNESS FOR A PARTICULAR PURPOSE. See Doc/LGPL for more details.       --
-- The license text is also available from:  Free Software Foundation, Inc., --
-- 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA                     --
-------------->  Please email comments to <bug-sather@gnu.org>  <--------------


immutable class ERE_LEX_TOKENS < $ENUMS{ERE_LEX_TOKENS}

immutable class ERE_LEX_TOKENS < $ENUMS{ERE_LEX_TOKENS} is -- This class provides a simple mapping facility from characters to -- lexical tokens. -- -- The associated resource file contains only a single character per -- line (after the first which contains the count of remaining lines). -- Version 1.0 May 97. Copyright K Hopper,U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 30 May 97 kh Original for Extended Regular_Expressions include ENUM{ERE_LEX_TOKENS} ; private const val_count : CARD := 18 ; -- The next routines provide the enumeration itself. Left_Bracket : SAME is return enum(1) end ; Right_Bracket : SAME is return enum(2) end ; Left_Brace : SAME is return enum(3) end ; Right_Brace : SAME is return enum(4) end ; Left_Parenthesis : SAME is return enum(5) end ; Right_Parenthesis : SAME is return enum(6) end ; Left_Anchor : SAME is return enum(7) end ; Right_Anchor : SAME is return enum(8) end ; Minus_Sign : SAME is return enum(9) end ; Plus_Sign : SAME is return enum(10) end ; Fullstop : SAME is return enum(11) end ; Vertical_Line : SAME is return enum(12) end ; Asterisk : SAME is return enum(13) end ; Question_Mark : SAME is return enum(14) end ; Escape : SAME is return enum(15) end ; Colon : SAME is return enum(16) end ; Equals_Mark : SAME is return enum(17) end ; Comma : SAME is return enum(18) end ; token( ch : CHAR ) : SAME is -- This routine returns the token value corresponding to the given -- character -- or void if there is no match. return create(ch.str); end ; end ; -- ERE_LEX_TOKENS

immutable class ERE_ERROR_KINDS < $ENUMS{ERE_ERROR_KINDS}

immutable class ERE_ERROR_KINDS < $ENUMS{ERE_ERROR_KINDS} is -- This class provides all of the different kinds of error which may -- may be detected when parsing regular expressions. -- Version 1.0 May 97. Copyright K Hopper,U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 30 May 97 kh Original for Extended Regular_Expressions include ENUM{ERE_ERROR_KINDS} ; const val_count : CARD := 11 ; -- The next routines provide the enumeration itself. Heading : SAME is return enum(1) end ; Trailing_Chars : SAME is return enum(2) end ; Unexpected_Char : SAME is return enum(3) end ; Missing_Brace : SAME is return enum(4) end ; Missing_Dup_No : SAME is return enum(5) end ; Missing_Right_Paren : SAME is return enum(6) end ; Missing_Right_Bracket : SAME is return enum(7) end ; Escape_Last : SAME is return enum(8) end ; Empty_Brackets : SAME is return enum(9) end ; Unexpected_Termination : SAME is return enum(10) end ; Invalid_Class : SAME is return enum(11) end ; end ; -- ERE_ERROR_KINDS

class ERE_ERRORS

class ERE_ERRORS is -- This class provides all of the error reporting (exception raising) -- needed when creating extended regular expressions. It only has the single -- feature - error. -- Version 1.0 May 97. Copyright K Hopper, U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 30 May 97 kh Original for Extended Regular_Expressions error( index : ERE_ERROR_KINDS, val : $FMT ) pre ~index.is_nil and (index /= ERE_ERROR_KINDS::Heading) post false -- Well! an exception is actually raised!! is -- This routine raises an exception when an error is found during -- parsing the extended regular expression. raise FMT::create(ERE_ERROR_KINDS::Heading.str + index.str,val).str end ; end ; -- ERE_ERRORS

class ONE_CHAR_ERE

class ONE_CHAR_ERE is -- This class embodies the concept of a single 'character' Extended -- Regular Expression which may indeed correspond to a single character, -- or be a bracketed expression from which one 'character' must be 'equal' -- to the current source stream character for a match to succeed. -- NOTE The implementation provided in this class is a strictly conforming -- implementation as specified in ISO/IEC 9945-2 and does NOT, therefore, -- support range expressions! -- Version 1.0 May 97. Copyright K Hopper,U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 30 May 97 kh Original for Extended Regular_Expressions private const Any_Char : CHAR := CHAR::create( CONTROL_CODES::PADDING_CHARACTER,LIBCHARS::default) ; private attr char : CHAR ; private attr bracketed : BOOL ; -- ie NOT a single char. private attr matching : BOOL ; -- ie NOT inverse matching -- The following components are non-void only if bracketed is true! -- The bracketed expression may be a list of either characters, including -- char_names as defined in ISO/IEC 14652, or character classes (see class -- ERE_CLASSES). private attr literal : FLIST{CHAR} ; private attr chclass : CHAR_CLASS ; private bracketed_expression( str_index : STR_CURSOR ) : SAME is -- This private routine handles the contents of a bracketed regular -- expression during creation of a one char ERE. str_index.advance ; -- over left anchor/bracket! res : SAME := self ; -- usually recursive : BOOL := false ; case ERE_LEX_TOKENS::token(str_index.item) when ERE_LEX_TOKENS::Left_Bracket then -- this could be a class! recursive := true ; res := bracketed_expression(str_index) ; when ERE_LEX_TOKENS::Right_Bracket then ERE_ERRORS::error(ERE_ERROR_KINDS::Empty_Brackets,str_index.index) when ERE_LEX_TOKENS::Left_Anchor then -- here it signifies non-matching matching := false ; -- NOTE The caret is left in the stream where it will -- be skipped by the following recursive call. recursive := true ; res := bracketed_expression(str_index) ; when ERE_LEX_TOKENS::Colon then str_index.advance ; loc_ch : CHAR := ERE_LEX_TOKENS::Colon.str[0] ; name : STR := str_index.get_upto_char(loc_ch) ; str_index.advance ; -- over colon recursive := false ; chclass := CHAR_CLASS::create(name) ; if chclass.is_nil then ERE_ERRORS::error(ERE_ERROR_KINDS::Invalid_Class,name) end when ERE_LEX_TOKENS::Fullstop then str_index.advance ; loc_ch : CHAR := ERE_LEX_TOKENS::Fullstop.str[0] ; name : STR := str_index.get_upto_char(loc_ch) ; str_index.advance ; -- over the second full stop! recursive := false ; char := CHAR::build(name.binstr.cursor,CULTURE::default.sather_lib) when ERE_LEX_TOKENS::Equals_Mark then else -- ordinary bracketed chars literal := FLIST{CHAR}::create ; loop -- over chars in brackets if ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Escape then str_index.advance ; if str_index.is_done then str_index.retract ; ERE_ERRORS::error( ERE_ERROR_KINDS::Escape_Last,str_index.index) end end ; literal := literal.push(str_index.get_item) ; if str_index.is_done then ERE_ERRORS::error(ERE_ERROR_KINDS::Missing_Right_Bracket, str_index.index) else if ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Bracket then str_index.advance ; break! end end end ; return res end ; if str_index.is_done then ERE_ERRORS::error( ERE_ERROR_KINDS::Missing_Right_Bracket,str_index.index) ; return res -- never reached -- error raised!! else if ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Bracket then str_index.advance end ; return res end end ; create( str_index : STR_CURSOR ) : SAME is -- This routine creates a single matching expression, either literal or -- bracketed. me : SAME := new ; me.matching := true ; -- assumption! case ERE_LEX_TOKENS::token(str_index.item) when ERE_LEX_TOKENS::Escape then str_index.advance ; -- over the escape char me.char := str_index.get_item when ERE_LEX_TOKENS::Fullstop then str_index.advance ; -- over the full stop me.char := Any_Char -- the 'wild card' char! when ERE_LEX_TOKENS::Left_Bracket then -- it's a bracketed expression me.bracketed := true ; return me.bracketed_expression(str_index) ; else -- just one ordinary char me.char := str_index.get_item ; end ; return me end ; matches_any : BOOL is -- This predicate returns true if and only if this object matches any -- character! return char = Any_Char end ; matches( str_index : STR_CURSOR ) : BOOL is -- This predicate returns true if and only if the stream item at the -- current position matches the expression self. if str_index.is_done then -- nothing to check!! return false end ; if ~chclass.is_nil then res : BOOL := chclass.contains(str_index.item) ; if res then str_index.advance end ; return res elsif bracketed then if matching = literal.contains(str_index.item) then str_index.advance ; return true else return false end else -- just a single char match! if (str_index.item = char) or (char = Any_Char) then str_index.advance ; return true else return false end end end ; end ; -- ONE_CHAR_ERE

class ONE_ERE

class ONE_ERE is -- This class embodies the concept of a single extended regular -- expression which may either be a single character expression, a -- parenthesised expression or a duplicated expression of either. -- Version 1.0 May 97. Copyright K Hopper,U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 30 May 97 kh Original for Extended Regular_Expressions private attr single : ONE_CHAR_ERE ; -- This only if not parenthesised private attr parenthesised : REG_EXP ; -- parenthesised form private const -- private enum for dup kinds None, Any_Num, One_Plus, Zero_One, Exact_Count, At_Least, Range ; private attr dup_kind : CARD ; -- as needed. private attr low, high : CARD ; private const -- a private enum for variant One_Char, Parenthesised, Duplicated ; private const Number_Base : CARD := 10 ; private attr variant : CARD ; private duplicate_expression( str_index : STR_CURSOR ) is -- This private routine handles the duploication expression which may -- be found in the place of the ordinary single character duplication -- symbols. str_index.advance ; -- over left brace! if str_index.is_done then ERE_ERRORS::error(ERE_ERROR_KINDS::Unexpected_Termination, str_index.item) end ; variant := Duplicated ; if CHAR_CLASS::Digit.contains(str_index.item) then low := 0 ; loop -- needed - comma MAY be ignored if CHAR_CLASS::Digit.contains(str_index.item) then low := low * Number_Base + LIBCHARS::default.card(str_index.get_item) else break! end end ; dup_kind := Exact_Count -- so far anyway! else ERE_ERRORS::error(ERE_ERROR_KINDS::Missing_Dup_No,str_index.item) end ; case ERE_LEX_TOKENS::token(str_index.item) when ERE_LEX_TOKENS::Comma then -- another number perhaps? str_index.advance ; if str_index.is_done then ERE_ERRORS::error( ERE_ERROR_KINDS::Unexpected_Termination,str_index.item) end ; dup_kind := At_Least ; if ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Brace then str_index.advance else -- might be a second number! if CHAR_CLASS::Digit.contains(str_index.item) then high := str_index.card ; dup_kind := Range else ERE_ERRORS::error(ERE_ERROR_KINDS::Missing_Dup_No, str_index.item) end ; if str_index.is_done then ERE_ERRORS::error(ERE_ERROR_KINDS::Unexpected_Termination, str_index.item) elsif ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Brace then str_index.advance else ERE_ERRORS::error( ERE_ERROR_KINDS::Missing_Brace,str_index.item) end end when ERE_LEX_TOKENS::Right_Brace then -- just a single number in braces str_index.advance else ERE_ERRORS::error(ERE_ERROR_KINDS::Unexpected_Char,str_index.item) end end ; create( str_index : STR_CURSOR ) : SAME is -- This routine creates a single extended regular expression from -- the source stream provided, otherwise, if the stream is invalid, raises -- a string exception. me : SAME := new ; if ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Left_Parenthesis then str_index.advance ; -- over the parenthesis me.parenthesised := REG_EXP::make_reg_exp(str_index,true) ; if ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Parenthesis then me.variant := Parenthesised ; str_index.advance else ERE_ERRORS::error( ERE_ERROR_KINDS::Missing_Right_Paren,str_index.index) end elsif ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Vertical_Line then return me else me.single := ONE_CHAR_ERE::create(str_index) end ; if str_index.is_done then return me end ; case ERE_LEX_TOKENS::token(str_index.item) -- duplication tokens/exprn when ERE_LEX_TOKENS::Asterisk then me.dup_kind := Any_Num ; str_index.advance when ERE_LEX_TOKENS::Plus_Sign then me.dup_kind := One_Plus ; str_index.advance when ERE_LEX_TOKENS::Question_Mark then me.dup_kind := Zero_One ; str_index.advance when ERE_LEX_TOKENS::Left_Brace then me.duplicate_expression(str_index) else -- Nothing to do end ; return me end ; matches( str_index : STR_CURSOR ) : BOOL is -- This predicate returns true if and only if the stream provided matches -- the expression contained in this object, setting the cursor to the next -- character beyond the end of the matched string portion. If false is -- returned then str_index is not changed. cnt : CARD := 0 ; matched : BOOL ; start_index : CARD ; -- for backtrack when paren fails loop if void(single) then start_index := str_index.index ; matched := parenthesised.matches(str_index) ; if ~matched then str_index.set_index(start_index) end else matched := single.matches(str_index) end ; if matched then cnt := cnt + 1 end ; success : BOOL ; case dup_kind when None then return matched when Any_Num then success := true when One_Plus then success := (cnt > 0) when Zero_One then success := (cnt < 2) when Exact_Count then success := (cnt = low) when At_Least then success := (cnt >= low) when Range then success := ((cnt >= low) and (cnt <= high)) else end ; if str_index.is_done or ~matched then return success end end end ; end ; -- ONE_ERE

class NON_ANCHORED_ERE

class NON_ANCHORED_ERE is -- This class embodies the concept of a single extended regular -- expression which is not anchored. It may contain a list of single -- expressions. -- Version 1.0 May 97. Copyright K Hopper,U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 30 May 97 kh Original for Extended Regular_Expressions private attr expr : FLIST{ONE_ERE} ; create( str_index : STR_CURSOR, paren : BOOL ) : SAME is -- This routine creates a (list of) non-anchored extended regular -- expressions from the source stream provided, otherwise, if the stream -- is invalid, raises a string exception. -- -- Note that this routine is only entered if the input stream is -- non-empty. me : SAME := new ; me.expr := FLIST{ONE_ERE}::create ; loop case ERE_LEX_TOKENS::token(str_index.item) when ERE_LEX_TOKENS::Vertical_Line then break! when ERE_LEX_TOKENS::Right_Anchor then break! when ERE_LEX_TOKENS::Right_Parenthesis then if paren then break! end else end ; me.expr := me.expr.push(ONE_ERE::create(str_index)) ; if str_index.is_done then break! end end ; return me end ; matches( str_index : STR_CURSOR ) : BOOL is -- This predicate returns true if and only if the stream provided matches -- the list of expressions contained in this object, setting the cursor to -- the character beyond the end of the matched string portion. If false is -- returned then str_index is not changed, loop loc_index : CARD := 0.upto!(expr.size - 1) ; if ~expr[loc_index].matches(str_index) then return false end end ; return true end ; end ; -- NON_ANCHORED_ERE

class EXT_REG_EXP

class EXT_REG_EXP is -- This class embodies the concept of a single extended regular -- expression which may or may not be anchored. The class parameter is -- expected to be a text string type. -- Version 1.0 May 97. Copyright K Hopper,U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 30 May 97 kh Original for Extended Regular_Expressions private attr expr : NON_ANCHORED_ERE ; -- MAY be void! -- One or both of the following may be true if this expression is -- anchored! private attr start : BOOL ; private attr finish : BOOL ; create(str_index : STR_CURSOR, paren : BOOL ) : SAME is -- This routine creates a (list of) non-anchored extended regular -- expressions from the source stream provided, otherwise, if the stream -- is invalid, raises a string exception. If paren is true then detection of -- it in the stream (unescaped) terminates an expression. me : SAME := new ; itm::=str_index.item; tok::=ERE_LEX_TOKENS::token(itm); tok2::=ERE_LEX_TOKENS::Left_Anchor; if tok = tok2 then me.start := true ; str_index.advance; end ; if str_index.is_done then ERE_ERRORS::error(ERE_ERROR_KINDS::Unexpected_Termination,str_index.item) elsif ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Anchor then me.finish := true ; -- empty expression str_index.advance; else -- there IS an expression me.expr := NON_ANCHORED_ERE::create(str_index,paren) ; if str_index.is_done then me.finish := false elsif (ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Anchor) then me.finish := true ; str_index.advance ; if ~(str_index.is_done or (ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Vertical_Line) ) then ERE_ERRORS::error(ERE_ERROR_KINDS::Trailing_Chars, str_index.index) end end end ; return me end ; matches(str_index : STR_CURSOR ) : BOOL is -- This predicate returns true if and only if the stream provided matches -- the list of expressions contained in this object, setting the cursor to -- the character beyond the end of the matched string portion. If false is -- returned then str_index is not changed, if start then if ~(str_index.index = 0) then -- No match return false end end ; if str_index.is_done then -- empty string -- OK? return void(expr) -- Yes if expr also void! else if void(expr) or expr.matches(str_index) then if finish then return str_index.is_done -- Anchored at end? else return true end else return false end end end ; end ; -- EXT_REG_EXP

class REG_EXP < $BINARY

class REG_EXP < $BINARY is -- This class provides POSIX-style string pattern matching, used, for -- example, when matching file names in path searching. It provides a match -- operation to find out if a test string matches the extended regular -- expression. -- -- An extended regular expression consists of one or more of the -- following kinds of components :-- -- -- a. Collating Elements -- any single character that is not -- a meta-character -- b. Duplication count -- a numeric constant -- c. Meta-characters -- ^ - ] -- d. Left Anchor -- ^ meaning the beginning of the expression -- e. Ordinary characters -- f. Quoted characters -- escaped (using a reverse solidus) ^.*[$\ -- g. Right Anchor -- $ meaning the end of the expression. -- h. Special Characters -- .\[^$*()|?{+ -- The syntax of extended regular expressions is defined in the POSIX -- standard ISO/IEC 9945-2:1995 to which reference should be made in -- understanding the following code. -- Version 1.3 May 97. Copyright K Hopper,U of Waikato -- Development History -- ------------------- -- Date Who By Detail -- ---- ------ ------ -- 10 Jun 96 kh Original -- 19 Feb 97 kh Additions for string/char portability. -- 10 Apr 97 kh Modified for INT to CARD, etc -- 29 May 97 kh Parameterised for internationalisation, -- and extended to Extended Regular Expressions. -- 9 Dec 99 kh Revised matching to use pattern lib. include BINARY ; private attr pattern : STR ; -- retained for output to file private attr expr : FLIST{EXT_REG_EXP} ; -- all alternatives! make_reg_exp( str_index : STR_CURSOR, recursive_call : BOOL ) : SAME is -- This routine creates a new regular expression. Its primary purpose -- is to handle expression alternatives and recursive calling! Note that -- it is not necessary to provide a value for pattern in this routine. res : SAME := new ; res.expr := FLIST{EXT_REG_EXP}::create ; loop exp::=EXT_REG_EXP::create(str_index,recursive_call); res.expr := res.expr.push(exp) ; if str_index.is_done or (ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Right_Parenthesis) then break! end ; if (ERE_LEX_TOKENS::token(str_index.item) = ERE_LEX_TOKENS::Vertical_Line) then str_index.advance end end ; return res end ; create( inout str : STR ) : SAME is -- This routine creates a new regular expression pattern object unless -- an error has been detected, when void is returned and the parameter set -- to contain the error message. protect me : SAME := make_reg_exp(str.cursor,false) ; me.pattern := str ; return me when STR then str := exception.str ; return void end -- protect! end ; build( index : BIN_CURSOR ) : SAME pre ~void(index) and ~index.is_done post true is -- This routine creates a new regular expression from the binary string -- cursor at the current position. pattern : STR := index.get_sized.str ; if void(pattern) then return void; end; -- nothing there return create(inout pattern) end ; binstr : BINSTR pre ~void(self) post (result.size = (pattern.size + 1)) is -- This routine returns a binary string representation of self suitable -- for filing and re-building! loc_res : BINSTR := pattern.binstr ; return BINSTR::create + OCTET::create(loc_res.size) + loc_res end ; matches( test_cursor : STR_CURSOR ) : BOOL is -- This predicate returns true iff the regular expression matches -- the test cursor, starting at the current buffer position. start_index : CARD ; loop -- over all alternatives! if test_cursor.is_done then return false else start_index := test_cursor.index end ; if expr.elt!.matches(test_cursor) then return true else test_cursor.set_index(start_index) ; -- try another alternative end end ; return false end ; matches( test_str : STR ) : BOOL is -- This predicate converts the test string to be the same encodings -- as the expression pattern if needed, before creating a string cursor and -- invoking the above matching routine. str_index : STR_CURSOR ; if pattern.index_lib = test_str.index_lib then str_index := test_str.cursor else str_index := test_str.convert(pattern.index_lib).cursor end ; return matches(str_index) end ; end ; -- REG_EXP