casa
$Rev:20696$
|
00001 //# Regex.h: Regular expression class 00002 //# Copyright (C) 1993,1994,1995,1996,1997,1999,2000,2001,2003 00003 //# Associated Universities, Inc. Washington DC, USA. 00004 //# 00005 //# This library is free software; you can redistribute it and/or modify it 00006 //# under the terms of the GNU Library General Public License as published by 00007 //# the Free Software Foundation; either version 2 of the License, or (at your 00008 //# option) any later version. 00009 //# 00010 //# This library is distributed in the hope that it will be useful, but WITHOUT 00011 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 00012 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public 00013 //# License for more details. 00014 //# 00015 //# You should have received a copy of the GNU Library General Public License 00016 //# along with this library; if not, write to the Free Software Foundation, 00017 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA. 00018 //# 00019 //# Correspondence concerning AIPS++ should be addressed as follows: 00020 //# Internet email: aips2-request@nrao.edu. 00021 //# Postal address: AIPS++ Project Office 00022 //# National Radio Astronomy Observatory 00023 //# 520 Edgemont Road 00024 //# Charlottesville, VA 22903-2475 USA 00025 //# 00026 //# $Id: Regex.h 20739 2009-09-29 01:15:15Z Malte.Marquarding $ 00027 00028 #ifndef CASA_REGEX_H 00029 #define CASA_REGEX_H 00030 00031 //# Includes 00032 #include <casa/aips.h> 00033 #include <casa/BasicSL/RegexBase.h> 00034 #include <casa/iosfwd.h> 00035 00036 namespace casa { //# NAMESPACE CASA - BEGIN 00037 00038 //# Forward declarations. 00039 struct re_pattern_buffer; 00040 struct re_registers; 00041 00042 // <summary> 00043 // Regular expression class 00044 // </summary> 00045 00046 // <use visibility=export> 00047 00048 // <reviewed reviewer="Friso Olnon" date="1995/03/20" tests="tRegex" demos=""> 00049 // </reviewed> 00050 00051 // <synopsis> 00052 // This class provides regular expression functionality, such as 00053 // matching and searching in strings, comparison of expressions, and 00054 // input/output. It is built on the regular expression functions in the 00055 // GNU library (see files cregex.h and cregex.cc). 00056 // <br> Apart from proper regular expressions, it also supports glob patterns 00057 // (UNIX file name patterns) by means of a conversion to a proper regex. 00058 // Also ordinary strings can be converted to a proper regex. 00059 // <p> 00060 // cregex.cc supports many syntaxes. Regex supports 00061 // only one syntax, the extended regular expression with { and not \\{ 00062 // as a special character. The special characters are: 00063 // <dl> 00064 // <dt> ^ 00065 // <dd> matches the beginning of a line. 00066 // <dt> $ 00067 // <dd> matches the end of a line. 00068 // <dt> . 00069 // <dd> matches any character 00070 // <dt> * 00071 // <dd> zero or more times the previous subexpression. 00072 // <dt> + 00073 // <dd> one or more times the previous subexpression. 00074 // <dt> ? 00075 // <dd> zero or one time the previous subexpression. 00076 // <dt> {n,m} 00077 // <dd> interval operator to specify how many times a subexpression 00078 // can match. See man page of egrep or regexp for more detail. 00079 // <dt> [] 00080 // <dd> matches any character inside the brackets; e.g. <src>[abc]</src>. 00081 // A hyphen can be used for a character range; e.g. <src>[a-z]</src>. 00082 // <br> 00083 // A ^ right after the opening bracket indicates "not"; 00084 // e.g. <src>[^abc]</src> means any character but a, b, and c. 00085 // If ^ is not the first character, it is a literal caret. 00086 // If - is the last character, it is a literal hyphen. 00087 // If ] is the first character, it is a literal closing bracket. 00088 // <br> 00089 // Special character classes are 00090 // [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], 00091 // [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. 00092 // The brackets are part of the name; e.g. 00093 // <src>[^[:upper:]]</src> is equal to <src>[^A-Z]</src>. 00094 // Note that [:upper:] is more portable, because A-Z fails 00095 // for the EBCDIC character set. 00096 // <dt> ( ) 00097 // <dd> grouping to change the normal operator precedence. 00098 // <dt> | 00099 // <dd> or operator. Matches left side or right side. 00100 // <dt> \\1 till \\9. Backreference to a subexpression. Matches part of string 00101 // equal to string part that matched the subexpression. 00102 // </dl> 00103 // Special characters have to be escaped with a backslash to use them 00104 // literally. Only inside the square brackets, escaping should not be done. 00105 // See the man page of egrep or regexp for more information about 00106 // regular expressions. 00107 // <p> 00108 // Several global Regex objects are predefined for common functionality. 00109 // <dl> 00110 // <dt> RXwhite 00111 // <dd> one or more whitespace characters 00112 // <dt> RXint 00113 // <dd> integer number (also negative) 00114 // <dt> RXdouble 00115 // <dd> double number (with e or E as exponent) 00116 // <dt> RXalpha 00117 // <dd> one or more alphabetic characters (lowercase and/or uppercase) 00118 // <dt> RXlowercase 00119 // <dd> lowercase alphabetic 00120 // <dt> RXuppercase 00121 // <dd> uppercase alphabetic 00122 // <dt> RXalphanum 00123 // <dd> one or more alphabetic/numeric characters (lowercase and/or uppercase) 00124 // <dt> RXidentifier 00125 // <dd> identifier name (first alphabetic or underscore, then zero or 00126 // more alphanumeric and/or underscores 00127 // </dl> 00128 // The static member function <src>fromPattern</src> converts a shell-like 00129 // pattern to a String which can be used to create a Regex from it. 00130 // A pattern has the following special characters: 00131 // <dl> 00132 // <dt> * 00133 // <dd> Zero or more arbitrary characters. 00134 // <dt> ? 00135 // <dd> One arbitrary character 00136 // <dt> [] 00137 // <dd> The same as [] in a regular expression (see above). 00138 // In addition to ^ a ! can be used to indicate "not". 00139 // <dt> {,} 00140 // <dd> A brace expression which is like brace expansion in some shells. 00141 // It is similar to the | construct in a regular expression. 00142 // <br> 00143 // E.g. <src>{abc,defg}</src> means <src>abc</src> or <src>defg</src>. 00144 // Brace expressions can be nested and can contain other 00145 // special characters. 00146 // <br> 00147 // E.g. St{Man*.{h,cc},Col?*.{h,cc,l,y}} 00148 // <br>A literal comma or brace in a brace expression can be given by 00149 // escaping it with a backslash. 00150 // </dl> 00151 // The static member function <src>fromSQLPattern</src> converts an SQL-like 00152 // pattern to a String which can be used to create a Regex from it. 00153 // A pattern has the following special characters: 00154 // <dl> 00155 // <dt> % 00156 // <dd> Zero or more arbitrary characters. 00157 // <dt> _ 00158 // <dd> One arbitrary character 00159 // </dl> 00160 // The static member function <src>fromString</src> converts a normal 00161 // string to a regular expression. This function escapes characters in 00162 // the string which are special in a regular expression. In this way a 00163 // normal string can be passed to a function taking a regular expression. 00164 // 00165 // The static member function <src>makeCaseInsensitive</src> returns a 00166 // new regular expression string containing the case-insensitive version of 00167 // the given expression string. 00168 // </synopsis> 00169 00170 // <example> 00171 // <srcblock> 00172 // Regex RXwhite("[ \n\t\r\v\f]+", 1); 00173 // (blank, newline, tab, return, vertical tab, formfeed) 00174 // Regex RXint("-?[0-9]+", 1); 00175 // Regex RXdouble("-?(([0-9]+\\.[0-9]*)|([0-9]+)|(\\.[0-9]+))([eE][+-]?[0-9]+)?", 1, 200); 00176 // Regex RXalpha("[A-Za-z]+", 1); 00177 // Regex RXlowercase("[a-z]+", 1); 00178 // Regex RXuppercase("[A-Z]+", 1); 00179 // Regex RXalphanum("[0-9A-Za-z]+", 1); 00180 // Regex RXidentifier("[A-Za-z_][A-Za-z0-9_]*", 1); 00181 // </srcblock> 00182 // In RXdouble the . is escaped via a backslash to get it literally. 00183 // The second backslash is needed to escape the backslash in C++. 00184 // <srcblock> 00185 // Regex rx1 (Regex::fromPattern ("St*.{h,cc}"); 00186 // results in regexp "St.*\.((h)|(cc))" 00187 // Regex rx2 (Regex::fromString ("tRegex.cc"); 00188 // results in regexp "tRegex\.cc" 00189 // </srcblock> 00190 // </example> 00191 00192 // <todo asof="2001/07/15"> 00193 // <li> Let sgi ifdef go 00194 // <li> Decide on documentation of GNU stuff (cregex.h, cregex.cc) 00195 // </todo> 00196 00197 00198 class Regex : public RegexBase { 00199 public: 00200 // Default constructor uses a zero-length regular expression. 00201 // <thrown> 00202 // <li> invalid_argument 00203 // </thrown> 00204 Regex(); 00205 00206 // Construct a regular expression. 00207 // Optionally a fast map can be created, a buffer size can be given 00208 // and a translation table (of 256 chars) can be applied. 00209 // The translation table can, for instance, be used to map 00210 // lowercase characters to uppercase. 00211 // See cregex.cc (the extended regular expression matching and search 00212 // library) for detailed information. 00213 // <thrown> 00214 // <li> invalid_argument 00215 // </thrown> 00216 Regex(const String &exp, Bool fast = False, Int sz = 40, 00217 const Char *translation = 0); 00218 00219 // Copy constructor (copy semantics). 00220 // <thrown> 00221 // <li> invalid_argument 00222 // </thrown> 00223 Regex(const Regex &that); 00224 00225 virtual ~Regex(); 00226 00227 // Assignment (copy semantics). 00228 // <thrown> 00229 // <li> invalid_argument 00230 // </thrown> 00231 // <group> 00232 Regex &operator=(const Regex &that); 00233 Regex &operator=(const String &strng); 00234 // </group> 00235 00236 // Convert a shell-like pattern to a regular expression. 00237 // This is useful for people who are more familiar with patterns 00238 // than with regular expressions. 00239 static String fromPattern(const String &pattern); 00240 00241 // Convert an SQL-like pattern to a regular expression. 00242 // This is useful TaQL which mimics SQL. 00243 static String fromSQLPattern(const String &pattern); 00244 00245 // Convert a normal string to a regular expression. 00246 // This consists of escaping the special characters. 00247 // This is useful when one wants to provide a normal string 00248 // (which may contain special characters) to a function working 00249 // on regular expressions. 00250 static String fromString(const String &strng); 00251 00252 // Create a case-insensitive reular expression string from the given 00253 // regular expression string. 00254 // It does it by inserting the lowercase and uppercase version of 00255 // characters in the input string into the output string. 00256 static String makeCaseInsensitive (const String &strng); 00257 00258 // Get the regular expression string. 00259 const String ®exp() const 00260 { return *str; } 00261 00262 // Get the translation table (can be a zero pointer). 00263 const Char *transtable() const 00264 { return trans; } 00265 00266 // Test if the regular expression matches (part of) string <src>s</src>. 00267 // The return value gives the length of the matching string part, 00268 // or String::npos if there is no match or an error. 00269 // The string has <src>len</src> characters and the test starts at 00270 // position <src>pos</src>. The string may contain null characters. 00271 // Negative p is allowed to match at end. 00272 // 00273 // <note role=tip> 00274 // Use the appropriate <linkto class=String>String</linkto> functions 00275 // to test if a string matches a regular expression. 00276 // <src>Regex::match</src> is pretty low-level. 00277 // </note> 00278 virtual String::size_type match(const Char *s, 00279 String::size_type len, 00280 String::size_type pos=0) const; 00281 00282 // Test if the regular expression occurs in string <src>s</src>. 00283 // The return value gives the position of the first substring 00284 // matching the regular expression. The length of that substring 00285 // is returned in <src>matchlen</src>. 00286 // The string has <src>len</src> characters and the test starts at 00287 // position <src>pos</src>. The string may contain null characters. 00288 // The search will do a reverse search if the pos given is less than 0. 00289 // <note role=tip> 00290 // Use the appropriate <linkto class=String>String</linkto> functions 00291 // to test if a regular expression occurs in a string. 00292 // <src>Regex::search</src> is pretty low-level. 00293 // </note> 00294 // <group> 00295 virtual String::size_type search(const Char *s, String::size_type len, 00296 Int &matchlen, 00297 Int pos=0) const; 00298 virtual String::size_type find(const Char *s, String::size_type len, 00299 Int &matchlen, 00300 String::size_type pos=0) const; 00301 // </group> 00302 00303 // Return some internal <src>cregex</src> info. 00304 Int match_info(Int& start, Int& length, Int nth = 0) const; 00305 00306 // Does it contain a valid Regex? 00307 Bool OK() const; 00308 00309 // Write as ASCII. 00310 friend ostream &operator<<(ostream &ios, const Regex &exp); 00311 00312 protected: 00313 String* str; // the reg. exp. string 00314 Int fastval; // fast flag 00315 Int bufsz; // buffer size given 00316 Char* trans; // possible translation table 00317 re_pattern_buffer* buf; // compiled reg.exp. 00318 re_registers* reg; // internal reg.exp. stuff 00319 00320 // Compile the regular expression 00321 // <thrown> 00322 // <li> invalid_argument 00323 // </thrown> 00324 void create(const String&, Int, Int, const Char*); 00325 00326 // Deallocate the stuff allocated by <src>create</src>. 00327 void dealloc(); 00328 }; 00329 00330 00331 // some built in regular expressions 00332 00333 extern const Regex RXwhite; // = "[ \n\t\r\v\f]+" 00334 extern const Regex RXint; // = "-?[0-9]+" 00335 extern const Regex RXdouble; // = "-?(([0-9]+\\.[0-9]*)| 00336 // ([0-9]+)|(\\.[0-9]+)) 00337 // ([eE][+-]?[0-9]+)?" 00338 extern const Regex RXalpha; // = "[A-Za-z]+" 00339 extern const Regex RXlowercase; // = "[a-z]+" 00340 extern const Regex RXuppercase; // = "[A-Z]+" 00341 extern const Regex RXalphanum; // = "[0-9A-Za-z]+" 00342 extern const Regex RXidentifier; // = "[A-Za-z_][A-Za-z0-9_]*" 00343 00344 00345 } //# NAMESPACE CASA - END 00346 00347 #endif