casa  $Rev:20696$
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
Regex.h
Go to the documentation of this file.
00001 //# Regex.h: Regular expression class
00002 //# Copyright (C) 1993,1994,1995,1996,1997,1999,2000,2001,2003
00003 //# Associated Universities, Inc. Washington DC, USA.
00004 //#
00005 //# This library is free software; you can redistribute it and/or modify it
00006 //# under the terms of the GNU Library General Public License as published by
00007 //# the Free Software Foundation; either version 2 of the License, or (at your
00008 //# option) any later version.
00009 //#
00010 //# This library is distributed in the hope that it will be useful, but WITHOUT
00011 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
00012 //# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
00013 //# License for more details.
00014 //#
00015 //# You should have received a copy of the GNU Library General Public License
00016 //# along with this library; if not, write to the Free Software Foundation,
00017 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
00018 //#
00019 //# Correspondence concerning AIPS++ should be addressed as follows:
00020 //#        Internet email: aips2-request@nrao.edu.
00021 //#        Postal address: AIPS++ Project Office
00022 //#                        National Radio Astronomy Observatory
00023 //#                        520 Edgemont Road
00024 //#                        Charlottesville, VA 22903-2475 USA
00025 //#
00026 //# $Id: Regex.h 20739 2009-09-29 01:15:15Z Malte.Marquarding $
00027 
00028 #ifndef CASA_REGEX_H
00029 #define CASA_REGEX_H
00030 
00031 //# Includes
00032 #include <casa/aips.h>
00033 #include <casa/BasicSL/RegexBase.h>
00034 #include <casa/iosfwd.h>
00035 
00036 namespace casa { //# NAMESPACE CASA - BEGIN
00037 
00038 //# Forward declarations.
00039 struct re_pattern_buffer;
00040 struct re_registers;
00041 
00042 // <summary>
00043 // Regular expression class
00044 // </summary>
00045 
00046 // <use visibility=export>
00047 
00048 // <reviewed reviewer="Friso Olnon" date="1995/03/20" tests="tRegex" demos="">
00049 // </reviewed>
00050 
00051 // <synopsis> 
00052 // This class provides regular expression functionality, such as
00053 // matching and searching in strings, comparison of expressions, and
00054 // input/output. It is built on the regular expression functions in the
00055 // GNU library (see files cregex.h and cregex.cc).
00056 // <br> Apart from proper regular expressions, it also supports glob patterns
00057 // (UNIX file name patterns) by means of a conversion to a proper regex.
00058 // Also ordinary strings can be converted to a proper regex.
00059 // <p>
00060 // cregex.cc supports many syntaxes. Regex supports
00061 // only one syntax, the extended regular expression with { and not \\{
00062 // as a special character. The special characters are:
00063 // <dl>
00064 //  <dt> ^
00065 //  <dd> matches the beginning of a line.
00066 //  <dt> $
00067 //  <dd> matches the end of a line.
00068 //  <dt> .
00069 //  <dd> matches any character
00070 //  <dt> *
00071 //  <dd> zero or more times the previous subexpression.
00072 //  <dt> +
00073 //  <dd> one or more times the previous subexpression.
00074 //  <dt> ?
00075 //  <dd> zero or one time the previous subexpression.
00076 //  <dt> {n,m}
00077 //  <dd> interval operator to specify how many times a subexpression
00078 //       can match. See man page of egrep or regexp for more detail.
00079 //  <dt> []
00080 //  <dd> matches any character inside the brackets; e.g. <src>[abc]</src>.
00081 //       A hyphen can be used for a character range; e.g. <src>[a-z]</src>.
00082 //       <br>
00083 //       A ^ right after the opening bracket indicates "not";
00084 //       e.g. <src>[^abc]</src> means any character but a, b, and c.
00085 //       If ^ is not the first character, it is a literal caret.
00086 //       If - is the last character, it is a literal hyphen.
00087 //       If ] is the first character, it is a literal closing bracket.
00088 //       <br>
00089 //       Special character classes are
00090 //       [:alpha:], [:upper:], [:lower:],  [:digit:], [:alnum:], [:xdigit:],
00091 //       [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
00092 //       The brackets are part of the name; e.g.
00093 //       <src>[^[:upper:]]</src> is equal to <src>[^A-Z]</src>.
00094 //       Note that [:upper:] is more portable, because A-Z fails
00095 //       for the EBCDIC character set.
00096 //  <dt> ( )
00097 //  <dd> grouping to change the normal operator precedence.
00098 //  <dt> |
00099 //  <dd> or operator. Matches left side or right side.
00100 //  <dt> \\1 till \\9. Backreference to a subexpression. Matches part of string
00101 //       equal to string part that matched the subexpression.
00102 // </dl>
00103 // Special characters have to be escaped with a backslash to use them
00104 // literally. Only inside the square brackets, escaping should not be done.
00105 // See the man page of egrep or regexp for more information about
00106 // regular expressions.
00107 // <p>
00108 // Several global Regex objects are predefined for common functionality.
00109 // <dl>
00110 //  <dt> RXwhite
00111 //  <dd> one or more whitespace characters
00112 //  <dt> RXint
00113 //  <dd> integer number (also negative)
00114 //  <dt> RXdouble
00115 //  <dd> double number (with e or E as exponent)
00116 //  <dt> RXalpha
00117 //  <dd> one or more alphabetic characters (lowercase and/or uppercase)
00118 //  <dt> RXlowercase
00119 //  <dd> lowercase alphabetic
00120 //  <dt> RXuppercase
00121 //  <dd> uppercase alphabetic
00122 //  <dt> RXalphanum
00123 //  <dd> one or more alphabetic/numeric characters (lowercase and/or uppercase)
00124 //  <dt> RXidentifier
00125 //  <dd> identifier name (first alphabetic or underscore, then zero or
00126 //       more alphanumeric and/or underscores
00127 // </dl>
00128 // The static member function <src>fromPattern</src> converts a shell-like
00129 // pattern to a String which can be used to create a Regex from it.
00130 // A pattern has the following special characters:
00131 // <dl>
00132 //  <dt> *
00133 //  <dd> Zero or more arbitrary characters.
00134 //  <dt> ?
00135 //  <dd> One arbitrary character
00136 //  <dt> []
00137 //  <dd> The same as [] in a regular expression (see above).
00138 //       In addition to ^ a ! can be used to indicate "not".
00139 //  <dt> {,}
00140 //  <dd> A brace expression which is like brace expansion in some shells.
00141 //       It is similar to the | construct in a regular expression.
00142 //       <br>
00143 //       E.g. <src>{abc,defg}</src> means <src>abc</src> or <src>defg</src>.
00144 //       Brace expressions can be nested and can contain other
00145 //       special characters.
00146 //       <br>
00147 //       E.g. St{Man*.{h,cc},Col?*.{h,cc,l,y}}
00148 //       <br>A literal comma or brace in a brace expression can be given by
00149 //       escaping it with a backslash.
00150 // </dl>
00151 // The static member function <src>fromSQLPattern</src> converts an SQL-like
00152 // pattern to a String which can be used to create a Regex from it.
00153 // A pattern has the following special characters:
00154 // <dl>
00155 //  <dt> %
00156 //  <dd> Zero or more arbitrary characters.
00157 //  <dt> _
00158 //  <dd> One arbitrary character
00159 // </dl>
00160 // The static member function <src>fromString</src> converts a normal
00161 // string to a regular expression. This function escapes characters in
00162 // the string which are special in a regular expression. In this way a
00163 // normal string can be passed to a function taking a regular expression.
00164 //
00165 // The static member function <src>makeCaseInsensitive</src> returns a
00166 // new regular expression string containing the case-insensitive version of
00167 // the given expression string.
00168 // </synopsis> 
00169 
00170 // <example>
00171 // <srcblock>
00172 // Regex RXwhite("[ \n\t\r\v\f]+", 1);
00173 //        (blank, newline, tab, return, vertical tab, formfeed)
00174 // Regex RXint("-?[0-9]+", 1);
00175 // Regex RXdouble("-?(([0-9]+\\.[0-9]*)|([0-9]+)|(\\.[0-9]+))([eE][+-]?[0-9]+)?", 1, 200);
00176 // Regex RXalpha("[A-Za-z]+", 1);
00177 // Regex RXlowercase("[a-z]+", 1);
00178 // Regex RXuppercase("[A-Z]+", 1);
00179 // Regex RXalphanum("[0-9A-Za-z]+", 1);
00180 // Regex RXidentifier("[A-Za-z_][A-Za-z0-9_]*", 1);
00181 // </srcblock>
00182 // In RXdouble the . is escaped via a backslash to get it literally.
00183 // The second backslash is needed to escape the backslash in C++.
00184 // <srcblock>
00185 // Regex rx1 (Regex::fromPattern ("St*.{h,cc}");
00186 //            results in regexp "St.*\.((h)|(cc))"
00187 // Regex rx2 (Regex::fromString ("tRegex.cc");
00188 //            results in regexp "tRegex\.cc"
00189 // </srcblock>
00190 // </example>
00191 
00192 // <todo asof="2001/07/15">
00193 //   <li> Let sgi ifdef go
00194 //   <li> Decide on documentation of GNU stuff (cregex.h, cregex.cc)
00195 // </todo>
00196 
00197 
00198 class Regex : public RegexBase {
00199 public:
00200     // Default constructor uses a zero-length regular expression.
00201     // <thrown>
00202     //  <li> invalid_argument
00203     // </thrown>
00204     Regex();
00205     
00206     // Construct a regular expression.
00207     // Optionally a fast map can be created, a buffer size can be given
00208     // and a translation table (of 256 chars) can be applied.
00209     // The translation table can, for instance, be used to map
00210     // lowercase characters to uppercase.
00211     // See cregex.cc (the extended regular expression matching and search
00212     // library) for detailed information.
00213     // <thrown>
00214     //  <li> invalid_argument
00215     // </thrown>
00216     Regex(const String &exp, Bool fast = False, Int sz = 40, 
00217           const Char *translation = 0);
00218 
00219     // Copy constructor (copy semantics).
00220     // <thrown>
00221     //  <li> invalid_argument
00222     // </thrown>
00223     Regex(const Regex &that);
00224     
00225     virtual ~Regex();
00226     
00227     // Assignment (copy semantics).
00228     // <thrown>
00229     //  <li> invalid_argument
00230     // </thrown>
00231     // <group>
00232     Regex &operator=(const Regex &that);
00233     Regex &operator=(const String &strng);
00234     // </group>
00235 
00236     // Convert a shell-like pattern to a regular expression.
00237     // This is useful for people who are more familiar with patterns
00238     // than with regular expressions.
00239     static String fromPattern(const String &pattern);
00240 
00241     // Convert an SQL-like pattern to a regular expression.
00242     // This is useful TaQL which mimics SQL.
00243     static String fromSQLPattern(const String &pattern);
00244 
00245     // Convert a normal string to a regular expression.
00246     // This consists of escaping the special characters.
00247     // This is useful when one wants to provide a normal string
00248     // (which may contain special characters) to a function working
00249     // on regular expressions.
00250     static String fromString(const String &strng);
00251 
00252     // Create a case-insensitive reular expression string from the given
00253     // regular expression string.
00254     // It does it by inserting the lowercase and uppercase version of
00255     // characters in the input string into the output string.
00256     static String makeCaseInsensitive (const String &strng);
00257 
00258     // Get the regular expression string.
00259     const String &regexp() const
00260       { return *str; }
00261     
00262     // Get the translation table (can be a zero pointer).
00263     const Char *transtable() const
00264       { return trans; }
00265     
00266     // Test if the regular expression matches (part of) string <src>s</src>.
00267     // The return value gives the length of the matching string part,
00268     // or String::npos if there is no match or an error.
00269     // The string has <src>len</src> characters and the test starts at
00270     // position <src>pos</src>. The string may contain null characters.
00271     // Negative p is allowed to match at end.
00272     //
00273     // <note role=tip>
00274     // Use the appropriate <linkto class=String>String</linkto> functions
00275     // to test if a string matches a regular expression. 
00276     // <src>Regex::match</src> is pretty low-level.
00277     // </note>
00278     virtual String::size_type match(const Char *s,
00279                                     String::size_type len,
00280                                     String::size_type pos=0) const;
00281     
00282     // Test if the regular expression occurs in string <src>s</src>.
00283     // The return value gives the position of the first substring
00284     // matching the regular expression. The length of that substring
00285     // is returned in <src>matchlen</src>.
00286     // The string has <src>len</src> characters and the test starts at
00287     // position <src>pos</src>. The string may contain null characters.
00288     // The search will do a reverse search if the pos given is less than 0.
00289     // <note role=tip>
00290     // Use the appropriate <linkto class=String>String</linkto> functions
00291     // to test if a regular expression occurs in a string.
00292     // <src>Regex::search</src> is pretty low-level.
00293     // </note>
00294     // <group>
00295     virtual String::size_type search(const Char *s, String::size_type len,
00296                                      Int &matchlen,
00297                                      Int pos=0) const;
00298     virtual String::size_type find(const Char *s, String::size_type len,
00299                                    Int &matchlen,
00300                                    String::size_type pos=0) const;
00301     // </group>
00302 
00303     // Return some internal <src>cregex</src> info.
00304     Int match_info(Int& start, Int& length, Int nth = 0) const;
00305 
00306     // Does it contain a valid Regex?
00307     Bool OK() const;
00308     
00309     // Write as ASCII.
00310     friend ostream &operator<<(ostream &ios, const Regex &exp);
00311     
00312 protected:
00313     String*            str;                 // the reg. exp. string
00314     Int                fastval;             // fast flag
00315     Int                bufsz;               // buffer size given
00316     Char*              trans;               // possible translation table
00317     re_pattern_buffer* buf;                 // compiled reg.exp.
00318     re_registers*      reg;                 // internal reg.exp. stuff
00319     
00320     // Compile the regular expression
00321     // <thrown>
00322     //  <li> invalid_argument
00323     // </thrown>
00324     void create(const String&, Int, Int, const Char*);
00325     
00326     // Deallocate the stuff allocated by <src>create</src>.
00327     void dealloc();
00328 };
00329 
00330 
00331 // some built in regular expressions
00332 
00333 extern const Regex RXwhite;          // = "[ \n\t\r\v\f]+"
00334 extern const Regex RXint;            // = "-?[0-9]+"
00335 extern const Regex RXdouble;         // = "-?(([0-9]+\\.[0-9]*)|
00336                                      //    ([0-9]+)|(\\.[0-9]+))
00337                                      //    ([eE][+-]?[0-9]+)?"
00338 extern const Regex RXalpha;          // = "[A-Za-z]+"
00339 extern const Regex RXlowercase;      // = "[a-z]+"
00340 extern const Regex RXuppercase;      // = "[A-Z]+"
00341 extern const Regex RXalphanum;       // = "[0-9A-Za-z]+"
00342 extern const Regex RXidentifier;     // = "[A-Za-z_][A-Za-z0-9_]*"
00343 
00344 
00345 } //# NAMESPACE CASA - END
00346 
00347 #endif