casa  5.7.0-16
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Regex.h
Go to the documentation of this file.
1 //# Regex.h: Regular expression class
2 //# Copyright (C) 1993,1994,1995,1996,1997,1999,2000,2001,2003
3 //# Associated Universities, Inc. Washington DC, USA.
4 //#
5 //# This library is free software; you can redistribute it and/or modify it
6 //# under the terms of the GNU Library General Public License as published by
7 //# the Free Software Foundation; either version 2 of the License, or (at your
8 //# option) any later version.
9 //#
10 //# This library is distributed in the hope that it will be useful, but WITHOUT
11 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
13 //# License for more details.
14 //#
15 //# You should have received a copy of the GNU Library General Public License
16 //# along with this library; if not, write to the Free Software Foundation,
17 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
18 //#
19 //# Correspondence concerning AIPS++ should be addressed as follows:
20 //# Internet email: aips2-request@nrao.edu.
21 //# Postal address: AIPS++ Project Office
22 //# National Radio Astronomy Observatory
23 //# 520 Edgemont Road
24 //# Charlottesville, VA 22903-2475 USA
25 //#
26 //# $Id$
27 
28 #ifndef CASA_REGEX_H
29 #define CASA_REGEX_H
30 
31 //# Includes
32 #include <casacore/casa/aips.h>
34 #include <casacore/casa/iosfwd.h>
35 
36 namespace casacore { //# NAMESPACE CASACORE - BEGIN
37 
38 //# Forward declarations.
39 struct re_pattern_buffer;
40 struct re_registers;
41 
42 // <summary>
43 // Regular expression class
44 // </summary>
45 
46 // <use visibility=export>
47 
48 // <reviewed reviewer="Friso Olnon" date="1995/03/20" tests="tRegex" demos="">
49 // </reviewed>
50 
51 // <synopsis>
52 // This class provides regular expression functionality, such as
53 // matching and searching in strings, comparison of expressions, and
54 // input/output. It is built on the regular expression functions in the
55 // GNU library (see files cregex.h and cregex.cc).
56 // <br> Apart from proper regular expressions, it also supports glob patterns
57 // (UNIX file name patterns) by means of a conversion to a proper regex.
58 // Also ordinary strings can be converted to a proper regex.
59 // <p>
60 // cregex.cc supports many syntaxes. Regex supports
61 // only one syntax, the extended regular expression with { and not \\{
62 // as a special character. The special characters are:
63 // <dl>
64 // <dt> ^
65 // <dd> matches the beginning of a line.
66 // <dt> $
67 // <dd> matches the end of a line.
68 // <dt> .
69 // <dd> matches any character
70 // <dt> *
71 // <dd> zero or more times the previous subexpression.
72 // <dt> +
73 // <dd> one or more times the previous subexpression.
74 // <dt> ?
75 // <dd> zero or one time the previous subexpression.
76 // <dt> {n,m}
77 // <dd> interval operator to specify how many times a subexpression
78 // can match. See man page of egrep or regexp for more detail.
79 // <dt> []
80 // <dd> matches any character inside the brackets; e.g. <src>[abc]</src>.
81 // A hyphen can be used for a character range; e.g. <src>[a-z]</src>.
82 // <br>
83 // A ^ right after the opening bracket indicates "not";
84 // e.g. <src>[^abc]</src> means any character but a, b, and c.
85 // If ^ is not the first character, it is a literal caret.
86 // If - is the last character, it is a literal hyphen.
87 // If ] is the first character, it is a literal closing bracket.
88 // <br>
89 // Special character classes are
90 // [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
91 // [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
92 // The brackets are part of the name; e.g.
93 // <src>[^[:upper:]]</src> is equal to <src>[^A-Z]</src>.
94 // Note that [:upper:] is more portable, because A-Z fails
95 // for the EBCDIC character set.
96 // <dt> ( )
97 // <dd> grouping to change the normal operator precedence.
98 // <dt> |
99 // <dd> or operator. Matches left side or right side.
100 // <dt> \\1 till \\9. Backreference to a subexpression. Matches part of string
101 // equal to string part that matched the subexpression.
102 // </dl>
103 // Special characters have to be escaped with a backslash to use them
104 // literally. Only inside the square brackets, escaping should not be done.
105 // See the man page of egrep or regexp for more information about
106 // regular expressions.
107 // <p>
108 // Several global Regex objects are predefined for common functionality.
109 // <dl>
110 // <dt> RXwhite
111 // <dd> one or more whitespace characters
112 // <dt> RXint
113 // <dd> integer number (also negative)
114 // <dt> RXdouble
115 // <dd> double number (with e or E as exponent)
116 // <dt> RXalpha
117 // <dd> one or more alphabetic characters (lowercase and/or uppercase)
118 // <dt> RXlowercase
119 // <dd> lowercase alphabetic
120 // <dt> RXuppercase
121 // <dd> uppercase alphabetic
122 // <dt> RXalphanum
123 // <dd> one or more alphabetic/numeric characters (lowercase and/or uppercase)
124 // <dt> RXidentifier
125 // <dd> identifier name (first alphabetic or underscore, then zero or
126 // more alphanumeric and/or underscores
127 // </dl>
128 // The static member function <src>fromPattern</src> converts a shell-like
129 // pattern to a String which can be used to create a Regex from it.
130 // A pattern has the following special characters:
131 // <dl>
132 // <dt> *
133 // <dd> Zero or more arbitrary characters.
134 // <dt> ?
135 // <dd> One arbitrary character
136 // <dt> []
137 // <dd> The same as [] in a regular expression (see above).
138 // In addition to ^ a ! can be used to indicate "not".
139 // <dt> {,}
140 // <dd> A brace expression which is like brace expansion in some shells.
141 // It is similar to the | construct in a regular expression.
142 // <br>
143 // E.g. <src>{abc,defg}</src> means <src>abc</src> or <src>defg</src>.
144 // Brace expressions can be nested and can contain other
145 // special characters.
146 // <br>
147 // E.g. St{Man*.{h,cc},Col?*.{h,cc,l,y}}
148 // <br>A literal comma or brace in a brace expression can be given by
149 // escaping it with a backslash.
150 // </dl>
151 // The static member function <src>fromSQLPattern</src> converts an SQL-like
152 // pattern to a String which can be used to create a Regex from it.
153 // A pattern has the following special characters:
154 // <dl>
155 // <dt> %
156 // <dd> Zero or more arbitrary characters.
157 // <dt> _
158 // <dd> One arbitrary character
159 // </dl>
160 // The static member function <src>fromString</src> converts a normal
161 // string to a regular expression. This function escapes characters in
162 // the string which are special in a regular expression. In this way a
163 // normal string can be passed to a function taking a regular expression.
164 //
165 // The static member function <src>makeCaseInsensitive</src> returns a
166 // new regular expression string containing the case-insensitive version of
167 // the given expression string.
168 // </synopsis>
169 
170 // <example>
171 // <srcblock>
172 // Regex RXwhite("[ \n\t\r\v\f]+", 1);
173 // (blank, newline, tab, return, vertical tab, formfeed)
174 // Regex RXint("-?[0-9]+", 1);
175 // Regex RXdouble("-?(([0-9]+\\.[0-9]*)|([0-9]+)|(\\.[0-9]+))([eE][+-]?[0-9]+)?", 1, 200);
176 // Regex RXalpha("[A-Za-z]+", 1);
177 // Regex RXlowercase("[a-z]+", 1);
178 // Regex RXuppercase("[A-Z]+", 1);
179 // Regex RXalphanum("[0-9A-Za-z]+", 1);
180 // Regex RXidentifier("[A-Za-z_][A-Za-z0-9_]*", 1);
181 // </srcblock>
182 // In RXdouble the . is escaped via a backslash to get it literally.
183 // The second backslash is needed to escape the backslash in C++.
184 // <srcblock>
185 // Regex rx1 (Regex::fromPattern ("St*.{h,cc}");
186 // results in regexp "St.*\.((h)|(cc))"
187 // Regex rx2 (Regex::fromString ("tRegex.cc");
188 // results in regexp "tRegex\.cc"
189 // </srcblock>
190 // </example>
191 
192 // <todo asof="2001/07/15">
193 // <li> Let sgi ifdef go
194 // <li> Decide on documentation of GNU stuff (cregex.h, cregex.cc)
195 // </todo>
196 
197 
198 class Regex : public RegexBase {
199 public:
200  // Default constructor uses a zero-length regular expression.
201  // <thrown>
202  // <li> invalid_argument
203  // </thrown>
204  Regex();
205 
206  // Construct a regular expression.
207  // Optionally a fast map can be created, a buffer size can be given
208  // and a translation table (of 256 chars) can be applied.
209  // The translation table can, for instance, be used to map
210  // lowercase characters to uppercase.
211  // See cregex.cc (the extended regular expression matching and search
212  // library) for detailed information.
213  // <thrown>
214  // <li> invalid_argument
215  // </thrown>
216  Regex(const String &exp, Bool fast = False, Int sz = 40,
217  const Char *translation = 0);
218 
219  // Copy constructor (copy semantics).
220  // <thrown>
221  // <li> invalid_argument
222  // </thrown>
223  Regex(const Regex &that);
224 
225  virtual ~Regex();
226 
227  // Assignment (copy semantics).
228  // <thrown>
229  // <li> invalid_argument
230  // </thrown>
231  // <group>
232  Regex &operator=(const Regex &that);
233  Regex &operator=(const String &strng);
234  // </group>
235 
236  // Convert a shell-like pattern to a regular expression.
237  // This is useful for people who are more familiar with patterns
238  // than with regular expressions.
239  static String fromPattern(const String &pattern);
240 
241  // Convert an SQL-like pattern to a regular expression.
242  // This is useful TaQL which mimics SQL.
243  static String fromSQLPattern(const String &pattern);
244 
245  // Convert a normal string to a regular expression.
246  // This consists of escaping the special characters.
247  // This is useful when one wants to provide a normal string
248  // (which may contain special characters) to a function working
249  // on regular expressions.
250  static String fromString(const String &strng);
251 
252  // Create a case-insensitive reular expression string from the given
253  // regular expression string.
254  // It does it by inserting the lowercase and uppercase version of
255  // characters in the input string into the output string.
256  static String makeCaseInsensitive (const String &strng);
257 
258  // Get the regular expression string.
259  const String &regexp() const
260  { return str; }
261 
262  // Get the translation table (can be a zero pointer).
263  const Char *transtable() const
264  { return trans; }
265 
266  // Test if the regular expression matches (part of) string <src>s</src>.
267  // The return value gives the length of the matching string part,
268  // or String::npos if there is no match or an error.
269  // The string has <src>len</src> characters and the test starts at
270  // position <src>pos</src>. The string may contain null characters.
271  // Negative p is allowed to match at end.
272  //
273  // <note role=tip>
274  // Use the appropriate <linkto class=String>String</linkto> functions
275  // to test if a string matches a regular expression.
276  // <src>Regex::match</src> is pretty low-level.
277  // </note>
278  virtual String::size_type match(const Char *s,
279  String::size_type len,
280  String::size_type pos=0) const;
281 
282  // Test if the regular expression occurs in string <src>s</src>.
283  // The return value gives the position of the first substring
284  // matching the regular expression. The length of that substring
285  // is returned in <src>matchlen</src>.
286  // The string has <src>len</src> characters and the test starts at
287  // position <src>pos</src>. The string may contain null characters.
288  // The search will do a reverse search if the pos given is less than 0.
289  // <note role=tip>
290  // Use the appropriate <linkto class=String>String</linkto> functions
291  // to test if a regular expression occurs in a string.
292  // <src>Regex::search</src> is pretty low-level.
293  // </note>
294  // <group>
295  virtual String::size_type search(const Char *s, String::size_type len,
296  Int &matchlen,
297  Int pos=0) const;
298  virtual String::size_type find(const Char *s, String::size_type len,
299  Int &matchlen,
300  String::size_type pos=0) const;
301  // </group>
302 
303  // Return some internal <src>cregex</src> info.
304  Int match_info(Int& start, Int& length, Int nth = 0) const;
305 
306  // Does it contain a valid Regex?
307  Bool OK() const;
308 
309  // Write as ASCII.
310  friend ostream &operator<<(ostream &ios, const Regex &exp);
311 
312 protected:
313  String str; // the reg. exp. string
314  Int fastval; // fast flag
315  Int bufsz; // buffer size given
316  Char* trans; // possible translation table
317  re_pattern_buffer* buf; // compiled reg.exp.
318  re_registers* reg; // internal reg.exp. stuff
319 
320  // Compile the regular expression
321  // <thrown>
322  // <li> invalid_argument
323  // </thrown>
324  void create(const String&, Int, Int, const Char*);
325 
326  // Deallocate the stuff allocated by <src>create</src>.
327  void dealloc();
328 };
329 
330 
331 // some built in regular expressions
332 
333 extern const Regex RXwhite; //# = "[ \n\t\r\v\f]+"
334 extern const Regex RXint; //# = "-?[0-9]+"
335 extern const Regex RXdouble; //# = "-?(([0-9]+\\.[0-9]*)|
336  //# ([0-9]+)|(\\.[0-9]+))
337  //# ([eE][+-]?[0-9]+)?"
338 extern const Regex RXalpha; //# = "[A-Za-z]+"
339 extern const Regex RXlowercase; //# = "[a-z]+"
340 extern const Regex RXuppercase; //# = "[A-Z]+"
341 extern const Regex RXalphanum; //# = "[0-9A-Za-z]+"
342 extern const Regex RXidentifier; //# = "[A-Za-z_][A-Za-z0-9_]*"
343 
344 
345 } //# NAMESPACE CASACORE - END
346 
347 #endif
virtual String::size_type search(const Char *s, String::size_type len, Int &matchlen, Int pos=0) const
Test if the regular expression occurs in string s.
const Regex RXlowercase
int Int
Definition: aipstype.h:50
static String fromSQLPattern(const String &pattern)
Convert an SQL-like pattern to a regular expression.
static String makeCaseInsensitive(const String &strng)
Create a case-insensitive reular expression string from the given regular expression string...
Data structure to store register contents data in.
Definition: cregex.h:252
virtual ~Regex()
This data structure is used to represent a compiled pattern.
Definition: cregex.h:202
virtual String::size_type find(const Char *s, String::size_type len, Int &matchlen, String::size_type pos=0) const
Search string s of length len, starting at position pos.
char Char
Definition: aipstype.h:46
LatticeExprNode exp(const LatticeExprNode &expr)
const Regex RXalpha
static String fromString(const String &strng)
Convert a normal string to a regular expression.
re_registers * reg
Definition: Regex.h:318
const Regex RXwhite
some built in regular expressions
TableExprNode pattern(const TableExprNode &node)
Definition: ExprNode.h:1444
Regex()
Default constructor uses a zero-length regular expression.
const Regex RXuppercase
const Regex RXalphanum
Regex & operator=(const Regex &that)
Assignment (copy semantics).
static String fromPattern(const String &pattern)
Convert a shell-like pattern to a regular expression.
friend ostream & operator<<(ostream &ios, const Regex &exp)
Write as ASCII.
const Regex RXdouble
string::size_type size_type
Definition: String.h:231
Regular expression class.
Definition: Regex.h:198
LatticeExprNode length(const LatticeExprNode &expr, const LatticeExprNode &axis)
2-argument function to get the length of an axis.
String str
Definition: Regex.h:313
const Regex RXint
bool Bool
Define the standard types used by Casacore.
Definition: aipstype.h:42
const Regex RXidentifier
Int match_info(Int &start, Int &length, Int nth=0) const
Return some internal cregex info.
const Bool False
Definition: aipstype.h:44
Bool OK() const
Does it contain a valid Regex?
void create(const String &, Int, Int, const Char *)
Compile the regular expression.
const String & regexp() const
Get the regular expression string.
Definition: Regex.h:259
virtual String::size_type match(const Char *s, String::size_type len, String::size_type pos=0) const
Test if the regular expression matches (part of) string s.
const Char * transtable() const
Get the translation table (can be a zero pointer).
Definition: Regex.h:263
Abstract interface class to regular expressions for String.
Definition: RegexBase.h:132
String: the storage and methods of handling collections of characters.
Definition: String.h:223
void dealloc()
Deallocate the stuff allocated by create.
re_pattern_buffer * buf
Definition: Regex.h:317
Char * trans
Definition: Regex.h:316
#define casacore
&lt;X11/Intrinsic.h&gt; #defines true, false, casacore::Bool, and String.
Definition: X11Intrinsic.h:42