casa  $Rev:20696$
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
StringDistance.h
Go to the documentation of this file.
00001 //# StringDistance.h: Class to deal with Levensthein distance of strings
00002 //# Copyright (C) 2009
00003 //# Associated Universities, Inc. Washington DC, USA.
00004 //#
00005 //# This library is free software; you can redistribute it and/or modify it
00006 //# under the terms of the GNU Library General Public License as published by
00007 //# the Free Software Foundation; either version 2 of the License, or (at your
00008 //# option) any later version.
00009 //#
00010 //# This library is distributed in the hope that it will be useful, but WITHOUT
00011 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
00012 //# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
00013 //# License for more details.
00014 //#
00015 //# You should have received a copy of the GNU Library General Public License
00016 //# along with this library; if not, write to the Free Software Foundation,
00017 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA.
00018 //#
00019 //# Correspondence concerning AIPS++ should be addressed as follows:
00020 //#        Internet email: aips2-request@nrao.edu.
00021 //#        Postal address: AIPS++ Project Office
00022 //#                        National Radio Astronomy Observatory
00023 //#                        520 Edgemont Road
00024 //#                        Charlottesville, VA 22903-2475 USA
00025 //#
00026 //# $Id: StringDistance.h 20739 2009-09-29 01:15:15Z Malte.Marquarding $
00027 
00028 #ifndef CASA_STRINGDISTANCE_H
00029 #define CASA_STRINGDISTANCE_H
00030 
00031 //# Includes
00032 #include <casa/BasicSL/String.h>
00033 #include <casa/Arrays/Matrix.h>
00034 
00035 namespace casa {
00036 
00037 // <summary>
00038 // Class to deal with Levensthein distance of strings.
00039 // </summary>
00040 
00041 // <synopsis>
00042 // The Levenshtein Distance is a metric telling how similar strings are.
00043 // It is also known as the Edit Distance.
00044 //
00045 // The distance tells how many operations (i.e., character substitutions,
00046 // insertions, and deletions are needed to transform one string into another.
00047 // <br>There are several extensions to the basic definition:
00048 // <ul>
00049 //  <li> Treat a swap of adjacent characters as one operation.
00050 //  <li> Give a weight to operations (e.g., insertions and deletions have
00051 //       half the weight of the other operations).
00052 //  <li> Take locality into account. For example, successive substitutions
00053 //       weigh more than non-successive.
00054 //  <li> Extend to wildcarded strings.
00055 // </ul> 
00056 // This class optionally uses the swap extension. Furthermore one can
00057 // optionally ignore blanks. By default both options are used.
00058 //
00059 // The code is based on code written by Anders Sewerin Johansen.
00060 // Calculating the distance is an expensive O(N^2) operation, thus
00061 // should be used with care.
00062 //
00063 // The class is constructed with the source string to compare against.
00064 // Thereafter its <code>match</code> or <code>distance</code>
00065 // function can be used for each target string.
00066 // </synopsis>
00067 
00068 class StringDistance
00069 {
00070 public:
00071   // Default constructor sets maxDistance to 0.
00072   StringDistance();
00073 
00074   // Construct from the source string and maximum distance.
00075   // If the maximum distance is negative, it defaults to 1+strlength/3.
00076   // Note that maximum distance 0 means that the strings must match exactly.
00077   explicit StringDistance (const String& source, Int maxDistance=-1,
00078                            Bool countSwaps=True, Bool ignoreBlanks=True,
00079                            Bool caseInsensitive=False);
00080 
00081   // Get data members.
00082   // <group>
00083   const string& source() const
00084     { return itsSource; }
00085   Int maxDistance() const
00086     { return itsMaxDistance; }
00087   const Matrix<Int>& matrix() const
00088     { return itsMatrix; }
00089   // </group>
00090   
00091   // Test if the given target string is within the maximum distance.
00092   Bool match (const String& target) const;
00093 
00094   // Calculate the distance from the string to the string given in the constructor.
00095   // If the length of target exceeds source length + maxDistance,
00096   // the difference in lengths is returned.
00097   Int distance (const String& target) const;
00098 
00099   // Calculate the distance between the two strings.
00100   // This is slower than the <src>distance</src> member function, because
00101   // it has to allocate the underlying Matrix for each invocation.
00102   static Int distance (const String& source, const String& target,
00103                        Bool countSwaps=True);
00104 
00105   // Remove blanks from the given string.
00106   static String removeBlanks (const String& source);
00107 
00108 private:
00109   // Calculate the distance.
00110   static Int doDistance (const String& source, const String& target,
00111                          Bool countSwaps, Matrix<Int>& matrix);
00112 
00113 
00114 private:
00115   String              itsSource;
00116   mutable Matrix<Int> itsMatrix;
00117   Int                 itsMaxDistance;
00118   Bool                itsCountSwaps;
00119   Bool                itsIgnoreBlanks;
00120   Bool                itsCaseInsensitive;
00121 };
00122 
00123 } //# end namespace
00124 
00125 #endif