casa
$Rev:20696$
|
00001 //# StringDistance.h: Class to deal with Levensthein distance of strings 00002 //# Copyright (C) 2009 00003 //# Associated Universities, Inc. Washington DC, USA. 00004 //# 00005 //# This library is free software; you can redistribute it and/or modify it 00006 //# under the terms of the GNU Library General Public License as published by 00007 //# the Free Software Foundation; either version 2 of the License, or (at your 00008 //# option) any later version. 00009 //# 00010 //# This library is distributed in the hope that it will be useful, but WITHOUT 00011 //# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 00012 //# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public 00013 //# License for more details. 00014 //# 00015 //# You should have received a copy of the GNU Library General Public License 00016 //# along with this library; if not, write to the Free Software Foundation, 00017 //# Inc., 675 Massachusetts Ave, Cambridge, MA 02139, USA. 00018 //# 00019 //# Correspondence concerning AIPS++ should be addressed as follows: 00020 //# Internet email: aips2-request@nrao.edu. 00021 //# Postal address: AIPS++ Project Office 00022 //# National Radio Astronomy Observatory 00023 //# 520 Edgemont Road 00024 //# Charlottesville, VA 22903-2475 USA 00025 //# 00026 //# $Id: StringDistance.h 20739 2009-09-29 01:15:15Z Malte.Marquarding $ 00027 00028 #ifndef CASA_STRINGDISTANCE_H 00029 #define CASA_STRINGDISTANCE_H 00030 00031 //# Includes 00032 #include <casa/BasicSL/String.h> 00033 #include <casa/Arrays/Matrix.h> 00034 00035 namespace casa { 00036 00037 // <summary> 00038 // Class to deal with Levensthein distance of strings. 00039 // </summary> 00040 00041 // <synopsis> 00042 // The Levenshtein Distance is a metric telling how similar strings are. 00043 // It is also known as the Edit Distance. 00044 // 00045 // The distance tells how many operations (i.e., character substitutions, 00046 // insertions, and deletions are needed to transform one string into another. 00047 // <br>There are several extensions to the basic definition: 00048 // <ul> 00049 // <li> Treat a swap of adjacent characters as one operation. 00050 // <li> Give a weight to operations (e.g., insertions and deletions have 00051 // half the weight of the other operations). 00052 // <li> Take locality into account. For example, successive substitutions 00053 // weigh more than non-successive. 00054 // <li> Extend to wildcarded strings. 00055 // </ul> 00056 // This class optionally uses the swap extension. Furthermore one can 00057 // optionally ignore blanks. By default both options are used. 00058 // 00059 // The code is based on code written by Anders Sewerin Johansen. 00060 // Calculating the distance is an expensive O(N^2) operation, thus 00061 // should be used with care. 00062 // 00063 // The class is constructed with the source string to compare against. 00064 // Thereafter its <code>match</code> or <code>distance</code> 00065 // function can be used for each target string. 00066 // </synopsis> 00067 00068 class StringDistance 00069 { 00070 public: 00071 // Default constructor sets maxDistance to 0. 00072 StringDistance(); 00073 00074 // Construct from the source string and maximum distance. 00075 // If the maximum distance is negative, it defaults to 1+strlength/3. 00076 // Note that maximum distance 0 means that the strings must match exactly. 00077 explicit StringDistance (const String& source, Int maxDistance=-1, 00078 Bool countSwaps=True, Bool ignoreBlanks=True, 00079 Bool caseInsensitive=False); 00080 00081 // Get data members. 00082 // <group> 00083 const string& source() const 00084 { return itsSource; } 00085 Int maxDistance() const 00086 { return itsMaxDistance; } 00087 const Matrix<Int>& matrix() const 00088 { return itsMatrix; } 00089 // </group> 00090 00091 // Test if the given target string is within the maximum distance. 00092 Bool match (const String& target) const; 00093 00094 // Calculate the distance from the string to the string given in the constructor. 00095 // If the length of target exceeds source length + maxDistance, 00096 // the difference in lengths is returned. 00097 Int distance (const String& target) const; 00098 00099 // Calculate the distance between the two strings. 00100 // This is slower than the <src>distance</src> member function, because 00101 // it has to allocate the underlying Matrix for each invocation. 00102 static Int distance (const String& source, const String& target, 00103 Bool countSwaps=True); 00104 00105 // Remove blanks from the given string. 00106 static String removeBlanks (const String& source); 00107 00108 private: 00109 // Calculate the distance. 00110 static Int doDistance (const String& source, const String& target, 00111 Bool countSwaps, Matrix<Int>& matrix); 00112 00113 00114 private: 00115 String itsSource; 00116 mutable Matrix<Int> itsMatrix; 00117 Int itsMaxDistance; 00118 Bool itsCountSwaps; 00119 Bool itsIgnoreBlanks; 00120 Bool itsCaseInsensitive; 00121 }; 00122 00123 } //# end namespace 00124 00125 #endif