001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 * 
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 * 
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a metaphone value. 
025 * <p>
026 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 
027 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
028 * </p>
029 * <p>
030 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, p
031 * 39.</CITE>
032 * </p>
033 * <p>
034 * Note, that this does not match the algorithm that ships with PHP, or the algorithm 
035 * found in the Perl <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
036 * They have had undocumented changes from the originally published algorithm. 
037 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
038 * </p>
039 * 
040 * @author Apache Software Foundation
041 * @version $Id: Metaphone.java 797690 2009-07-24 23:28:35Z ggregory $
042 */
043public class Metaphone implements StringEncoder {
044
045    /**
046     * Five values in the English language 
047     */
048    private static final String VOWELS = "AEIOU" ;
049
050    /**
051     * Variable used in Metaphone algorithm
052     */
053    private static final String FRONTV = "EIY"   ;
054
055    /**
056     * Variable used in Metaphone algorithm
057     */
058    private static final String VARSON = "CSPTG" ;
059
060    /**
061     * The max code length for metaphone is 4
062     */
063    private int maxCodeLen = 4 ;
064
065    /**
066     * Creates an instance of the Metaphone encoder
067     */
068    public Metaphone() {
069        super();
070    }
071
072    /**
073     * Find the metaphone value of a String. This is similar to the
074     * soundex algorithm, but better at finding similar sounding words.
075     * All input is converted to upper case.
076     * Limitations: Input format is expected to be a single ASCII word
077     * with only characters in the A - Z range, no punctuation or numbers.
078     *
079     * @param txt String to find the metaphone code for
080     * @return A metaphone code corresponding to the String supplied
081     */
082    public String metaphone(String txt) {
083        boolean hard = false ;
084        if ((txt == null) || (txt.length() == 0)) {
085            return "" ;
086        }
087        // single character is itself
088        if (txt.length() == 1) {
089            return txt.toUpperCase(java.util.Locale.ENGLISH) ;
090        }
091      
092        char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray() ;
093      
094        StringBuffer local = new StringBuffer(40); // manipulate
095        StringBuffer code = new StringBuffer(10) ; //   output
096        // handle initial 2 characters exceptions
097        switch(inwd[0]) {
098        case 'K' : 
099        case 'G' : 
100        case 'P' : /* looking for KN, etc*/
101            if (inwd[1] == 'N') {
102                local.append(inwd, 1, inwd.length - 1);
103            } else {
104                local.append(inwd);
105            }
106            break;
107        case 'A': /* looking for AE */
108            if (inwd[1] == 'E') {
109                local.append(inwd, 1, inwd.length - 1);
110            } else {
111                local.append(inwd);
112            }
113            break;
114        case 'W' : /* looking for WR or WH */
115            if (inwd[1] == 'R') {   // WR -> R
116                local.append(inwd, 1, inwd.length - 1); 
117                break ;
118            }
119            if (inwd[1] == 'H') {
120                local.append(inwd, 1, inwd.length - 1);
121                local.setCharAt(0, 'W'); // WH -> W
122            } else {
123                local.append(inwd);
124            }
125            break;
126        case 'X' : /* initial X becomes S */
127            inwd[0] = 'S';
128            local.append(inwd);
129            break ;
130        default :
131            local.append(inwd);
132        } // now local has working string with initials fixed
133
134        int wdsz = local.length();
135        int n = 0 ;
136
137        while ((code.length() < this.getMaxCodeLen()) && 
138               (n < wdsz) ) { // max code size of 4 works well
139            char symb = local.charAt(n) ;
140            // remove duplicate letters except C
141            if ((symb != 'C') && (isPreviousChar( local, n, symb )) ) {
142                n++ ;
143            } else { // not dup
144                switch(symb) {
145                case 'A' : case 'E' : case 'I' : case 'O' : case 'U' :
146                    if (n == 0) { 
147                        code.append(symb);
148                    }
149                    break ; // only use vowel if leading char
150                case 'B' :
151                    if ( isPreviousChar(local, n, 'M') && 
152                         isLastChar(wdsz, n) ) { // B is silent if word ends in MB
153                        break;
154                    }
155                    code.append(symb);
156                    break;
157                case 'C' : // lots of C special cases
158                    /* discard if SCI, SCE or SCY */
159                    if ( isPreviousChar(local, n, 'S') && 
160                         !isLastChar(wdsz, n) && 
161                         (FRONTV.indexOf(local.charAt(n + 1)) >= 0) ) { 
162                        break;
163                    }
164                    if (regionMatch(local, n, "CIA")) { // "CIA" -> X
165                        code.append('X'); 
166                        break;
167                    }
168                    if (!isLastChar(wdsz, n) && 
169                        (FRONTV.indexOf(local.charAt(n + 1)) >= 0)) {
170                        code.append('S');
171                        break; // CI,CE,CY -> S
172                    }
173                    if (isPreviousChar(local, n, 'S') &&
174                        isNextChar(local, n, 'H') ) { // SCH->sk
175                        code.append('K') ; 
176                        break ;
177                    }
178                    if (isNextChar(local, n, 'H')) { // detect CH
179                        if ((n == 0) && 
180                            (wdsz >= 3) && 
181                            isVowel(local,2) ) { // CH consonant -> K consonant
182                            code.append('K');
183                        } else { 
184                            code.append('X'); // CHvowel -> X
185                        }
186                    } else { 
187                        code.append('K');
188                    }
189                    break ;
190                case 'D' :
191                    if (!isLastChar(wdsz, n + 1) && 
192                        isNextChar(local, n, 'G') && 
193                        (FRONTV.indexOf(local.charAt(n + 2)) >= 0)) { // DGE DGI DGY -> J 
194                        code.append('J'); n += 2 ;
195                    } else { 
196                        code.append('T');
197                    }
198                    break ;
199                case 'G' : // GH silent at end or before consonant
200                    if (isLastChar(wdsz, n + 1) && 
201                        isNextChar(local, n, 'H')) {
202                        break;
203                    }
204                    if (!isLastChar(wdsz, n + 1) &&  
205                        isNextChar(local,n,'H') && 
206                        !isVowel(local,n+2)) {
207                        break;
208                    }
209                    if ((n > 0) && 
210                        ( regionMatch(local, n, "GN") ||
211                          regionMatch(local, n, "GNED") ) ) {
212                        break; // silent G
213                    }
214                    if (isPreviousChar(local, n, 'G')) {
215                        // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
216                        hard = true ;
217                    } else {
218                        hard = false ;
219                    }
220                    if (!isLastChar(wdsz, n) && 
221                        (FRONTV.indexOf(local.charAt(n + 1)) >= 0) && 
222                        (!hard)) {
223                        code.append('J');
224                    } else {
225                        code.append('K');
226                    }
227                    break ;
228                case 'H':
229                    if (isLastChar(wdsz, n)) {
230                        break ; // terminal H
231                    }
232                    if ((n > 0) && 
233                        (VARSON.indexOf(local.charAt(n - 1)) >= 0)) {
234                        break;
235                    }
236                    if (isVowel(local,n+1)) {
237                        code.append('H'); // Hvowel
238                    }
239                    break;
240                case 'F': 
241                case 'J' : 
242                case 'L' :
243                case 'M': 
244                case 'N' : 
245                case 'R' :
246                    code.append(symb); 
247                    break;
248                case 'K' :
249                    if (n > 0) { // not initial
250                        if (!isPreviousChar(local, n, 'C')) {
251                            code.append(symb);
252                        }
253                    } else {
254                        code.append(symb); // initial K
255                    }
256                    break ;
257                case 'P' :
258                    if (isNextChar(local,n,'H')) {
259                        // PH -> F
260                        code.append('F');
261                    } else {
262                        code.append(symb);
263                    }
264                    break ;
265                case 'Q' :
266                    code.append('K');
267                    break;
268                case 'S' :
269                    if (regionMatch(local,n,"SH") || 
270                        regionMatch(local,n,"SIO") || 
271                        regionMatch(local,n,"SIA")) {
272                        code.append('X');
273                    } else {
274                        code.append('S');
275                    }
276                    break;
277                case 'T' :
278                    if (regionMatch(local,n,"TIA") || 
279                        regionMatch(local,n,"TIO")) {
280                        code.append('X'); 
281                        break;
282                    }
283                    if (regionMatch(local,n,"TCH")) {
284                        // Silent if in "TCH"
285                        break;
286                    }
287                    // substitute numeral 0 for TH (resembles theta after all)
288                    if (regionMatch(local,n,"TH")) {
289                        code.append('0');
290                    } else {
291                        code.append('T');
292                    }
293                    break ;
294                case 'V' :
295                    code.append('F'); break ;
296                case 'W' : case 'Y' : // silent if not followed by vowel
297                    if (!isLastChar(wdsz,n) && 
298                        isVowel(local,n+1)) {
299                        code.append(symb);
300                    }
301                    break ;
302                case 'X' :
303                    code.append('K'); code.append('S');
304                    break ;
305                case 'Z' :
306                    code.append('S'); break ;
307                } // end switch
308                n++ ;
309            } // end else from symb != 'C'
310            if (code.length() > this.getMaxCodeLen()) { 
311                code.setLength(this.getMaxCodeLen()); 
312            }
313        }
314        return code.toString();
315    }
316
317    private boolean isVowel(StringBuffer string, int index) {
318        return VOWELS.indexOf(string.charAt(index)) >= 0;
319    }
320
321    private boolean isPreviousChar(StringBuffer string, int index, char c) {
322        boolean matches = false;
323        if( index > 0 &&
324            index < string.length() ) {
325            matches = string.charAt(index - 1) == c;
326        }
327        return matches;
328    }
329
330    private boolean isNextChar(StringBuffer string, int index, char c) {
331        boolean matches = false;
332        if( index >= 0 &&
333            index < string.length() - 1 ) {
334            matches = string.charAt(index + 1) == c;
335        }
336        return matches;
337    }
338
339    private boolean regionMatch(StringBuffer string, int index, String test) {
340        boolean matches = false;
341        if( index >= 0 &&
342            (index + test.length() - 1) < string.length() ) {
343            String substring = string.substring( index, index + test.length());
344            matches = substring.equals( test );
345        }
346        return matches;
347    }
348
349    private boolean isLastChar(int wdsz, int n) {
350        return n + 1 == wdsz;
351    } 
352    
353    
354    /**
355     * Encodes an Object using the metaphone algorithm.  This method
356     * is provided in order to satisfy the requirements of the
357     * Encoder interface, and will throw an EncoderException if the
358     * supplied object is not of type java.lang.String.
359     *
360     * @param pObject Object to encode
361     * @return An object (or type java.lang.String) containing the 
362     *         metaphone code which corresponds to the String supplied.
363     * @throws EncoderException if the parameter supplied is not
364     *                          of type java.lang.String
365     */
366    public Object encode(Object pObject) throws EncoderException {
367        if (!(pObject instanceof String)) {
368            throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 
369        }
370        return metaphone((String) pObject);
371    }
372
373    /**
374     * Encodes a String using the Metaphone algorithm. 
375     *
376     * @param pString String object to encode
377     * @return The metaphone code corresponding to the String supplied
378     */
379    public String encode(String pString) {
380        return metaphone(pString);   
381    }
382
383    /**
384     * Tests is the metaphones of two strings are identical.
385     *
386     * @param str1 First of two strings to compare
387     * @param str2 Second of two strings to compare
388     * @return <code>true</code> if the metaphones of these strings are identical, 
389     *        <code>false</code> otherwise.
390     */
391    public boolean isMetaphoneEqual(String str1, String str2) {
392        return metaphone(str1).equals(metaphone(str2));
393    }
394
395    /**
396     * Returns the maxCodeLen.
397     * @return int
398     */
399    public int getMaxCodeLen() { return this.maxCodeLen; }
400
401    /**
402     * Sets the maxCodeLen.
403     * @param maxCodeLen The maxCodeLen to set
404     */
405    public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
406
407}