001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 * 
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 * 
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Refined Soundex value. A refined soundex code is
025 * optimized for spell checking words. Soundex method originally developed by
026 * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>.
027 * 
028 * @author Apache Software Foundation
029 * @version $Id: RefinedSoundex.java 797690 2009-07-24 23:28:35Z ggregory $
030 */
031public class RefinedSoundex implements StringEncoder {
032
033    /**
034     * @since 1.4
035     */
036    public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505";
037
038   /**
039     * RefinedSoundex is *refined* for a number of reasons one being that the
040     * mappings have been altered. This implementation contains default
041     * mappings for US English.
042     */
043    public static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
044
045    /**
046     * Every letter of the alphabet is "mapped" to a numerical value. This char
047     * array holds the values to which each letter is mapped. This
048     * implementation contains a default map for US_ENGLISH
049     */
050    private final char[] soundexMapping;
051
052    /**
053     * This static variable contains an instance of the RefinedSoundex using
054     * the US_ENGLISH mapping.
055     */
056    public static final RefinedSoundex US_ENGLISH = new RefinedSoundex();
057
058     /**
059     * Creates an instance of the RefinedSoundex object using the default US
060     * English mapping.
061     */
062    public RefinedSoundex() {
063        this.soundexMapping = US_ENGLISH_MAPPING;
064    }
065
066    /**
067     * Creates a refined soundex instance using a custom mapping. This
068     * constructor can be used to customize the mapping, and/or possibly
069     * provide an internationalized mapping for a non-Western character set.
070     * 
071     * @param mapping
072     *                  Mapping array to use when finding the corresponding code for
073     *                  a given character
074     */
075    public RefinedSoundex(char[] mapping) {
076        this.soundexMapping = new char[mapping.length];
077        System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length);
078    }
079
080    /**
081     * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
082     * and/or possibly provide an internationalized mapping for a non-Western character set.
083     * 
084     * @param mapping
085     *            Mapping string to use when finding the corresponding code for a given character
086     * @since 1.4
087     */
088    public RefinedSoundex(String mapping) {
089        this.soundexMapping = mapping.toCharArray();
090    }
091
092    /**
093     * Returns the number of characters in the two encoded Strings that are the
094     * same. This return value ranges from 0 to the length of the shortest
095     * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for
096     * example) indicates strong similarity or identical values. For refined
097     * Soundex, the return value can be greater than 4.
098     * 
099     * @param s1
100     *                  A String that will be encoded and compared.
101     * @param s2
102     *                  A String that will be encoded and compared.
103     * @return The number of characters in the two encoded Strings that are the
104     *             same from 0 to to the length of the shortest encoded String.
105     * 
106     * @see SoundexUtils#difference(StringEncoder,String,String)
107     * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp">
108     *          MS T-SQL DIFFERENCE</a>
109     * 
110     * @throws EncoderException
111     *                  if an error occurs encoding one of the strings
112     * @since 1.3
113     */
114    public int difference(String s1, String s2) throws EncoderException {
115        return SoundexUtils.difference(this, s1, s2);
116    }
117
118    /**
119     * Encodes an Object using the refined soundex algorithm. This method is
120     * provided in order to satisfy the requirements of the Encoder interface,
121     * and will throw an EncoderException if the supplied object is not of type
122     * java.lang.String.
123     * 
124     * @param pObject
125     *                  Object to encode
126     * @return An object (or type java.lang.String) containing the refined
127     *             soundex code which corresponds to the String supplied.
128     * @throws EncoderException
129     *                  if the parameter supplied is not of type java.lang.String
130     */
131    public Object encode(Object pObject) throws EncoderException {
132        if (!(pObject instanceof String)) {
133            throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String");
134        }
135        return soundex((String) pObject);
136    }
137
138    /**
139     * Encodes a String using the refined soundex algorithm.
140     * 
141     * @param pString
142     *                  A String object to encode
143     * @return A Soundex code corresponding to the String supplied
144     */
145    public String encode(String pString) {
146        return soundex(pString);
147    }
148
149    /**
150     * Returns the mapping code for a given character. The mapping codes are
151     * maintained in an internal char array named soundexMapping, and the
152     * default values of these mappings are US English.
153     * 
154     * @param c
155     *                  char to get mapping for
156     * @return A character (really a numeral) to return for the given char
157     */
158    char getMappingCode(char c) {
159        if (!Character.isLetter(c)) {
160            return 0;
161        }
162        return this.soundexMapping[Character.toUpperCase(c) - 'A'];
163    }
164
165    /**
166     * Retreives the Refined Soundex code for a given String object.
167     * 
168     * @param str
169     *                  String to encode using the Refined Soundex algorithm
170     * @return A soundex code for the String supplied
171     */
172    public String soundex(String str) {
173        if (str == null) {
174            return null;
175        }
176        str = SoundexUtils.clean(str);
177        if (str.length() == 0) {
178            return str;
179        }
180
181        StringBuffer sBuf = new StringBuffer();
182        sBuf.append(str.charAt(0));
183
184        char last, current;
185        last = '*';
186
187        for (int i = 0; i < str.length(); i++) {
188
189            current = getMappingCode(str.charAt(i));
190            if (current == last) {
191                continue;
192            } else if (current != 0) {
193                sBuf.append(current);
194            }
195
196            last = current;
197
198        }
199
200        return sBuf.toString();
201    }
202}