001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 * 
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 * 
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.io.input;
018
019import java.io.IOException;
020import java.io.InputStream;
021import java.util.Arrays;
022import java.util.Comparator;
023import java.util.List;
024
025import org.apache.commons.io.ByteOrderMark;
026
027/**
028 * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes.
029 * 
030 * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the
031 * first byte in the stream.
032 * 
033 * The {@link ByteOrderMark} implementation has the following pre-defined BOMs:
034 * <ul>
035 * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li>
036 * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li>
037 * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li>
038 * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li>
039 * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li>
040 * </ul>
041 * 
042 * 
043 * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3>
044 * 
045 * <pre>
046 * BOMInputStream bomIn = new BOMInputStream(in);
047 * if (bomIn.hasBOM()) {
048 *     // has a UTF-8 BOM
049 * }
050 * </pre>
051 * 
052 * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3>
053 * 
054 * <pre>
055 * boolean include = true;
056 * BOMInputStream bomIn = new BOMInputStream(in, include);
057 * if (bomIn.hasBOM()) {
058 *     // has a UTF-8 BOM
059 * }
060 * </pre>
061 * 
062 * <h3>Example 3 - Detect Multiple BOMs</h3>
063 * 
064 * <pre>
065 * BOMInputStream bomIn = new BOMInputStream(in, 
066 *   ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
067 *   ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
068 *   );
069 * if (bomIn.hasBOM() == false) {
070 *     // No BOM found
071 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
072 *     // has a UTF-16LE BOM
073 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
074 *     // has a UTF-16BE BOM
075 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
076 *     // has a UTF-32LE BOM
077 * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
078 *     // has a UTF-32BE BOM
079 * }
080 * </pre>
081 * 
082 * @see org.apache.commons.io.ByteOrderMark
083 * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a>
084 * @version $Id: BOMInputStream.java 1346400 2012-06-05 14:48:01Z ggregory $
085 * @since 2.0
086 */
087public class BOMInputStream extends ProxyInputStream {
088    private final boolean include;
089    /**
090     * BOMs are sorted from longest to shortest.
091     */
092    private final List<ByteOrderMark> boms;
093    private ByteOrderMark byteOrderMark;
094    private int[] firstBytes;
095    private int fbLength;
096    private int fbIndex;
097    private int markFbIndex;
098    private boolean markedAtStart;
099
100    /**
101     * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM.
102     * 
103     * @param delegate
104     *            the InputStream to delegate to
105     */
106    public BOMInputStream(InputStream delegate) {
107        this(delegate, false, ByteOrderMark.UTF_8);
108    }
109
110    /**
111     * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it.
112     * 
113     * @param delegate
114     *            the InputStream to delegate to
115     * @param include
116     *            true to include the UTF-8 BOM or false to exclude it
117     */
118    public BOMInputStream(InputStream delegate, boolean include) {
119        this(delegate, include, ByteOrderMark.UTF_8);
120    }
121
122    /**
123     * Constructs a new BOM InputStream that excludes the specified BOMs.
124     * 
125     * @param delegate
126     *            the InputStream to delegate to
127     * @param boms
128     *            The BOMs to detect and exclude
129     */
130    public BOMInputStream(InputStream delegate, ByteOrderMark... boms) {
131        this(delegate, false, boms);
132    }
133
134    /**
135     * Compares ByteOrderMark objects in descending length order.
136     */
137    private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() {
138
139        public int compare(ByteOrderMark bom1, ByteOrderMark bom2) {
140            int len1 = bom1.length();
141            int len2 = bom2.length();
142            if (len1 > len2) {
143                return -1;
144            }
145            if (len2 > len1) {
146                return 1;
147            }
148            return 0;
149        }
150    };
151
152    /**
153     * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
154     * 
155     * @param delegate
156     *            the InputStream to delegate to
157     * @param include
158     *            true to include the specified BOMs or false to exclude them
159     * @param boms
160     *            The BOMs to detect and optionally exclude
161     */
162    public BOMInputStream(InputStream delegate, boolean include, ByteOrderMark... boms) {
163        super(delegate);
164        if (boms == null || boms.length == 0) {
165            throw new IllegalArgumentException("No BOMs specified");
166        }
167        this.include = include;
168        // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes.
169        Arrays.sort(boms, ByteOrderMarkLengthComparator);
170        this.boms = Arrays.asList(boms);
171
172    }
173
174    /**
175     * Indicates whether the stream contains one of the specified BOMs.
176     * 
177     * @return true if the stream has one of the specified BOMs, otherwise false if it does not
178     * @throws IOException
179     *             if an error reading the first bytes of the stream occurs
180     */
181    public boolean hasBOM() throws IOException {
182        return getBOM() != null;
183    }
184
185    /**
186     * Indicates whether the stream contains the specified BOM.
187     * 
188     * @param bom
189     *            The BOM to check for
190     * @return true if the stream has the specified BOM, otherwise false if it does not
191     * @throws IllegalArgumentException
192     *             if the BOM is not one the stream is configured to detect
193     * @throws IOException
194     *             if an error reading the first bytes of the stream occurs
195     */
196    public boolean hasBOM(ByteOrderMark bom) throws IOException {
197        if (!boms.contains(bom)) {
198            throw new IllegalArgumentException("Stream not configure to detect " + bom);
199        }
200        return byteOrderMark != null && getBOM().equals(bom);
201    }
202
203    /**
204     * Return the BOM (Byte Order Mark).
205     * 
206     * @return The BOM or null if none
207     * @throws IOException
208     *             if an error reading the first bytes of the stream occurs
209     */
210    public ByteOrderMark getBOM() throws IOException {
211        if (firstBytes == null) {
212            fbLength = 0;
213            // BOMs are sorted from longest to shortest
214            final int maxBomSize = boms.get(0).length();
215            firstBytes = new int[maxBomSize];
216            // Read first maxBomSize bytes
217            for (int i = 0; i < firstBytes.length; i++) {
218                firstBytes[i] = in.read();
219                fbLength++;
220                if (firstBytes[i] < 0) {
221                    break;
222                }
223            }
224            // match BOM in firstBytes
225            byteOrderMark = find();
226            if (byteOrderMark != null) {
227                if (!include) {
228                    if (byteOrderMark.length() < firstBytes.length) {
229                        fbIndex = byteOrderMark.length();
230                    } else {
231                        fbLength = 0;
232                    }
233                }
234            }
235        }
236        return byteOrderMark;
237    }
238
239    /**
240     * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}.
241     * 
242     * @return The BOM charset Name or null if no BOM found
243     * @throws IOException
244     *             if an error reading the first bytes of the stream occurs
245     * 
246     */
247    public String getBOMCharsetName() throws IOException {
248        getBOM();
249        return byteOrderMark == null ? null : byteOrderMark.getCharsetName();
250    }
251
252    /**
253     * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte
254     * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been
255     * processed already.
256     * 
257     * @return the byte read (excluding BOM) or -1 if the end of stream
258     * @throws IOException
259     *             if an I/O error occurs
260     */
261    private int readFirstBytes() throws IOException {
262        getBOM();
263        return fbIndex < fbLength ? firstBytes[fbIndex++] : -1;
264    }
265
266    /**
267     * Find a BOM with the specified bytes.
268     * 
269     * @return The matched BOM or null if none matched
270     */
271    private ByteOrderMark find() {
272        for (ByteOrderMark bom : boms) {
273            if (matches(bom)) {
274                return bom;
275            }
276        }
277        return null;
278    }
279
280    /**
281     * Check if the bytes match a BOM.
282     * 
283     * @param bom
284     *            The BOM
285     * @return true if the bytes match the bom, otherwise false
286     */
287    private boolean matches(ByteOrderMark bom) {
288        // if (bom.length() != fbLength) {
289        // return false;
290        // }
291        // firstBytes may be bigger than the BOM bytes
292        for (int i = 0; i < bom.length(); i++) {
293            if (bom.get(i) != firstBytes[i]) {
294                return false;
295            }
296        }
297        return true;
298    }
299
300    // ----------------------------------------------------------------------------
301    // Implementation of InputStream
302    // ----------------------------------------------------------------------------
303
304    /**
305     * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM.
306     * 
307     * @return the byte read (excluding BOM) or -1 if the end of stream
308     * @throws IOException
309     *             if an I/O error occurs
310     */
311    @Override
312    public int read() throws IOException {
313        int b = readFirstBytes();
314        return b >= 0 ? b : in.read();
315    }
316
317    /**
318     * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM.
319     * 
320     * @param buf
321     *            the buffer to read the bytes into
322     * @param off
323     *            The start offset
324     * @param len
325     *            The number of bytes to read (excluding BOM)
326     * @return the number of bytes read or -1 if the end of stream
327     * @throws IOException
328     *             if an I/O error occurs
329     */
330    @Override
331    public int read(byte[] buf, int off, int len) throws IOException {
332        int firstCount = 0;
333        int b = 0;
334        while (len > 0 && b >= 0) {
335            b = readFirstBytes();
336            if (b >= 0) {
337                buf[off++] = (byte) (b & 0xFF);
338                len--;
339                firstCount++;
340            }
341        }
342        int secondCount = in.read(buf, off, len);
343        return secondCount < 0 ? firstCount > 0 ? firstCount : -1 : firstCount + secondCount;
344    }
345
346    /**
347     * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM.
348     * 
349     * @param buf
350     *            the buffer to read the bytes into
351     * @return the number of bytes read (excluding BOM) or -1 if the end of stream
352     * @throws IOException
353     *             if an I/O error occurs
354     */
355    @Override
356    public int read(byte[] buf) throws IOException {
357        return read(buf, 0, buf.length);
358    }
359
360    /**
361     * Invokes the delegate's <code>mark(int)</code> method.
362     * 
363     * @param readlimit
364     *            read ahead limit
365     */
366    @Override
367    public synchronized void mark(int readlimit) {
368        markFbIndex = fbIndex;
369        markedAtStart = firstBytes == null;
370        in.mark(readlimit);
371    }
372
373    /**
374     * Invokes the delegate's <code>reset()</code> method.
375     * 
376     * @throws IOException
377     *             if an I/O error occurs
378     */
379    @Override
380    public synchronized void reset() throws IOException {
381        fbIndex = markFbIndex;
382        if (markedAtStart) {
383            firstBytes = null;
384        }
385
386        in.reset();
387    }
388
389    /**
390     * Invokes the delegate's <code>skip(long)</code> method, detecting and optionallyskipping BOM.
391     * 
392     * @param n
393     *            the number of bytes to skip
394     * @return the number of bytes to skipped or -1 if the end of stream
395     * @throws IOException
396     *             if an I/O error occurs
397     */
398    @Override
399    public long skip(long n) throws IOException {
400        while (n > 0 && readFirstBytes() >= 0) {
401            n--;
402        }
403        return in.skip(n);
404    }
405}