001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.gzip;
020
021import java.io.ByteArrayOutputStream;
022import java.io.IOException;
023import java.io.EOFException;
024import java.io.InputStream;
025import java.io.DataInputStream;
026import java.io.BufferedInputStream;
027import java.util.zip.DataFormatException;
028import java.util.zip.Deflater;
029import java.util.zip.Inflater;
030import java.util.zip.CRC32;
031
032import org.apache.commons.compress.compressors.CompressorInputStream;
033import org.apache.commons.compress.utils.CharsetNames;
034
035/**
036 * Input stream that decompresses .gz files.
037 * This supports decompressing concatenated .gz files which is important
038 * when decompressing standalone .gz files.
039 * <p>
040 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
041 * files: it stops after the first member and silently ignores the rest.
042 * It doesn't leave the read position to point to the beginning of the next
043 * member, which makes it difficult workaround the lack of concatenation
044 * support.
045 * <p>
046 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
047 * container format decoder. The actual decompression is done with
048 * {@link java.util.zip.Inflater}.
049 */
050public class GzipCompressorInputStream extends CompressorInputStream {
051    // Header flags
052    // private static final int FTEXT = 0x01; // Uninteresting for us
053    private static final int FHCRC = 0x02;
054    private static final int FEXTRA = 0x04;
055    private static final int FNAME = 0x08;
056    private static final int FCOMMENT = 0x10;
057    private static final int FRESERVED = 0xE0;
058
059    // Compressed input stream, possibly wrapped in a BufferedInputStream
060    private final InputStream in;
061
062    // True if decompressing multimember streams.
063    private final boolean decompressConcatenated;
064
065    // Buffer to hold the input data
066    private final byte[] buf = new byte[8192];
067
068    // Amount of data in buf.
069    private int bufUsed = 0;
070
071    // Decompressor
072    private Inflater inf = new Inflater(true);
073
074    // CRC32 from uncompressed data
075    private final CRC32 crc = new CRC32();
076
077    private int memberSize;
078
079    // True once everything has been decompressed
080    private boolean endReached = false;
081
082    // used in no-arg read method
083    private final byte[] oneByte = new byte[1];
084
085    private final GzipParameters parameters = new GzipParameters();
086
087    /**
088     * Constructs a new input stream that decompresses gzip-compressed data
089     * from the specified input stream.
090     * <p>
091     * This is equivalent to
092     * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
093     * will not decompress concatenated .gz files.
094     *
095     * @param inputStream  the InputStream from which this object should
096     *                     be created of
097     *
098     * @throws IOException if the stream could not be created
099     */
100    public GzipCompressorInputStream(InputStream inputStream)
101            throws IOException {
102        this(inputStream, false);
103    }
104
105    /**
106     * Constructs a new input stream that decompresses gzip-compressed data
107     * from the specified input stream.
108     * <p>
109     * If <code>decompressConcatenated</code> is {@code false}:
110     * This decompressor might read more input than it will actually use.
111     * If <code>inputStream</code> supports <code>mark</code> and
112     * <code>reset</code>, then the input position will be adjusted
113     * so that it is right after the last byte of the compressed stream.
114     * If <code>mark</code> isn't supported, the input position will be
115     * undefined.
116     *
117     * @param inputStream  the InputStream from which this object should
118     *                     be created of
119     * @param decompressConcatenated
120     *                     if true, decompress until the end of the input;
121     *                     if false, stop after the first .gz member
122     *
123     * @throws IOException if the stream could not be created
124     */
125    public GzipCompressorInputStream(InputStream inputStream,
126                                     boolean decompressConcatenated)
127            throws IOException {
128        // Mark support is strictly needed for concatenated files only,
129        // but it's simpler if it is always available.
130        if (inputStream.markSupported()) {
131            in = inputStream;
132        } else {
133            in = new BufferedInputStream(inputStream);
134        }
135
136        this.decompressConcatenated = decompressConcatenated;
137        init(true);
138    }
139
140    /**
141     * Provides the stream's meta data - may change with each stream
142     * when decompressing concatenated streams.
143     * @return the stream's meta data
144     * @since 1.8
145     */
146    public GzipParameters getMetaData() {
147        return parameters;
148    }
149
150    private boolean init(boolean isFirstMember) throws IOException {
151        assert isFirstMember || decompressConcatenated;
152
153        // Check the magic bytes without a possibility of EOFException.
154        int magic0 = in.read();
155        int magic1 = in.read();
156
157        // If end of input was reached after decompressing at least
158        // one .gz member, we have reached the end of the file successfully.
159        if (magic0 == -1 && !isFirstMember) {
160            return false;
161        }
162
163        if (magic0 != 31 || magic1 != 139) {
164            throw new IOException(isFirstMember
165                                  ? "Input is not in the .gz format"
166                                  : "Garbage after a valid .gz stream");
167        }
168
169        // Parsing the rest of the header may throw EOFException.
170        DataInputStream inData = new DataInputStream(in);
171        int method = inData.readUnsignedByte();
172        if (method != Deflater.DEFLATED) {
173            throw new IOException("Unsupported compression method "
174                                  + method + " in the .gz header");
175        }
176
177        int flg = inData.readUnsignedByte();
178        if ((flg & FRESERVED) != 0) {
179            throw new IOException(
180                    "Reserved flags are set in the .gz header");
181        }
182
183        parameters.setModificationTime(readLittleEndianInt(inData) * 1000);
184        switch (inData.readUnsignedByte()) { // extra flags
185        case 2:
186            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
187            break;
188        case 4:
189            parameters.setCompressionLevel(Deflater.BEST_SPEED);
190            break;
191        default:
192            // ignored for now
193            break;
194        }
195        parameters.setOperatingSystem(inData.readUnsignedByte());
196
197        // Extra field, ignored
198        if ((flg & FEXTRA) != 0) {
199            int xlen = inData.readUnsignedByte();
200            xlen |= inData.readUnsignedByte() << 8;
201
202            // This isn't as efficient as calling in.skip would be,
203            // but it's lazier to handle unexpected end of input this way.
204            // Most files don't have an extra field anyway.
205            while (xlen-- > 0) {
206                inData.readUnsignedByte();
207            }
208        }
209
210        // Original file name
211        if ((flg & FNAME) != 0) {
212            parameters.setFilename(new String(readToNull(inData),
213                                              CharsetNames.ISO_8859_1));
214        }
215
216        // Comment
217        if ((flg & FCOMMENT) != 0) {
218            parameters.setComment(new String(readToNull(inData),
219                                             CharsetNames.ISO_8859_1));
220        }
221
222        // Header "CRC16" which is actually a truncated CRC32 (which isn't
223        // as good as real CRC16). I don't know if any encoder implementation
224        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
225        // doesn't support this field, but zlib seems to be able to at least
226        // skip over it.
227        if ((flg & FHCRC) != 0) {
228            inData.readShort();
229        }
230
231        // Reset
232        inf.reset();
233        crc.reset();
234        memberSize = 0;
235
236        return true;
237    }
238
239    private byte[] readToNull(DataInputStream inData) throws IOException {
240        ByteArrayOutputStream bos = new ByteArrayOutputStream();
241        int b = 0;
242        while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD
243            bos.write(b);
244        }
245        return bos.toByteArray();
246    }
247
248    private int readLittleEndianInt(DataInputStream inData) throws IOException {
249        return inData.readUnsignedByte()
250            | (inData.readUnsignedByte() << 8)
251            | (inData.readUnsignedByte() << 16)
252            | (inData.readUnsignedByte() << 24);
253    }
254
255    @Override
256    public int read() throws IOException {
257        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
258    }
259
260    /**
261     * {@inheritDoc}
262     *
263     * @since 1.1
264     */
265    @Override
266    public int read(byte[] b, int off, int len) throws IOException {
267        if (endReached) {
268            return -1;
269        }
270
271        int size = 0;
272
273        while (len > 0) {
274            if (inf.needsInput()) {
275                // Remember the current position because we may need to
276                // rewind after reading too much input.
277                in.mark(buf.length);
278
279                bufUsed = in.read(buf);
280                if (bufUsed == -1) {
281                    throw new EOFException();
282                }
283
284                inf.setInput(buf, 0, bufUsed);
285            }
286
287            int ret;
288            try {
289                ret = inf.inflate(b, off, len);
290            } catch (DataFormatException e) {
291                throw new IOException("Gzip-compressed data is corrupt");
292            }
293
294            crc.update(b, off, ret);
295            memberSize += ret;
296            off += ret;
297            len -= ret;
298            size += ret;
299            count(ret);
300
301            if (inf.finished()) {
302                // We may have read too many bytes. Rewind the read
303                // position to match the actual amount used.
304                //
305                // NOTE: The "if" is there just in case. Since we used
306                // in.mark earler, it should always skip enough.
307                in.reset();
308
309                int skipAmount = bufUsed - inf.getRemaining();
310                if (in.skip(skipAmount) != skipAmount) {
311                    throw new IOException();
312                }
313
314                bufUsed = 0;
315
316                DataInputStream inData = new DataInputStream(in);
317
318                // CRC32
319                long crcStored = 0;
320                for (int i = 0; i < 4; ++i) {
321                    crcStored |= (long)inData.readUnsignedByte() << (i * 8);
322                }
323
324                if (crcStored != crc.getValue()) {
325                    throw new IOException("Gzip-compressed data is corrupt "
326                                          + "(CRC32 error)");
327                }
328
329                // Uncompressed size modulo 2^32 (ISIZE in the spec)
330                int isize = 0;
331                for (int i = 0; i < 4; ++i) {
332                    isize |= inData.readUnsignedByte() << (i * 8);
333                }
334
335                if (isize != memberSize) {
336                    throw new IOException("Gzip-compressed data is corrupt"
337                                          + "(uncompressed size mismatch)");
338                }
339
340                // See if this is the end of the file.
341                if (!decompressConcatenated || !init(false)) {
342                    inf.end();
343                    inf = null;
344                    endReached = true;
345                    return size == 0 ? -1 : size;
346                }
347            }
348        }
349
350        return size;
351    }
352
353    /**
354     * Checks if the signature matches what is expected for a .gz file.
355     *
356     * @param signature the bytes to check
357     * @param length    the number of bytes to check
358     * @return          true if this is a .gz stream, false otherwise
359     *
360     * @since 1.1
361     */
362    public static boolean matches(byte[] signature, int length) {
363
364        if (length < 2) {
365            return false;
366        }
367
368        if (signature[0] != 31) {
369            return false;
370        }
371
372        if (signature[1] != -117) {
373            return false;
374        }
375
376        return true;
377    }
378
379    /**
380     * Closes the input stream (unless it is System.in).
381     *
382     * @since 1.2
383     */
384    @Override
385    public void close() throws IOException {
386        if (inf != null) {
387            inf.end();
388            inf = null;
389        }
390
391        if (this.in != System.in) {
392            this.in.close();
393        }
394    }
395}