001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.ByteArrayOutputStream; 022import java.io.IOException; 023import java.io.EOFException; 024import java.io.InputStream; 025import java.io.DataInputStream; 026import java.io.BufferedInputStream; 027import java.util.zip.DataFormatException; 028import java.util.zip.Deflater; 029import java.util.zip.Inflater; 030import java.util.zip.CRC32; 031 032import org.apache.commons.compress.compressors.CompressorInputStream; 033import org.apache.commons.compress.utils.CharsetNames; 034 035/** 036 * Input stream that decompresses .gz files. 037 * This supports decompressing concatenated .gz files which is important 038 * when decompressing standalone .gz files. 039 * <p> 040 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 041 * files: it stops after the first member and silently ignores the rest. 042 * It doesn't leave the read position to point to the beginning of the next 043 * member, which makes it difficult workaround the lack of concatenation 044 * support. 045 * <p> 046 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz 047 * container format decoder. The actual decompression is done with 048 * {@link java.util.zip.Inflater}. 049 */ 050public class GzipCompressorInputStream extends CompressorInputStream { 051 // Header flags 052 // private static final int FTEXT = 0x01; // Uninteresting for us 053 private static final int FHCRC = 0x02; 054 private static final int FEXTRA = 0x04; 055 private static final int FNAME = 0x08; 056 private static final int FCOMMENT = 0x10; 057 private static final int FRESERVED = 0xE0; 058 059 // Compressed input stream, possibly wrapped in a BufferedInputStream 060 private final InputStream in; 061 062 // True if decompressing multimember streams. 063 private final boolean decompressConcatenated; 064 065 // Buffer to hold the input data 066 private final byte[] buf = new byte[8192]; 067 068 // Amount of data in buf. 069 private int bufUsed = 0; 070 071 // Decompressor 072 private Inflater inf = new Inflater(true); 073 074 // CRC32 from uncompressed data 075 private final CRC32 crc = new CRC32(); 076 077 private int memberSize; 078 079 // True once everything has been decompressed 080 private boolean endReached = false; 081 082 // used in no-arg read method 083 private final byte[] oneByte = new byte[1]; 084 085 private final GzipParameters parameters = new GzipParameters(); 086 087 /** 088 * Constructs a new input stream that decompresses gzip-compressed data 089 * from the specified input stream. 090 * <p> 091 * This is equivalent to 092 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus 093 * will not decompress concatenated .gz files. 094 * 095 * @param inputStream the InputStream from which this object should 096 * be created of 097 * 098 * @throws IOException if the stream could not be created 099 */ 100 public GzipCompressorInputStream(InputStream inputStream) 101 throws IOException { 102 this(inputStream, false); 103 } 104 105 /** 106 * Constructs a new input stream that decompresses gzip-compressed data 107 * from the specified input stream. 108 * <p> 109 * If <code>decompressConcatenated</code> is {@code false}: 110 * This decompressor might read more input than it will actually use. 111 * If <code>inputStream</code> supports <code>mark</code> and 112 * <code>reset</code>, then the input position will be adjusted 113 * so that it is right after the last byte of the compressed stream. 114 * If <code>mark</code> isn't supported, the input position will be 115 * undefined. 116 * 117 * @param inputStream the InputStream from which this object should 118 * be created of 119 * @param decompressConcatenated 120 * if true, decompress until the end of the input; 121 * if false, stop after the first .gz member 122 * 123 * @throws IOException if the stream could not be created 124 */ 125 public GzipCompressorInputStream(InputStream inputStream, 126 boolean decompressConcatenated) 127 throws IOException { 128 // Mark support is strictly needed for concatenated files only, 129 // but it's simpler if it is always available. 130 if (inputStream.markSupported()) { 131 in = inputStream; 132 } else { 133 in = new BufferedInputStream(inputStream); 134 } 135 136 this.decompressConcatenated = decompressConcatenated; 137 init(true); 138 } 139 140 /** 141 * Provides the stream's meta data - may change with each stream 142 * when decompressing concatenated streams. 143 * @return the stream's meta data 144 * @since 1.8 145 */ 146 public GzipParameters getMetaData() { 147 return parameters; 148 } 149 150 private boolean init(boolean isFirstMember) throws IOException { 151 assert isFirstMember || decompressConcatenated; 152 153 // Check the magic bytes without a possibility of EOFException. 154 int magic0 = in.read(); 155 int magic1 = in.read(); 156 157 // If end of input was reached after decompressing at least 158 // one .gz member, we have reached the end of the file successfully. 159 if (magic0 == -1 && !isFirstMember) { 160 return false; 161 } 162 163 if (magic0 != 31 || magic1 != 139) { 164 throw new IOException(isFirstMember 165 ? "Input is not in the .gz format" 166 : "Garbage after a valid .gz stream"); 167 } 168 169 // Parsing the rest of the header may throw EOFException. 170 DataInputStream inData = new DataInputStream(in); 171 int method = inData.readUnsignedByte(); 172 if (method != Deflater.DEFLATED) { 173 throw new IOException("Unsupported compression method " 174 + method + " in the .gz header"); 175 } 176 177 int flg = inData.readUnsignedByte(); 178 if ((flg & FRESERVED) != 0) { 179 throw new IOException( 180 "Reserved flags are set in the .gz header"); 181 } 182 183 parameters.setModificationTime(readLittleEndianInt(inData) * 1000); 184 switch (inData.readUnsignedByte()) { // extra flags 185 case 2: 186 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 187 break; 188 case 4: 189 parameters.setCompressionLevel(Deflater.BEST_SPEED); 190 break; 191 default: 192 // ignored for now 193 break; 194 } 195 parameters.setOperatingSystem(inData.readUnsignedByte()); 196 197 // Extra field, ignored 198 if ((flg & FEXTRA) != 0) { 199 int xlen = inData.readUnsignedByte(); 200 xlen |= inData.readUnsignedByte() << 8; 201 202 // This isn't as efficient as calling in.skip would be, 203 // but it's lazier to handle unexpected end of input this way. 204 // Most files don't have an extra field anyway. 205 while (xlen-- > 0) { 206 inData.readUnsignedByte(); 207 } 208 } 209 210 // Original file name 211 if ((flg & FNAME) != 0) { 212 parameters.setFilename(new String(readToNull(inData), 213 CharsetNames.ISO_8859_1)); 214 } 215 216 // Comment 217 if ((flg & FCOMMENT) != 0) { 218 parameters.setComment(new String(readToNull(inData), 219 CharsetNames.ISO_8859_1)); 220 } 221 222 // Header "CRC16" which is actually a truncated CRC32 (which isn't 223 // as good as real CRC16). I don't know if any encoder implementation 224 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 225 // doesn't support this field, but zlib seems to be able to at least 226 // skip over it. 227 if ((flg & FHCRC) != 0) { 228 inData.readShort(); 229 } 230 231 // Reset 232 inf.reset(); 233 crc.reset(); 234 memberSize = 0; 235 236 return true; 237 } 238 239 private byte[] readToNull(DataInputStream inData) throws IOException { 240 ByteArrayOutputStream bos = new ByteArrayOutputStream(); 241 int b = 0; 242 while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD 243 bos.write(b); 244 } 245 return bos.toByteArray(); 246 } 247 248 private int readLittleEndianInt(DataInputStream inData) throws IOException { 249 return inData.readUnsignedByte() 250 | (inData.readUnsignedByte() << 8) 251 | (inData.readUnsignedByte() << 16) 252 | (inData.readUnsignedByte() << 24); 253 } 254 255 @Override 256 public int read() throws IOException { 257 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 258 } 259 260 /** 261 * {@inheritDoc} 262 * 263 * @since 1.1 264 */ 265 @Override 266 public int read(byte[] b, int off, int len) throws IOException { 267 if (endReached) { 268 return -1; 269 } 270 271 int size = 0; 272 273 while (len > 0) { 274 if (inf.needsInput()) { 275 // Remember the current position because we may need to 276 // rewind after reading too much input. 277 in.mark(buf.length); 278 279 bufUsed = in.read(buf); 280 if (bufUsed == -1) { 281 throw new EOFException(); 282 } 283 284 inf.setInput(buf, 0, bufUsed); 285 } 286 287 int ret; 288 try { 289 ret = inf.inflate(b, off, len); 290 } catch (DataFormatException e) { 291 throw new IOException("Gzip-compressed data is corrupt"); 292 } 293 294 crc.update(b, off, ret); 295 memberSize += ret; 296 off += ret; 297 len -= ret; 298 size += ret; 299 count(ret); 300 301 if (inf.finished()) { 302 // We may have read too many bytes. Rewind the read 303 // position to match the actual amount used. 304 // 305 // NOTE: The "if" is there just in case. Since we used 306 // in.mark earler, it should always skip enough. 307 in.reset(); 308 309 int skipAmount = bufUsed - inf.getRemaining(); 310 if (in.skip(skipAmount) != skipAmount) { 311 throw new IOException(); 312 } 313 314 bufUsed = 0; 315 316 DataInputStream inData = new DataInputStream(in); 317 318 // CRC32 319 long crcStored = 0; 320 for (int i = 0; i < 4; ++i) { 321 crcStored |= (long)inData.readUnsignedByte() << (i * 8); 322 } 323 324 if (crcStored != crc.getValue()) { 325 throw new IOException("Gzip-compressed data is corrupt " 326 + "(CRC32 error)"); 327 } 328 329 // Uncompressed size modulo 2^32 (ISIZE in the spec) 330 int isize = 0; 331 for (int i = 0; i < 4; ++i) { 332 isize |= inData.readUnsignedByte() << (i * 8); 333 } 334 335 if (isize != memberSize) { 336 throw new IOException("Gzip-compressed data is corrupt" 337 + "(uncompressed size mismatch)"); 338 } 339 340 // See if this is the end of the file. 341 if (!decompressConcatenated || !init(false)) { 342 inf.end(); 343 inf = null; 344 endReached = true; 345 return size == 0 ? -1 : size; 346 } 347 } 348 } 349 350 return size; 351 } 352 353 /** 354 * Checks if the signature matches what is expected for a .gz file. 355 * 356 * @param signature the bytes to check 357 * @param length the number of bytes to check 358 * @return true if this is a .gz stream, false otherwise 359 * 360 * @since 1.1 361 */ 362 public static boolean matches(byte[] signature, int length) { 363 364 if (length < 2) { 365 return false; 366 } 367 368 if (signature[0] != 31) { 369 return false; 370 } 371 372 if (signature[1] != -117) { 373 return false; 374 } 375 376 return true; 377 } 378 379 /** 380 * Closes the input stream (unless it is System.in). 381 * 382 * @since 1.2 383 */ 384 @Override 385 public void close() throws IOException { 386 if (inf != null) { 387 inf.end(); 388 inf = null; 389 } 390 391 if (this.in != System.in) { 392 this.in.close(); 393 } 394 } 395}