001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 */ 018 019/* 020 * This package is based on the work done by Timothy Gerard Endres 021 * (time@ice.com) to whom the Ant project is very grateful for his great code. 022 */ 023 024package org.apache.commons.compress.archivers.tar; 025 026import java.io.ByteArrayOutputStream; 027import java.io.IOException; 028import java.io.InputStream; 029import java.util.HashMap; 030import java.util.Map; 031import java.util.Map.Entry; 032 033import org.apache.commons.compress.archivers.ArchiveEntry; 034import org.apache.commons.compress.archivers.ArchiveInputStream; 035import org.apache.commons.compress.archivers.zip.ZipEncoding; 036import org.apache.commons.compress.archivers.zip.ZipEncodingHelper; 037import org.apache.commons.compress.utils.ArchiveUtils; 038import org.apache.commons.compress.utils.CharsetNames; 039import org.apache.commons.compress.utils.IOUtils; 040 041/** 042 * The TarInputStream reads a UNIX tar archive as an InputStream. 043 * methods are provided to position at each successive entry in 044 * the archive, and the read each entry as a normal input stream 045 * using read(). 046 * @NotThreadSafe 047 */ 048public class TarArchiveInputStream extends ArchiveInputStream { 049 050 private static final int SMALL_BUFFER_SIZE = 256; 051 052 private final byte[] SMALL_BUF = new byte[SMALL_BUFFER_SIZE]; 053 054 /** The size the TAR header */ 055 private final int recordSize; 056 057 /** The size of a block */ 058 private final int blockSize; 059 060 /** True if file has hit EOF */ 061 private boolean hasHitEOF; 062 063 /** Size of the current entry */ 064 private long entrySize; 065 066 /** How far into the entry the stream is at */ 067 private long entryOffset; 068 069 /** An input stream to read from */ 070 private final InputStream is; 071 072 /** The meta-data about the current entry */ 073 private TarArchiveEntry currEntry; 074 075 /** The encoding of the file */ 076 private final ZipEncoding encoding; 077 078 /** 079 * Constructor for TarInputStream. 080 * @param is the input stream to use 081 */ 082 public TarArchiveInputStream(InputStream is) { 083 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE); 084 } 085 086 /** 087 * Constructor for TarInputStream. 088 * @param is the input stream to use 089 * @param encoding name of the encoding to use for file names 090 * @since 1.4 091 */ 092 public TarArchiveInputStream(InputStream is, String encoding) { 093 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, 094 encoding); 095 } 096 097 /** 098 * Constructor for TarInputStream. 099 * @param is the input stream to use 100 * @param blockSize the block size to use 101 */ 102 public TarArchiveInputStream(InputStream is, int blockSize) { 103 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE); 104 } 105 106 /** 107 * Constructor for TarInputStream. 108 * @param is the input stream to use 109 * @param blockSize the block size to use 110 * @param encoding name of the encoding to use for file names 111 * @since 1.4 112 */ 113 public TarArchiveInputStream(InputStream is, int blockSize, 114 String encoding) { 115 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding); 116 } 117 118 /** 119 * Constructor for TarInputStream. 120 * @param is the input stream to use 121 * @param blockSize the block size to use 122 * @param recordSize the record size to use 123 */ 124 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) { 125 this(is, blockSize, recordSize, null); 126 } 127 128 /** 129 * Constructor for TarInputStream. 130 * @param is the input stream to use 131 * @param blockSize the block size to use 132 * @param recordSize the record size to use 133 * @param encoding name of the encoding to use for file names 134 * @since 1.4 135 */ 136 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize, 137 String encoding) { 138 this.is = is; 139 this.hasHitEOF = false; 140 this.encoding = ZipEncodingHelper.getZipEncoding(encoding); 141 this.recordSize = recordSize; 142 this.blockSize = blockSize; 143 } 144 145 /** 146 * Closes this stream. Calls the TarBuffer's close() method. 147 * @throws IOException on error 148 */ 149 @Override 150 public void close() throws IOException { 151 is.close(); 152 } 153 154 /** 155 * Get the record size being used by this stream's buffer. 156 * 157 * @return The TarBuffer record size. 158 */ 159 public int getRecordSize() { 160 return recordSize; 161 } 162 163 /** 164 * Get the available data that can be read from the current 165 * entry in the archive. This does not indicate how much data 166 * is left in the entire archive, only in the current entry. 167 * This value is determined from the entry's size header field 168 * and the amount of data already read from the current entry. 169 * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE 170 * bytes are left in the current entry in the archive. 171 * 172 * @return The number of available bytes for the current entry. 173 * @throws IOException for signature 174 */ 175 @Override 176 public int available() throws IOException { 177 if (entrySize - entryOffset > Integer.MAX_VALUE) { 178 return Integer.MAX_VALUE; 179 } 180 return (int) (entrySize - entryOffset); 181 } 182 183 184 /** 185 * Skips over and discards <code>n</code> bytes of data from this input 186 * stream. The <code>skip</code> method may, for a variety of reasons, end 187 * up skipping over some smaller number of bytes, possibly <code>0</code>. 188 * This may result from any of a number of conditions; reaching end of file 189 * or end of entry before <code>n</code> bytes have been skipped; are only 190 * two possibilities. The actual number of bytes skipped is returned. If 191 * <code>n</code> is negative, no bytes are skipped. 192 * 193 * 194 * @param n 195 * the number of bytes to be skipped. 196 * @return the actual number of bytes skipped. 197 * @exception IOException 198 * if some other I/O error occurs. 199 */ 200 @Override 201 public long skip(final long n) throws IOException { 202 if (n <= 0) { 203 return 0; 204 } 205 206 final long available = entrySize - entryOffset; 207 final long skipped = is.skip(Math.min(n, available)); 208 count(skipped); 209 entryOffset += skipped; 210 return skipped; 211 } 212 213 /** 214 * Since we do not support marking just yet, we do nothing. 215 */ 216 @Override 217 public synchronized void reset() { 218 } 219 220 /** 221 * Get the next entry in this tar archive. This will skip 222 * over any remaining data in the current entry, if there 223 * is one, and place the input stream at the header of the 224 * next entry, and read the header and instantiate a new 225 * TarEntry from the header bytes and return that entry. 226 * If there are no more entries in the archive, null will 227 * be returned to indicate that the end of the archive has 228 * been reached. 229 * 230 * @return The next TarEntry in the archive, or null. 231 * @throws IOException on error 232 */ 233 public TarArchiveEntry getNextTarEntry() throws IOException { 234 if (hasHitEOF) { 235 return null; 236 } 237 238 if (currEntry != null) { 239 /* Skip will only go to the end of the current entry */ 240 IOUtils.skip(this, Long.MAX_VALUE); 241 242 /* skip to the end of the last record */ 243 skipRecordPadding(); 244 } 245 246 byte[] headerBuf = getRecord(); 247 248 if (headerBuf == null) { 249 /* hit EOF */ 250 currEntry = null; 251 return null; 252 } 253 254 try { 255 currEntry = new TarArchiveEntry(headerBuf, encoding); 256 } catch (IllegalArgumentException e) { 257 IOException ioe = new IOException("Error detected parsing the header"); 258 ioe.initCause(e); 259 throw ioe; 260 } 261 262 entryOffset = 0; 263 entrySize = currEntry.getSize(); 264 265 if (currEntry.isGNULongLinkEntry()) { 266 byte[] longLinkData = getLongNameData(); 267 if (longLinkData == null) { 268 // Bugzilla: 40334 269 // Malformed tar file - long link entry name not followed by 270 // entry 271 return null; 272 } 273 currEntry.setLinkName(encoding.decode(longLinkData)); 274 } 275 276 if (currEntry.isGNULongNameEntry()) { 277 byte[] longNameData = getLongNameData(); 278 if (longNameData == null) { 279 // Bugzilla: 40334 280 // Malformed tar file - long entry name not followed by 281 // entry 282 return null; 283 } 284 currEntry.setName(encoding.decode(longNameData)); 285 } 286 287 if (currEntry.isPaxHeader()){ // Process Pax headers 288 paxHeaders(); 289 } 290 291 if (currEntry.isGNUSparse()){ // Process sparse files 292 readGNUSparse(); 293 } 294 295 // If the size of the next element in the archive has changed 296 // due to a new size being reported in the posix header 297 // information, we update entrySize here so that it contains 298 // the correct value. 299 entrySize = currEntry.getSize(); 300 301 return currEntry; 302 } 303 304 /** 305 * The last record block should be written at the full size, so skip any 306 * additional space used to fill a record after an entry 307 */ 308 private void skipRecordPadding() throws IOException { 309 if (this.entrySize > 0 && this.entrySize % this.recordSize != 0) { 310 long numRecords = (this.entrySize / this.recordSize) + 1; 311 long padding = (numRecords * this.recordSize) - this.entrySize; 312 long skipped = IOUtils.skip(is, padding); 313 count(skipped); 314 } 315 } 316 317 /** 318 * Get the next entry in this tar archive as longname data. 319 * 320 * @return The next entry in the archive as longname data, or null. 321 * @throws IOException on error 322 */ 323 protected byte[] getLongNameData() throws IOException { 324 // read in the name 325 ByteArrayOutputStream longName = new ByteArrayOutputStream(); 326 int length = 0; 327 while ((length = read(SMALL_BUF)) >= 0) { 328 longName.write(SMALL_BUF, 0, length); 329 } 330 getNextEntry(); 331 if (currEntry == null) { 332 // Bugzilla: 40334 333 // Malformed tar file - long entry name not followed by entry 334 return null; 335 } 336 byte[] longNameData = longName.toByteArray(); 337 // remove trailing null terminator(s) 338 length = longNameData.length; 339 while (length > 0 && longNameData[length - 1] == 0) { 340 --length; 341 } 342 if (length != longNameData.length) { 343 byte[] l = new byte[length]; 344 System.arraycopy(longNameData, 0, l, 0, length); 345 longNameData = l; 346 } 347 return longNameData; 348 } 349 350 /** 351 * Get the next record in this tar archive. This will skip 352 * over any remaining data in the current entry, if there 353 * is one, and place the input stream at the header of the 354 * next entry. 355 * 356 * <p>If there are no more entries in the archive, null will be 357 * returned to indicate that the end of the archive has been 358 * reached. At the same time the {@code hasHitEOF} marker will be 359 * set to true.</p> 360 * 361 * @return The next header in the archive, or null. 362 * @throws IOException on error 363 */ 364 private byte[] getRecord() throws IOException { 365 byte[] headerBuf = readRecord(); 366 hasHitEOF = isEOFRecord(headerBuf); 367 if (hasHitEOF && headerBuf != null) { 368 tryToConsumeSecondEOFRecord(); 369 consumeRemainderOfLastBlock(); 370 headerBuf = null; 371 } 372 return headerBuf; 373 } 374 375 /** 376 * Determine if an archive record indicate End of Archive. End of 377 * archive is indicated by a record that consists entirely of null bytes. 378 * 379 * @param record The record data to check. 380 * @return true if the record data is an End of Archive 381 */ 382 protected boolean isEOFRecord(byte[] record) { 383 return record == null || ArchiveUtils.isArrayZero(record, recordSize); 384 } 385 386 /** 387 * Read a record from the input stream and return the data. 388 * 389 * @return The record data or null if EOF has been hit. 390 * @throws IOException on error 391 */ 392 protected byte[] readRecord() throws IOException { 393 394 byte[] record = new byte[recordSize]; 395 396 int readNow = IOUtils.readFully(is, record); 397 count(readNow); 398 if (readNow != recordSize) { 399 return null; 400 } 401 402 return record; 403 } 404 405 private void paxHeaders() throws IOException{ 406 Map<String, String> headers = parsePaxHeaders(this); 407 getNextEntry(); // Get the actual file entry 408 applyPaxHeadersToCurrentEntry(headers); 409 } 410 411 Map<String, String> parsePaxHeaders(InputStream i) throws IOException { 412 Map<String, String> headers = new HashMap<String, String>(); 413 // Format is "length keyword=value\n"; 414 while(true){ // get length 415 int ch; 416 int len = 0; 417 int read = 0; 418 while((ch = i.read()) != -1) { 419 read++; 420 if (ch == ' '){ // End of length string 421 // Get keyword 422 ByteArrayOutputStream coll = new ByteArrayOutputStream(); 423 while((ch = i.read()) != -1) { 424 read++; 425 if (ch == '='){ // end of keyword 426 String keyword = coll.toString(CharsetNames.UTF_8); 427 // Get rest of entry 428 byte[] rest = new byte[len - read]; 429 int got = IOUtils.readFully(i, rest); 430 if (got != len - read){ 431 throw new IOException("Failed to read " 432 + "Paxheader. Expected " 433 + (len - read) 434 + " bytes, read " 435 + got); 436 } 437 // Drop trailing NL 438 String value = new String(rest, 0, 439 len - read - 1, CharsetNames.UTF_8); 440 headers.put(keyword, value); 441 break; 442 } 443 coll.write((byte) ch); 444 } 445 break; // Processed single header 446 } 447 len *= 10; 448 len += ch - '0'; 449 } 450 if (ch == -1){ // EOF 451 break; 452 } 453 } 454 return headers; 455 } 456 457 private void applyPaxHeadersToCurrentEntry(Map<String, String> headers) { 458 /* 459 * The following headers are defined for Pax. 460 * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields 461 * mtime 462 * comment 463 * gid, gname 464 * linkpath 465 * size 466 * uid,uname 467 * SCHILY.devminor, SCHILY.devmajor: don't have setters/getters for those 468 */ 469 for (Entry<String, String> ent : headers.entrySet()){ 470 String key = ent.getKey(); 471 String val = ent.getValue(); 472 if ("path".equals(key)){ 473 currEntry.setName(val); 474 } else if ("linkpath".equals(key)){ 475 currEntry.setLinkName(val); 476 } else if ("gid".equals(key)){ 477 currEntry.setGroupId(Integer.parseInt(val)); 478 } else if ("gname".equals(key)){ 479 currEntry.setGroupName(val); 480 } else if ("uid".equals(key)){ 481 currEntry.setUserId(Integer.parseInt(val)); 482 } else if ("uname".equals(key)){ 483 currEntry.setUserName(val); 484 } else if ("size".equals(key)){ 485 currEntry.setSize(Long.parseLong(val)); 486 } else if ("mtime".equals(key)){ 487 currEntry.setModTime((long) (Double.parseDouble(val) * 1000)); 488 } else if ("SCHILY.devminor".equals(key)){ 489 currEntry.setDevMinor(Integer.parseInt(val)); 490 } else if ("SCHILY.devmajor".equals(key)){ 491 currEntry.setDevMajor(Integer.parseInt(val)); 492 } 493 } 494 } 495 496 /** 497 * Adds the sparse chunks from the current entry to the sparse chunks, 498 * including any additional sparse entries following the current entry. 499 * 500 * @throws IOException on error 501 * 502 * @todo Sparse files get not yet really processed. 503 */ 504 private void readGNUSparse() throws IOException { 505 /* we do not really process sparse files yet 506 sparses = new ArrayList(); 507 sparses.addAll(currEntry.getSparses()); 508 */ 509 if (currEntry.isExtended()) { 510 TarArchiveSparseEntry entry; 511 do { 512 byte[] headerBuf = getRecord(); 513 if (headerBuf == null) { 514 currEntry = null; 515 break; 516 } 517 entry = new TarArchiveSparseEntry(headerBuf); 518 /* we do not really process sparse files yet 519 sparses.addAll(entry.getSparses()); 520 */ 521 } while (entry.isExtended()); 522 } 523 } 524 525 /** 526 * Returns the next Archive Entry in this Stream. 527 * 528 * @return the next entry, 529 * or {@code null} if there are no more entries 530 * @throws IOException if the next entry could not be read 531 */ 532 @Override 533 public ArchiveEntry getNextEntry() throws IOException { 534 return getNextTarEntry(); 535 } 536 537 /** 538 * Tries to read the next record rewinding the stream if it is not a EOF record. 539 * 540 * <p>This is meant to protect against cases where a tar 541 * implementation has written only one EOF record when two are 542 * expected. Actually this won't help since a non-conforming 543 * implementation likely won't fill full blocks consisting of - by 544 * default - ten records either so we probably have already read 545 * beyond the archive anyway.</p> 546 */ 547 private void tryToConsumeSecondEOFRecord() throws IOException { 548 boolean shouldReset = true; 549 boolean marked = is.markSupported(); 550 if (marked) { 551 is.mark(recordSize); 552 } 553 try { 554 shouldReset = !isEOFRecord(readRecord()); 555 } finally { 556 if (shouldReset && marked) { 557 pushedBackBytes(recordSize); 558 is.reset(); 559 } 560 } 561 } 562 563 /** 564 * Reads bytes from the current tar archive entry. 565 * 566 * This method is aware of the boundaries of the current 567 * entry in the archive and will deal with them as if they 568 * were this stream's start and EOF. 569 * 570 * @param buf The buffer into which to place bytes read. 571 * @param offset The offset at which to place bytes read. 572 * @param numToRead The number of bytes to read. 573 * @return The number of bytes read, or -1 at EOF. 574 * @throws IOException on error 575 */ 576 @Override 577 public int read(byte[] buf, int offset, int numToRead) throws IOException { 578 int totalRead = 0; 579 580 if (hasHitEOF || entryOffset >= entrySize) { 581 return -1; 582 } 583 584 if (currEntry == null) { 585 throw new IllegalStateException("No current tar entry"); 586 } 587 588 numToRead = Math.min(numToRead, available()); 589 590 totalRead = is.read(buf, offset, numToRead); 591 592 if (totalRead == -1) { 593 if (numToRead > 0) { 594 throw new IOException("Truncated TAR archive"); 595 } 596 hasHitEOF = true; 597 } else { 598 count(totalRead); 599 entryOffset += totalRead; 600 } 601 602 return totalRead; 603 } 604 605 /** 606 * Whether this class is able to read the given entry. 607 * 608 * <p>May return false if the current entry is a sparse file.</p> 609 */ 610 @Override 611 public boolean canReadEntryData(ArchiveEntry ae) { 612 if (ae instanceof TarArchiveEntry) { 613 TarArchiveEntry te = (TarArchiveEntry) ae; 614 return !te.isGNUSparse(); 615 } 616 return false; 617 } 618 619 /** 620 * Get the current TAR Archive Entry that this input stream is processing 621 * 622 * @return The current Archive Entry 623 */ 624 public TarArchiveEntry getCurrentEntry() { 625 return currEntry; 626 } 627 628 protected final void setCurrentEntry(TarArchiveEntry e) { 629 currEntry = e; 630 } 631 632 protected final boolean isAtEOF() { 633 return hasHitEOF; 634 } 635 636 protected final void setAtEOF(boolean b) { 637 hasHitEOF = b; 638 } 639 640 /** 641 * This method is invoked once the end of the archive is hit, it 642 * tries to consume the remaining bytes under the assumption that 643 * the tool creating this archive has padded the last block. 644 */ 645 private void consumeRemainderOfLastBlock() throws IOException { 646 long bytesReadOfLastBlock = getBytesRead() % blockSize; 647 if (bytesReadOfLastBlock > 0) { 648 long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock); 649 count(skipped); 650 } 651 } 652 653 /** 654 * Checks if the signature matches what is expected for a tar file. 655 * 656 * @param signature 657 * the bytes to check 658 * @param length 659 * the number of bytes to check 660 * @return true, if this stream is a tar archive stream, false otherwise 661 */ 662 public static boolean matches(byte[] signature, int length) { 663 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) { 664 return false; 665 } 666 667 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, 668 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 669 && 670 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, 671 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 672 ){ 673 return true; 674 } 675 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, 676 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 677 && 678 ( 679 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, 680 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 681 || 682 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, 683 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 684 ) 685 ){ 686 return true; 687 } 688 // COMPRESS-107 - recognise Ant tar files 689 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, 690 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 691 && 692 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, 693 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 694 ){ 695 return true; 696 } 697 return false; 698 } 699 700}