001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one or more
003 *  contributor license agreements.  See the NOTICE file distributed with
004 *  this work for additional information regarding copyright ownership.
005 *  The ASF licenses this file to You under the Apache License, Version 2.0
006 *  (the "License"); you may not use this file except in compliance with
007 *  the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 *  Unless required by applicable law or agreed to in writing, software
012 *  distributed under the License is distributed on an "AS IS" BASIS,
013 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 *  See the License for the specific language governing permissions and
015 *  limitations under the License.
016 *
017 */
018
019/*
020 * This package is based on the work done by Timothy Gerard Endres
021 * (time@ice.com) to whom the Ant project is very grateful for his great code.
022 */
023
024package org.apache.commons.compress.archivers.tar;
025
026import java.io.ByteArrayOutputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.util.HashMap;
030import java.util.Map;
031import java.util.Map.Entry;
032
033import org.apache.commons.compress.archivers.ArchiveEntry;
034import org.apache.commons.compress.archivers.ArchiveInputStream;
035import org.apache.commons.compress.archivers.zip.ZipEncoding;
036import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
037import org.apache.commons.compress.utils.ArchiveUtils;
038import org.apache.commons.compress.utils.CharsetNames;
039import org.apache.commons.compress.utils.IOUtils;
040
041/**
042 * The TarInputStream reads a UNIX tar archive as an InputStream.
043 * methods are provided to position at each successive entry in
044 * the archive, and the read each entry as a normal input stream
045 * using read().
046 * @NotThreadSafe
047 */
048public class TarArchiveInputStream extends ArchiveInputStream {
049
050    private static final int SMALL_BUFFER_SIZE = 256;
051
052    private final byte[] SMALL_BUF = new byte[SMALL_BUFFER_SIZE];
053
054    /** The size the TAR header */
055    private final int recordSize;
056
057    /** The size of a block */
058    private final int blockSize;
059
060    /** True if file has hit EOF */
061    private boolean hasHitEOF;
062
063    /** Size of the current entry */
064    private long entrySize;
065
066    /** How far into the entry the stream is at */
067    private long entryOffset;
068
069    /** An input stream to read from */
070    private final InputStream is;
071
072    /** The meta-data about the current entry */
073    private TarArchiveEntry currEntry;
074
075    /** The encoding of the file */
076    private final ZipEncoding encoding;
077
078    /**
079     * Constructor for TarInputStream.
080     * @param is the input stream to use
081     */
082    public TarArchiveInputStream(InputStream is) {
083        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE);
084    }
085
086    /**
087     * Constructor for TarInputStream.
088     * @param is the input stream to use
089     * @param encoding name of the encoding to use for file names
090     * @since 1.4
091     */
092    public TarArchiveInputStream(InputStream is, String encoding) {
093        this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE,
094             encoding);
095    }
096
097    /**
098     * Constructor for TarInputStream.
099     * @param is the input stream to use
100     * @param blockSize the block size to use
101     */
102    public TarArchiveInputStream(InputStream is, int blockSize) {
103        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE);
104    }
105
106    /**
107     * Constructor for TarInputStream.
108     * @param is the input stream to use
109     * @param blockSize the block size to use
110     * @param encoding name of the encoding to use for file names
111     * @since 1.4
112     */
113    public TarArchiveInputStream(InputStream is, int blockSize,
114                                 String encoding) {
115        this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding);
116    }
117
118    /**
119     * Constructor for TarInputStream.
120     * @param is the input stream to use
121     * @param blockSize the block size to use
122     * @param recordSize the record size to use
123     */
124    public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
125        this(is, blockSize, recordSize, null);      
126    }
127
128    /**
129     * Constructor for TarInputStream.
130     * @param is the input stream to use
131     * @param blockSize the block size to use
132     * @param recordSize the record size to use
133     * @param encoding name of the encoding to use for file names
134     * @since 1.4
135     */
136    public TarArchiveInputStream(InputStream is, int blockSize, int recordSize,
137                                 String encoding) {
138        this.is = is;
139        this.hasHitEOF = false;
140        this.encoding = ZipEncodingHelper.getZipEncoding(encoding);
141        this.recordSize = recordSize;
142        this.blockSize = blockSize;
143    }
144
145    /**
146     * Closes this stream. Calls the TarBuffer's close() method.
147     * @throws IOException on error
148     */
149    @Override
150    public void close() throws IOException {
151        is.close();
152    }
153
154    /**
155     * Get the record size being used by this stream's buffer.
156     *
157     * @return The TarBuffer record size.
158     */
159    public int getRecordSize() {
160        return recordSize;
161    }
162
163    /**
164     * Get the available data that can be read from the current
165     * entry in the archive. This does not indicate how much data
166     * is left in the entire archive, only in the current entry.
167     * This value is determined from the entry's size header field
168     * and the amount of data already read from the current entry.
169     * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE
170     * bytes are left in the current entry in the archive.
171     *
172     * @return The number of available bytes for the current entry.
173     * @throws IOException for signature
174     */
175    @Override
176    public int available() throws IOException {
177        if (entrySize - entryOffset > Integer.MAX_VALUE) {
178            return Integer.MAX_VALUE;
179        }
180        return (int) (entrySize - entryOffset);
181    }
182
183    
184    /**
185     * Skips over and discards <code>n</code> bytes of data from this input
186     * stream. The <code>skip</code> method may, for a variety of reasons, end
187     * up skipping over some smaller number of bytes, possibly <code>0</code>.
188     * This may result from any of a number of conditions; reaching end of file
189     * or end of entry before <code>n</code> bytes have been skipped; are only
190     * two possibilities. The actual number of bytes skipped is returned. If
191     * <code>n</code> is negative, no bytes are skipped.
192     * 
193     * 
194     * @param n
195     *            the number of bytes to be skipped.
196     * @return the actual number of bytes skipped.
197     * @exception IOException
198     *                if some other I/O error occurs.
199     */
200    @Override
201    public long skip(final long n) throws IOException {
202        if (n <= 0) {
203            return 0;
204        }
205
206        final long available = entrySize - entryOffset;
207        final long skipped = is.skip(Math.min(n, available)); 
208        count(skipped);
209        entryOffset += skipped;
210        return skipped;
211    }
212
213    /**
214     * Since we do not support marking just yet, we do nothing.
215     */
216    @Override
217    public synchronized void reset() {
218    }
219
220    /**
221     * Get the next entry in this tar archive. This will skip
222     * over any remaining data in the current entry, if there
223     * is one, and place the input stream at the header of the
224     * next entry, and read the header and instantiate a new
225     * TarEntry from the header bytes and return that entry.
226     * If there are no more entries in the archive, null will
227     * be returned to indicate that the end of the archive has
228     * been reached.
229     *
230     * @return The next TarEntry in the archive, or null.
231     * @throws IOException on error
232     */
233    public TarArchiveEntry getNextTarEntry() throws IOException {
234        if (hasHitEOF) {
235            return null;
236        }
237
238        if (currEntry != null) {
239            /* Skip will only go to the end of the current entry */
240            IOUtils.skip(this, Long.MAX_VALUE);
241
242            /* skip to the end of the last record */
243            skipRecordPadding();
244        }
245
246        byte[] headerBuf = getRecord();
247
248        if (headerBuf == null) {
249            /* hit EOF */
250            currEntry = null;
251            return null;
252        }
253
254        try {
255            currEntry = new TarArchiveEntry(headerBuf, encoding);
256        } catch (IllegalArgumentException e) {
257            IOException ioe = new IOException("Error detected parsing the header");
258            ioe.initCause(e);
259            throw ioe;
260        }
261
262        entryOffset = 0;
263        entrySize = currEntry.getSize();
264
265        if (currEntry.isGNULongLinkEntry()) {
266            byte[] longLinkData = getLongNameData();
267            if (longLinkData == null) {
268                // Bugzilla: 40334
269                // Malformed tar file - long link entry name not followed by
270                // entry
271                return null;
272            }
273            currEntry.setLinkName(encoding.decode(longLinkData));
274        }
275
276        if (currEntry.isGNULongNameEntry()) {
277            byte[] longNameData = getLongNameData();
278            if (longNameData == null) {
279                // Bugzilla: 40334
280                // Malformed tar file - long entry name not followed by
281                // entry
282                return null;
283            }
284            currEntry.setName(encoding.decode(longNameData));
285        }
286
287        if (currEntry.isPaxHeader()){ // Process Pax headers
288            paxHeaders();
289        }
290
291        if (currEntry.isGNUSparse()){ // Process sparse files
292            readGNUSparse();
293        }
294
295        // If the size of the next element in the archive has changed
296        // due to a new size being reported in the posix header
297        // information, we update entrySize here so that it contains
298        // the correct value.
299        entrySize = currEntry.getSize();
300
301        return currEntry;
302    }
303    
304    /**
305     * The last record block should be written at the full size, so skip any
306     * additional space used to fill a record after an entry
307     */
308    private void skipRecordPadding() throws IOException {
309        if (this.entrySize > 0 && this.entrySize % this.recordSize != 0) {
310            long numRecords = (this.entrySize / this.recordSize) + 1;
311            long padding = (numRecords * this.recordSize) - this.entrySize;
312            long skipped = IOUtils.skip(is, padding);
313            count(skipped);
314        }
315    }
316
317    /**
318     * Get the next entry in this tar archive as longname data.
319     *
320     * @return The next entry in the archive as longname data, or null.
321     * @throws IOException on error
322     */
323    protected byte[] getLongNameData() throws IOException {
324        // read in the name
325        ByteArrayOutputStream longName = new ByteArrayOutputStream();
326        int length = 0;
327        while ((length = read(SMALL_BUF)) >= 0) {
328            longName.write(SMALL_BUF, 0, length);
329        }
330        getNextEntry();
331        if (currEntry == null) {
332            // Bugzilla: 40334
333            // Malformed tar file - long entry name not followed by entry
334            return null;
335        }
336        byte[] longNameData = longName.toByteArray();
337        // remove trailing null terminator(s)
338        length = longNameData.length;
339        while (length > 0 && longNameData[length - 1] == 0) {
340            --length;
341        }
342        if (length != longNameData.length) {
343            byte[] l = new byte[length];
344            System.arraycopy(longNameData, 0, l, 0, length);
345            longNameData = l;
346        }
347        return longNameData;
348    }
349
350    /**
351     * Get the next record in this tar archive. This will skip
352     * over any remaining data in the current entry, if there
353     * is one, and place the input stream at the header of the
354     * next entry.
355     *
356     * <p>If there are no more entries in the archive, null will be
357     * returned to indicate that the end of the archive has been
358     * reached.  At the same time the {@code hasHitEOF} marker will be
359     * set to true.</p>
360     *
361     * @return The next header in the archive, or null.
362     * @throws IOException on error
363     */
364    private byte[] getRecord() throws IOException {
365        byte[] headerBuf = readRecord();
366        hasHitEOF = isEOFRecord(headerBuf);
367        if (hasHitEOF && headerBuf != null) {
368            tryToConsumeSecondEOFRecord();
369            consumeRemainderOfLastBlock();
370            headerBuf = null;
371        }
372        return headerBuf;
373    }
374
375    /**
376     * Determine if an archive record indicate End of Archive. End of
377     * archive is indicated by a record that consists entirely of null bytes.
378     *
379     * @param record The record data to check.
380     * @return true if the record data is an End of Archive
381     */
382    protected boolean isEOFRecord(byte[] record) {
383        return record == null || ArchiveUtils.isArrayZero(record, recordSize);
384    }
385    
386    /**
387     * Read a record from the input stream and return the data.
388     *
389     * @return The record data or null if EOF has been hit.
390     * @throws IOException on error
391     */
392    protected byte[] readRecord() throws IOException {
393
394        byte[] record = new byte[recordSize];
395
396        int readNow = IOUtils.readFully(is, record);
397        count(readNow);
398        if (readNow != recordSize) {
399            return null;
400        }
401
402        return record;
403    }
404
405    private void paxHeaders() throws IOException{
406        Map<String, String> headers = parsePaxHeaders(this);
407        getNextEntry(); // Get the actual file entry
408        applyPaxHeadersToCurrentEntry(headers);
409    }
410
411    Map<String, String> parsePaxHeaders(InputStream i) throws IOException {
412        Map<String, String> headers = new HashMap<String, String>();
413        // Format is "length keyword=value\n";
414        while(true){ // get length
415            int ch;
416            int len = 0;
417            int read = 0;
418            while((ch = i.read()) != -1) {
419                read++;
420                if (ch == ' '){ // End of length string
421                    // Get keyword
422                    ByteArrayOutputStream coll = new ByteArrayOutputStream();
423                    while((ch = i.read()) != -1) {
424                        read++;
425                        if (ch == '='){ // end of keyword
426                            String keyword = coll.toString(CharsetNames.UTF_8);
427                            // Get rest of entry
428                            byte[] rest = new byte[len - read];
429                            int got = IOUtils.readFully(i, rest);
430                            if (got != len - read){
431                                throw new IOException("Failed to read "
432                                                      + "Paxheader. Expected "
433                                                      + (len - read)
434                                                      + " bytes, read "
435                                                      + got);
436                            }
437                            // Drop trailing NL
438                            String value = new String(rest, 0,
439                                                      len - read - 1, CharsetNames.UTF_8);
440                            headers.put(keyword, value);
441                            break;
442                        }
443                        coll.write((byte) ch);
444                    }
445                    break; // Processed single header
446                }
447                len *= 10;
448                len += ch - '0';
449            }
450            if (ch == -1){ // EOF
451                break;
452            }
453        }
454        return headers;
455    }
456
457    private void applyPaxHeadersToCurrentEntry(Map<String, String> headers) {
458        /*
459         * The following headers are defined for Pax.
460         * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields
461         * mtime
462         * comment
463         * gid, gname
464         * linkpath
465         * size
466         * uid,uname
467         * SCHILY.devminor, SCHILY.devmajor: don't have setters/getters for those
468         */
469        for (Entry<String, String> ent : headers.entrySet()){
470            String key = ent.getKey();
471            String val = ent.getValue();
472            if ("path".equals(key)){
473                currEntry.setName(val);
474            } else if ("linkpath".equals(key)){
475                currEntry.setLinkName(val);
476            } else if ("gid".equals(key)){
477                currEntry.setGroupId(Integer.parseInt(val));
478            } else if ("gname".equals(key)){
479                currEntry.setGroupName(val);
480            } else if ("uid".equals(key)){
481                currEntry.setUserId(Integer.parseInt(val));
482            } else if ("uname".equals(key)){
483                currEntry.setUserName(val);
484            } else if ("size".equals(key)){
485                currEntry.setSize(Long.parseLong(val));
486            } else if ("mtime".equals(key)){
487                currEntry.setModTime((long) (Double.parseDouble(val) * 1000));
488            } else if ("SCHILY.devminor".equals(key)){
489                currEntry.setDevMinor(Integer.parseInt(val));
490            } else if ("SCHILY.devmajor".equals(key)){
491                currEntry.setDevMajor(Integer.parseInt(val));
492            }
493        }
494    }
495
496    /**
497     * Adds the sparse chunks from the current entry to the sparse chunks,
498     * including any additional sparse entries following the current entry.
499     *
500     * @throws IOException on error
501     *
502     * @todo Sparse files get not yet really processed.
503     */
504    private void readGNUSparse() throws IOException {
505        /* we do not really process sparse files yet
506        sparses = new ArrayList();
507        sparses.addAll(currEntry.getSparses());
508        */
509        if (currEntry.isExtended()) {
510            TarArchiveSparseEntry entry;
511            do {
512                byte[] headerBuf = getRecord();
513                if (headerBuf == null) {
514                    currEntry = null;
515                    break;
516                }
517                entry = new TarArchiveSparseEntry(headerBuf);
518                /* we do not really process sparse files yet
519                sparses.addAll(entry.getSparses());
520                */
521            } while (entry.isExtended());
522        }
523    }
524
525    /**
526     * Returns the next Archive Entry in this Stream.
527     *
528     * @return the next entry,
529     *         or {@code null} if there are no more entries
530     * @throws IOException if the next entry could not be read
531     */
532    @Override
533    public ArchiveEntry getNextEntry() throws IOException {
534        return getNextTarEntry();
535    }
536    
537    /**
538     * Tries to read the next record rewinding the stream if it is not a EOF record.
539     *
540     * <p>This is meant to protect against cases where a tar
541     * implementation has written only one EOF record when two are
542     * expected.  Actually this won't help since a non-conforming
543     * implementation likely won't fill full blocks consisting of - by
544     * default - ten records either so we probably have already read
545     * beyond the archive anyway.</p>
546     */
547    private void tryToConsumeSecondEOFRecord() throws IOException {
548        boolean shouldReset = true;
549        boolean marked = is.markSupported();
550        if (marked) {
551            is.mark(recordSize);
552        }
553        try {
554            shouldReset = !isEOFRecord(readRecord());
555        } finally {
556            if (shouldReset && marked) {
557                pushedBackBytes(recordSize);
558                is.reset();
559            }
560        }
561    }
562
563    /**
564     * Reads bytes from the current tar archive entry.
565     *
566     * This method is aware of the boundaries of the current
567     * entry in the archive and will deal with them as if they
568     * were this stream's start and EOF.
569     *
570     * @param buf The buffer into which to place bytes read.
571     * @param offset The offset at which to place bytes read.
572     * @param numToRead The number of bytes to read.
573     * @return The number of bytes read, or -1 at EOF.
574     * @throws IOException on error
575     */
576    @Override
577    public int read(byte[] buf, int offset, int numToRead) throws IOException {
578        int totalRead = 0;
579
580        if (hasHitEOF || entryOffset >= entrySize) {
581            return -1;
582        }
583
584        if (currEntry == null) {
585            throw new IllegalStateException("No current tar entry");
586        }
587
588        numToRead = Math.min(numToRead, available());
589        
590        totalRead = is.read(buf, offset, numToRead);
591        
592        if (totalRead == -1) {
593            if (numToRead > 0) {
594                throw new IOException("Truncated TAR archive");
595            }
596            hasHitEOF = true;
597        } else {
598            count(totalRead);
599            entryOffset += totalRead;
600        }
601
602        return totalRead;
603    }
604
605    /**
606     * Whether this class is able to read the given entry.
607     *
608     * <p>May return false if the current entry is a sparse file.</p>
609     */
610    @Override
611    public boolean canReadEntryData(ArchiveEntry ae) {
612        if (ae instanceof TarArchiveEntry) {
613            TarArchiveEntry te = (TarArchiveEntry) ae;
614            return !te.isGNUSparse();
615        }
616        return false;
617    }
618
619    /**
620     * Get the current TAR Archive Entry that this input stream is processing
621     * 
622     * @return The current Archive Entry
623     */
624    public TarArchiveEntry getCurrentEntry() {
625        return currEntry;
626    }
627
628    protected final void setCurrentEntry(TarArchiveEntry e) {
629        currEntry = e;
630    }
631
632    protected final boolean isAtEOF() {
633        return hasHitEOF;
634    }
635
636    protected final void setAtEOF(boolean b) {
637        hasHitEOF = b;
638    }
639
640    /**
641     * This method is invoked once the end of the archive is hit, it
642     * tries to consume the remaining bytes under the assumption that
643     * the tool creating this archive has padded the last block.
644     */
645    private void consumeRemainderOfLastBlock() throws IOException {
646        long bytesReadOfLastBlock = getBytesRead() % blockSize;
647        if (bytesReadOfLastBlock > 0) {
648            long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock);
649            count(skipped);
650        }
651    }
652
653    /**
654     * Checks if the signature matches what is expected for a tar file.
655     *
656     * @param signature
657     *            the bytes to check
658     * @param length
659     *            the number of bytes to check
660     * @return true, if this stream is a tar archive stream, false otherwise
661     */
662    public static boolean matches(byte[] signature, int length) {
663        if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
664            return false;
665        }
666
667        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
668                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
669            &&
670            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
671                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
672                ){
673            return true;
674        }
675        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
676                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
677            &&
678            (
679             ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
680                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
681            ||
682            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
683                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
684            )
685                ){
686            return true;
687        }
688        // COMPRESS-107 - recognise Ant tar files
689        if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
690                signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
691            &&
692            ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
693                signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
694                ){
695            return true;
696        }
697        return false;
698    }
699
700}