001    /*
002     *  Licensed to the Apache Software Foundation (ASF) under one or more
003     *  contributor license agreements.  See the NOTICE file distributed with
004     *  this work for additional information regarding copyright ownership.
005     *  The ASF licenses this file to You under the Apache License, Version 2.0
006     *  (the "License"); you may not use this file except in compliance with
007     *  the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     *  Unless required by applicable law or agreed to in writing, software
012     *  distributed under the License is distributed on an "AS IS" BASIS,
013     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     *  See the License for the specific language governing permissions and
015     *  limitations under the License.
016     *
017     */
018    
019    /*
020     * This package is based on the work done by Timothy Gerard Endres
021     * (time@ice.com) to whom the Ant project is very grateful for his great code.
022     */
023    
024    package org.apache.commons.compress.archivers.tar;
025    
026    import java.io.ByteArrayOutputStream;
027    import java.io.IOException;
028    import java.io.InputStream;
029    import java.util.HashMap;
030    import java.util.Map;
031    import java.util.Map.Entry;
032    
033    import org.apache.commons.compress.archivers.ArchiveEntry;
034    import org.apache.commons.compress.archivers.ArchiveInputStream;
035    import org.apache.commons.compress.archivers.zip.ZipEncoding;
036    import org.apache.commons.compress.archivers.zip.ZipEncodingHelper;
037    import org.apache.commons.compress.utils.ArchiveUtils;
038    import org.apache.commons.compress.utils.CharsetNames;
039    
040    /**
041     * The TarInputStream reads a UNIX tar archive as an InputStream.
042     * methods are provided to position at each successive entry in
043     * the archive, and the read each entry as a normal input stream
044     * using read().
045     * @NotThreadSafe
046     */
047    public class TarArchiveInputStream extends ArchiveInputStream {
048        private static final int SMALL_BUFFER_SIZE = 256;
049        private static final int BUFFER_SIZE = 8 * 1024;
050    
051        private boolean hasHitEOF;
052        private long entrySize;
053        private long entryOffset;
054        private byte[] readBuf;
055        protected final TarBuffer buffer;
056        private TarArchiveEntry currEntry;
057        private final ZipEncoding encoding;
058    
059        /**
060         * Constructor for TarInputStream.
061         * @param is the input stream to use
062         */
063        public TarArchiveInputStream(InputStream is) {
064            this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE);
065        }
066    
067        /**
068         * Constructor for TarInputStream.
069         * @param is the input stream to use
070         * @param encoding name of the encoding to use for file names
071         * @since Commons Compress 1.4
072         */
073        public TarArchiveInputStream(InputStream is, String encoding) {
074            this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE, encoding);
075        }
076    
077        /**
078         * Constructor for TarInputStream.
079         * @param is the input stream to use
080         * @param blockSize the block size to use
081         */
082        public TarArchiveInputStream(InputStream is, int blockSize) {
083            this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE);
084        }
085    
086        /**
087         * Constructor for TarInputStream.
088         * @param is the input stream to use
089         * @param blockSize the block size to use
090         * @param encoding name of the encoding to use for file names
091         * @since Commons Compress 1.4
092         */
093        public TarArchiveInputStream(InputStream is, int blockSize,
094                                     String encoding) {
095            this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE, encoding);
096        }
097    
098        /**
099         * Constructor for TarInputStream.
100         * @param is the input stream to use
101         * @param blockSize the block size to use
102         * @param recordSize the record size to use
103         */
104        public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
105            this(is, blockSize, recordSize, null);
106        }
107    
108        /**
109         * Constructor for TarInputStream.
110         * @param is the input stream to use
111         * @param blockSize the block size to use
112         * @param recordSize the record size to use
113         * @param encoding name of the encoding to use for file names
114         * @since Commons Compress 1.4
115         */
116        public TarArchiveInputStream(InputStream is, int blockSize, int recordSize,
117                                     String encoding) {
118            this.buffer = new TarBuffer(is, blockSize, recordSize);
119            this.readBuf = null;
120            this.hasHitEOF = false;
121            this.encoding = ZipEncodingHelper.getZipEncoding(encoding);
122        }
123    
124        /**
125         * Closes this stream. Calls the TarBuffer's close() method.
126         * @throws IOException on error
127         */
128        @Override
129        public void close() throws IOException {
130            buffer.close();
131        }
132    
133        /**
134         * Get the record size being used by this stream's TarBuffer.
135         *
136         * @return The TarBuffer record size.
137         */
138        public int getRecordSize() {
139            return buffer.getRecordSize();
140        }
141    
142        /**
143         * Get the available data that can be read from the current
144         * entry in the archive. This does not indicate how much data
145         * is left in the entire archive, only in the current entry.
146         * This value is determined from the entry's size header field
147         * and the amount of data already read from the current entry.
148         * Integer.MAX_VALUE is returen in case more than Integer.MAX_VALUE
149         * bytes are left in the current entry in the archive.
150         *
151         * @return The number of available bytes for the current entry.
152         * @throws IOException for signature
153         */
154        @Override
155        public int available() throws IOException {
156            if (entrySize - entryOffset > Integer.MAX_VALUE) {
157                return Integer.MAX_VALUE;
158            }
159            return (int) (entrySize - entryOffset);
160        }
161    
162        /**
163         * Skip bytes in the input buffer. This skips bytes in the
164         * current entry's data, not the entire archive, and will
165         * stop at the end of the current entry's data if the number
166         * to skip extends beyond that point.
167         *
168         * @param numToSkip The number of bytes to skip.
169         * @return the number actually skipped
170         * @throws IOException on error
171         */
172        @Override
173        public long skip(long numToSkip) throws IOException {
174            // REVIEW
175            // This is horribly inefficient, but it ensures that we
176            // properly skip over bytes via the TarBuffer...
177            //
178            byte[] skipBuf = new byte[BUFFER_SIZE];
179            long skip = numToSkip;
180            while (skip > 0) {
181                int realSkip = (int) (skip > skipBuf.length ? skipBuf.length : skip);
182                int numRead = read(skipBuf, 0, realSkip);
183                if (numRead == -1) {
184                    break;
185                }
186                skip -= numRead;
187            }
188            return (numToSkip - skip);
189        }
190    
191        /**
192         * Since we do not support marking just yet, we do nothing.
193         */
194        @Override
195        public synchronized void reset() {
196        }
197    
198        /**
199         * Get the next entry in this tar archive. This will skip
200         * over any remaining data in the current entry, if there
201         * is one, and place the input stream at the header of the
202         * next entry, and read the header and instantiate a new
203         * TarEntry from the header bytes and return that entry.
204         * If there are no more entries in the archive, null will
205         * be returned to indicate that the end of the archive has
206         * been reached.
207         *
208         * @return The next TarEntry in the archive, or null.
209         * @throws IOException on error
210         */
211        public TarArchiveEntry getNextTarEntry() throws IOException {
212            if (hasHitEOF) {
213                return null;
214            }
215    
216            if (currEntry != null) {
217                long numToSkip = entrySize - entryOffset;
218    
219                while (numToSkip > 0) {
220                    long skipped = skip(numToSkip);
221                    if (skipped <= 0) {
222                        throw new RuntimeException("failed to skip current tar entry");
223                    }
224                    numToSkip -= skipped;
225                }
226    
227                readBuf = null;
228            }
229    
230            byte[] headerBuf = getRecord();
231    
232            if (hasHitEOF) {
233                currEntry = null;
234                return null;
235            }
236    
237            try {
238                currEntry = new TarArchiveEntry(headerBuf, encoding);
239            } catch (IllegalArgumentException e) {
240                IOException ioe = new IOException("Error detected parsing the header");
241                ioe.initCause(e);
242                throw ioe;
243            }
244            entryOffset = 0;
245            entrySize = currEntry.getSize();
246    
247            if (currEntry.isGNULongNameEntry()) {
248                // read in the name
249                StringBuffer longName = new StringBuffer();
250                byte[] buf = new byte[SMALL_BUFFER_SIZE];
251                int length = 0;
252                while ((length = read(buf)) >= 0) {
253                    longName.append(new String(buf, 0, length)); // TODO default charset?
254                }
255                getNextEntry();
256                if (currEntry == null) {
257                    // Bugzilla: 40334
258                    // Malformed tar file - long entry name not followed by entry
259                    return null;
260                }
261                // remove trailing null terminator
262                if (longName.length() > 0
263                    && longName.charAt(longName.length() - 1) == 0) {
264                    longName.deleteCharAt(longName.length() - 1);
265                }
266                currEntry.setName(longName.toString());
267            }
268    
269            if (currEntry.isPaxHeader()){ // Process Pax headers
270                paxHeaders();
271            }
272    
273            if (currEntry.isGNUSparse()){ // Process sparse files
274                readGNUSparse();
275            }
276    
277            // If the size of the next element in the archive has changed
278            // due to a new size being reported in the posix header
279            // information, we update entrySize here so that it contains
280            // the correct value.
281            entrySize = currEntry.getSize();
282            return currEntry;
283        }
284    
285        /**
286         * Get the next record in this tar archive. This will skip
287         * over any remaining data in the current entry, if there
288         * is one, and place the input stream at the header of the
289         * next entry.
290         * If there are no more entries in the archive, null will
291         * be returned to indicate that the end of the archive has
292         * been reached.
293         *
294         * @return The next header in the archive, or null.
295         * @throws IOException on error
296         */
297        private byte[] getRecord() throws IOException {
298            if (hasHitEOF) {
299                return null;
300            }
301    
302            byte[] headerBuf = buffer.readRecord();
303    
304            if (headerBuf == null) {
305                hasHitEOF = true;
306            } else if (buffer.isEOFRecord(headerBuf)) {
307                hasHitEOF = true;
308            }
309    
310            return hasHitEOF ? null : headerBuf;
311        }
312    
313        private void paxHeaders() throws IOException{
314            Map<String, String> headers = parsePaxHeaders(this);
315            getNextEntry(); // Get the actual file entry
316            applyPaxHeadersToCurrentEntry(headers);
317        }
318    
319        Map<String, String> parsePaxHeaders(InputStream i) throws IOException {
320            Map<String, String> headers = new HashMap<String, String>();
321            // Format is "length keyword=value\n";
322            while(true){ // get length
323                int ch;
324                int len = 0;
325                int read = 0;
326                while((ch = i.read()) != -1) {
327                    read++;
328                    if (ch == ' '){ // End of length string
329                        // Get keyword
330                        ByteArrayOutputStream coll = new ByteArrayOutputStream();
331                        while((ch = i.read()) != -1) {
332                            read++;
333                            if (ch == '='){ // end of keyword
334                                String keyword = coll.toString(CharsetNames.UTF_8);
335                                // Get rest of entry
336                                byte[] rest = new byte[len - read];
337                                int got = i.read(rest);
338                                if (got != len - read){
339                                    throw new IOException("Failed to read "
340                                                          + "Paxheader. Expected "
341                                                          + (len - read)
342                                                          + " bytes, read "
343                                                          + got);
344                                }
345                                // Drop trailing NL
346                                String value = new String(rest, 0,
347                                                          len - read - 1, CharsetNames.UTF_8);
348                                headers.put(keyword, value);
349                                break;
350                            }
351                            coll.write((byte) ch);
352                        }
353                        break; // Processed single header
354                    }
355                    len *= 10;
356                    len += ch - '0';
357                }
358                if (ch == -1){ // EOF
359                    break;
360                }
361            }
362            return headers;
363        }
364    
365        private void applyPaxHeadersToCurrentEntry(Map<String, String> headers) {
366            /*
367             * The following headers are defined for Pax.
368             * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields
369             * mtime
370             * comment
371             * gid, gname
372             * linkpath
373             * size
374             * uid,uname
375             * SCHILY.devminor, SCHILY.devmajor: don't have setters/getters for those
376             */
377            for (Entry<String, String> ent : headers.entrySet()){
378                String key = ent.getKey();
379                String val = ent.getValue();
380                if ("path".equals(key)){
381                    currEntry.setName(val);
382                } else if ("linkpath".equals(key)){
383                    currEntry.setLinkName(val);
384                } else if ("gid".equals(key)){
385                    currEntry.setGroupId(Integer.parseInt(val));
386                } else if ("gname".equals(key)){
387                    currEntry.setGroupName(val);
388                } else if ("uid".equals(key)){
389                    currEntry.setUserId(Integer.parseInt(val));
390                } else if ("uname".equals(key)){
391                    currEntry.setUserName(val);
392                } else if ("size".equals(key)){
393                    currEntry.setSize(Long.parseLong(val));
394                } else if ("mtime".equals(key)){
395                    currEntry.setModTime((long) (Double.parseDouble(val) * 1000));
396                } else if ("SCHILY.devminor".equals(key)){
397                    currEntry.setDevMinor(Integer.parseInt(val));
398                } else if ("SCHILY.devmajor".equals(key)){
399                    currEntry.setDevMajor(Integer.parseInt(val));
400                }
401            }
402        }
403    
404        /**
405         * Adds the sparse chunks from the current entry to the sparse chunks,
406         * including any additional sparse entries following the current entry.
407         * 
408         * @throws IOException on error 
409         * 
410         * @todo Sparse files get not yet really processed. 
411         */
412        private void readGNUSparse() throws IOException {
413            /* we do not really process sparse files yet
414            sparses = new ArrayList();
415            sparses.addAll(currEntry.getSparses());
416            */
417            if (currEntry.isExtended()) {
418                TarArchiveSparseEntry entry;
419                do {
420                    byte[] headerBuf = getRecord();
421                    if (hasHitEOF) {
422                        currEntry = null;
423                        break;
424                    }
425                    entry = new TarArchiveSparseEntry(headerBuf);
426                    /* we do not really process sparse files yet
427                    sparses.addAll(entry.getSparses());
428                    */
429                } while (entry.isExtended());
430            }
431        }
432    
433        @Override
434        public ArchiveEntry getNextEntry() throws IOException {
435            return getNextTarEntry();
436        }
437    
438        /**
439         * Reads bytes from the current tar archive entry.
440         *
441         * This method is aware of the boundaries of the current
442         * entry in the archive and will deal with them as if they
443         * were this stream's start and EOF.
444         *
445         * @param buf The buffer into which to place bytes read.
446         * @param offset The offset at which to place bytes read.
447         * @param numToRead The number of bytes to read.
448         * @return The number of bytes read, or -1 at EOF.
449         * @throws IOException on error
450         */
451        @Override
452        public int read(byte[] buf, int offset, int numToRead) throws IOException {
453            int totalRead = 0;
454    
455            if (entryOffset >= entrySize) {
456                return -1;
457            }
458    
459            if ((numToRead + entryOffset) > entrySize) {
460                numToRead = (int) (entrySize - entryOffset);
461            }
462    
463            if (readBuf != null) {
464                int sz = (numToRead > readBuf.length) ? readBuf.length
465                    : numToRead;
466    
467                System.arraycopy(readBuf, 0, buf, offset, sz);
468    
469                if (sz >= readBuf.length) {
470                    readBuf = null;
471                } else {
472                    int newLen = readBuf.length - sz;
473                    byte[] newBuf = new byte[newLen];
474    
475                    System.arraycopy(readBuf, sz, newBuf, 0, newLen);
476    
477                    readBuf = newBuf;
478                }
479    
480                totalRead += sz;
481                numToRead -= sz;
482                offset += sz;
483            }
484    
485            while (numToRead > 0) {
486                byte[] rec = buffer.readRecord();
487    
488                if (rec == null) {
489                    // Unexpected EOF!
490                    throw new IOException("unexpected EOF with " + numToRead
491                                          + " bytes unread. Occured at byte: " + getBytesRead());
492                }
493                count(rec.length);
494                int sz = numToRead;
495                int recLen = rec.length;
496    
497                if (recLen > sz) {
498                    System.arraycopy(rec, 0, buf, offset, sz);
499    
500                    readBuf = new byte[recLen - sz];
501    
502                    System.arraycopy(rec, sz, readBuf, 0, recLen - sz);
503                } else {
504                    sz = recLen;
505    
506                    System.arraycopy(rec, 0, buf, offset, recLen);
507                }
508    
509                totalRead += sz;
510                numToRead -= sz;
511                offset += sz;
512            }
513    
514            entryOffset += totalRead;
515    
516            return totalRead;
517        }
518    
519        /**
520         * Whether this class is able to read the given entry.
521         *
522         * <p>May return false if the current entry is a sparse file.</p>
523         */
524        @Override
525        public boolean canReadEntryData(ArchiveEntry ae) {
526            if (ae instanceof TarArchiveEntry) {
527                TarArchiveEntry te = (TarArchiveEntry) ae;
528                return !te.isGNUSparse();
529            }
530            return false;
531        }
532    
533        protected final TarArchiveEntry getCurrentEntry() {
534            return currEntry;
535        }
536    
537        protected final void setCurrentEntry(TarArchiveEntry e) {
538            currEntry = e;
539        }
540    
541        protected final boolean isAtEOF() {
542            return hasHitEOF;
543        }
544    
545        protected final void setAtEOF(boolean b) {
546            hasHitEOF = b;
547        }
548    
549        /**
550         * Checks if the signature matches what is expected for a tar file.
551         * 
552         * @param signature
553         *            the bytes to check
554         * @param length
555         *            the number of bytes to check
556         * @return true, if this stream is a tar archive stream, false otherwise
557         */
558        public static boolean matches(byte[] signature, int length) {
559            if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
560                return false;
561            }
562    
563            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
564                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
565                &&
566                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
567                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
568                    ){
569                return true;
570            }
571            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
572                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
573                &&
574                (
575                 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
576                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
577                ||
578                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
579                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
580                )
581                    ){
582                return true;
583            }
584            // COMPRESS-107 - recognise Ant tar files
585            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
586                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
587                &&
588                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
589                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
590                    ){
591                return true;
592            }
593            return false;
594        }
595    
596    }