001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     * http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing,
013     * software distributed under the License is distributed on an
014     * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015     * KIND, either express or implied.  See the License for the
016     * specific language governing permissions and limitations
017     * under the License.
018     */
019    package org.apache.commons.compress.compressors.gzip;
020    
021    import java.io.IOException;
022    import java.io.EOFException;
023    import java.io.InputStream;
024    import java.io.DataInputStream;
025    import java.io.BufferedInputStream;
026    import java.util.zip.DataFormatException;
027    import java.util.zip.Inflater;
028    import java.util.zip.CRC32;
029    
030    import org.apache.commons.compress.compressors.CompressorInputStream;
031    
032    /**
033     * Input stream that decompresses .gz files.
034     * This supports decompressing concatenated .gz files which is important
035     * when decompressing standalone .gz files.
036     * <p>
037     * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
038     * files: it stops after the first member and silently ignores the rest.
039     * It doesn't leave the read position to point to the beginning of the next
040     * member, which makes it difficult workaround the lack of concatenation
041     * support.
042     * <p>
043     * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
044     * container format decoder. The actual decompression is done with
045     * {@link java.util.zip.Inflater}.
046     */
047    public class GzipCompressorInputStream extends CompressorInputStream {
048        // Header flags
049        // private static final int FTEXT = 0x01; // Uninteresting for us
050        private static final int FHCRC = 0x02;
051        private static final int FEXTRA = 0x04;
052        private static final int FNAME = 0x08;
053        private static final int FCOMMENT = 0x10;
054        private static final int FRESERVED = 0xE0;
055    
056        // Compressed input stream, possibly wrapped in a BufferedInputStream
057        private final InputStream in;
058    
059        // True if decompressing multimember streams.
060        private final boolean decompressConcatenated;
061    
062        // Buffer to hold the input data
063        private final byte[] buf = new byte[8192];
064    
065        // Amount of data in buf.
066        private int bufUsed = 0;
067    
068        // Decompressor
069        private Inflater inf = new Inflater(true);
070    
071        // CRC32 from uncompressed data
072        private CRC32 crc = new CRC32();
073    
074        private int memberSize;
075    
076        // True once everything has been decompressed
077        private boolean endReached = false;
078    
079        /**
080         * Constructs a new input stream that decompresses gzip-compressed data
081         * from the specified input stream.
082         * <p>
083         * This is equivalent to
084         * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
085         * will not decompress concatenated .gz files.
086         *
087         * @param inputStream  the InputStream from which this object should
088         *                     be created of
089         *
090         * @throws IOException if the stream could not be created
091         */
092        public GzipCompressorInputStream(InputStream inputStream)
093                throws IOException {
094            this(inputStream, false);
095        }
096    
097        /**
098         * Constructs a new input stream that decompresses gzip-compressed data
099         * from the specified input stream.
100         * <p>
101         * If <code>decompressConcatenated</code> is {@code false}:
102         * This decompressor might read more input than it will actually use.
103         * If <code>inputStream</code> supports <code>mark</code> and
104         * <code>reset</code>, then the input position will be adjusted
105         * so that it is right after the last byte of the compressed stream.
106         * If <code>mark</code> isn't supported, the input position will be
107         * undefined.
108         *
109         * @param inputStream  the InputStream from which this object should
110         *                     be created of
111         * @param decompressConcatenated
112         *                     if true, decompress until the end of the input;
113         *                     if false, stop after the first .gz member
114         *
115         * @throws IOException if the stream could not be created
116         */
117        public GzipCompressorInputStream(InputStream inputStream,
118                                         boolean decompressConcatenated)
119                throws IOException {
120            // Mark support is strictly needed for concatenated files only,
121            // but it's simpler if it is always available.
122            if (inputStream.markSupported()) {
123                in = inputStream;
124            } else {
125                in = new BufferedInputStream(inputStream);
126            }
127    
128            this.decompressConcatenated = decompressConcatenated;
129            init(true);
130        }
131    
132        private boolean init(boolean isFirstMember) throws IOException {
133            assert isFirstMember || decompressConcatenated;
134    
135            // Check the magic bytes without a possibility of EOFException.
136            int magic0 = in.read();
137            int magic1 = in.read();
138    
139            // If end of input was reached after decompressing at least
140            // one .gz member, we have reached the end of the file successfully.
141            if (magic0 == -1 && !isFirstMember) {
142                return false;
143            }
144    
145            if (magic0 != 31 || magic1 != 139) {
146                throw new IOException(isFirstMember
147                                      ? "Input is not in the .gz format"
148                                      : "Garbage after a valid .gz stream");
149            }
150    
151            // Parsing the rest of the header may throw EOFException.
152            DataInputStream inData = new DataInputStream(in);
153            int method = inData.readUnsignedByte();
154            if (method != 8) {
155                throw new IOException("Unsupported compression method "
156                                      + method + " in the .gz header");
157            }
158    
159            int flg = inData.readUnsignedByte();
160            if ((flg & FRESERVED) != 0) {
161                throw new IOException(
162                        "Reserved flags are set in the .gz header");
163            }
164    
165            inData.readInt(); // mtime, ignored
166            inData.readUnsignedByte(); // extra flags, ignored
167            inData.readUnsignedByte(); // operating system, ignored
168    
169            // Extra field, ignored
170            if ((flg & FEXTRA) != 0) {
171                int xlen = inData.readUnsignedByte();
172                xlen |= inData.readUnsignedByte() << 8;
173    
174                // This isn't as efficient as calling in.skip would be,
175                // but it's lazier to handle unexpected end of input this way.
176                // Most files don't have an extra field anyway.
177                while (xlen-- > 0) {
178                    inData.readUnsignedByte();
179                }
180            }
181    
182            // Original file name, ignored
183            if ((flg & FNAME) != 0) {
184                readToNull(inData);
185            }
186    
187            // Comment, ignored
188            if ((flg & FCOMMENT) != 0) {
189                readToNull(inData);
190            }
191    
192            // Header "CRC16" which is actually a truncated CRC32 (which isn't
193            // as good as real CRC16). I don't know if any encoder implementation
194            // sets this, so it's not worth trying to verify it. GNU gzip 1.4
195            // doesn't support this field, but zlib seems to be able to at least
196            // skip over it.
197            if ((flg & FHCRC) != 0) {
198                inData.readShort();
199            }
200    
201            // Reset
202            inf.reset();
203            crc.reset();
204            memberSize = 0;
205    
206            return true;
207        }
208    
209        private void readToNull(DataInputStream inData) throws IOException {
210            while (inData.readUnsignedByte() != 0x00) {}
211        }
212    
213        /** {@inheritDoc} */
214        @Override
215        public int read() throws IOException {
216            byte[] buf = new byte[1];
217            return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF);
218        }
219    
220        /**
221         * {@inheritDoc}
222         *
223         * @since 1.1
224         */
225        @Override
226        public int read(byte[] b, int off, int len) throws IOException {
227            if (endReached) {
228                return -1;
229            }
230    
231            int size = 0;
232    
233            while (len > 0) {
234                if (inf.needsInput()) {
235                    // Remember the current position because we may need to
236                    // rewind after reading too much input.
237                    in.mark(buf.length);
238    
239                    bufUsed = in.read(buf);
240                    if (bufUsed == -1) {
241                        throw new EOFException();
242                    }
243    
244                    inf.setInput(buf, 0, bufUsed);
245                }
246    
247                int ret;
248                try {
249                    ret = inf.inflate(b, off, len);
250                } catch (DataFormatException e) {
251                    throw new IOException("Gzip-compressed data is corrupt");
252                }
253    
254                crc.update(b, off, ret);
255                memberSize += ret;
256                off += ret;
257                len -= ret;
258                size += ret;
259                count(ret);
260    
261                if (inf.finished()) {
262                    // We may have read too many bytes. Rewind the read
263                    // position to match the actual amount used.
264                    //
265                    // NOTE: The "if" is there just in case. Since we used
266                    // in.mark earler, it should always skip enough.
267                    in.reset();
268    
269                    int skipAmount = bufUsed - inf.getRemaining();
270                    if (in.skip(skipAmount) != skipAmount) {
271                        throw new IOException();
272                    }
273    
274                    bufUsed = 0;
275    
276                    DataInputStream inData = new DataInputStream(in);
277    
278                    // CRC32
279                    long crcStored = 0;
280                    for (int i = 0; i < 4; ++i) {
281                        crcStored |= (long)inData.readUnsignedByte() << (i * 8);
282                    }
283    
284                    if (crcStored != crc.getValue()) {
285                        throw new IOException("Gzip-compressed data is corrupt "
286                                              + "(CRC32 error)");
287                    }
288    
289                    // Uncompressed size modulo 2^32 (ISIZE in the spec)
290                    int isize = 0;
291                    for (int i = 0; i < 4; ++i) {
292                        isize |= inData.readUnsignedByte() << (i * 8);
293                    }
294    
295                    if (isize != memberSize) {
296                        throw new IOException("Gzip-compressed data is corrupt"
297                                              + "(uncompressed size mismatch)");
298                    }
299    
300                    // See if this is the end of the file.
301                    if (!decompressConcatenated || !init(false)) {
302                        inf.end();
303                        inf = null;
304                        endReached = true;
305                        return size == 0 ? -1 : size;
306                    }
307                }
308            }
309    
310            return size;
311        }
312    
313        /**
314         * Checks if the signature matches what is expected for a .gz file.
315         *
316         * @param signature the bytes to check
317         * @param length    the number of bytes to check
318         * @return          true if this is a .gz stream, false otherwise
319         *
320         * @since 1.1
321         */
322        public static boolean matches(byte[] signature, int length) {
323    
324            if (length < 2) {
325                return false;
326            }
327    
328            if (signature[0] != 31) {
329                return false;
330            }
331    
332            if (signature[1] != -117) {
333                return false;
334            }
335    
336            return true;
337        }
338    
339        /**
340         * Closes the input stream (unless it is System.in).
341         *
342         * @since 1.2
343         */
344        @Override
345        public void close() throws IOException {
346            if (inf != null) {
347                inf.end();
348                inf = null;
349            }
350    
351            if (this.in != System.in) {
352                this.in.close();
353            }
354        }
355    }