001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019 package org.apache.commons.compress.compressors.gzip; 020 021 import java.io.IOException; 022 import java.io.EOFException; 023 import java.io.InputStream; 024 import java.io.DataInputStream; 025 import java.io.BufferedInputStream; 026 import java.util.zip.DataFormatException; 027 import java.util.zip.Inflater; 028 import java.util.zip.CRC32; 029 030 import org.apache.commons.compress.compressors.CompressorInputStream; 031 032 /** 033 * Input stream that decompresses .gz files. 034 * This supports decompressing concatenated .gz files which is important 035 * when decompressing standalone .gz files. 036 * <p> 037 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 038 * files: it stops after the first member and silently ignores the rest. 039 * It doesn't leave the read position to point to the beginning of the next 040 * member, which makes it difficult workaround the lack of concatenation 041 * support. 042 * <p> 043 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz 044 * container format decoder. The actual decompression is done with 045 * {@link java.util.zip.Inflater}. 046 */ 047 public class GzipCompressorInputStream extends CompressorInputStream { 048 // Header flags 049 // private static final int FTEXT = 0x01; // Uninteresting for us 050 private static final int FHCRC = 0x02; 051 private static final int FEXTRA = 0x04; 052 private static final int FNAME = 0x08; 053 private static final int FCOMMENT = 0x10; 054 private static final int FRESERVED = 0xE0; 055 056 // Compressed input stream, possibly wrapped in a BufferedInputStream 057 private final InputStream in; 058 059 // True if decompressing multimember streams. 060 private final boolean decompressConcatenated; 061 062 // Buffer to hold the input data 063 private final byte[] buf = new byte[8192]; 064 065 // Amount of data in buf. 066 private int bufUsed = 0; 067 068 // Decompressor 069 private Inflater inf = new Inflater(true); 070 071 // CRC32 from uncompressed data 072 private CRC32 crc = new CRC32(); 073 074 private int memberSize; 075 076 // True once everything has been decompressed 077 private boolean endReached = false; 078 079 /** 080 * Constructs a new input stream that decompresses gzip-compressed data 081 * from the specified input stream. 082 * <p> 083 * This is equivalent to 084 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus 085 * will not decompress concatenated .gz files. 086 * 087 * @param inputStream the InputStream from which this object should 088 * be created of 089 * 090 * @throws IOException if the stream could not be created 091 */ 092 public GzipCompressorInputStream(InputStream inputStream) 093 throws IOException { 094 this(inputStream, false); 095 } 096 097 /** 098 * Constructs a new input stream that decompresses gzip-compressed data 099 * from the specified input stream. 100 * <p> 101 * If <code>decompressConcatenated</code> is {@code false}: 102 * This decompressor might read more input than it will actually use. 103 * If <code>inputStream</code> supports <code>mark</code> and 104 * <code>reset</code>, then the input position will be adjusted 105 * so that it is right after the last byte of the compressed stream. 106 * If <code>mark</code> isn't supported, the input position will be 107 * undefined. 108 * 109 * @param inputStream the InputStream from which this object should 110 * be created of 111 * @param decompressConcatenated 112 * if true, decompress until the end of the input; 113 * if false, stop after the first .gz member 114 * 115 * @throws IOException if the stream could not be created 116 */ 117 public GzipCompressorInputStream(InputStream inputStream, 118 boolean decompressConcatenated) 119 throws IOException { 120 // Mark support is strictly needed for concatenated files only, 121 // but it's simpler if it is always available. 122 if (inputStream.markSupported()) { 123 in = inputStream; 124 } else { 125 in = new BufferedInputStream(inputStream); 126 } 127 128 this.decompressConcatenated = decompressConcatenated; 129 init(true); 130 } 131 132 private boolean init(boolean isFirstMember) throws IOException { 133 assert isFirstMember || decompressConcatenated; 134 135 // Check the magic bytes without a possibility of EOFException. 136 int magic0 = in.read(); 137 int magic1 = in.read(); 138 139 // If end of input was reached after decompressing at least 140 // one .gz member, we have reached the end of the file successfully. 141 if (magic0 == -1 && !isFirstMember) { 142 return false; 143 } 144 145 if (magic0 != 31 || magic1 != 139) { 146 throw new IOException(isFirstMember 147 ? "Input is not in the .gz format" 148 : "Garbage after a valid .gz stream"); 149 } 150 151 // Parsing the rest of the header may throw EOFException. 152 DataInputStream inData = new DataInputStream(in); 153 int method = inData.readUnsignedByte(); 154 if (method != 8) { 155 throw new IOException("Unsupported compression method " 156 + method + " in the .gz header"); 157 } 158 159 int flg = inData.readUnsignedByte(); 160 if ((flg & FRESERVED) != 0) { 161 throw new IOException( 162 "Reserved flags are set in the .gz header"); 163 } 164 165 inData.readInt(); // mtime, ignored 166 inData.readUnsignedByte(); // extra flags, ignored 167 inData.readUnsignedByte(); // operating system, ignored 168 169 // Extra field, ignored 170 if ((flg & FEXTRA) != 0) { 171 int xlen = inData.readUnsignedByte(); 172 xlen |= inData.readUnsignedByte() << 8; 173 174 // This isn't as efficient as calling in.skip would be, 175 // but it's lazier to handle unexpected end of input this way. 176 // Most files don't have an extra field anyway. 177 while (xlen-- > 0) { 178 inData.readUnsignedByte(); 179 } 180 } 181 182 // Original file name, ignored 183 if ((flg & FNAME) != 0) { 184 readToNull(inData); 185 } 186 187 // Comment, ignored 188 if ((flg & FCOMMENT) != 0) { 189 readToNull(inData); 190 } 191 192 // Header "CRC16" which is actually a truncated CRC32 (which isn't 193 // as good as real CRC16). I don't know if any encoder implementation 194 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 195 // doesn't support this field, but zlib seems to be able to at least 196 // skip over it. 197 if ((flg & FHCRC) != 0) { 198 inData.readShort(); 199 } 200 201 // Reset 202 inf.reset(); 203 crc.reset(); 204 memberSize = 0; 205 206 return true; 207 } 208 209 private void readToNull(DataInputStream inData) throws IOException { 210 while (inData.readUnsignedByte() != 0x00) {} 211 } 212 213 /** {@inheritDoc} */ 214 @Override 215 public int read() throws IOException { 216 byte[] buf = new byte[1]; 217 return read(buf, 0, 1) == -1 ? -1 : (buf[0] & 0xFF); 218 } 219 220 /** 221 * {@inheritDoc} 222 * 223 * @since 1.1 224 */ 225 @Override 226 public int read(byte[] b, int off, int len) throws IOException { 227 if (endReached) { 228 return -1; 229 } 230 231 int size = 0; 232 233 while (len > 0) { 234 if (inf.needsInput()) { 235 // Remember the current position because we may need to 236 // rewind after reading too much input. 237 in.mark(buf.length); 238 239 bufUsed = in.read(buf); 240 if (bufUsed == -1) { 241 throw new EOFException(); 242 } 243 244 inf.setInput(buf, 0, bufUsed); 245 } 246 247 int ret; 248 try { 249 ret = inf.inflate(b, off, len); 250 } catch (DataFormatException e) { 251 throw new IOException("Gzip-compressed data is corrupt"); 252 } 253 254 crc.update(b, off, ret); 255 memberSize += ret; 256 off += ret; 257 len -= ret; 258 size += ret; 259 count(ret); 260 261 if (inf.finished()) { 262 // We may have read too many bytes. Rewind the read 263 // position to match the actual amount used. 264 // 265 // NOTE: The "if" is there just in case. Since we used 266 // in.mark earler, it should always skip enough. 267 in.reset(); 268 269 int skipAmount = bufUsed - inf.getRemaining(); 270 if (in.skip(skipAmount) != skipAmount) { 271 throw new IOException(); 272 } 273 274 bufUsed = 0; 275 276 DataInputStream inData = new DataInputStream(in); 277 278 // CRC32 279 long crcStored = 0; 280 for (int i = 0; i < 4; ++i) { 281 crcStored |= (long)inData.readUnsignedByte() << (i * 8); 282 } 283 284 if (crcStored != crc.getValue()) { 285 throw new IOException("Gzip-compressed data is corrupt " 286 + "(CRC32 error)"); 287 } 288 289 // Uncompressed size modulo 2^32 (ISIZE in the spec) 290 int isize = 0; 291 for (int i = 0; i < 4; ++i) { 292 isize |= inData.readUnsignedByte() << (i * 8); 293 } 294 295 if (isize != memberSize) { 296 throw new IOException("Gzip-compressed data is corrupt" 297 + "(uncompressed size mismatch)"); 298 } 299 300 // See if this is the end of the file. 301 if (!decompressConcatenated || !init(false)) { 302 inf.end(); 303 inf = null; 304 endReached = true; 305 return size == 0 ? -1 : size; 306 } 307 } 308 } 309 310 return size; 311 } 312 313 /** 314 * Checks if the signature matches what is expected for a .gz file. 315 * 316 * @param signature the bytes to check 317 * @param length the number of bytes to check 318 * @return true if this is a .gz stream, false otherwise 319 * 320 * @since 1.1 321 */ 322 public static boolean matches(byte[] signature, int length) { 323 324 if (length < 2) { 325 return false; 326 } 327 328 if (signature[0] != 31) { 329 return false; 330 } 331 332 if (signature[1] != -117) { 333 return false; 334 } 335 336 return true; 337 } 338 339 /** 340 * Closes the input stream (unless it is System.in). 341 * 342 * @since 1.2 343 */ 344 @Override 345 public void close() throws IOException { 346 if (inf != null) { 347 inf.end(); 348 inf = null; 349 } 350 351 if (this.in != System.in) { 352 this.in.close(); 353 } 354 } 355 }