001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.util.Arrays; 023 024/** 025 * Helper class for compression algorithms that use the ideas of LZ77. 026 * 027 * <p>Most LZ77 derived algorithms split input data into blocks of 028 * uncompressed data (called literal blocks) and back-references 029 * (pairs of offsets and lengths) that state "add <code>length</code> 030 * bytes that are the same as those already written starting 031 * <code>offset</code> bytes before the current position. The details 032 * of how those blocks and back-references are encoded are quite 033 * different between the algorithms and some algorithms perform 034 * additional steps (Huffman encoding in the case of DEFLATE for 035 * example).</p> 036 * 037 * <p>This class attempts to extract the core logic - finding 038 * back-references - so it can be re-used. It follows the algorithm 039 * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't 040 * implement the "lazy match" optimization. The three-byte hash 041 * function used in this class is the same as the one used by zlib and 042 * InfoZIP's ZIP implementation of DEFLATE. The whole class is 043 * strongly inspired by InfoZIP's implementation.</p> 044 * 045 * <p>LZ77 is used vaguely here (as well as many other places that 046 * talk about it :-), LZSS would likely be closer to the truth but 047 * LZ77 has become the synonym for a whole family of algorithms.</p> 048 * 049 * <p>The API consists of a compressor that is fed <code>byte</code>s 050 * and emits {@link Block}s to a registered callback where the blocks 051 * represent either {@link LiteralBlock literal blocks}, {@link 052 * BackReference back-references} or {@link EOD end of data 053 * markers}. In order to ensure the callback receives all information, 054 * the {@code #finish} method must be used once all data has been fed 055 * into the compressor.</p> 056 * 057 * <p>Several parameters influence the outcome of the "compression":</p> 058 * <dl> 059 * 060 * <dt><code>windowSize</code></dt> <dd>the size of the sliding 061 * window, must be a power of two - this determines the maximum 062 * offset a back-reference can take. The compressor maintains a 063 * buffer of twice of <code>windowSize</code> - real world values are 064 * in the area of 32k.</dd> 065 * 066 * <dt><code>minBackReferenceLength</code></dt> 067 * <dd>Minimal length of a back-reference found. A true minimum of 3 is 068 * hard-coded inside of this implemention but bigger lengths can be 069 * configured.</dd> 070 * 071 * <dt><code>maxBackReferenceLength</code></dt> 072 * <dd>Maximal length of a back-reference found.</dd> 073 * 074 * <dt><code>maxOffset</code></dt> 075 * <dd>Maximal offset of a back-reference.</dd> 076 * 077 * <dt><code>maxLiteralLength</code></dt> 078 * <dd>Maximal length of a literal block.</dd> 079 * </dl> 080 * 081 * @see "https://tools.ietf.org/html/rfc1951#section-4" 082 * @since 1.14 083 * @NotThreadSafe 084 */ 085public class LZ77Compressor { 086 087 /** 088 * Base class representing things the compressor may emit. 089 */ 090 public static abstract class Block { } 091 /** 092 * Represents a literal block of data. 093 * 094 * <p>For performance reasons this encapsulates the real data, not 095 * a copy of it. Don't modify the data and process it inside of 096 * {@link Callback#accept} immediately as it will get overwritten 097 * sooner or later.</p> 098 */ 099 public static final class LiteralBlock extends Block { 100 private final byte[] data; 101 private final int offset, length; 102 public LiteralBlock(byte[] data, int offset, int length) { 103 this.data = data; 104 this.offset = offset; 105 this.length = length; 106 } 107 /** 108 * The literal data. 109 * 110 * <p>This returns a life view of the actual data in order to 111 * avoid copying, modify the array at your own risk.</p> 112 * @return the data 113 */ 114 public byte[] getData() { 115 return data; 116 } 117 /** 118 * Offset into data where the literal block starts. 119 * @return the offset 120 */ 121 public int getOffset() { 122 return offset; 123 } 124 /** 125 * Length of literal block. 126 * @return the length 127 */ 128 public int getLength() { 129 return length; 130 } 131 132 @Override 133 public String toString() { 134 return "LiteralBlock starting at " + offset + " with length " + length; 135 } 136 } 137 /** 138 * Represents a back-reference. 139 */ 140 public static final class BackReference extends Block { 141 private final int offset, length; 142 public BackReference(int offset, int length) { 143 this.offset = offset; 144 this.length = length; 145 } 146 /** 147 * Provides the offset of the back-reference. 148 * @return the offset 149 */ 150 public int getOffset() { 151 return offset; 152 } 153 /** 154 * Provides the length of the back-reference. 155 * @return the length 156 */ 157 public int getLength() { 158 return length; 159 } 160 161 @Override 162 public String toString() { 163 return "BackReference with offset " + offset + " and length " + length; 164 } 165 } 166 /** 167 * A simple "we are done" marker. 168 */ 169 public static final class EOD extends Block { } 170 171 private static final EOD THE_EOD = new EOD(); 172 173 /** 174 * Callback invoked while the compressor processes data. 175 * 176 * <p>The callback is invoked on the same thread that receives the 177 * bytes to compress and may be invoked multiple times during the 178 * execution of {@link #compress} or {@link #finish}.</p> 179 */ 180 public interface Callback { 181 /** 182 * Consumes a block. 183 * @param b the block to consume 184 * @throws IOException in case of an error 185 */ 186 void accept(Block b) throws IOException; 187 } 188 189 static final int NUMBER_OF_BYTES_IN_HASH = 3; 190 private static final int NO_MATCH = -1; 191 192 private final Parameters params; 193 private final Callback callback; 194 195 // the sliding window, twice as big as "windowSize" parameter 196 private final byte[] window; 197 // the head of hash-chain - indexed by hash-code, points to the 198 // location inside of window of the latest sequence of bytes with 199 // the given hash. 200 private final int[] head; 201 // for each window-location points to the latest earlier location 202 // with the same hash. Only stores values for the latest 203 // "windowSize" elements, the index is "window location modulo 204 // windowSize". 205 private final int[] prev; 206 207 // bit mask used when indexing into prev 208 private final int wMask; 209 210 private boolean initialized = false; 211 // the position inside of window that shall be encoded right now 212 private int currentPosition; 213 // the number of bytes available to compress including the one at 214 // currentPosition 215 private int lookahead = 0; 216 // the hash of the three bytes stating at the current position 217 private int insertHash = 0; 218 // the position inside of the window where the current literal 219 // block starts (in case we are inside of a literal block). 220 private int blockStart = 0; 221 // position of the current match 222 private int matchStart = NO_MATCH; 223 // number of missed insertString calls for the up to three last 224 // bytes of the last match that can only be performed once more 225 // data has been read 226 private int missedInserts = 0; 227 228 /** 229 * Initializes a compressor with parameters and a callback. 230 * @param params the parameters 231 * @param callback the callback 232 * @throws NullPointerException if either parameter is <code>null</code> 233 */ 234 public LZ77Compressor(Parameters params, Callback callback) { 235 if (params == null) { 236 throw new NullPointerException("params must not be null"); 237 } 238 if (callback == null) { 239 throw new NullPointerException("callback must not be null"); 240 } 241 this.params = params; 242 this.callback = callback; 243 244 final int wSize = params.getWindowSize(); 245 window = new byte[wSize * 2]; 246 wMask = wSize - 1; 247 head = new int[HASH_SIZE]; 248 Arrays.fill(head, NO_MATCH); 249 prev = new int[wSize]; 250 } 251 252 /** 253 * Feeds bytes into the compressor which in turn may emit zero or 254 * more blocks to the callback during the execution of this 255 * method. 256 * @param data the data to compress - must not be null 257 * @throws IOException if the callback throws an exception 258 */ 259 public void compress(byte[] data) throws IOException { 260 compress(data, 0, data.length); 261 } 262 263 /** 264 * Feeds bytes into the compressor which in turn may emit zero or 265 * more blocks to the callback during the execution of this 266 * method. 267 * @param data the data to compress - must not be null 268 * @param off the start offset of the data 269 * @param len the number of bytes to compress 270 * @throws IOException if the callback throws an exception 271 */ 272 public void compress(byte[] data, int off, int len) throws IOException { 273 final int wSize = params.getWindowSize(); 274 while (len > wSize) { // chop into windowSize sized chunks 275 doCompress(data, off, wSize); 276 off += wSize; 277 len -= wSize; 278 } 279 if (len > 0) { 280 doCompress(data, off, len); 281 } 282 } 283 284 /** 285 * Tells the compressor to process all remaining data and signal 286 * end of data to the callback. 287 * 288 * <p>The compressor will in turn emit at least one block ({@link 289 * EOD}) but potentially multiple blocks to the callback during 290 * the execution of this method.</p> 291 * @throws IOException if the callback throws an exception 292 */ 293 public void finish() throws IOException { 294 if (blockStart != currentPosition || lookahead > 0) { 295 currentPosition += lookahead; 296 flushLiteralBlock(); 297 } 298 callback.accept(THE_EOD); 299 } 300 301 /** 302 * Adds some initial data to fill the window with. 303 * 304 * <p>This is used if the stream has been cut into blocks and 305 * back-references of one block may refer to data of the previous 306 * block(s). One such example is the LZ4 frame format using block 307 * dependency.</p> 308 * 309 * @param data the data to fill the window with. 310 * @throws IllegalStateException if the compressor has already started to accept data 311 */ 312 public void prefill(byte[] data) { 313 if (currentPosition != 0 || lookahead != 0) { 314 throw new IllegalStateException("the compressor has already started to accept data, can't prefill anymore"); 315 } 316 317 // don't need more than windowSize for back-references 318 final int len = Math.min(params.getWindowSize(), data.length); 319 System.arraycopy(data, data.length - len, window, 0, len); 320 321 if (len >= NUMBER_OF_BYTES_IN_HASH) { 322 initialize(); 323 final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1; 324 for (int i = 0; i < stop; i++) { 325 insertString(i); 326 } 327 missedInserts = NUMBER_OF_BYTES_IN_HASH - 1; 328 } else { // not enough data to hash anything 329 missedInserts = len; 330 } 331 blockStart = currentPosition = len; 332 } 333 334 // we use a 15 bit hashcode as calculated in updateHash 335 private static final int HASH_SIZE = 1 << 15; 336 private static final int HASH_MASK = HASH_SIZE - 1; 337 private static final int H_SHIFT = 5; 338 339 /** 340 * Assumes we are calculating the hash for three consecutive bytes 341 * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC 342 * the new hash for BCD is nextHash(H, D). 343 * 344 * <p>The hash is shifted by five bits on each update so all 345 * effects of A have been swapped after the third update.</p> 346 */ 347 private int nextHash(int oldHash, byte nextByte) { 348 final int nextVal = nextByte & 0xFF; 349 return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK; 350 } 351 352 // performs the actual algorithm with the pre-condition len <= windowSize 353 private void doCompress(byte[] data, int off, int len) throws IOException { 354 int spaceLeft = window.length - currentPosition - lookahead; 355 if (len > spaceLeft) { 356 slide(); 357 } 358 System.arraycopy(data, off, window, currentPosition + lookahead, len); 359 lookahead += len; 360 if (!initialized && lookahead >= params.getMinBackReferenceLength()) { 361 initialize(); 362 } 363 if (initialized) { 364 compress(); 365 } 366 } 367 368 private void slide() throws IOException { 369 final int wSize = params.getWindowSize(); 370 if (blockStart != currentPosition && blockStart < wSize) { 371 flushLiteralBlock(); 372 blockStart = currentPosition; 373 } 374 System.arraycopy(window, wSize, window, 0, wSize); 375 currentPosition -= wSize; 376 matchStart -= wSize; 377 blockStart -= wSize; 378 for (int i = 0; i < HASH_SIZE; i++) { 379 int h = head[i]; 380 head[i] = h >= wSize ? h - wSize : NO_MATCH; 381 } 382 for (int i = 0; i < wSize; i++) { 383 int p = prev[i]; 384 prev[i] = p >= wSize ? p - wSize : NO_MATCH; 385 } 386 } 387 388 private void initialize() { 389 for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { 390 insertHash = nextHash(insertHash, window[i]); 391 } 392 initialized = true; 393 } 394 395 private void compress() throws IOException { 396 final int minMatch = params.getMinBackReferenceLength(); 397 final boolean lazy = params.getLazyMatching(); 398 final int lazyThreshold = params.getLazyMatchingThreshold(); 399 400 while (lookahead >= minMatch) { 401 catchUpMissedInserts(); 402 int matchLength = 0; 403 int hashHead = insertString(currentPosition); 404 if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { 405 // sets matchStart as a side effect 406 matchLength = longestMatch(hashHead); 407 408 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { 409 // try to find a longer match using the next position 410 matchLength = longestMatchForNextPosition(matchLength); 411 } 412 } 413 if (matchLength >= minMatch) { 414 if (blockStart != currentPosition) { 415 // emit preceeding literal block 416 flushLiteralBlock(); 417 blockStart = NO_MATCH; 418 } 419 flushBackReference(matchLength); 420 insertStringsInMatch(matchLength); 421 lookahead -= matchLength; 422 currentPosition += matchLength; 423 blockStart = currentPosition; 424 } else { 425 // no match, append to current or start a new literal 426 lookahead--; 427 currentPosition++; 428 if (currentPosition - blockStart >= params.getMaxLiteralLength()) { 429 flushLiteralBlock(); 430 blockStart = currentPosition; 431 } 432 } 433 } 434 } 435 436 /** 437 * Inserts the current three byte sequence into the dictionary and 438 * returns the previous head of the hash-chain. 439 * 440 * <p>Updates <code>insertHash</code> and <code>prev</code> as a 441 * side effect.</p> 442 */ 443 private int insertString(int pos) { 444 insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); 445 int hashHead = head[insertHash]; 446 prev[pos & wMask] = hashHead; 447 head[insertHash] = pos; 448 return hashHead; 449 } 450 451 private int longestMatchForNextPosition(final int prevMatchLength) { 452 // save a bunch of values to restore them if the next match isn't better than the current one 453 final int prevMatchStart = matchStart; 454 final int prevInsertHash = insertHash; 455 456 lookahead--; 457 currentPosition++; 458 int hashHead = insertString(currentPosition); 459 final int prevHashHead = prev[currentPosition & wMask]; 460 int matchLength = longestMatch(hashHead); 461 462 if (matchLength <= prevMatchLength) { 463 // use the first match, as the next one isn't any better 464 matchLength = prevMatchLength; 465 matchStart = prevMatchStart; 466 467 // restore modified values 468 head[insertHash] = prevHashHead; 469 insertHash = prevInsertHash; 470 currentPosition--; 471 lookahead++; 472 } 473 return matchLength; 474 } 475 476 private void insertStringsInMatch(int matchLength) { 477 // inserts strings contained in current match 478 // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts 479 final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); 480 // currentPosition has been inserted already 481 for (int i = 1; i <= stop; i++) { 482 insertString(currentPosition + i); 483 } 484 missedInserts = matchLength - stop - 1; 485 } 486 487 private void catchUpMissedInserts() { 488 while (missedInserts > 0) { 489 insertString(currentPosition - missedInserts--); 490 } 491 } 492 493 private void flushBackReference(int matchLength) throws IOException { 494 callback.accept(new BackReference(currentPosition - matchStart, matchLength)); 495 } 496 497 private void flushLiteralBlock() throws IOException { 498 callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); 499 } 500 501 /** 502 * Searches the hash chain for real matches and returns the length 503 * of the longest match (0 if none were found) that isn't too far 504 * away (WRT maxOffset). 505 * 506 * <p>Sets matchStart to the index of the start position of the 507 * longest match as a side effect.</p> 508 */ 509 private int longestMatch(int matchHead) { 510 final int minLength = params.getMinBackReferenceLength(); 511 int longestMatchLength = minLength - 1; 512 final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead); 513 final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); 514 final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength()); 515 final int maxCandidates = params.getMaxCandidates(); 516 for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) { 517 int currentLength = 0; 518 for (int i = 0; i < maxPossibleLength; i++) { 519 if (window[matchHead + i] != window[currentPosition + i]) { 520 break; 521 } 522 currentLength++; 523 } 524 if (currentLength > longestMatchLength) { 525 longestMatchLength = currentLength; 526 matchStart = matchHead; 527 if (currentLength >= niceBackReferenceLength) { 528 // no need to search any further 529 break; 530 } 531 } 532 matchHead = prev[matchHead & wMask]; 533 } 534 return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() 535 } 536}