1 // Written in the D programming language. 2 3 /** 4 Components that can compress and expand ranges. 5 6 Description: 7 Compression is useful for storage and transmission data formats. 8 This compresses using a variant of the 9 $(WEB en.wikipedia.org/wiki/LZ77_and_LZ78, LZ77) compression algorithm. 10 compress() and expand() are meant to be a matched set. Users should 11 not depend on the particular data format. 12 13 Example: 14 This program takes a filename as an argument, compresses it, 15 expands it, and verifies that the result matches the original 16 contents. 17 --- 18 int main(string args[]) 19 { 20 import std.stdio; 21 import std.algorithm; 22 import std.file; 23 import std.compression.lz77; 24 25 string filename; 26 if (args.length < 2) 27 { 28 printf("need filename argument\n"); 29 return 1; 30 } 31 else 32 filename = args[1]; 33 34 ubyte[] si = cast(ubyte[])std.file.read(filename); 35 36 // Compress 37 auto di = new ubyte[maxCompressedSize(si.length)]; 38 auto result = si.compress().copy(di); 39 40 di = di[0..$ - result.length]; 41 42 writefln("Compression done, srclen = %s compressed = %s", 43 si.length, di.length); 44 45 // Decompress 46 ubyte[] si2 = new ubyte[si.length]; 47 result = di.expand().copy(si2); 48 assert(result.length == 0); 49 50 writefln("Decompression done, dilen = %s decompressed = %s", 51 di.length, si2.length); 52 53 if (si != si2) 54 { 55 writeln("Buffers don't match"); 56 assert(0); 57 } 58 59 return 0; 60 } 61 --- 62 63 References: $(WEB en.wikipedia.org/wiki/LZ77_and_LZ78, Wikipedia) 64 65 Macros: 66 WIKI = Phobos/StdCompress 67 68 Copyright: Copyright Digital Mars 2013. 69 License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 70 Authors: $(WEB digitalmars.com, Walter Bright) 71 Source: $(SARGONSRC src/sargon/_lz77.d) 72 */ 73 74 module sargon.lz77; 75 76 import std.range; 77 78 //import core.stdc.stdio : printf; 79 80 private struct CircularBuffer(U: T[dim], T, size_t dim) 81 { 82 void put(T e) 83 { 84 assert(length != dim); 85 size_t i = first + length; 86 if (i >= dim) 87 i -= dim; 88 buf[i] = e; 89 length += 1; 90 } 91 92 @property T front() 93 { 94 assert(length); 95 return buf[first]; 96 } 97 98 void popFront() 99 { 100 assert(length); 101 first += 1; 102 if (first == dim) 103 first = 0; 104 --length; 105 } 106 107 @property bool full() 108 { 109 return length == dim; 110 } 111 112 @property bool empty() 113 { 114 return length == 0; 115 } 116 117 T opIndex(size_t i) 118 { 119 assert(i < length); 120 i += first; 121 if (i >= dim) 122 i -= dim; 123 return buf[i]; 124 } 125 126 size_t length; 127 128 private: 129 size_t first; 130 T[dim] buf = void; 131 } 132 133 unittest 134 { 135 CircularBuffer!(ubyte[3]) buf; 136 assert(buf.empty); 137 assert(buf.length == 0); 138 assert(!buf.full); 139 140 buf.put(7); 141 assert(!buf.empty); 142 assert(buf.length == 1); 143 assert(!buf.full); 144 assert(buf[0] == 7); 145 146 buf.put(8); 147 buf.put(9); 148 assert(!buf.empty); 149 assert(buf.length == 3); 150 assert(buf.full); 151 152 assert(buf.front == 7); 153 buf.popFront(); 154 buf.put(10); 155 assert(!buf.empty); 156 assert(buf.length == 3); 157 assert(buf.full); 158 assert(buf[0] == 8); 159 assert(buf[1] == 9); 160 assert(buf[2] == 10); 161 } 162 163 /* *********************************************************************** */ 164 165 private 166 { 167 enum matchLenMax = 0x80; 168 enum offsetMax = 0x8000/4; 169 } 170 171 /********************************* 172 * Compute the max size of a buffer needed to hold the compressed result. 173 * Parameters: 174 * length number of bytes of uncompressed data 175 * Returns: 176 * max size of compressed buffer 177 */ 178 size_t maxCompressedSize(size_t length) 179 { 180 return length + (length >> 3) + 3; 181 } 182 183 /************************************ 184 * Exception thrown on errors in compression functions, 185 * likely caused by corrupted data. 186 */ 187 188 class CompressException : Exception 189 { 190 this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable next = null) 191 { 192 super(msg, file, line, next); 193 } 194 } 195 196 /******************************************************* 197 * Component that returns a Range that will compress src using LZ77 compression. 198 * 199 * Description: 200 * Compresses input of arbitrarily large size. 201 * 202 * compress() does not allocate memory, although it uses a little over 8Kb 203 * of stack if src is not an array. 204 * 205 * Parameters: 206 * src An InputRange of ubytes. 207 * If it is an array, it will go faster and will 208 * use much less stack. 209 * Returns: 210 * InputRange 211 */ 212 213 auto compress(R)(R src) if (isInputRange!R && is(ElementType!R : ubyte)) 214 { 215 enum bool arrayLike = isRandomAccessRange!R && hasLength!R; 216 217 static struct Range 218 { 219 private: 220 R src; 221 222 static if (!arrayLike) 223 /* Need to create a sliding window buffer, since we 224 * cannot look back in src[] 225 */ 226 CircularBuffer!(ubyte[offsetMax + 1 + matchLenMax]) buf; 227 228 ubyte[32] dst; // temp buffer for output 229 size_t dis; // index in dst[] of next ubyte to be emitted 230 size_t dip; // where bits is to go in dst[] 231 size_t di = 1; // current position in dst[] 232 233 size_t si = 0; // current index into buf[] or src[] 234 uint bitsLeft = 8; // space left in bits 235 uint bits; // encoded matchlen and offset 236 237 bool done = false; 238 239 public: 240 241 this(R src) { this.src = src; } 242 243 void popFront() 244 { 245 //printf("dst.popFront()\n"); 246 dis = (dis + 1) & 0x1F; 247 } 248 249 @property ubyte front() 250 { 251 //printf("dst.front()\n"); 252 //printf("%02x\n", dst[dis]); 253 return dst[dis]; 254 } 255 256 @property bool empty() 257 { 258 //printf("dst.empty()\n"); 259 if (dis != dip) 260 return false; 261 262 if (done) 263 return true; 264 265 void putBit(uint c) 266 { 267 bits = (bits << 1) + c; 268 if (--bitsLeft == 0) 269 { 270 dst[dip] = cast(ubyte)bits; 271 dip = di; 272 di = (di + 1) & 0x1F; 273 assert(di != dis); 274 bitsLeft = 8; 275 } 276 } 277 278 while (1) 279 { 280 if (dis != dip) 281 return false; 282 283 static if (arrayLike) 284 { 285 if (si == src.length) 286 break; 287 size_t mlmax = matchLenMax; 288 if (mlmax > src.length - si) 289 mlmax = src.length - si; 290 } 291 else 292 { 293 while (!buf.full && !src.empty) 294 { 295 buf.put(src.front); 296 src.popFront(); 297 } 298 if (si == buf.length) 299 break; 300 301 size_t mlmax = matchLenMax; 302 if (mlmax > buf.length - si) 303 mlmax = buf.length - si; 304 } 305 306 int offsetmax = cast(int)((si > offsetMax) ? -offsetMax : -si); 307 308 /* Look for best match, and compute matchlen and offset 309 */ 310 uint matchlen = 0; 311 uint offset; 312 for (int i = -1; i >= offsetmax; i--) 313 { 314 static if (arrayLike) 315 { 316 if (src[si + i] != src[si]) 317 continue; 318 } 319 else 320 { 321 if (buf[si + i] != buf[si]) 322 continue; 323 } 324 325 // Compute longest match 326 uint j; 327 for (j = 1; j < mlmax; j++) 328 { 329 static if (arrayLike) 330 { 331 if (src[si + i + j] != src[si + j]) 332 break; 333 } 334 else 335 { 336 if (buf[si + i + j] != buf[si + j]) 337 break; 338 } 339 } 340 if (j > matchlen) 341 { 342 matchlen = j; 343 offset = -i - 1; 344 } 345 } 346 347 //if (matchlen > 1) 348 //printf("offset = x%x, matchlen = x%x\n", offset, matchlen); 349 350 void advanceSrc() 351 { 352 static if (arrayLike) 353 ++si; 354 else 355 { 356 if (si < offsetMax) 357 ++si; 358 else 359 buf.popFront(); 360 } 361 } 362 363 switch (matchlen) 364 { 365 case 0: 366 case 1: 367 Lbyte: 368 putBit(1); 369 static if (arrayLike) 370 dst[di] = src[si]; 371 else 372 dst[di] = buf[si]; 373 di = (di + 1) & 0x1F; 374 advanceSrc(); 375 continue; 376 377 case 2: // 000 378 if (offset >= 256) 379 goto Lbyte; 380 putBit(0); 381 putBit(0); 382 putBit(0); 383 dst[di] = cast(ubyte)offset; 384 di = (di + 1) & 0x1F; 385 advanceSrc(); 386 advanceSrc(); 387 continue; 388 389 case 3: // 001 390 putBit(0); 391 putBit(0); 392 putBit(1); 393 break; 394 395 case 4: // 0100 396 case 5: // 0101 397 putBit(0); 398 putBit(1); 399 putBit(0); 400 putBit(matchlen & 1); 401 break; 402 403 case 6: // 01100 404 case 7: // 01101 405 putBit(0); 406 putBit(1); 407 putBit(1); 408 putBit(0); 409 putBit(matchlen & 1); 410 break; 411 412 case 8: // 0111000 413 case 9: // 0111001 414 case 10: // 0111010 415 case 11: // 0111011 416 putBit(0); 417 putBit(1); 418 putBit(1); 419 putBit(1); 420 putBit(0); 421 putBit((matchlen >> 1) & 1); 422 putBit(matchlen & 1); 423 break; 424 425 case 12: .. case 19: // 011110XXX 426 putBit(0); 427 putBit(1); 428 putBit(1); 429 putBit(1); 430 putBit(1); 431 putBit(0); 432 putBit(((matchlen - 12) >> 2) & 1); 433 putBit((matchlen >> 1) & 1); 434 putBit(matchlen & 1); 435 break; 436 437 default: // 011111 438 assert(matchlen <= 0x80); 439 putBit(0); 440 putBit(1); 441 putBit(1); 442 putBit(1); 443 putBit(1); 444 putBit(1); 445 dst[di] = cast(ubyte)matchlen; 446 di = (di + 1) & 0x1F; 447 break; 448 } 449 if (offset < 0x100) // 00 450 { 451 putBit(0); 452 putBit(0); 453 } 454 else if (offset < 0x200) // 010 455 { 456 putBit(0); 457 putBit(1); 458 putBit(0); 459 } 460 else if (offset < 0x400) // 011X 461 { 462 putBit(0); 463 putBit(1); 464 putBit(1); 465 putBit((offset >> 8) & 1); 466 } 467 else if (offset < 0x800) // 100XX 468 { 469 putBit(1); 470 putBit(0); 471 putBit(0); 472 putBit((offset >> 9) & 1); 473 putBit((offset >> 8) & 1); 474 } 475 else if (offset < 0x1000) // 101XXX 476 { 477 putBit(1); 478 putBit(0); 479 putBit(1); 480 putBit((offset >> 10) & 1); 481 putBit((offset >> 9) & 1); 482 putBit((offset >> 8) & 1); 483 } 484 else if (offset < 0x2000) // 110XXXX 485 { 486 putBit(1); 487 putBit(1); 488 putBit(0); 489 putBit((offset >> 11) & 1); 490 putBit((offset >> 10) & 1); 491 putBit((offset >> 9) & 1); 492 putBit((offset >> 8) & 1); 493 } 494 else if (offset < 0x3000) // 1110XXXX 495 { 496 putBit(1); 497 putBit(1); 498 putBit(1); 499 putBit(0); 500 putBit((offset >> 11) & 1); 501 putBit((offset >> 10) & 1); 502 putBit((offset >> 9) & 1); 503 putBit((offset >> 8) & 1); 504 } 505 else if (offset < 0x4000) // 11110XXXX 506 { 507 putBit(1); 508 putBit(1); 509 putBit(1); 510 putBit(1); 511 putBit(0); 512 putBit((offset >> 11) & 1); 513 putBit((offset >> 10) & 1); 514 putBit((offset >> 9) & 1); 515 putBit((offset >> 8) & 1); 516 } 517 else if (offset < 0x8000) // 11111XXXXXX 518 { 519 putBit(1); 520 putBit(1); 521 putBit(1); 522 putBit(1); 523 putBit(1); 524 putBit((offset >> 13) & 1); 525 putBit((offset >> 12) & 1); 526 putBit((offset >> 11) & 1); 527 putBit((offset >> 10) & 1); 528 putBit((offset >> 9) & 1); 529 putBit((offset >> 8) & 1); 530 } 531 else 532 { 533 assert(0); 534 } 535 dst[di] = offset & 0xFF; 536 di = (di + 1) & 0x1F; 537 static if (arrayLike) 538 si += matchlen; 539 else 540 { 541 foreach (i; 0 .. matchlen) 542 { 543 advanceSrc(); 544 } 545 } 546 547 if (dis != dip) 548 return false; 549 } 550 551 // Put end marker, 011111 0x82 552 putBit(0); 553 putBit(1); 554 putBit(1); 555 putBit(1); 556 putBit(1); 557 putBit(1); 558 559 bits <<= bitsLeft; 560 dst[dip] = cast(ubyte)bits; 561 562 dst[di] = 0x82; 563 di = (di + 1) & 0x1F; 564 dip = di; 565 done = true; 566 return false; 567 } 568 } 569 570 return Range(src); 571 } 572 573 /************************************* 574 * Component to expand compressed result of LZ77 Compress. 575 * 576 * Description: 577 * Does not allocate memory. The expand operation is quite a bit 578 * faster than the corresponding compress. 579 * Parameters: 580 * src An InputRange over data generated by compress. 581 * Returns: 582 * An InputRange which provides the expanded data. 583 */ 584 585 586 auto expand(R)(R src) if (isInputRange!R && is(ElementType!R : ubyte)) 587 { 588 enum bool arrayLike = isRandomAccessRange!R && hasLength!R; 589 590 static struct Range 591 { 592 private: 593 R src; 594 size_t di = 0; 595 enum Prime = 9; // magic value to prime the pump 596 uint bitsLeft = Prime; 597 uint bits; 598 599 static if (arrayLike) 600 size_t si = 0; 601 602 CircularBuffer!(ubyte[offsetMax + 1 + matchLenMax]) dst; 603 604 public: 605 this(R src) { this.src = src; } 606 607 @property ubyte front() 608 { 609 return dst.front; 610 } 611 612 void popFront() 613 { 614 dst.popFront(); 615 } 616 617 @property bool empty() 618 { 619 if (dst.length > offsetMax) 620 return false; 621 622 uint count = 0; 623 uint off; 624 uint offh; 625 626 while (dst.length <= offsetMax) 627 { 628 // If no more input 629 static if (arrayLike) 630 { 631 if (si == src.length) 632 break; 633 } 634 else 635 { 636 if (src.empty) 637 break; 638 } 639 640 if (bitsLeft == Prime) // prime the pump 641 { 642 bits = get(); 643 bitsLeft = 8; 644 } 645 646 if (getBit()) 647 { 648 dst.put(get()); // straight byte 649 continue; 650 } 651 652 // 0 653 if (!getBit() || // 00 654 (++count, !getBit()) || // 010 655 (++count, !getBit())) // 0110 656 { 657 count++; // count = 1,2,3 658 count += count + getBit(); // count = 2-7 659 //printf("count = %d\n", count); 660 if (count == 2) // 000 661 { off = 0; 662 goto BACK_COPY; 663 } 664 } 665 else 666 { 667 //0111 668 if (!getBit()) //01110XX is 8-11 669 { 670 count = getBit(); 671 count = (count << 1) | getBit(); 672 count += 8; 673 } 674 //01111 675 else if (!getBit()) //11110XXX is 12-19 676 { 677 count = getBit(); 678 count = (count << 1) | getBit(); 679 count = (count << 1) | getBit(); 680 count += 12; 681 } 682 else 683 { 684 //011111 685 count = get(); 686 //printf("count = %02x, si = %04x\n", count, si - *psi); 687 if (count >= 0x81) 688 { 689 if (count != 0x81) 690 { 691 // Reached the end of the source 692 static if (arrayLike) 693 { if (si != src.length) 694 throw new CompressException("compressed data is corrupt"); 695 } 696 else 697 { if (!src.empty) 698 throw new CompressException("compressed data is corrupt"); 699 } 700 break; 701 } 702 count = 0; 703 continue; 704 } 705 } 706 } 707 708 // Get high byte of offset 709 710 if (getBit()) 711 { 712 // 1 713 if (!getBit()) 714 { 715 // 10 716 offh = 0x402; 717 if (getBit()) // 100XX is 4-7 718 { 719 // 101XXX is 8-0xF 720 offh = 0x803; 721 } 722 } 723 else 724 { 725 // 11 726 offh = 0x1004; 727 if (getBit()) // 110XXXX is 0x10-0x1F 728 { 729 // 111 730 offh = 0x2004; 731 if (getBit()) // 1110XXXX is 0x20-0x2F 732 { 733 // 1111 734 offh = 0x3004; // 11110XXXX is 0x30-0x3F 735 if (getBit()) 736 offh = 0x4006; // 11111XXXXXX is 0x40-0x7F 737 } 738 } 739 } 740 } 741 //0 742 else if (getBit()) 743 { 744 //01 745 off = 0x100; 746 if (!getBit()) 747 goto BACK_COPY; // 010 is 1 748 //011X is 2 or 3 749 offh = 0x201; 750 } 751 else 752 { 753 //00 is 0 754 off = 0; 755 goto BACK_COPY; 756 } 757 off = 0; 758 do 759 { 760 off = (off << 1) | (getBit() << 8); 761 } while ((--offh & 0xFF) != 0); 762 off += offh & 0xFF00; 763 764 765 BACK_COPY: 766 off |= get(); // bottom 8 bits of offset 767 if (off + 1 > dst.length) 768 throw new CompressException("corrupt compressed data"); 769 for (; count; count--) 770 { 771 dst.put(dst[dst.length - (off + 1)]); 772 } 773 } 774 return dst.empty; 775 } 776 777 private: 778 779 /* Get next bit from src[] 780 */ 781 bool getBit() 782 { 783 bool c = (bits & 0x80) != 0; 784 bits <<= 1; 785 if (--bitsLeft == 0) 786 { 787 bitsLeft = 8; 788 bits = get(); 789 } 790 return c; 791 } 792 793 /* Get next ubyte from src[] 794 */ 795 ubyte get() 796 { 797 static if (arrayLike) 798 return src[si++]; 799 else 800 { 801 assert(!src.empty); 802 auto c = src.front; 803 src.popFront(); 804 return c; 805 } 806 } 807 } 808 809 return Range(src); 810 } 811 812 /* =========================================================== */ 813 814 815 version (unittest) 816 { 817 import core.stdc.stdio; 818 819 // Roll our own to avoid importing std.algorithm which pulls in everything 820 ubyte[] copy(R)(R src, ubyte[] dst) 821 { 822 size_t i = 0; 823 while (!src.empty) 824 { 825 dst[i++] = src.front; 826 src.popFront(); 827 } 828 return dst[i .. dst.length]; 829 } 830 831 // Convert array to InputRange in order to test Range code paths 832 struct Adapter 833 { 834 this(ubyte[] r) { this.r = r; } 835 836 @property bool empty() { return r.length == 0; } 837 838 @property ubyte front() { return r[0]; } 839 840 void popFront() { r = r[1 .. $]; } 841 842 @property size_t length() { return r.length; } 843 844 private: 845 846 ubyte[] r; 847 } 848 } 849 850 void test() 851 { 852 ubyte[] src; 853 src.compress(); 854 src.expand(); 855 } 856 857 unittest 858 { 859 void testArray(ubyte[] src) 860 { 861 auto di = new ubyte[maxCompressedSize(src.length)]; 862 auto result = src.compress().copy(di); 863 di = di[0 .. $ - result.length]; 864 ubyte[] src2 = new ubyte[src.length]; 865 result = di.expand().copy(src2); 866 if (src != src2) 867 { 868 printf("Buffers don't match\n"); 869 assert(0); 870 } 871 } 872 873 void testRange(InputRange)(ubyte[] src1, ref InputRange src) 874 { 875 auto di = new ubyte[maxCompressedSize(src.length)]; 876 auto result = src.compress().copy(di); 877 di = di[0 .. $ - result.length]; 878 ubyte[] src2 = new ubyte[src.length]; 879 result = di.expand().copy(src2); 880 if (src1 != src2) 881 { 882 printf("Buffers don't match\n"); 883 assert(0); 884 } 885 } 886 887 foreach (i; 0 .. 20) 888 { 889 ubyte[] src = new ubyte[i]; 890 testArray(src); 891 auto a = Adapter(src); 892 testRange(src, a); 893 } 894 895 static string gettysburg = 896 "Four score and seven years ago our fathers brought forth on this continent a new nation, 897 conceived in liberty, and dedicated to the proposition that all men are created equal. 898 899 Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived 900 and so dedicated, can long endure. We are met on a great battlefield of that war. We have come 901 to dedicate a portion of that field, as a final resting place for those who here gave their 902 lives that that nation might live. It is altogether fitting and proper that we should do this. 903 904 But, in a larger sense, we can not dedicate, we can not consecrate, we can not hallow this 905 ground. The brave men, living and dead, who struggled here, have consecrated it, far above our 906 poor power to add or detract. The world will little note, nor long remember what we say here, 907 but it can never forget what they did here. It is for us the living, rather, to be dedicated 908 here to the unfinished work which they who fought here have thus far so nobly advanced. It is 909 rather for us to be here dedicated to the great task remaining before us-that from these honored 910 dead we take increased devotion to that cause for which they gave the last full measure of 911 devotion-that we here highly resolve that these dead shall not have died in vain-that this 912 nation, under God, shall have a new birth of freedom-and that government of the people, by the 913 people, for the people, shall not perish from the earth."; 914 915 testArray(cast(ubyte[])gettysburg); 916 917 ubyte[] p = new ubyte[0x100000]; 918 size_t s; 919 while (s < p.length) 920 { 921 if (s + gettysburg.length > p.length) 922 break; 923 p[s .. s+gettysburg.length] = (cast(ubyte[])gettysburg)[]; 924 s += gettysburg.length; 925 926 foreach (i; 0 .. s) 927 { 928 if (s + i >= p.length) 929 break; 930 p[s + i] = cast(ubyte)((s + i) / 10); 931 } 932 s += s; 933 } 934 935 testArray(p); 936 }