1 // Written in the D programming language.
2 
3 /**
4     Components that can compress and expand ranges.
5 
6     Description:
7         Compression is useful for storage and transmission data formats.
8         This compresses using a variant of the
9         $(WEB en.wikipedia.org/wiki/LZ77_and_LZ78, LZ77) compression algorithm.
10         compress() and expand() are meant to be a matched set. Users should
11         not depend on the particular data format.
12 
13     Example:
14     This program takes a filename as an argument, compresses it,
15     expands it, and verifies that the result matches the original
16     contents.
17     ---
18     int main(string args[])
19     {
20         import std.stdio;
21         import std.algorithm;
22         import std.file;
23         import std.compression.lz77;
24 
25         string filename;
26         if (args.length < 2)
27         {
28             printf("need filename argument\n");
29             return 1;
30         }
31         else
32             filename = args[1];
33 
34         ubyte[] si = cast(ubyte[])std.file.read(filename);
35 
36         // Compress
37         auto di = new ubyte[maxCompressedSize(si.length)];
38         auto result = si.compress().copy(di);
39 
40         di = di[0..$ - result.length];
41 
42         writefln("Compression done, srclen = %s compressed = %s",
43             si.length, di.length);
44 
45         // Decompress
46         ubyte[] si2 = new ubyte[si.length];
47         result = di.expand().copy(si2);
48         assert(result.length == 0);
49 
50         writefln("Decompression done, dilen = %s decompressed = %s",
51             di.length, si2.length);
52 
53         if (si != si2)
54         {
55             writeln("Buffers don't match");
56             assert(0);
57         }
58 
59         return 0;
60     }
61     ---
62 
63     References: $(WEB en.wikipedia.org/wiki/LZ77_and_LZ78, Wikipedia)
64 
65     Macros:
66         WIKI = Phobos/StdCompress
67 
68     Copyright: Copyright Digital Mars 2013.
69     License:   $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
70     Authors:   $(WEB digitalmars.com, Walter Bright)
71     Source:    $(SARGONSRC src/sargon/_lz77.d)
72    */
73 
74 module sargon.lz77;
75 
76 import std.range;
77 
78 //import core.stdc.stdio : printf;
79 
80 private struct CircularBuffer(U: T[dim], T, size_t dim)
81 {
82     void put(T e)
83     {
84         assert(length != dim);
85         size_t i = first + length;
86         if (i >= dim)
87             i -= dim;
88         buf[i] = e;
89         length += 1;
90     }
91 
92     @property T front()
93     {
94         assert(length);
95         return buf[first];
96     }
97 
98     void popFront()
99     {
100         assert(length);
101         first += 1;
102         if (first == dim)
103             first = 0;
104         --length;
105     }
106 
107     @property bool full()
108     {
109         return length == dim;
110     }
111 
112     @property bool empty()
113     {
114         return length == 0;
115     }
116 
117     T opIndex(size_t i)
118     {
119         assert(i < length);
120         i += first;
121         if (i >= dim)
122             i -= dim;
123         return buf[i];
124     }
125 
126     size_t length;
127 
128   private:
129     size_t first;
130     T[dim] buf = void;
131 }
132 
133 unittest
134 {
135     CircularBuffer!(ubyte[3]) buf;
136     assert(buf.empty);
137     assert(buf.length == 0);
138     assert(!buf.full);
139 
140     buf.put(7);
141     assert(!buf.empty);
142     assert(buf.length == 1);
143     assert(!buf.full);
144     assert(buf[0] == 7);
145 
146     buf.put(8);
147     buf.put(9);
148     assert(!buf.empty);
149     assert(buf.length == 3);
150     assert(buf.full);
151 
152     assert(buf.front == 7);
153     buf.popFront();
154     buf.put(10);
155     assert(!buf.empty);
156     assert(buf.length == 3);
157     assert(buf.full);
158     assert(buf[0] == 8);
159     assert(buf[1] == 9);
160     assert(buf[2] == 10);
161 }
162 
163 /* *********************************************************************** */
164 
165 private
166 {
167     enum matchLenMax = 0x80;
168     enum offsetMax = 0x8000/4;
169 }
170 
171 /*********************************
172  * Compute the max size of a buffer needed to hold the compressed result.
173  * Parameters:
174  *      length  number of bytes of uncompressed data
175  * Returns:
176  *      max size of compressed buffer
177  */
178 size_t maxCompressedSize(size_t length)
179 {
180     return length + (length >> 3) + 3;
181 }
182 
183 /************************************
184  * Exception thrown on errors in compression functions,
185  * likely caused by corrupted data.
186  */
187 
188 class CompressException : Exception
189 {
190     this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable next = null)
191     {
192         super(msg, file, line, next);
193     }
194 }
195 
196 /*******************************************************
197  * Component that returns a Range that will compress src using LZ77 compression.
198  *
199  * Description:
200  *      Compresses input of arbitrarily large size.
201  *
202  *      compress() does not allocate memory, although it uses a little over 8Kb
203  *      of stack if src is not an array.
204  *
205  * Parameters:
206  *      src     An InputRange of ubytes.
207  *              If it is an array, it will go faster and will
208  *              use much less stack.
209  * Returns:
210  *      InputRange
211  */
212 
213 auto compress(R)(R src) if (isInputRange!R && is(ElementType!R : ubyte))
214 {
215     enum bool arrayLike = isRandomAccessRange!R && hasLength!R;
216 
217     static struct Range
218     {
219       private:
220         R src;
221 
222         static if (!arrayLike)
223             /* Need to create a sliding window buffer, since we
224              * cannot look back in src[]
225              */
226             CircularBuffer!(ubyte[offsetMax + 1 + matchLenMax]) buf;
227 
228         ubyte[32] dst;  // temp buffer for output
229         size_t dis;     // index in dst[] of next ubyte to be emitted
230         size_t dip;     // where bits is to go in dst[]
231         size_t di = 1;  // current position in dst[]
232 
233         size_t si = 0;  // current index into buf[] or src[]
234         uint bitsLeft = 8; // space left in bits
235         uint bits;      // encoded matchlen and offset
236 
237         bool done = false;
238 
239       public:
240 
241         this(R src) { this.src = src; }
242 
243         void popFront()
244         {
245             //printf("dst.popFront()\n");
246             dis = (dis + 1) & 0x1F;
247         }
248 
249         @property ubyte front()
250         {
251             //printf("dst.front()\n");
252             //printf("%02x\n", dst[dis]);
253             return dst[dis];
254         }
255 
256         @property bool empty()
257         {
258             //printf("dst.empty()\n");
259             if (dis != dip)
260                 return false;
261 
262             if (done)
263                 return true;
264 
265             void putBit(uint c)
266             {
267                 bits = (bits << 1) + c;
268                 if (--bitsLeft == 0)
269                 {
270                     dst[dip] = cast(ubyte)bits;
271                     dip = di;
272                     di = (di + 1) & 0x1F;
273                     assert(di != dis);
274                     bitsLeft = 8;
275                 }
276             }
277 
278             while (1)
279             {
280                 if (dis != dip)
281                     return false;
282 
283                 static if (arrayLike)
284                 {
285                     if (si == src.length)
286                         break;
287                     size_t mlmax = matchLenMax;
288                     if (mlmax > src.length - si)
289                         mlmax = src.length - si;
290                 }
291                 else
292                 {
293                     while (!buf.full && !src.empty)
294                     {
295                         buf.put(src.front);
296                         src.popFront();
297                     }
298                     if (si == buf.length)
299                         break;
300 
301                     size_t mlmax = matchLenMax;
302                     if (mlmax > buf.length - si)
303                         mlmax = buf.length - si;
304                 }
305 
306                 int offsetmax = cast(int)((si > offsetMax) ? -offsetMax : -si);
307 
308                 /* Look for best match, and compute matchlen and offset
309                  */
310                 uint matchlen = 0;
311                 uint offset;
312                 for (int i = -1; i >= offsetmax; i--)
313                 {
314                     static if (arrayLike)
315                     {
316                         if (src[si + i] != src[si])
317                             continue;
318                     }
319                     else
320                     {
321                         if (buf[si + i] != buf[si])
322                             continue;
323                     }
324 
325                     // Compute longest match
326                     uint j;
327                     for (j = 1; j < mlmax; j++)
328                     {
329                         static if (arrayLike)
330                         {
331                             if (src[si + i + j] != src[si + j])
332                                 break;
333                         }
334                         else
335                         {
336                             if (buf[si + i + j] != buf[si + j])
337                                 break;
338                         }
339                     }
340                     if (j > matchlen)
341                     {
342                         matchlen = j;
343                         offset = -i - 1;
344                     }
345                 }
346 
347                 //if (matchlen > 1)
348                     //printf("offset = x%x, matchlen = x%x\n", offset, matchlen);
349 
350                 void advanceSrc()
351                 {
352                     static if (arrayLike)
353                         ++si;
354                     else
355                     {
356                         if (si < offsetMax)
357                             ++si;
358                         else
359                             buf.popFront();
360                     }
361                 }
362 
363                 switch (matchlen)
364                 {
365                     case 0:
366                     case 1:
367                     Lbyte:
368                         putBit(1);
369                         static if (arrayLike)
370                             dst[di] = src[si];
371                         else
372                             dst[di] = buf[si];
373                         di = (di + 1) & 0x1F;
374                         advanceSrc();
375                         continue;
376 
377                     case 2:             // 000
378                         if (offset >= 256)
379                             goto Lbyte;
380                         putBit(0);
381                         putBit(0);
382                         putBit(0);
383                         dst[di] = cast(ubyte)offset;
384                         di = (di + 1) & 0x1F;
385                         advanceSrc();
386                         advanceSrc();
387                         continue;
388 
389                     case 3:             // 001
390                         putBit(0);
391                         putBit(0);
392                         putBit(1);
393                         break;
394 
395                     case 4:             // 0100
396                     case 5:             // 0101
397                         putBit(0);
398                         putBit(1);
399                         putBit(0);
400                         putBit(matchlen & 1);
401                         break;
402 
403                     case 6:             // 01100
404                     case 7:             // 01101
405                         putBit(0);
406                         putBit(1);
407                         putBit(1);
408                         putBit(0);
409                         putBit(matchlen & 1);
410                         break;
411 
412                     case 8:             // 0111000
413                     case 9:             // 0111001
414                     case 10:            // 0111010
415                     case 11:            // 0111011
416                         putBit(0);
417                         putBit(1);
418                         putBit(1);
419                         putBit(1);
420                         putBit(0);
421                         putBit((matchlen >> 1) & 1);
422                         putBit(matchlen & 1);
423                         break;
424 
425                     case 12: .. case 19:  // 011110XXX
426                         putBit(0);
427                         putBit(1);
428                         putBit(1);
429                         putBit(1);
430                         putBit(1);
431                         putBit(0);
432                         putBit(((matchlen - 12) >> 2) & 1);
433                         putBit((matchlen >> 1) & 1);
434                         putBit(matchlen & 1);
435                         break;
436 
437                     default:            // 011111
438                         assert(matchlen <= 0x80);
439                         putBit(0);
440                         putBit(1);
441                         putBit(1);
442                         putBit(1);
443                         putBit(1);
444                         putBit(1);
445                         dst[di] = cast(ubyte)matchlen;
446                         di = (di + 1) & 0x1F;
447                         break;
448                 }
449                 if (offset < 0x100)             // 00
450                 {
451                     putBit(0);
452                     putBit(0);
453                 }
454                 else if (offset < 0x200)        // 010
455                 {
456                     putBit(0);
457                     putBit(1);
458                     putBit(0);
459                 }
460                 else if (offset < 0x400)        // 011X
461                 {
462                     putBit(0);
463                     putBit(1);
464                     putBit(1);
465                     putBit((offset >> 8) & 1);
466                 }
467                 else if (offset < 0x800)        // 100XX
468                 {
469                     putBit(1);
470                     putBit(0);
471                     putBit(0);
472                     putBit((offset >> 9) & 1);
473                     putBit((offset >> 8) & 1);
474                 }
475                 else if (offset < 0x1000)       // 101XXX
476                 {
477                     putBit(1);
478                     putBit(0);
479                     putBit(1);
480                     putBit((offset >> 10) & 1);
481                     putBit((offset >> 9) & 1);
482                     putBit((offset >> 8) & 1);
483                 }
484                 else if (offset < 0x2000)       // 110XXXX
485                 {
486                     putBit(1);
487                     putBit(1);
488                     putBit(0);
489                     putBit((offset >> 11) & 1);
490                     putBit((offset >> 10) & 1);
491                     putBit((offset >> 9) & 1);
492                     putBit((offset >> 8) & 1);
493                 }
494                 else if (offset < 0x3000)       // 1110XXXX
495                 {
496                     putBit(1);
497                     putBit(1);
498                     putBit(1);
499                     putBit(0);
500                     putBit((offset >> 11) & 1);
501                     putBit((offset >> 10) & 1);
502                     putBit((offset >> 9) & 1);
503                     putBit((offset >> 8) & 1);
504                 }
505                 else if (offset < 0x4000)       // 11110XXXX
506                 {
507                     putBit(1);
508                     putBit(1);
509                     putBit(1);
510                     putBit(1);
511                     putBit(0);
512                     putBit((offset >> 11) & 1);
513                     putBit((offset >> 10) & 1);
514                     putBit((offset >> 9) & 1);
515                     putBit((offset >> 8) & 1);
516                 }
517                 else if (offset < 0x8000)       // 11111XXXXXX
518                 {
519                     putBit(1);
520                     putBit(1);
521                     putBit(1);
522                     putBit(1);
523                     putBit(1);
524                     putBit((offset >> 13) & 1);
525                     putBit((offset >> 12) & 1);
526                     putBit((offset >> 11) & 1);
527                     putBit((offset >> 10) & 1);
528                     putBit((offset >> 9) & 1);
529                     putBit((offset >> 8) & 1);
530                 }
531                 else
532                 {
533                     assert(0);
534                 }
535                 dst[di] = offset & 0xFF;
536                 di = (di + 1) & 0x1F;
537                 static if (arrayLike)
538                     si += matchlen;
539                 else
540                 {
541                     foreach (i; 0 .. matchlen)
542                     {
543                         advanceSrc();
544                     }
545                 }
546 
547                 if (dis != dip)
548                     return false;
549             }
550 
551             // Put end marker, 011111 0x82
552             putBit(0);
553             putBit(1);
554             putBit(1);
555             putBit(1);
556             putBit(1);
557             putBit(1);
558 
559             bits <<= bitsLeft;
560             dst[dip] = cast(ubyte)bits;
561 
562             dst[di] = 0x82;
563             di = (di + 1) & 0x1F;
564             dip = di;
565             done = true;
566             return false;
567         }
568     }
569 
570     return Range(src);
571 }
572 
573 /*************************************
574  * Component to expand compressed result of LZ77 Compress.
575  *
576  * Description:
577  *      Does not allocate memory. The expand operation is quite a bit
578  *      faster than the corresponding compress.
579  * Parameters:
580  *      src     An InputRange over data generated by compress.
581  * Returns:
582  *      An InputRange which provides the expanded data.
583  */
584 
585 
586 auto expand(R)(R src) if (isInputRange!R && is(ElementType!R : ubyte))
587 {
588     enum bool arrayLike = isRandomAccessRange!R && hasLength!R;
589 
590     static struct Range
591     {
592       private:
593         R src;
594         size_t di = 0;
595         enum Prime = 9;         // magic value to prime the pump
596         uint bitsLeft = Prime;
597         uint bits;
598 
599         static if (arrayLike)
600             size_t si = 0;
601 
602         CircularBuffer!(ubyte[offsetMax + 1 + matchLenMax]) dst;
603 
604       public:
605         this(R src) { this.src = src; }
606 
607         @property ubyte front()
608         {
609             return dst.front;
610         }
611 
612         void popFront()
613         {
614             dst.popFront();
615         }
616 
617         @property bool empty()
618         {
619             if (dst.length > offsetMax)
620                 return false;
621 
622             uint count = 0;
623             uint off;
624             uint offh;
625 
626             while (dst.length <= offsetMax)
627             {
628                 // If no more input
629                 static if (arrayLike)
630                 {
631                     if (si == src.length)
632                         break;
633                 }
634                 else
635                 {
636                     if (src.empty)
637                         break;
638                 }
639 
640                 if (bitsLeft == Prime)                  // prime the pump
641                 {
642                     bits = get();
643                     bitsLeft = 8;
644                 }
645 
646                 if (getBit())
647                 {
648                     dst.put(get());                     // straight byte
649                     continue;
650                 }
651 
652                 // 0
653                 if (!getBit() ||                       // 00
654                     (++count, !getBit()) ||            // 010
655                     (++count, !getBit()))              // 0110
656                 {
657                     count++;            // count = 1,2,3
658                     count += count + getBit(); // count = 2-7
659                     //printf("count = %d\n", count);
660                     if (count == 2)     // 000
661                     {   off = 0;
662                         goto BACK_COPY;
663                     }
664                 }
665                 else
666                 {
667                     //0111
668                     if (!getBit())                     //01110XX is 8-11
669                     {
670                         count = getBit();
671                         count = (count << 1) | getBit();
672                         count += 8;
673                     }
674                     //01111
675                     else if (!getBit())                //11110XXX is 12-19
676                     {
677                         count = getBit();
678                         count = (count << 1) | getBit();
679                         count = (count << 1) | getBit();
680                         count += 12;
681                     }
682                     else
683                     {
684                         //011111
685                         count = get();
686                         //printf("count = %02x, si = %04x\n", count, si - *psi);
687                         if (count >= 0x81)
688                         {
689                             if (count != 0x81)
690                             {
691                                 // Reached the end of the source
692                                 static if (arrayLike)
693                                 {   if (si != src.length)
694                                         throw new CompressException("compressed data is corrupt");
695                                 }
696                                 else
697                                 {   if (!src.empty)
698                                         throw new CompressException("compressed data is corrupt");
699                                 }
700                                 break;
701                             }
702                             count = 0;
703                             continue;
704                         }
705                     }
706                 }
707 
708                 // Get high byte of offset
709 
710                 if (getBit())
711                 {
712                     // 1
713                     if (!getBit())
714                     {
715                         // 10
716                         offh = 0x402;
717                         if (getBit())                // 100XX is 4-7
718                         {
719                             // 101XXX is 8-0xF
720                             offh = 0x803;
721                         }
722                     }
723                     else
724                     {
725                         // 11
726                         offh = 0x1004;
727                         if (getBit())                // 110XXXX is 0x10-0x1F
728                         {
729                             // 111
730                             offh = 0x2004;
731                             if (getBit())            // 1110XXXX is 0x20-0x2F
732                             {
733                                 // 1111
734                                 offh = 0x3004;       // 11110XXXX is 0x30-0x3F
735                                 if (getBit())
736                                     offh = 0x4006;   // 11111XXXXXX is 0x40-0x7F
737                             }
738                         }
739                     }
740                 }
741                 //0
742                 else if (getBit())
743                 {
744                     //01
745                     off = 0x100;
746                     if (!getBit())
747                         goto BACK_COPY;         // 010 is 1
748                     //011X  is 2 or 3
749                     offh = 0x201;
750                 }
751                 else
752                 {
753                     //00 is 0
754                     off = 0;
755                     goto BACK_COPY;
756                 }
757                 off = 0;
758                 do
759                 {
760                     off = (off << 1) | (getBit() << 8);
761                 } while ((--offh & 0xFF) != 0);
762                 off += offh & 0xFF00;
763 
764 
765             BACK_COPY:
766                 off |= get();           // bottom 8 bits of offset
767                 if (off + 1 > dst.length)
768                     throw new CompressException("corrupt compressed data");
769                 for (; count; count--)
770                 {
771                     dst.put(dst[dst.length - (off + 1)]);
772                 }
773             }
774             return dst.empty;
775         }
776 
777       private:
778 
779         /* Get next bit from src[]
780          */
781         bool getBit()
782         {
783             bool c = (bits & 0x80) != 0;
784             bits <<= 1;
785             if (--bitsLeft == 0)
786             {
787                 bitsLeft = 8;
788                 bits = get();
789             }
790             return c;
791         }
792 
793         /* Get next ubyte from src[]
794          */
795         ubyte get()
796         {
797             static if (arrayLike)
798                 return src[si++];
799             else
800             {
801                 assert(!src.empty);
802                 auto c = src.front;
803                 src.popFront();
804                 return c;
805             }
806         }
807     }
808 
809     return Range(src);
810 }
811 
812 /* =========================================================== */
813 
814 
815 version (unittest)
816 {
817     import core.stdc.stdio;
818 
819     // Roll our own to avoid importing std.algorithm which pulls in everything
820     ubyte[] copy(R)(R src, ubyte[] dst)
821     {
822         size_t i = 0;
823         while (!src.empty)
824         {
825             dst[i++] = src.front;
826             src.popFront();
827         }
828         return dst[i .. dst.length];
829     }
830 
831     // Convert array to InputRange in order to test Range code paths
832     struct Adapter
833     {
834         this(ubyte[] r) { this.r = r; }
835 
836         @property bool empty() { return r.length == 0; }
837 
838         @property ubyte front() { return r[0]; }
839 
840         void popFront() { r = r[1 .. $]; }
841 
842         @property size_t length() { return r.length; }
843 
844       private:
845 
846         ubyte[] r;
847     }
848 }
849 
850 void test()
851 {
852     ubyte[] src;
853     src.compress();
854     src.expand();
855 }
856 
857 unittest
858 {
859     void testArray(ubyte[] src)
860     {
861         auto di = new ubyte[maxCompressedSize(src.length)];
862         auto result = src.compress().copy(di);
863         di = di[0 .. $ - result.length];
864         ubyte[] src2 = new ubyte[src.length];
865         result = di.expand().copy(src2);
866         if (src != src2)
867         {
868             printf("Buffers don't match\n");
869             assert(0);
870         }
871     }
872 
873     void testRange(InputRange)(ubyte[] src1, ref InputRange src)
874     {
875         auto di = new ubyte[maxCompressedSize(src.length)];
876         auto result = src.compress().copy(di);
877         di = di[0 .. $ - result.length];
878         ubyte[] src2 = new ubyte[src.length];
879         result = di.expand().copy(src2);
880         if (src1 != src2)
881         {
882             printf("Buffers don't match\n");
883             assert(0);
884         }
885     }
886 
887     foreach (i; 0 .. 20)
888     {
889         ubyte[] src = new ubyte[i];
890         testArray(src);
891         auto a = Adapter(src);
892         testRange(src, a);
893     }
894 
895     static string gettysburg =
896     "Four score and seven years ago our fathers brought forth on this continent a new nation,
897     conceived in liberty, and dedicated to the proposition that all men are created equal.
898 
899     Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived
900     and so dedicated, can long endure. We are met on a great battlefield of that war. We have come
901     to dedicate a portion of that field, as a final resting place for those who here gave their
902     lives that that nation might live. It is altogether fitting and proper that we should do this.
903 
904     But, in a larger sense, we can not dedicate, we can not consecrate, we can not hallow this
905     ground. The brave men, living and dead, who struggled here, have consecrated it, far above our
906     poor power to add or detract. The world will little note, nor long remember what we say here,
907     but it can never forget what they did here. It is for us the living, rather, to be dedicated
908     here to the unfinished work which they who fought here have thus far so nobly advanced. It is
909     rather for us to be here dedicated to the great task remaining before us-that from these honored
910     dead we take increased devotion to that cause for which they gave the last full measure of
911     devotion-that we here highly resolve that these dead shall not have died in vain-that this
912     nation, under God, shall have a new birth of freedom-and that government of the people, by the
913     people, for the people, shall not perish from the earth.";
914 
915     testArray(cast(ubyte[])gettysburg);
916 
917     ubyte[] p = new ubyte[0x100000];
918     size_t s;
919     while (s < p.length)
920     {
921         if (s + gettysburg.length > p.length)
922             break;
923         p[s .. s+gettysburg.length] = (cast(ubyte[])gettysburg)[];
924         s += gettysburg.length;
925 
926         foreach (i; 0 .. s)
927         {
928             if (s + i >= p.length)
929                 break;
930             p[s + i] = cast(ubyte)((s + i) / 10);
931         }
932         s += s;
933     }
934 
935     testArray(p);
936 }