1 
2 /**
3  * Implement IEEE 754 half-precision binary floating point format binary16.
4  *
5  * This a 16 bit type, and consists of a sign bit, a 5 bit exponent, and a
6  * 10 bit significand.
7  * All operations on HalfFloat are CTFE'able.
8  *
9  * References:
10  *      $(WEB en.wikipedia.org/wiki/Half-precision_floating-point_format, Wikipedia)
11  * Copyright: Copyright Digital Mars 2012-2014
12  * License:   $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0)
13  * Authors:   $(WEB digitalmars.com, Walter Bright)
14  * Source:    $(SARGONSRC src/sargon/_halffloat.d)
15  * Macros:
16  *   WIKI=Phobos/StdHalffloat
17  */
18 
19 module sargon.halffloat;
20 
21 /**
22  * The half precision floating point type.
23  *
24  * The only operations are:
25  * $(UL
26  * $(LI explicit conversion of float to HalfFloat)
27  * $(LI implicit conversion of HalfFloat to float)
28  * )
29  * It operates in an analogous manner to shorts, which are converted to ints
30  * before performing any operations, and explicitly cast back to shorts.
31  * The half float is considered essentially a storage type, not a computation type.
32  * Example:
33  * ---
34     HalfFloat h = hf!27.2f;
35     HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 );
36     HalfFloat f = HalfFloat(0.0f);
37  * ---
38  * Bugs:
39  *      The only rounding mode currently supported is Round To Nearest.
40  *      The exceptions OVERFLOW, UNDERFLOW and INEXACT are not thrown.
41  */
42 
43 struct HalfFloat {
44 
45     /* Provide implicit conversion of HalfFloat to float
46      */
47 
48     @property float toFloat() { return shortToFloat(s); }
49     alias toFloat this;
50 
51     /* Done as a template in order to prevent implicit conversion
52      * of argument to float.
53      */
54 
55     this(T : float)(T f)
56     {
57         static assert(is(T == float));
58         s = floatToShort(f);
59     }
60 
61     /* These are done as properties to avoid
62      * circular reference problems.
63      */
64 
65     ///
66     static @property HalfFloat min_normal() { HalfFloat hf = void; hf.s = 0x0400; return hf; }
67     unittest { assert(min_normal == hf!0x1p-14); }
68 
69     ///
70     static @property HalfFloat max()        { HalfFloat hf = void; hf.s = 0x7BFF; return hf; }
71     unittest { assert(max == hf!0x1.FFCp+15); }
72 
73     ///
74     static @property HalfFloat nan()        { HalfFloat hf = void; hf.s = EXPMASK | 1; return hf; }
75     unittest { assert(nan != hf!(float.nan)); }
76 
77     ///
78     static @property HalfFloat infinity()   { HalfFloat hf = void; hf.s = EXPMASK; return hf; }
79     unittest { assert(infinity == hf!(float.infinity)); }
80 
81     ///
82     static @property HalfFloat epsilon()    { HalfFloat hf = void; hf.s = 0x1400; return hf; }
83     unittest { assert(epsilon == hf!0x1p-10); }
84 
85     enum dig =        3;        ///
86     enum mant_dig =   11;       ///
87     enum max_10_exp = 5;        ///
88     enum max_exp =    16;       ///
89     enum min_10_exp = -5;       ///
90     enum min_exp =    -14;      ///
91 
92   private:
93     ushort s = EXPMASK | 1;     // .init is HalfFloat.nan
94 }
95 
96 /********************
97  * User defined literal for Half Float.
98  * Example:
99  * ---
100  * auto h = hf!1.3f;
101  * ---
102  */
103 
104 template hf(float v)
105 {
106     enum hf = HalfFloat(v);
107 }
108 
109 private:
110 
111 // Half float values
112 enum SIGNMASK  = 0x8000;
113 enum EXPMASK   = 0x7C00;
114 enum MANTMASK  = 0x03FF;
115 enum HIDDENBIT = 0x0400;
116 
117 // float values
118 enum FSIGNMASK  = 0x80000000;
119 enum FEXPMASK   = 0x7F800000;
120 enum FMANTMASK  = 0x007FFFFF;
121 enum FHIDDENBIT = 0x00800000;
122 
123 // Rounding mode
124 enum ROUND { TONEAREST, UPWARD, DOWNWARD, TOZERO };
125 enum ROUNDMODE = ROUND.TONEAREST;
126 
127 ushort floatToShort(float f)
128 {
129     /* If the target CPU has a conversion instruction, this code could be
130      * replaced with inline asm or a compiler intrinsic, but leave this
131      * as the CTFE path so CTFE can work on it.
132      */
133 
134     /* The code currently does not set INEXACT, UNDERFLOW, or OVERFLOW,
135      * but is marked where those would go.
136      */
137 
138     uint s = *cast(uint*)&f;
139 
140     ushort u = (s & FSIGNMASK) ? SIGNMASK : 0;
141     int exp = s & FEXPMASK;
142     if (exp == FEXPMASK)  // if nan or infinity
143     {
144         if ((s & FMANTMASK) == 0)       // if infinity
145         {
146             u |= EXPMASK;
147         }
148         else                            // else nan
149         {
150             u |= EXPMASK | 1;
151         }
152         return u;
153     }
154 
155     uint significand = s & FMANTMASK;
156 
157     if (exp == 0)                       // if subnormal or zero
158     {
159         if (significand == 0)           // if zero
160             return u;
161 
162         /* A subnormal float is going to give us a zero result anyway,
163          * so just set UNDERFLOW and INEXACT and return +-0.
164          */
165         return u;
166     }
167     else                                // else normal
168     {
169         // normalize exponent and remove bias
170         exp = (exp >> 23) - 127;
171         significand |= FHIDDENBIT;
172     }
173 
174     exp += 15;                          // bias the exponent
175 
176     bool guard = false;                 // guard bit
177     bool sticky = false;                // sticky bit
178 
179     uint shift = 13;                    // lop off rightmost 13 bits
180     if (exp <= 0)                       // if subnormal
181     {   shift += -exp + 1;              // more bits to lop off
182         exp = 0;
183     }
184     if (shift > 23)
185     {
186         // Set UNDERFLOW, INEXACT, return +-0
187         return u;
188     }
189 
190 //printf("exp = x%x significand = x%x\n", exp, significand);
191 
192     // Lop off rightmost 13 bits, but save guard and sticky bits
193     guard = (significand & (1 << (shift - 1))) != 0;
194     sticky = (significand & ((1 << (shift - 1)) - 1)) != 0;
195     significand >>= shift;
196 
197 //printf("guard = %d, sticky = %d\n", guard, sticky);
198 //printf("significand = x%x\n", significand);
199 
200     if (guard || sticky)
201     {
202         // Lost some bits, so set INEXACT and round the result
203         switch (ROUNDMODE)
204         {
205             case ROUND.TONEAREST:
206                 if (guard && (sticky || (significand & 1)))
207                     ++significand;
208                 break;
209 
210             case ROUND.UPWARD:
211                 if (!(s & FSIGNMASK))
212                     ++significand;
213                 break;
214 
215             case ROUND.DOWNWARD:
216                 if (s & FSIGNMASK)
217                     ++significand;
218                 break;
219 
220             case ROUND.TOZERO:
221                 break;
222 
223             default:
224                 assert(0);
225         }
226         if (exp == 0)                           // if subnormal
227         {
228             if (significand & HIDDENBIT)        // and not a subnormal no more
229                 ++exp;
230         }
231         else if (significand & (HIDDENBIT << 1))
232         {
233             significand >>= 1;
234             ++exp;
235         }
236     }
237 
238     if (exp > 30)
239     {   // Set OVERFLOW and INEXACT, return +-infinity
240         return u | EXPMASK;
241     }
242 
243     /* Add exponent and significand into result.
244      */
245 
246     u |= exp << 10;                             // exponent
247     u |= (significand & ~HIDDENBIT);            // significand
248 
249     return u;
250 }
251 
252 unittest
253 {
254     static struct S { ushort u; float f; }
255 
256     static S[] tests =
257     [
258         { 0x3C00,  1.0f },
259         { 0x3C01,  1.0009765625f },
260         { 0xC000, -2.0f },
261         { 0x7BFF,  65504.0f },
262         { 0x0400,  6.10352e-5f },
263         { 0x03FF,  6.09756e-5f },
264         { 0x0001,  5.9604644775e-8f },
265         { 0x0000,  0.0f },
266         { 0x8000, -0.0f },
267         { 0x7C00,  float.infinity },
268         { 0xFC00, -float.infinity },
269         { 0x3555,  0.333252f },
270         { 0x7C01,  float.nan },
271         { 0xFC01, -float.nan },
272         { 0x0000,  1.0e-8f },
273         { 0x8000, -1.0e-8f },
274         { 0x7C00,  1.0e31f },
275         { 0xFC00, -1.0e31f },
276         { 0x0000,  1.0e-37f / 10.0f },  // subnormal float
277         { 0x8000, -1.0e-37f / 10.0f },
278         { 0x6800,  0x1002p-1 }, // guard
279         { 0x6801,  0x1003p-1 }, // guard && sticky
280         { 0x6802,  0x1006p-1 }, // guard && (significand & 1)
281         { 0x6802,  0x1007p-1 }, // guard && sticky && (significand & 1)
282         { 0x0400,  0x1FFFp-27 }, // round up subnormal to normal
283         { 0x0800,  0x3FFFp-27 }, // lose bit, add one to exp
284         //{ , },
285     ];
286 
287     foreach (i, s; tests)
288     {
289         ushort u = floatToShort(s.f);
290         if (u != s.u)
291         {
292             printf("[%d] %g %04x expected %04x\n", i, s.f, u, s.u);
293             assert(0);
294         }
295     }
296 }
297 
298 float shortToFloat(ushort s)
299 {
300     /* If the target CPU has a conversion instruction, this code could be
301      * replaced with inline asm or a compiler intrinsic, but leave this
302      * as the CTFE path so CTFE can work on it.
303      */
304     /* This one is fairly easy because there are no possible errors
305      * and no necessary rounding.
306      */
307 
308     int exp = s & EXPMASK;
309     if (exp == EXPMASK)  // if nan or infinity
310     {
311         float f;
312         if ((s & MANTMASK) == 0)        // if infinity
313         {
314             f = float.infinity;
315         }
316         else                            // else nan
317         {
318             f = float.nan;
319         }
320         return (s & SIGNMASK) ? -f : f;
321     }
322 
323     uint significand = s & MANTMASK;
324 
325     if (exp == 0)                       // if subnormal or zero
326     {
327         if (significand == 0)           // if zero
328             return (s & SIGNMASK) ? -0.0f : 0.0f;
329 
330         // Normalize by shifting until the hidden bit is 1
331         while (!(significand & HIDDENBIT))
332         {
333             significand <<= 1;
334             --exp;
335         }
336         significand &= ~HIDDENBIT;      // hidden bit is, well, hidden
337         exp -= 14;
338     }
339     else                                // else normal
340     {
341         // normalize exponent and remove bias
342         exp = (exp >> 10) - 15;
343     }
344 
345     /* Assemble sign, exponent, and significand into float.
346      * Don't have to deal with overflow, inexact, or subnormal
347      * because the range of floats is big enough.
348      */
349 
350     assert(-126 <= exp && exp <= 127);  // just to be sure
351 
352     //printf("exp = %d, significand = x%x\n", exp, significand);
353 
354     uint u = (s & SIGNMASK) << 16;      // sign bit
355     u |= (exp + 127) << 23;             // bias the exponent and shift into position
356     u |= significand << (23 - 10);
357 
358     return *cast(float*)&u;
359 }
360 
361 unittest
362 {
363     static struct S { ushort u; float f; }
364 
365     static S[] tests =
366     [
367         { 0x3C00,  1.0f },
368         { 0xC000, -2.0f },
369         { 0x7BFF,  65504f },
370         { 0x0000,  0.0f },
371         { 0x8000, -0.0f },
372         { 0x7C00,  float.infinity},
373         { 0xFC00,  -float.infinity},
374         //{ , },
375     ];
376 
377     foreach (i, s; tests)
378     {
379         float f = shortToFloat(s.u);
380         if (f != s.f)
381         {
382             printf("[%d] %04x %g expected %g\n", i, s.u, f, s.f);
383             assert(0);
384         }
385     }
386 }
387 
388 
389 version (unittest) import std.stdio;
390 
391 unittest
392 {
393     HalfFloat h = hf!27.2;
394     HalfFloat j = cast(HalfFloat)( hf!3.5 + hf!5 );
395     HalfFloat f = HalfFloat(0.0f);
396 
397     float k = j + h;
398 
399     f.s = 0x1400;
400     writeln("1.0009765625 ", 1.0f + f);
401     assert(f == HalfFloat.epsilon);
402 
403     f.s = 0x0400;
404     writeln("6.10352e-5 ", cast(float)f);
405     assert(f == HalfFloat.min_normal);
406 
407     f.s = 0x03FF;
408     writeln("6.09756e-5 ", cast(float)f);
409 
410     f.s = 1;
411     writefln("5.96046e-8 %.10e", cast(float)f);
412 
413     f.s = 0;
414     writeln("0 ", cast(float)f);
415     assert(f == 0.0f);
416 
417     f.s = 0x8000;
418     writeln("-0 ", cast(float)f);
419     assert(f == -0.0f);
420 
421     f.s = 0x3555;
422     writeln("0.33325 ", cast(float)f);
423 
424     f = HalfFloat.nan();
425     assert(f.s == 0x7C01);
426     float fl = f;
427     writefln("%x", *cast(uint*)&fl);
428     assert(*cast(uint*)&fl == 0x7FC0_0000);
429 }
430