1 2 /** 3 * Implement IEEE 754 half-precision binary floating point format binary16. 4 * 5 * This a 16 bit type, and consists of a sign bit, a 5 bit exponent, and a 6 * 10 bit significand. 7 * All operations on HalfFloat are CTFE'able. 8 * 9 * References: 10 * $(WEB en.wikipedia.org/wiki/Half-precision_floating-point_format, Wikipedia) 11 * Copyright: Copyright Digital Mars 2012-2014 12 * License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0) 13 * Authors: $(WEB digitalmars.com, Walter Bright) 14 * Source: $(SARGONSRC src/sargon/_halffloat.d) 15 * Macros: 16 * WIKI=Phobos/StdHalffloat 17 */ 18 19 module sargon.halffloat; 20 21 /** 22 * The half precision floating point type. 23 * 24 * The only operations are: 25 * $(UL 26 * $(LI explicit conversion of float to HalfFloat) 27 * $(LI implicit conversion of HalfFloat to float) 28 * ) 29 * It operates in an analogous manner to shorts, which are converted to ints 30 * before performing any operations, and explicitly cast back to shorts. 31 * The half float is considered essentially a storage type, not a computation type. 32 * Example: 33 * --- 34 HalfFloat h = hf!27.2f; 35 HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 ); 36 HalfFloat f = HalfFloat(0.0f); 37 * --- 38 * Bugs: 39 * The only rounding mode currently supported is Round To Nearest. 40 * The exceptions OVERFLOW, UNDERFLOW and INEXACT are not thrown. 41 */ 42 43 struct HalfFloat { 44 45 /* Provide implicit conversion of HalfFloat to float 46 */ 47 48 @property float toFloat() { return shortToFloat(s); } 49 alias toFloat this; 50 51 /* Done as a template in order to prevent implicit conversion 52 * of argument to float. 53 */ 54 55 this(T : float)(T f) 56 { 57 static assert(is(T == float)); 58 s = floatToShort(f); 59 } 60 61 /* These are done as properties to avoid 62 * circular reference problems. 63 */ 64 65 /// 66 static @property HalfFloat min_normal() { HalfFloat hf = void; hf.s = 0x0400; return hf; } 67 unittest { assert(min_normal == hf!0x1p-14); } 68 69 /// 70 static @property HalfFloat max() { HalfFloat hf = void; hf.s = 0x7BFF; return hf; } 71 unittest { assert(max == hf!0x1.FFCp+15); } 72 73 /// 74 static @property HalfFloat nan() { HalfFloat hf = void; hf.s = EXPMASK | 1; return hf; } 75 unittest { assert(nan != hf!(float.nan)); } 76 77 /// 78 static @property HalfFloat infinity() { HalfFloat hf = void; hf.s = EXPMASK; return hf; } 79 unittest { assert(infinity == hf!(float.infinity)); } 80 81 /// 82 static @property HalfFloat epsilon() { HalfFloat hf = void; hf.s = 0x1400; return hf; } 83 unittest { assert(epsilon == hf!0x1p-10); } 84 85 enum dig = 3; /// 86 enum mant_dig = 11; /// 87 enum max_10_exp = 5; /// 88 enum max_exp = 16; /// 89 enum min_10_exp = -5; /// 90 enum min_exp = -14; /// 91 92 private: 93 ushort s = EXPMASK | 1; // .init is HalfFloat.nan 94 } 95 96 /******************** 97 * User defined literal for Half Float. 98 * Example: 99 * --- 100 * auto h = hf!1.3f; 101 * --- 102 */ 103 104 template hf(float v) 105 { 106 enum hf = HalfFloat(v); 107 } 108 109 private: 110 111 // Half float values 112 enum SIGNMASK = 0x8000; 113 enum EXPMASK = 0x7C00; 114 enum MANTMASK = 0x03FF; 115 enum HIDDENBIT = 0x0400; 116 117 // float values 118 enum FSIGNMASK = 0x80000000; 119 enum FEXPMASK = 0x7F800000; 120 enum FMANTMASK = 0x007FFFFF; 121 enum FHIDDENBIT = 0x00800000; 122 123 // Rounding mode 124 enum ROUND { TONEAREST, UPWARD, DOWNWARD, TOZERO }; 125 enum ROUNDMODE = ROUND.TONEAREST; 126 127 ushort floatToShort(float f) 128 { 129 /* If the target CPU has a conversion instruction, this code could be 130 * replaced with inline asm or a compiler intrinsic, but leave this 131 * as the CTFE path so CTFE can work on it. 132 */ 133 134 /* The code currently does not set INEXACT, UNDERFLOW, or OVERFLOW, 135 * but is marked where those would go. 136 */ 137 138 uint s = *cast(uint*)&f; 139 140 ushort u = (s & FSIGNMASK) ? SIGNMASK : 0; 141 int exp = s & FEXPMASK; 142 if (exp == FEXPMASK) // if nan or infinity 143 { 144 if ((s & FMANTMASK) == 0) // if infinity 145 { 146 u |= EXPMASK; 147 } 148 else // else nan 149 { 150 u |= EXPMASK | 1; 151 } 152 return u; 153 } 154 155 uint significand = s & FMANTMASK; 156 157 if (exp == 0) // if subnormal or zero 158 { 159 if (significand == 0) // if zero 160 return u; 161 162 /* A subnormal float is going to give us a zero result anyway, 163 * so just set UNDERFLOW and INEXACT and return +-0. 164 */ 165 return u; 166 } 167 else // else normal 168 { 169 // normalize exponent and remove bias 170 exp = (exp >> 23) - 127; 171 significand |= FHIDDENBIT; 172 } 173 174 exp += 15; // bias the exponent 175 176 bool guard = false; // guard bit 177 bool sticky = false; // sticky bit 178 179 uint shift = 13; // lop off rightmost 13 bits 180 if (exp <= 0) // if subnormal 181 { shift += -exp + 1; // more bits to lop off 182 exp = 0; 183 } 184 if (shift > 23) 185 { 186 // Set UNDERFLOW, INEXACT, return +-0 187 return u; 188 } 189 190 //printf("exp = x%x significand = x%x\n", exp, significand); 191 192 // Lop off rightmost 13 bits, but save guard and sticky bits 193 guard = (significand & (1 << (shift - 1))) != 0; 194 sticky = (significand & ((1 << (shift - 1)) - 1)) != 0; 195 significand >>= shift; 196 197 //printf("guard = %d, sticky = %d\n", guard, sticky); 198 //printf("significand = x%x\n", significand); 199 200 if (guard || sticky) 201 { 202 // Lost some bits, so set INEXACT and round the result 203 switch (ROUNDMODE) 204 { 205 case ROUND.TONEAREST: 206 if (guard && (sticky || (significand & 1))) 207 ++significand; 208 break; 209 210 case ROUND.UPWARD: 211 if (!(s & FSIGNMASK)) 212 ++significand; 213 break; 214 215 case ROUND.DOWNWARD: 216 if (s & FSIGNMASK) 217 ++significand; 218 break; 219 220 case ROUND.TOZERO: 221 break; 222 223 default: 224 assert(0); 225 } 226 if (exp == 0) // if subnormal 227 { 228 if (significand & HIDDENBIT) // and not a subnormal no more 229 ++exp; 230 } 231 else if (significand & (HIDDENBIT << 1)) 232 { 233 significand >>= 1; 234 ++exp; 235 } 236 } 237 238 if (exp > 30) 239 { // Set OVERFLOW and INEXACT, return +-infinity 240 return u | EXPMASK; 241 } 242 243 /* Add exponent and significand into result. 244 */ 245 246 u |= exp << 10; // exponent 247 u |= (significand & ~HIDDENBIT); // significand 248 249 return u; 250 } 251 252 unittest 253 { 254 static struct S { ushort u; float f; } 255 256 static S[] tests = 257 [ 258 { 0x3C00, 1.0f }, 259 { 0x3C01, 1.0009765625f }, 260 { 0xC000, -2.0f }, 261 { 0x7BFF, 65504.0f }, 262 { 0x0400, 6.10352e-5f }, 263 { 0x03FF, 6.09756e-5f }, 264 { 0x0001, 5.9604644775e-8f }, 265 { 0x0000, 0.0f }, 266 { 0x8000, -0.0f }, 267 { 0x7C00, float.infinity }, 268 { 0xFC00, -float.infinity }, 269 { 0x3555, 0.333252f }, 270 { 0x7C01, float.nan }, 271 { 0xFC01, -float.nan }, 272 { 0x0000, 1.0e-8f }, 273 { 0x8000, -1.0e-8f }, 274 { 0x7C00, 1.0e31f }, 275 { 0xFC00, -1.0e31f }, 276 { 0x0000, 1.0e-37f / 10.0f }, // subnormal float 277 { 0x8000, -1.0e-37f / 10.0f }, 278 { 0x6800, 0x1002p-1 }, // guard 279 { 0x6801, 0x1003p-1 }, // guard && sticky 280 { 0x6802, 0x1006p-1 }, // guard && (significand & 1) 281 { 0x6802, 0x1007p-1 }, // guard && sticky && (significand & 1) 282 { 0x0400, 0x1FFFp-27 }, // round up subnormal to normal 283 { 0x0800, 0x3FFFp-27 }, // lose bit, add one to exp 284 //{ , }, 285 ]; 286 287 foreach (i, s; tests) 288 { 289 ushort u = floatToShort(s.f); 290 if (u != s.u) 291 { 292 printf("[%d] %g %04x expected %04x\n", i, s.f, u, s.u); 293 assert(0); 294 } 295 } 296 } 297 298 float shortToFloat(ushort s) 299 { 300 /* If the target CPU has a conversion instruction, this code could be 301 * replaced with inline asm or a compiler intrinsic, but leave this 302 * as the CTFE path so CTFE can work on it. 303 */ 304 /* This one is fairly easy because there are no possible errors 305 * and no necessary rounding. 306 */ 307 308 int exp = s & EXPMASK; 309 if (exp == EXPMASK) // if nan or infinity 310 { 311 float f; 312 if ((s & MANTMASK) == 0) // if infinity 313 { 314 f = float.infinity; 315 } 316 else // else nan 317 { 318 f = float.nan; 319 } 320 return (s & SIGNMASK) ? -f : f; 321 } 322 323 uint significand = s & MANTMASK; 324 325 if (exp == 0) // if subnormal or zero 326 { 327 if (significand == 0) // if zero 328 return (s & SIGNMASK) ? -0.0f : 0.0f; 329 330 // Normalize by shifting until the hidden bit is 1 331 while (!(significand & HIDDENBIT)) 332 { 333 significand <<= 1; 334 --exp; 335 } 336 significand &= ~HIDDENBIT; // hidden bit is, well, hidden 337 exp -= 14; 338 } 339 else // else normal 340 { 341 // normalize exponent and remove bias 342 exp = (exp >> 10) - 15; 343 } 344 345 /* Assemble sign, exponent, and significand into float. 346 * Don't have to deal with overflow, inexact, or subnormal 347 * because the range of floats is big enough. 348 */ 349 350 assert(-126 <= exp && exp <= 127); // just to be sure 351 352 //printf("exp = %d, significand = x%x\n", exp, significand); 353 354 uint u = (s & SIGNMASK) << 16; // sign bit 355 u |= (exp + 127) << 23; // bias the exponent and shift into position 356 u |= significand << (23 - 10); 357 358 return *cast(float*)&u; 359 } 360 361 unittest 362 { 363 static struct S { ushort u; float f; } 364 365 static S[] tests = 366 [ 367 { 0x3C00, 1.0f }, 368 { 0xC000, -2.0f }, 369 { 0x7BFF, 65504f }, 370 { 0x0000, 0.0f }, 371 { 0x8000, -0.0f }, 372 { 0x7C00, float.infinity}, 373 { 0xFC00, -float.infinity}, 374 //{ , }, 375 ]; 376 377 foreach (i, s; tests) 378 { 379 float f = shortToFloat(s.u); 380 if (f != s.f) 381 { 382 printf("[%d] %04x %g expected %g\n", i, s.u, f, s.f); 383 assert(0); 384 } 385 } 386 } 387 388 389 version (unittest) import std.stdio; 390 391 unittest 392 { 393 HalfFloat h = hf!27.2; 394 HalfFloat j = cast(HalfFloat)( hf!3.5 + hf!5 ); 395 HalfFloat f = HalfFloat(0.0f); 396 397 float k = j + h; 398 399 f.s = 0x1400; 400 writeln("1.0009765625 ", 1.0f + f); 401 assert(f == HalfFloat.epsilon); 402 403 f.s = 0x0400; 404 writeln("6.10352e-5 ", cast(float)f); 405 assert(f == HalfFloat.min_normal); 406 407 f.s = 0x03FF; 408 writeln("6.09756e-5 ", cast(float)f); 409 410 f.s = 1; 411 writefln("5.96046e-8 %.10e", cast(float)f); 412 413 f.s = 0; 414 writeln("0 ", cast(float)f); 415 assert(f == 0.0f); 416 417 f.s = 0x8000; 418 writeln("-0 ", cast(float)f); 419 assert(f == -0.0f); 420 421 f.s = 0x3555; 422 writeln("0.33325 ", cast(float)f); 423 424 f = HalfFloat.nan(); 425 assert(f.s == 0x7C01); 426 float fl = f; 427 writefln("%x", *cast(uint*)&fl); 428 assert(*cast(uint*)&fl == 0x7FC0_0000); 429 } 430