Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 

252 wiersze
7.5 KiB

  1. //
  2. // Lol Engine
  3. //
  4. // Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
  5. // This program is free software; you can redistribute it and/or
  6. // modify it under the terms of the Do What The Fuck You Want To
  7. // Public License, Version 2, as published by Sam Hocevar. See
  8. // http://sam.zoy.org/projects/COPYING.WTFPL for more details.
  9. //
  10. #if defined HAVE_CONFIG_H
  11. # include "config.h"
  12. #endif
  13. #include "core.h"
  14. using namespace std;
  15. namespace lol
  16. {
  17. /* These macros implement a finite iterator useful to build lookup
  18. * tables. For instance, S64(0) will call S1(x) for all values of x
  19. * between 0 and 63.
  20. * Due to the exponential behaviour of the calls, the stress on the
  21. * compiler may be important. */
  22. #define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3)
  23. #define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12)
  24. #define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48)
  25. #define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192)
  26. #define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
  27. /* Lookup table-based algorithm from “Fast Half Float Conversions”
  28. * by Jeroen van der Zijp, November 2008. No rounding is performed,
  29. * and some NaN values may be incorrectly converted to Inf. */
  30. static inline uint16_t float_to_half_nobranch(uint32_t x)
  31. {
  32. static uint16_t const basetable[512] =
  33. {
  34. #define S1(i) (((i) < 103) ? 0x0000: \
  35. ((i) < 113) ? 0x0400 >> (113 - (i)) : \
  36. ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
  37. S256(0),
  38. #undef S1
  39. #define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
  40. ((i) < 113) ? 0x0400 >> (113 - (i)): \
  41. ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
  42. S256(0),
  43. #undef S1
  44. };
  45. static uint8_t const shifttable[512] =
  46. {
  47. #define S1(i) (((i) < 103) ? 24 : \
  48. ((i) < 113) ? 126 - (i) : \
  49. ((i) < 143 || (i) == 255) ? 13 : 24)
  50. S256(0), S256(0),
  51. #undef S1
  52. };
  53. uint16_t bits = basetable[(x >> 23) & 0x1ff];
  54. bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
  55. return bits;
  56. }
  57. /* This method is faster than the OpenEXR implementation (very often
  58. * used, eg. in Ogre), with the additional benefit of rounding, inspired
  59. * by James Tursa’s half-precision code. */
  60. static inline uint16_t float_to_half_branch(uint32_t x)
  61. {
  62. uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
  63. uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
  64. unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
  65. /* If zero, or denormal, or exponent underflows too much for a denormal,
  66. * return signed zero. */
  67. if (e < 103)
  68. return bits;
  69. /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
  70. if (e > 142)
  71. {
  72. bits |= 0x7c00u;
  73. /* If exponent was 0xff and one mantissa bit was set, it means NaN,
  74. * not Inf, so make sure we set one mantissa bit too. */
  75. bits |= e == 255 && (x & 0x007fffffu);
  76. return bits;
  77. }
  78. /* If exponent underflows but not too much, return a denormal */
  79. if (e < 113)
  80. {
  81. m |= 0x0800u;
  82. /* Extra rounding may overflow and set mantissa to 0 and exponent
  83. * to 1, which is OK. */
  84. bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
  85. return bits;
  86. }
  87. bits |= ((e - 112) << 10) | (m >> 1);
  88. /* Extra rounding. An overflow will set mantissa to 0 and increment
  89. * the exponent, which is OK. */
  90. bits += m & 1;
  91. return bits;
  92. }
  93. static int const shifttable[32] =
  94. {
  95. 23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
  96. 15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
  97. };
  98. static uint32_t const shiftmagic = 0x07c4acddu;
  99. /* Lookup table-based algorithm from “Fast Half Float Conversions”
  100. * by Jeroen van der Zijp, November 2008. Tables are generated using
  101. * the C++ preprocessor, thanks to a branchless implementation also
  102. * used in half_to_float_branch(). This code is very fast when performing
  103. * conversions on arrays of values. */
  104. static inline uint32_t half_to_float_nobranch(uint16_t x)
  105. {
  106. #define M3(i) ((i) | ((i) >> 1))
  107. #define M7(i) (M3(i) | (M3(i) >> 2))
  108. #define MF(i) (M7(i) | (M7(i) >> 4))
  109. #define MFF(i) (MF(i) | (MF(i) >> 8))
  110. #define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]
  111. static uint32_t const mantissatable[2048] =
  112. {
  113. #define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
  114. S1024(0),
  115. #undef S1
  116. #define S1(i) (0x38000000u + ((i) << 13))
  117. S1024(0),
  118. #undef S1
  119. };
  120. static uint32_t const exponenttable[64] =
  121. {
  122. #define S1(i) (((i) == 0) ? 0 : \
  123. ((i) < 31) ? ((i) << 23) : \
  124. ((i) == 31) ? 0x47800000u : \
  125. ((i) == 32) ? 0x80000000u : \
  126. ((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
  127. S64(0),
  128. #undef S1
  129. };
  130. static int const offsettable[64] =
  131. {
  132. #define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
  133. S64(0),
  134. #undef S1
  135. };
  136. return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
  137. + exponenttable[x >> 10];
  138. }
  139. /* This algorithm is similar to the OpenEXR implementation, except it
  140. * uses branchless code in the denormal path. This is slower than the
  141. * table version, but will be more friendly to the cache for occasional
  142. * uses. */
  143. static inline uint32_t half_to_float_branch(uint16_t x)
  144. {
  145. uint32_t s = (x & 0x8000u) << 16;
  146. if ((x & 0x7fffu) == 0)
  147. return (uint32_t)x << 16;
  148. uint32_t e = x & 0x7c00u;
  149. uint32_t m = x & 0x03ffu;
  150. if (e == 0)
  151. {
  152. uint32_t v = m | (m >> 1);
  153. v |= v >> 2;
  154. v |= v >> 4;
  155. v |= v >> 8;
  156. e = shifttable[(v * shiftmagic) >> 27];
  157. /* We don't have to remove the 10th mantissa bit because it gets
  158. * added to our underestimated exponent. */
  159. return s | (((125 - e) << 23) + (m << e));
  160. }
  161. if (e == 0x7c00u)
  162. {
  163. /* The amd64 pipeline likes the if() better than a ternary operator
  164. * or any other trick I could find. --sam */
  165. if (m == 0)
  166. return s | 0x7f800000u;
  167. return s | 0x7fc00000u;
  168. }
  169. return s | (((e >> 10) + 112) << 23) | (m << 13);
  170. }
  171. /* Constructor from float. Uses the non-branching version because benchmarks
  172. * indicate it is always twice as fast. The penalty of loading the lookup
  173. * tables does not seem important. */
  174. half half::makefast(float f)
  175. {
  176. union { float f; uint32_t x; } u = { f };
  177. return makebits(float_to_half_nobranch(u.x));
  178. }
  179. /* Constructor from float with better precision. */
  180. half half::makeslow(float f)
  181. {
  182. union { float f; uint32_t x; } u = { f };
  183. return makebits(float_to_half_branch(u.x));
  184. }
  185. /* Cast to float. Uses the branching version because loading the tables
  186. * for only one value is going to be cache-expensive. */
  187. half::operator float() const
  188. {
  189. /* FIXME: there is a hidden "this" in this method. Export more
  190. * code so that it can all work in registers instead. */
  191. union { float f; uint32_t x; } u;
  192. u.x = half_to_float_branch(bits);
  193. return u.f;
  194. }
  195. size_t half::convert(half *dst, float const *src, size_t nelem)
  196. {
  197. for (size_t i = 0; i < nelem; i++)
  198. {
  199. union { float f; uint32_t x; } u;
  200. u.f = *src++;
  201. *dst++ = makebits(float_to_half_nobranch(u.x));
  202. }
  203. return nelem;
  204. }
  205. size_t half::convert(float *dst, half const *src, size_t nelem)
  206. {
  207. for (size_t i = 0; i < nelem; i++)
  208. {
  209. union { float f; uint32_t x; } u;
  210. u.x = half_to_float_nobranch((*src++).bits);
  211. *dst++ = u.f;
  212. }
  213. return nelem;
  214. }
  215. } /* namespace lol */