You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

268 lines
8.1 KiB

  1. //
  2. // Lol Engine
  3. //
  4. // Copyright: (c) 2010-2013 Sam Hocevar <sam@hocevar.net>
  5. // This program is free software; you can redistribute it and/or
  6. // modify it under the terms of the Do What The Fuck You Want To
  7. // Public License, Version 2, as published by Sam Hocevar. See
  8. // http://www.wtfpl.net/ for more details.
  9. //
  10. #if defined HAVE_CONFIG_H
  11. # include "config.h"
  12. #endif
  13. #if defined __CELLOS_LV2__
  14. # if defined __SNC__
  15. # include <ppu_altivec_internals.h>
  16. # else
  17. # include <altivec.h>
  18. # endif
  19. #endif
  20. #include "core.h"
  21. using namespace std;
  22. namespace lol
  23. {
  24. /* These macros implement a finite iterator useful to build lookup
  25. * tables. For instance, S64(0) will call S1(x) for all values of x
  26. * between 0 and 63.
  27. * Due to the exponential behaviour of the calls, the stress on the
  28. * compiler may be important. */
  29. #define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3)
  30. #define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12)
  31. #define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48)
  32. #define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192)
  33. #define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
  34. /* Lookup table-based algorithm from “Fast Half Float Conversions”
  35. * by Jeroen van der Zijp, November 2008. No rounding is performed,
  36. * and some NaN values may be incorrectly converted to Inf (because
  37. * the lowest order bits in the mantissa are ignored). */
  38. static inline uint16_t float_to_half_nobranch(uint32_t x)
  39. {
  40. static uint16_t const basetable[512] =
  41. {
  42. #define S1(i) (((i) < 103) ? 0x0000u : \
  43. ((i) < 113) ? 0x0400u >> (0x1f & (113 - (i))) : \
  44. ((i) < 143) ? ((i) - 112) << 10 : 0x7c00u)
  45. S256(0),
  46. #undef S1
  47. #define S1(i) (uint16_t)(0x8000u | basetable[i])
  48. S256(0),
  49. #undef S1
  50. };
  51. static uint8_t const shifttable[512] =
  52. {
  53. #define S1(i) (((i) < 103) ? 24 : \
  54. ((i) < 113) ? 126 - (i) : \
  55. ((i) < 143 || (i) == 255) ? 13 : 24)
  56. S256(0), S256(0),
  57. #undef S1
  58. };
  59. uint16_t bits = basetable[(x >> 23) & 0x1ff];
  60. bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
  61. return bits;
  62. }
  63. /* This method is faster than the OpenEXR implementation (very often
  64. * used, eg. in Ogre), with the additional benefit of rounding, inspired
  65. * by James Tursa’s half-precision code. */
  66. static inline uint16_t float_to_half_branch(uint32_t x)
  67. {
  68. uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
  69. uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
  70. unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
  71. /* If zero, or denormal, or exponent underflows too much for a denormal
  72. * half, return signed zero. */
  73. if (e < 103)
  74. return bits;
  75. /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
  76. if (e > 142)
  77. {
  78. bits |= 0x7c00u;
  79. /* If exponent was 0xff and one mantissa bit was set, it means NaN,
  80. * not Inf, so make sure we set one mantissa bit too. */
  81. bits |= e == 255 && (x & 0x007fffffu);
  82. return bits;
  83. }
  84. /* If exponent underflows but not too much, return a denormal */
  85. if (e < 113)
  86. {
  87. m |= 0x0800u;
  88. /* Extra rounding may overflow and set mantissa to 0 and exponent
  89. * to 1, which is OK. */
  90. bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
  91. return bits;
  92. }
  93. bits |= ((e - 112) << 10) | (m >> 1);
  94. /* Extra rounding. An overflow will set mantissa to 0 and increment
  95. * the exponent, which is OK. */
  96. bits += m & 1;
  97. return bits;
  98. }
  99. /* We use this magic table, inspired by De Bruijn sequences, to compute a
  100. * branchless integer log2. The actual value fetched is 24-log2(x+1) for x
  101. * in 1, 3, 7, f, 1f, 3f, 7f, ff, 1fe, 1ff, 3fc, 3fd, 3fe, 3ff. See
  102. * http://lolengine.net/blog/2012/04/03/beyond-de-bruijn for an explanation
  103. * of how the value 0x5a1a1a2u was obtained. */
  104. static uint32_t const shifttable[16] =
  105. {
  106. 23, 22, 21, 15, 0, 20, 18, 14, 14, 16, 19, 0, 17, 0, 0, 0,
  107. };
  108. static uint32_t const shiftmagic = 0x5a1a1a2u;
  109. /* Lookup table-based algorithm from “Fast Half Float Conversions”
  110. * by Jeroen van der Zijp, November 2008. Tables are generated using
  111. * the C++ preprocessor, thanks to a branchless implementation also
  112. * used in half_to_float_branch(). This code is very fast when performing
  113. * conversions on arrays of values. */
  114. static inline uint32_t half_to_float_nobranch(uint16_t x)
  115. {
  116. #define M3(i) ((i) | ((i) >> 1))
  117. #define M7(i) (M3(i) | (M3(i) >> 2))
  118. #define MF(i) (M7(i) | (M7(i) >> 4))
  119. #define E(i) shifttable[(uint32_t)((uint64_t)MF(i) * shiftmagic) >> 28]
  120. static uint32_t const mantissatable[2048] =
  121. {
  122. #define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
  123. S1024(0),
  124. #undef S1
  125. #define S1(i) (0x38000000u + ((i) << 13))
  126. S1024(0),
  127. #undef S1
  128. };
  129. static uint32_t const exponenttable[64] =
  130. {
  131. #define S1(i) (((i) == 0) ? 0 : \
  132. ((i) < 31) ? ((uint32_t)(i) << 23) : \
  133. ((i) == 31) ? 0x47800000u : \
  134. ((i) == 32) ? 0x80000000u : \
  135. ((i) < 63) ? (0x80000000u | (((i) - 32) << 23)) : 0xc7800000)
  136. S64(0),
  137. #undef S1
  138. };
  139. static int const offsettable[64] =
  140. {
  141. #define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
  142. S64(0),
  143. #undef S1
  144. };
  145. return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
  146. + exponenttable[x >> 10];
  147. }
  148. /* This algorithm is similar to the OpenEXR implementation, except it
  149. * uses branchless code in the denormal path. This is slower than the
  150. * table version, but will be more friendly to the cache for occasional
  151. * uses. */
  152. static inline uint32_t half_to_float_branch(uint16_t x)
  153. {
  154. uint32_t s = (x & 0x8000u) << 16;
  155. if ((x & 0x7fffu) == 0)
  156. return (uint32_t)x << 16;
  157. uint32_t e = x & 0x7c00u;
  158. uint32_t m = x & 0x03ffu;
  159. if (e == 0)
  160. {
  161. /* m has 10 significant bits but replicating the leading bit to
  162. * 8 positions instead of 16 works just as well because of our
  163. * handcrafted shiftmagic table. */
  164. uint32_t v = m | (m >> 1);
  165. v |= v >> 2;
  166. v |= v >> 4;
  167. e = shifttable[(v * shiftmagic) >> 28];
  168. /* We don't have to remove the 10th mantissa bit because it gets
  169. * added to our underestimated exponent. */
  170. return s | (((125 - e) << 23) + (m << e));
  171. }
  172. if (e == 0x7c00u)
  173. {
  174. /* The amd64 pipeline likes the if() better than a ternary operator
  175. * or any other trick I could find. --sam */
  176. if (m == 0)
  177. return s | 0x7f800000u;
  178. return s | 0x7fc00000u;
  179. }
  180. return s | (((e >> 10) + 112) << 23) | (m << 13);
  181. }
  182. /* Constructor from float. Uses the non-branching version because benchmarks
  183. * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
  184. * penalty of loading the lookup tables does not seem important. */
  185. half half::makefast(float f)
  186. {
  187. union { float f; uint32_t x; } u = { f };
  188. return makebits(float_to_half_nobranch(u.x));
  189. }
  190. /* Constructor from float with better precision. */
  191. half half::makeaccurate(float f)
  192. {
  193. union { float f; uint32_t x; } u = { f };
  194. return makebits(float_to_half_branch(u.x));
  195. }
  196. /* Cast to float. Uses the branching version because loading the tables
  197. * for only one value is going to be cache-expensive. */
  198. half::operator float() const
  199. {
  200. union { float f; uint32_t x; } u;
  201. u.x = half_to_float_branch(bits);
  202. return u.f;
  203. }
  204. size_t half::convert(half *dst, float const *src, size_t nelem)
  205. {
  206. for (size_t i = 0; i < nelem; i++)
  207. {
  208. union { float f; uint32_t x; } u;
  209. u.f = *src++;
  210. *dst++ = makebits(float_to_half_nobranch(u.x));
  211. }
  212. return nelem;
  213. }
  214. size_t half::convert(float *dst, half const *src, size_t nelem)
  215. {
  216. for (size_t i = 0; i < nelem; i++)
  217. {
  218. union { float f; uint32_t x; } u;
  219. #if !defined __CELLOS_LV2__
  220. /* This code is really too slow on the PS3, even with the denormal
  221. * handling stripped off. */
  222. u.x = half_to_float_nobranch((*src++).bits);
  223. #else
  224. u.x = half_to_float_branch((*src++).bits);
  225. #endif
  226. *dst++ = u.f;
  227. }
  228. return nelem;
  229. }
  230. } /* namespace lol */