Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 
 

319 rindas
9.7 KiB

  1. //
  2. // Lol Engine
  3. //
  4. // Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
  5. // This program is free software; you can redistribute it and/or
  6. // modify it under the terms of the Do What The Fuck You Want To
  7. // Public License, Version 2, as published by Sam Hocevar. See
  8. // http://sam.zoy.org/projects/COPYING.WTFPL for more details.
  9. //
  10. #if defined HAVE_CONFIG_H
  11. # include "config.h"
  12. #endif
  13. #if defined __CELLOS_LV2__
  14. # include <ppu_altivec_internals.h>
  15. #endif
  16. #include "core.h"
  17. using namespace std;
  18. namespace lol
  19. {
  20. /* These macros implement a finite iterator useful to build lookup
  21. * tables. For instance, S64(0) will call S1(x) for all values of x
  22. * between 0 and 63.
  23. * Due to the exponential behaviour of the calls, the stress on the
  24. * compiler may be important. */
  25. #define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3)
  26. #define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12)
  27. #define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48)
  28. #define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192)
  29. #define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
  30. /* Lookup table-based algorithm from “Fast Half Float Conversions”
  31. * by Jeroen van der Zijp, November 2008. No rounding is performed,
  32. * and some NaN values may be incorrectly converted to Inf. */
  33. static inline uint16_t float_to_half_nobranch(uint32_t x)
  34. {
  35. static uint16_t const basetable[512] =
  36. {
  37. #define S1(i) (((i) < 103) ? 0x0000 : \
  38. ((i) < 113) ? 0x0400 >> (113 - (i)) : \
  39. ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
  40. S256(0),
  41. #undef S1
  42. #define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
  43. ((i) < 113) ? 0x0400 >> (113 - (i)) : \
  44. ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
  45. S256(0),
  46. #undef S1
  47. };
  48. static uint8_t const shifttable[512] =
  49. {
  50. #define S1(i) (((i) < 103) ? 24 : \
  51. ((i) < 113) ? 126 - (i) : \
  52. ((i) < 143 || (i) == 255) ? 13 : 24)
  53. S256(0), S256(0),
  54. #undef S1
  55. };
  56. uint16_t bits = basetable[(x >> 23) & 0x1ff];
  57. bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
  58. return bits;
  59. }
  60. /* This method is faster than the OpenEXR implementation (very often
  61. * used, eg. in Ogre), with the additional benefit of rounding, inspired
  62. * by James Tursa’s half-precision code. */
  63. static inline uint16_t float_to_half_branch(uint32_t x)
  64. {
  65. uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
  66. uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
  67. unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
  68. /* If zero, or denormal, or exponent underflows too much for a denormal
  69. * half, return signed zero. */
  70. if (e < 103)
  71. return bits;
  72. /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
  73. if (e > 142)
  74. {
  75. bits |= 0x7c00u;
  76. /* If exponent was 0xff and one mantissa bit was set, it means NaN,
  77. * not Inf, so make sure we set one mantissa bit too. */
  78. bits |= e == 255 && (x & 0x007fffffu);
  79. return bits;
  80. }
  81. /* If exponent underflows but not too much, return a denormal */
  82. if (e < 113)
  83. {
  84. m |= 0x0800u;
  85. /* Extra rounding may overflow and set mantissa to 0 and exponent
  86. * to 1, which is OK. */
  87. bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
  88. return bits;
  89. }
  90. bits |= ((e - 112) << 10) | (m >> 1);
  91. /* Extra rounding. An overflow will set mantissa to 0 and increment
  92. * the exponent, which is OK. */
  93. bits += m & 1;
  94. return bits;
  95. }
  96. #if 0
  97. static inline void float_to_half_vector(half *dst, float const *src)
  98. {
  99. vector unsigned int const v7 = vec_splat_u32(7);
  100. vector unsigned short const v6 = vec_splat_u16(6);
  101. #if _XBOX
  102. vector signed short const v9 = vec_splat_u16(9);
  103. vector unsigned short const v10 = vec_splat_u16(10);
  104. #else
  105. vector signed short const v0x0040 = {
  106. 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040, 0x0040};
  107. vector unsigned short const v0x0400 = {
  108. 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
  109. #endif
  110. vector unsigned char const shuffle_high = {
  111. 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
  112. vector unsigned char const shuffle_low = {
  113. 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
  114. vector unsigned char const v0xbf70 = {
  115. 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70,
  116. 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70, 0xbf, 0x70};
  117. vector unsigned short v_mant, v_ret;
  118. vector signed short v_exp;
  119. vector unsigned int in0 = (vector unsigned int)vec_ld(0, src);
  120. vector unsigned int in1 = (vector unsigned int)vec_ld(16, src);
  121. in0 = vec_sra(in0, v7);
  122. in1 = vec_sra(in1, v7);
  123. v_exp = (vector signed short)vec_perm(in0, in1, shuffle_high);
  124. v_mant = (vector unsigned short)vec_perm(in0, in1, shuffle_low);
  125. v_exp = (vector signed short)vec_subs((vector unsigned char)v_exp, v0xbf70);
  126. #if _XBOX
  127. v_ret = (vector unsigned short)vec_or(v_exp, vec_sr(v_exp, v9));
  128. #else
  129. v_ret = (vector unsigned short)vec_madds(v_exp, v0x0040, v_exp);
  130. #endif
  131. v_mant = vec_sr(v_mant, v6);
  132. #if _XBOX
  133. v_ret = vec_or(v_mant, vec_sl(v_ret, v10));
  134. #else
  135. v_ret = vec_mladd(v_ret, v0x0400, v_mant);
  136. #endif
  137. vec_st(v_ret, 0, (uint16_t *)dst);
  138. }
  139. #endif
  140. static int const shifttable[32] =
  141. {
  142. 23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
  143. 15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
  144. };
  145. static uint32_t const shiftmagic = 0x07c4acddu;
  146. /* Lookup table-based algorithm from “Fast Half Float Conversions”
  147. * by Jeroen van der Zijp, November 2008. Tables are generated using
  148. * the C++ preprocessor, thanks to a branchless implementation also
  149. * used in half_to_float_branch(). This code is very fast when performing
  150. * conversions on arrays of values. */
  151. static inline uint32_t half_to_float_nobranch(uint16_t x)
  152. {
  153. #define M3(i) ((i) | ((i) >> 1))
  154. #define M7(i) (M3(i) | (M3(i) >> 2))
  155. #define MF(i) (M7(i) | (M7(i) >> 4))
  156. #define MFF(i) (MF(i) | (MF(i) >> 8))
  157. #define E(i) shifttable[(unsigned int)(MFF(i) * shiftmagic) >> 27]
  158. static uint32_t const mantissatable[2048] =
  159. {
  160. #define S1(i) (((i) == 0) ? 0 : ((125 - E(i)) << 23) + ((i) << E(i)))
  161. S1024(0),
  162. #undef S1
  163. #define S1(i) (0x38000000u + ((i) << 13))
  164. S1024(0),
  165. #undef S1
  166. };
  167. static uint32_t const exponenttable[64] =
  168. {
  169. #define S1(i) (((i) == 0) ? 0 : \
  170. ((i) < 31) ? ((i) << 23) : \
  171. ((i) == 31) ? 0x47800000u : \
  172. ((i) == 32) ? 0x80000000u : \
  173. ((i) < 63) ? (0x80000000u + (((i) - 32) << 23)) : 0xc7800000)
  174. S64(0),
  175. #undef S1
  176. };
  177. static int const offsettable[64] =
  178. {
  179. #define S1(i) (((i) == 0 || (i) == 32) ? 0 : 1024)
  180. S64(0),
  181. #undef S1
  182. };
  183. return mantissatable[offsettable[x >> 10] + (x & 0x3ff)]
  184. + exponenttable[x >> 10];
  185. }
  186. /* This algorithm is similar to the OpenEXR implementation, except it
  187. * uses branchless code in the denormal path. This is slower than the
  188. * table version, but will be more friendly to the cache for occasional
  189. * uses. */
  190. static inline uint32_t half_to_float_branch(uint16_t x)
  191. {
  192. uint32_t s = (x & 0x8000u) << 16;
  193. if ((x & 0x7fffu) == 0)
  194. return (uint32_t)x << 16;
  195. uint32_t e = x & 0x7c00u;
  196. uint32_t m = x & 0x03ffu;
  197. if (e == 0)
  198. {
  199. #if !defined __CELLOS_LV2__
  200. uint32_t v = m | (m >> 1);
  201. v |= v >> 2;
  202. v |= v >> 4;
  203. v |= v >> 8;
  204. e = shifttable[(v * shiftmagic) >> 27];
  205. /* We don't have to remove the 10th mantissa bit because it gets
  206. * added to our underestimated exponent. */
  207. return s | (((125 - e) << 23) + (m << e));
  208. #else
  209. /* PS3 don't know bout my denormals */
  210. return s;
  211. #endif
  212. }
  213. if (e == 0x7c00u)
  214. {
  215. /* The amd64 pipeline likes the if() better than a ternary operator
  216. * or any other trick I could find. --sam */
  217. if (m == 0)
  218. return s | 0x7f800000u;
  219. return s | 0x7fc00000u;
  220. }
  221. return s | (((e >> 10) + 112) << 23) | (m << 13);
  222. }
  223. /* Constructor from float. Uses the non-branching version because benchmarks
  224. * indicate it is about 80% faster on amd64, and 20% faster on the PS3. The
  225. * penalty of loading the lookup tables does not seem important. */
  226. half half::makefast(float f)
  227. {
  228. union { float f; uint32_t x; } u = { f };
  229. return makebits(float_to_half_nobranch(u.x));
  230. }
  231. /* Constructor from float with better precision. */
  232. half half::makeaccurate(float f)
  233. {
  234. union { float f; uint32_t x; } u = { f };
  235. return makebits(float_to_half_branch(u.x));
  236. }
  237. /* Cast to float. Uses the branching version because loading the tables
  238. * for only one value is going to be cache-expensive. */
  239. float half::tofloat(half h)
  240. {
  241. union { float f; uint32_t x; } u;
  242. u.x = half_to_float_branch(h.bits);
  243. return u.f;
  244. }
  245. size_t half::convert(half *dst, float const *src, size_t nelem)
  246. {
  247. for (size_t i = 0; i < nelem; i++)
  248. {
  249. union { float f; uint32_t x; } u;
  250. u.f = *src++;
  251. *dst++ = makebits(float_to_half_nobranch(u.x));
  252. #if 0
  253. /* AltiVec code. Will work one day. */
  254. float_to_half_vector(dst, src);
  255. src += 8;
  256. dst += 8;
  257. i += 7;
  258. #endif
  259. }
  260. return nelem;
  261. }
  262. size_t half::convert(float *dst, half const *src, size_t nelem)
  263. {
  264. for (size_t i = 0; i < nelem; i++)
  265. {
  266. union { float f; uint32_t x; } u;
  267. #if !defined __CELLOS_LV2__
  268. /* This code is really too slow on the PS3, even with the denormal
  269. * handling stripped off. */
  270. u.x = half_to_float_nobranch((*src++).bits);
  271. #else
  272. u.x = half_to_float_branch((*src++).bits);
  273. #endif
  274. *dst++ = u.f;
  275. }
  276. return nelem;
  277. }
  278. } /* namespace lol */