You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

388 lines
12 KiB

  1. //
  2. // Lol Engine
  3. //
  4. // Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
  5. // This program is free software; you can redistribute it and/or
  6. // modify it under the terms of the Do What The Fuck You Want To
  7. // Public License, Version 2, as published by Sam Hocevar. See
  8. // http://www.wtfpl.net/ for more details.
  9. //
  10. #include <lol/engine-internal.h>
  11. #if defined HAVE_FASTMATH_H
  12. # include <fastmath.h>
  13. #endif
  14. // Optimisation helpers
  15. #if defined __GNUC__
  16. # define __likely(x) __builtin_expect(!!(x), 1)
  17. # define __unlikely(x) __builtin_expect(!!(x), 0)
  18. # define INLINEATTR __attribute__((always_inline))
  19. # if defined __x86_64__
  20. # define FP_USE(x) __asm__("" : "+x" (x))
  21. # elif defined __i386__ /* FIXME: this isn't good */
  22. # define FP_USE(x) __asm__("" : "+m" (x))
  23. # else
  24. # define FP_USE(x) (void)(x)
  25. # endif
  26. #else
  27. # define __likely(x) x
  28. # define __unlikely(x) x
  29. # define INLINEATTR
  30. # define FP_USE(x) (void)(x)
  31. #endif
  32. namespace lol
  33. {
  34. static const double PI_2 = 1.57079632679489661923132;
  35. static const double PI_4 = 0.785398163397448309615661;
  36. static const double INV_PI = 0.318309886183790671537768;
  37. static const double ROOT3 = 1.73205080756887729352745;
  38. static const double ZERO = 0.0;
  39. static const double ONE = 1.0;
  40. static const double NEG_ONE = -1.0;
  41. static const double HALF = 0.5;
  42. static const double QUARTER = 0.25;
  43. static const double TWO = 2.0;
  44. #if defined __GNUC__
  45. static const double VERY_SMALL_NUMBER = 0x1.0p-128;
  46. #else
  47. static const double VERY_SMALL_NUMBER = 3e-39;
  48. #endif
  49. static const double TWO_EXP_52 = 4503599627370496.0;
  50. static const double TWO_EXP_54 = 18014398509481984.0;
  51. /** sin Taylor series coefficients. */
  52. static const double SC[] =
  53. {
  54. -1.6449340668482264364724e-0, // π^2/3!
  55. +8.1174242528335364363700e-1, // π^4/5!
  56. -1.9075182412208421369647e-1, // π^6/7!
  57. +2.6147847817654800504653e-2, // π^8/9!
  58. -2.3460810354558236375089e-3, // π^10/11!
  59. +1.4842879303107100368487e-4, // π^12/13!
  60. -6.9758736616563804745344e-6, // π^14/15!
  61. +2.5312174041370276513517e-7, // π^16/17!
  62. };
  63. /* Note: the last value should be -1.3878952462213772114468e-7 (ie.
  64. * π^18/18!) but we tweak it in order to get the better average precision
  65. * required for tan() computations when close to π/2+kπ values. */
  66. static const double CC[] =
  67. {
  68. -4.9348022005446793094172e-0, // π^2/2!
  69. +4.0587121264167682181850e-0, // π^4/4!
  70. -1.3352627688545894958753e-0, // π^6/6!
  71. +2.3533063035889320454188e-1, // π^8/8!
  72. -2.5806891390014060012598e-2, // π^10/10!
  73. +1.9295743094039230479033e-3, // π^12/12!
  74. -1.0463810492484570711802e-4, // π^14/14!
  75. +4.3030695870329470072978e-6, // π^16/16!
  76. -1.3777e-7,
  77. };
  78. /* These coefficients use Sloane’s http://oeis.org/A002430 and
  79. * http://oeis.org/A036279 sequences for the Taylor series of tan().
  80. * Note: the last value should be 2.12485922978838540352881e5 (ie.
  81. * 443861162*π^18/1856156927625), but we tweak it in order to get
  82. * sub 1e-11 average precision in a larger range. */
  83. static const double TC[] =
  84. {
  85. 3.28986813369645287294483e0, // π^2/3
  86. 1.29878788045336582981920e1, // 2*π^4/15
  87. 5.18844961612069061254404e1, // 17*π^6/315
  88. 2.07509320280908496804928e2, // 62*π^8/2835
  89. 8.30024701695986756361561e2, // 1382*π^10/155925
  90. 3.32009324029001216460018e3, // 21844*π^12/6081075
  91. 1.32803704909665483598490e4, // 929569*π^14/638512875
  92. 5.31214808666037709352112e4, // 6404582*π^16/10854718875
  93. 2.373e5,
  94. };
  95. static inline double lol_fabs(double x) INLINEATTR;
  96. #if defined __GNUC__
  97. static inline double lol_round(double x) INLINEATTR;
  98. static inline double lol_trunc(double x) INLINEATTR;
  99. #endif
  100. static inline double lol_fabs(double x)
  101. {
  102. #if defined __GNUC__
  103. return __builtin_fabs(x);
  104. #else
  105. using std::fabs;
  106. return fabs(x);
  107. #endif
  108. }
  109. #if defined __GNUC__
  110. static inline double lol_round(double x)
  111. {
  112. return __builtin_round(x);
  113. }
  114. static inline double lol_trunc(double x)
  115. {
  116. return __builtin_trunc(x);
  117. }
  118. #endif
  119. double lol_sin(double x)
  120. {
  121. double absx = lol_fabs(x * INV_PI);
  122. /* If branches are cheap, skip the cycle count when |x| < π/4,
  123. * and only do the Taylor series up to the required precision. */
  124. #if LOL_FEATURE_CHEAP_BRANCHES
  125. if (absx < QUARTER)
  126. {
  127. /* Computing x^4 is one multiplication too many we do, but it helps
  128. * interleave the Taylor series operations a lot better. */
  129. double x2 = absx * absx;
  130. double x4 = x2 * x2;
  131. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  132. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  133. double taylor = sub2 * x2 + sub1;
  134. return x * taylor;
  135. }
  136. #endif
  137. /* Wrap |x| to the range [-1, 1] and keep track of the number of
  138. * cycles required. If odd, we'll need to change the sign of the
  139. * result. */
  140. double num_cycles = absx + TWO_EXP_52;
  141. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  142. double is_even = TWO * num_cycles - ONE;
  143. FP_USE(is_even); is_even += TWO_EXP_54;
  144. FP_USE(is_even); is_even -= TWO_EXP_54;
  145. FP_USE(is_even);
  146. is_even -= TWO * num_cycles - ONE;
  147. double sign = is_even;
  148. absx -= num_cycles;
  149. /* If branches are very cheap, we have the option to do the Taylor
  150. * series at a much lower degree by splitting. */
  151. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  152. if (lol_fabs(absx) > QUARTER)
  153. {
  154. sign = (x * absx >= 0.0) ? sign : -sign;
  155. double x1 = HALF - lol_fabs(absx);
  156. double x2 = x1 * x1;
  157. double x4 = x2 * x2;
  158. double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  159. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  160. double taylor = sub2 * x2 + sub1;
  161. return taylor * sign;
  162. }
  163. #endif
  164. sign *= (x >= 0.0) ? D_PI : -D_PI;
  165. /* Compute a Tailor series for sin() and combine sign information. */
  166. double x2 = absx * absx;
  167. double x4 = x2 * x2;
  168. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  169. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  170. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  171. #else
  172. double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  173. double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
  174. #endif
  175. double taylor = sub2 * x2 + sub1;
  176. return absx * taylor * sign;
  177. }
  178. double lol_cos(double x)
  179. {
  180. double absx = lol_fabs(x * INV_PI);
  181. #if LOL_FEATURE_CHEAP_BRANCHES
  182. if (absx < QUARTER)
  183. {
  184. double x2 = absx * absx;
  185. double x4 = x2 * x2;
  186. double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
  187. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  188. double taylor = (sub1 * x2 + sub2) * x2 + ONE;
  189. return taylor;
  190. }
  191. #endif
  192. double num_cycles = absx + TWO_EXP_52;
  193. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  194. double is_even = TWO * num_cycles - ONE;
  195. FP_USE(is_even); is_even += TWO_EXP_54;
  196. FP_USE(is_even); is_even -= TWO_EXP_54;
  197. FP_USE(is_even);
  198. is_even -= TWO * num_cycles - ONE;
  199. double sign = is_even;
  200. absx -= num_cycles;
  201. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  202. if (lol_fabs(absx) > QUARTER)
  203. {
  204. double x1 = HALF - lol_fabs(absx);
  205. double x2 = x1 * x1;
  206. double x4 = x2 * x2;
  207. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  208. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  209. double taylor = sub2 * x2 + sub1;
  210. return x1 * taylor * sign * D_PI;
  211. }
  212. #endif
  213. double x2 = absx * absx;
  214. double x4 = x2 * x2;
  215. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  216. double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  217. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  218. #else
  219. double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  220. double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
  221. #endif
  222. double taylor = sub2 * x2 + sub1;
  223. return taylor * sign;
  224. }
  225. void lol_sincos(double x, double *sinx, double *cosx)
  226. {
  227. double absx = lol_fabs(x * INV_PI);
  228. #if LOL_FEATURE_CHEAP_BRANCHES
  229. if (absx < QUARTER)
  230. {
  231. double x2 = absx * absx;
  232. double x4 = x2 * x2;
  233. /* Computing the Taylor series to the 11th order is enough to get
  234. * x * 1e-11 precision, but we push it to the 13th order so that
  235. * tan() has a better precision. */
  236. double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  237. double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  238. double taylors = subs2 * x2 + subs1;
  239. *sinx = x * taylors;
  240. double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
  241. double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  242. double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
  243. *cosx = taylorc;
  244. return;
  245. }
  246. #endif
  247. double num_cycles = absx + TWO_EXP_52;
  248. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  249. double is_even = TWO * num_cycles - ONE;
  250. FP_USE(is_even); is_even += TWO_EXP_54;
  251. FP_USE(is_even); is_even -= TWO_EXP_54;
  252. FP_USE(is_even);
  253. is_even -= TWO * num_cycles - ONE;
  254. double sin_sign = is_even;
  255. double cos_sign = is_even;
  256. absx -= num_cycles;
  257. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  258. if (lol_fabs(absx) > QUARTER)
  259. {
  260. cos_sign = sin_sign;
  261. sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
  262. double x1 = HALF - lol_fabs(absx);
  263. double x2 = x1 * x1;
  264. double x4 = x2 * x2;
  265. double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  266. double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  267. double taylors = subs2 * x2 + subs1;
  268. *sinx = taylors * sin_sign;
  269. double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  270. double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  271. double taylorc = subc2 * x2 + subc1;
  272. *cosx = x1 * taylorc * cos_sign * D_PI;
  273. return;
  274. }
  275. #endif
  276. sin_sign *= (x >= 0.0) ? D_PI : -D_PI;
  277. double x2 = absx * absx;
  278. double x4 = x2 * x2;
  279. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  280. double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  281. double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  282. double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  283. double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  284. #else
  285. double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  286. double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
  287. /* Push Taylor series to the 19th order to enhance tan() accuracy. */
  288. double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  289. double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
  290. #endif
  291. double taylors = subs2 * x2 + subs1;
  292. *sinx = absx * taylors * sin_sign;
  293. double taylorc = subc2 * x2 + subc1;
  294. *cosx = taylorc * cos_sign;
  295. }
  296. void lol_sincos(float x, float *sinx, float *cosx)
  297. {
  298. double x2 = static_cast<double>(x);
  299. double s2, c2;
  300. lol_sincos(x2, &s2, &c2);
  301. *sinx = static_cast<float>(s2);
  302. *cosx = static_cast<float>(c2);
  303. }
  304. double lol_tan(double x)
  305. {
  306. #if LOL_FEATURE_CHEAP_BRANCHES
  307. double absx = lol_fabs(x * INV_PI);
  308. /* This value was determined empirically to ensure an error of no
  309. * more than x * 1e-11 in this range. */
  310. if (absx < 0.163)
  311. {
  312. double x2 = absx * absx;
  313. double x4 = x2 * x2;
  314. double sub1 = (((TC[7] * x4 + TC[5]) * x4
  315. + TC[3]) * x4 + TC[1]) * x4 + ONE;
  316. double sub2 = (((TC[8] * x4 + TC[6]) * x4
  317. + TC[4]) * x4 + TC[2]) * x4 + TC[0];
  318. double taylor = sub2 * x2 + sub1;
  319. return x * taylor;
  320. }
  321. #endif
  322. double sinx, cosx;
  323. lol_sincos(x, &sinx, &cosx);
  324. /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
  325. double absc = lol_fabs(cosx);
  326. if (__unlikely(absc < VERY_SMALL_NUMBER))
  327. cosx = VERY_SMALL_NUMBER;
  328. return sinx / cosx;
  329. }
  330. } /* namespace lol */