515 líneas
15 KiB

  1. //
  2. // Lol Engine
  3. //
  4. // Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
  5. // This program is free software; you can redistribute it and/or
  6. // modify it under the terms of the Do What The Fuck You Want To
  7. // Public License, Version 2, as published by Sam Hocevar. See
  8. // http://www.wtfpl.net/ for more details.
  9. //
  10. #if defined HAVE_CONFIG_H
  11. # include "config.h"
  12. #endif
  13. #if defined HAVE_FASTMATH_H
  14. # include <fastmath.h>
  15. #endif
  16. #include "core.h"
  17. using namespace std;
  18. namespace lol
  19. {
  20. static const double PI_2 = 1.57079632679489661923132;
  21. static const double PI_4 = 0.785398163397448309615661;
  22. static const double INV_PI = 0.318309886183790671537768;
  23. static const double ROOT3 = 1.73205080756887729352745;
  24. static const double ZERO = 0.0;
  25. static const double ONE = 1.0;
  26. static const double NEG_ONE = -1.0;
  27. static const double HALF = 0.5;
  28. static const double QUARTER = 0.25;
  29. static const double TWO = 2.0;
  30. #if defined __GNUC__
  31. static const double VERY_SMALL_NUMBER = 0x1.0p-128;
  32. #else
  33. static const double VERY_SMALL_NUMBER = 3e-39;
  34. #endif
  35. static const double TWO_EXP_52 = 4503599627370496.0;
  36. static const double TWO_EXP_54 = 18014398509481984.0;
  37. /** sin Taylor series coefficients. */
  38. static const double SC[] =
  39. {
  40. -1.6449340668482264364724e-0, // π^2/3!
  41. +8.1174242528335364363700e-1, // π^4/5!
  42. -1.9075182412208421369647e-1, // π^6/7!
  43. +2.6147847817654800504653e-2, // π^8/9!
  44. -2.3460810354558236375089e-3, // π^10/11!
  45. +1.4842879303107100368487e-4, // π^12/13!
  46. -6.9758736616563804745344e-6, // π^14/15!
  47. +2.5312174041370276513517e-7, // π^16/17!
  48. };
  49. /* Note: the last value should be -1.3878952462213772114468e-7 (ie.
  50. * π^18/18!) but we tweak it in order to get the better average precision
  51. * required for tan() computations when close to π/2+kπ values. */
  52. static const double CC[] =
  53. {
  54. -4.9348022005446793094172e-0, // π^2/2!
  55. +4.0587121264167682181850e-0, // π^4/4!
  56. -1.3352627688545894958753e-0, // π^6/6!
  57. +2.3533063035889320454188e-1, // π^8/8!
  58. -2.5806891390014060012598e-2, // π^10/10!
  59. +1.9295743094039230479033e-3, // π^12/12!
  60. -1.0463810492484570711802e-4, // π^14/14!
  61. +4.3030695870329470072978e-6, // π^16/16!
  62. -1.3777e-7,
  63. };
  64. /* These coefficients use Sloane’s http://oeis.org/A002430 and
  65. * http://oeis.org/A036279 sequences for the Taylor series of tan().
  66. * Note: the last value should be 2.12485922978838540352881e5 (ie.
  67. * 443861162*π^18/1856156927625), but we tweak it in order to get
  68. * sub 1e-11 average precision in a larger range. */
  69. static const double TC[] =
  70. {
  71. 3.28986813369645287294483e0, // π^2/3
  72. 1.29878788045336582981920e1, // 2*π^4/15
  73. 5.18844961612069061254404e1, // 17*π^6/315
  74. 2.07509320280908496804928e2, // 62*π^8/2835
  75. 8.30024701695986756361561e2, // 1382*π^10/155925
  76. 3.32009324029001216460018e3, // 21844*π^12/6081075
  77. 1.32803704909665483598490e4, // 929569*π^14/638512875
  78. 5.31214808666037709352112e4, // 6404582*π^16/10854718875
  79. 2.373e5,
  80. };
  81. #if defined __CELLOS_LV2__
  82. static inline double lol_fctid(double x) INLINEATTR;
  83. static inline double lol_fctidz(double x) INLINEATTR;
  84. static inline double lol_fcfid(double x) INLINEATTR;
  85. static inline double lol_frsqrte(double x) INLINEATTR;
  86. static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
  87. static inline double lol_fres(double x) INLINEATTR;
  88. static inline double lol_fdiv(double a, double b) INLINEATTR;
  89. #endif
  90. static inline double lol_fabs(double x) INLINEATTR;
  91. #if defined __GNUC__
  92. static inline double lol_round(double x) INLINEATTR;
  93. static inline double lol_trunc(double x) INLINEATTR;
  94. #endif
  95. #if defined __CELLOS_LV2__
  96. static inline double lol_fctid(double x)
  97. {
  98. double r;
  99. #if defined __SNC__
  100. r = __builtin_fctid(x);
  101. #else
  102. __asm__ ("fctid %0, %1"
  103. : "=f" (r) : "f" (x));
  104. #endif
  105. return r;
  106. }
  107. static double lol_fctidz(double x)
  108. {
  109. double r;
  110. #if defined __SNC__
  111. r = __builtin_fctidz(x);
  112. #else
  113. __asm__ ("fctidz %0, %1"
  114. : "=f" (r) : "f" (x));
  115. #endif
  116. return r;
  117. }
  118. static double lol_fcfid(double x)
  119. {
  120. double r;
  121. #if defined __SNC__
  122. r = __builtin_fcfid(x);
  123. #else
  124. __asm__ ("fcfid %0, %1"
  125. : "=f" (r) : "f" (x));
  126. #endif
  127. return r;
  128. }
  129. static double lol_frsqrte(double x)
  130. {
  131. #if defined __SNC__
  132. return __builtin_frsqrte(x);
  133. #else
  134. double r;
  135. __asm__ ("frsqrte %0, %1"
  136. : "=f" (r) : "f" (x));
  137. return r;
  138. #endif
  139. }
  140. static inline double lol_fsel(double c, double gte, double lt)
  141. {
  142. #if defined __CELLOS_LV2__ && defined __SNC__
  143. return __fsel(c, gte, lt);
  144. #elif defined __CELLOS_LV2__
  145. double r;
  146. __asm__ ("fsel %0, %1, %2, %3"
  147. : "=f" (r) : "f" (c), "f" (gte), "f" (lt));
  148. return r;
  149. #else
  150. return (c >= 0) ? gte : lt;
  151. #endif
  152. }
  153. static inline double lol_fres(double x)
  154. {
  155. double ret;
  156. #if defined __SNC__
  157. ret = __builtin_fre(x);
  158. #else
  159. __asm__ ("fres %0, %1"
  160. : "=f" (ret) : "f" (x));
  161. #endif
  162. return ret;
  163. }
  164. static inline double lol_fdiv(double a, double b)
  165. {
  166. /* Estimate */
  167. double x0 = lol_fres(b);
  168. /* Two steps of Newton-Raphson */
  169. x0 = (b * x0 - ONE) * -x0 + x0;
  170. x0 = (b * x0 - ONE) * -x0 + x0;
  171. return a * x0;
  172. }
  173. #endif /* __CELLOS_LV2__ */
  174. static inline double lol_fabs(double x)
  175. {
  176. #if defined __CELLOS_LV2__ && defined __SNC__
  177. return __fabs(x);
  178. #elif defined __CELLOS_LV2__
  179. double r;
  180. __asm__ ("fabs %0, %1"
  181. : "=f" (r) : "f" (x));
  182. return r;
  183. #elif defined __GNUC__
  184. return __builtin_fabs(x);
  185. #else
  186. using std::fabs;
  187. return fabs(x);
  188. #endif
  189. }
  190. #if defined __GNUC__
  191. static inline double lol_round(double x)
  192. {
  193. #if defined __CELLOS_LV2__
  194. return lol_fcfid(lol_fctid(x));
  195. #else
  196. return __builtin_round(x);
  197. #endif
  198. }
  199. static inline double lol_trunc(double x)
  200. {
  201. #if defined __CELLOS_LV2__
  202. return lol_fcfid(lol_fctidz(x));
  203. #else
  204. return __builtin_trunc(x);
  205. #endif
  206. }
  207. #endif
  208. double lol_sin(double x)
  209. {
  210. double absx = lol_fabs(x * INV_PI);
  211. /* If branches are cheap, skip the cycle count when |x| < π/4,
  212. * and only do the Taylor series up to the required precision. */
  213. #if defined LOL_FEATURE_CHEAP_BRANCHES
  214. if (absx < QUARTER)
  215. {
  216. /* Computing x^4 is one multiplication too many we do, but it helps
  217. * interleave the Taylor series operations a lot better. */
  218. double x2 = absx * absx;
  219. double x4 = x2 * x2;
  220. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  221. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  222. double taylor = sub2 * x2 + sub1;
  223. return x * taylor;
  224. }
  225. #endif
  226. /* Wrap |x| to the range [-1, 1] and keep track of the number of
  227. * cycles required. If odd, we'll need to change the sign of the
  228. * result. */
  229. #if defined __CELLOS_LV2__
  230. double sign = lol_fsel(x, D_PI, -D_PI);
  231. double num_cycles = lol_round(absx);
  232. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  233. sign = lol_fsel(is_even, sign, -sign);
  234. #else
  235. double num_cycles = absx + TWO_EXP_52;
  236. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  237. double is_even = TWO * num_cycles - ONE;
  238. FP_USE(is_even); is_even += TWO_EXP_54;
  239. FP_USE(is_even); is_even -= TWO_EXP_54;
  240. FP_USE(is_even);
  241. is_even -= TWO * num_cycles - ONE;
  242. double sign = is_even;
  243. #endif
  244. absx -= num_cycles;
  245. /* If branches are very cheap, we have the option to do the Taylor
  246. * series at a much lower degree by splitting. */
  247. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  248. if (lol_fabs(absx) > QUARTER)
  249. {
  250. sign = (x * absx >= 0.0) ? sign : -sign;
  251. double x1 = HALF - lol_fabs(absx);
  252. double x2 = x1 * x1;
  253. double x4 = x2 * x2;
  254. double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  255. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  256. double taylor = sub2 * x2 + sub1;
  257. return taylor * sign;
  258. }
  259. #endif
  260. #if !defined __CELLOS_LV2__
  261. sign *= (x >= 0.0) ? D_PI : -D_PI;
  262. #endif
  263. /* Compute a Tailor series for sin() and combine sign information. */
  264. double x2 = absx * absx;
  265. double x4 = x2 * x2;
  266. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  267. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  268. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  269. #else
  270. double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  271. double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
  272. #endif
  273. double taylor = sub2 * x2 + sub1;
  274. return absx * taylor * sign;
  275. }
  276. double lol_cos(double x)
  277. {
  278. double absx = lol_fabs(x * INV_PI);
  279. #if defined LOL_FEATURE_CHEAP_BRANCHES
  280. if (absx < QUARTER)
  281. {
  282. double x2 = absx * absx;
  283. double x4 = x2 * x2;
  284. double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
  285. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  286. double taylor = (sub1 * x2 + sub2) * x2 + ONE;
  287. return taylor;
  288. }
  289. #endif
  290. #if defined __CELLOS_LV2__
  291. double num_cycles = lol_round(absx);
  292. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  293. double sign = lol_fsel(is_even, ONE, NEG_ONE);
  294. #else
  295. double num_cycles = absx + TWO_EXP_52;
  296. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  297. double is_even = TWO * num_cycles - ONE;
  298. FP_USE(is_even); is_even += TWO_EXP_54;
  299. FP_USE(is_even); is_even -= TWO_EXP_54;
  300. FP_USE(is_even);
  301. is_even -= TWO * num_cycles - ONE;
  302. double sign = is_even;
  303. #endif
  304. absx -= num_cycles;
  305. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  306. if (lol_fabs(absx) > QUARTER)
  307. {
  308. double x1 = HALF - lol_fabs(absx);
  309. double x2 = x1 * x1;
  310. double x4 = x2 * x2;
  311. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  312. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  313. double taylor = sub2 * x2 + sub1;
  314. return x1 * taylor * sign * D_PI;
  315. }
  316. #endif
  317. double x2 = absx * absx;
  318. double x4 = x2 * x2;
  319. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  320. double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  321. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  322. #else
  323. double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  324. double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
  325. #endif
  326. double taylor = sub2 * x2 + sub1;
  327. return taylor * sign;
  328. }
  329. void lol_sincos(double x, double *sinx, double *cosx)
  330. {
  331. double absx = lol_fabs(x * INV_PI);
  332. #if defined LOL_FEATURE_CHEAP_BRANCHES
  333. if (absx < QUARTER)
  334. {
  335. double x2 = absx * absx;
  336. double x4 = x2 * x2;
  337. /* Computing the Taylor series to the 11th order is enough to get
  338. * x * 1e-11 precision, but we push it to the 13th order so that
  339. * tan() has a better precision. */
  340. double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  341. double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  342. double taylors = subs2 * x2 + subs1;
  343. *sinx = x * taylors;
  344. double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
  345. double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  346. double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
  347. *cosx = taylorc;
  348. return;
  349. }
  350. #endif
  351. #if defined __CELLOS_LV2__
  352. double num_cycles = lol_round(absx);
  353. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  354. double sin_sign = lol_fsel(x, D_PI, -D_PI);
  355. sin_sign = lol_fsel(is_even, sin_sign, -sin_sign);
  356. double cos_sign = lol_fsel(is_even, ONE, NEG_ONE);
  357. #else
  358. double num_cycles = absx + TWO_EXP_52;
  359. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  360. double is_even = TWO * num_cycles - ONE;
  361. FP_USE(is_even); is_even += TWO_EXP_54;
  362. FP_USE(is_even); is_even -= TWO_EXP_54;
  363. FP_USE(is_even);
  364. is_even -= TWO * num_cycles - ONE;
  365. double sin_sign = is_even;
  366. double cos_sign = is_even;
  367. #endif
  368. absx -= num_cycles;
  369. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  370. if (lol_fabs(absx) > QUARTER)
  371. {
  372. cos_sign = sin_sign;
  373. sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
  374. double x1 = HALF - lol_fabs(absx);
  375. double x2 = x1 * x1;
  376. double x4 = x2 * x2;
  377. double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  378. double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  379. double taylors = subs2 * x2 + subs1;
  380. *sinx = taylors * sin_sign;
  381. double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  382. double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  383. double taylorc = subc2 * x2 + subc1;
  384. *cosx = x1 * taylorc * cos_sign * D_PI;
  385. return;
  386. }
  387. #endif
  388. #if !defined __CELLOS_LV2__
  389. sin_sign *= (x >= 0.0) ? D_PI : -D_PI;
  390. #endif
  391. double x2 = absx * absx;
  392. double x4 = x2 * x2;
  393. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  394. double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  395. double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  396. double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  397. double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  398. #else
  399. double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  400. double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
  401. /* Push Taylor series to the 19th order to enhance tan() accuracy. */
  402. double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  403. double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
  404. #endif
  405. double taylors = subs2 * x2 + subs1;
  406. *sinx = absx * taylors * sin_sign;
  407. double taylorc = subc2 * x2 + subc1;
  408. *cosx = taylorc * cos_sign;
  409. }
  410. void lol_sincos(float x, float *sinx, float *cosx)
  411. {
  412. double x2 = static_cast<double>(x);
  413. double s2, c2;
  414. lol_sincos(x2, &s2, &c2);
  415. *sinx = static_cast<float>(s2);
  416. *cosx = static_cast<float>(c2);
  417. }
  418. double lol_tan(double x)
  419. {
  420. #if defined LOL_FEATURE_CHEAP_BRANCHES
  421. double absx = lol_fabs(x * INV_PI);
  422. /* This value was determined empirically to ensure an error of no
  423. * more than x * 1e-11 in this range. */
  424. if (absx < 0.163)
  425. {
  426. double x2 = absx * absx;
  427. double x4 = x2 * x2;
  428. double sub1 = (((TC[7] * x4 + TC[5]) * x4
  429. + TC[3]) * x4 + TC[1]) * x4 + ONE;
  430. double sub2 = (((TC[8] * x4 + TC[6]) * x4
  431. + TC[4]) * x4 + TC[2]) * x4 + TC[0];
  432. double taylor = sub2 * x2 + sub1;
  433. return x * taylor;
  434. }
  435. #endif
  436. double sinx, cosx;
  437. lol_sincos(x, &sinx, &cosx);
  438. /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
  439. double absc = lol_fabs(cosx);
  440. #if defined __CELLOS_LV2__
  441. double is_cos_not_zero = absc - VERY_SMALL_NUMBER;
  442. cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER);
  443. return lol_fdiv(sinx, cosx);
  444. #else
  445. if (__unlikely(absc < VERY_SMALL_NUMBER))
  446. cosx = VERY_SMALL_NUMBER;
  447. return sinx / cosx;
  448. #endif
  449. }
  450. } /* namespace lol */