You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

530 lines
15 KiB

  1. //
  2. // Lol Engine
  3. //
  4. // Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
  5. // This program is free software; you can redistribute it and/or
  6. // modify it under the terms of the Do What The Fuck You Want To
  7. // Public License, Version 2, as published by Sam Hocevar. See
  8. // http://www.wtfpl.net/ for more details.
  9. //
  10. #include <lol/engine-internal.h>
  11. #if defined HAVE_FASTMATH_H
  12. # include <fastmath.h>
  13. #endif
  14. // Optimisation helpers
  15. #if defined __GNUC__
  16. # define __likely(x) __builtin_expect(!!(x), 1)
  17. # define __unlikely(x) __builtin_expect(!!(x), 0)
  18. # define INLINEATTR __attribute__((always_inline))
  19. # if defined __CELLOS_LV2__ && !defined __SNC__
  20. # define FP_USE(x) __asm__("" : "+f" (x))
  21. # elif defined __x86_64__
  22. # define FP_USE(x) __asm__("" : "+x" (x))
  23. # elif defined __i386__ /* FIXME: this isn't good */
  24. # define FP_USE(x) __asm__("" : "+m" (x))
  25. # else
  26. # define FP_USE(x) (void)(x)
  27. # endif
  28. #else
  29. # define __likely(x) x
  30. # define __unlikely(x) x
  31. # define INLINEATTR
  32. # define FP_USE(x) (void)(x)
  33. #endif
  34. namespace lol
  35. {
  36. static const double PI_2 = 1.57079632679489661923132;
  37. static const double PI_4 = 0.785398163397448309615661;
  38. static const double INV_PI = 0.318309886183790671537768;
  39. static const double ROOT3 = 1.73205080756887729352745;
  40. static const double ZERO = 0.0;
  41. static const double ONE = 1.0;
  42. static const double NEG_ONE = -1.0;
  43. static const double HALF = 0.5;
  44. static const double QUARTER = 0.25;
  45. static const double TWO = 2.0;
  46. #if defined __GNUC__
  47. static const double VERY_SMALL_NUMBER = 0x1.0p-128;
  48. #else
  49. static const double VERY_SMALL_NUMBER = 3e-39;
  50. #endif
  51. static const double TWO_EXP_52 = 4503599627370496.0;
  52. static const double TWO_EXP_54 = 18014398509481984.0;
  53. /** sin Taylor series coefficients. */
  54. static const double SC[] =
  55. {
  56. -1.6449340668482264364724e-0, // π^2/3!
  57. +8.1174242528335364363700e-1, // π^4/5!
  58. -1.9075182412208421369647e-1, // π^6/7!
  59. +2.6147847817654800504653e-2, // π^8/9!
  60. -2.3460810354558236375089e-3, // π^10/11!
  61. +1.4842879303107100368487e-4, // π^12/13!
  62. -6.9758736616563804745344e-6, // π^14/15!
  63. +2.5312174041370276513517e-7, // π^16/17!
  64. };
  65. /* Note: the last value should be -1.3878952462213772114468e-7 (ie.
  66. * π^18/18!) but we tweak it in order to get the better average precision
  67. * required for tan() computations when close to π/2+kπ values. */
  68. static const double CC[] =
  69. {
  70. -4.9348022005446793094172e-0, // π^2/2!
  71. +4.0587121264167682181850e-0, // π^4/4!
  72. -1.3352627688545894958753e-0, // π^6/6!
  73. +2.3533063035889320454188e-1, // π^8/8!
  74. -2.5806891390014060012598e-2, // π^10/10!
  75. +1.9295743094039230479033e-3, // π^12/12!
  76. -1.0463810492484570711802e-4, // π^14/14!
  77. +4.3030695870329470072978e-6, // π^16/16!
  78. -1.3777e-7,
  79. };
  80. /* These coefficients use Sloane’s http://oeis.org/A002430 and
  81. * http://oeis.org/A036279 sequences for the Taylor series of tan().
  82. * Note: the last value should be 2.12485922978838540352881e5 (ie.
  83. * 443861162*π^18/1856156927625), but we tweak it in order to get
  84. * sub 1e-11 average precision in a larger range. */
  85. static const double TC[] =
  86. {
  87. 3.28986813369645287294483e0, // π^2/3
  88. 1.29878788045336582981920e1, // 2*π^4/15
  89. 5.18844961612069061254404e1, // 17*π^6/315
  90. 2.07509320280908496804928e2, // 62*π^8/2835
  91. 8.30024701695986756361561e2, // 1382*π^10/155925
  92. 3.32009324029001216460018e3, // 21844*π^12/6081075
  93. 1.32803704909665483598490e4, // 929569*π^14/638512875
  94. 5.31214808666037709352112e4, // 6404582*π^16/10854718875
  95. 2.373e5,
  96. };
  97. #if defined __CELLOS_LV2__
  98. static inline double lol_fctid(double x) INLINEATTR;
  99. static inline double lol_fctidz(double x) INLINEATTR;
  100. static inline double lol_fcfid(double x) INLINEATTR;
  101. static inline double lol_frsqrte(double x) INLINEATTR;
  102. static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
  103. static inline double lol_fres(double x) INLINEATTR;
  104. static inline double lol_fdiv(double a, double b) INLINEATTR;
  105. #endif
  106. static inline double lol_fabs(double x) INLINEATTR;
  107. #if defined __GNUC__
  108. static inline double lol_round(double x) INLINEATTR;
  109. static inline double lol_trunc(double x) INLINEATTR;
  110. #endif
  111. #if defined __CELLOS_LV2__
  112. static inline double lol_fctid(double x)
  113. {
  114. double r;
  115. #if defined __SNC__
  116. r = __builtin_fctid(x);
  117. #else
  118. __asm__ ("fctid %0, %1"
  119. : "=f" (r) : "f" (x));
  120. #endif
  121. return r;
  122. }
  123. static double lol_fctidz(double x)
  124. {
  125. double r;
  126. #if defined __SNC__
  127. r = __builtin_fctidz(x);
  128. #else
  129. __asm__ ("fctidz %0, %1"
  130. : "=f" (r) : "f" (x));
  131. #endif
  132. return r;
  133. }
  134. static double lol_fcfid(double x)
  135. {
  136. double r;
  137. #if defined __SNC__
  138. r = __builtin_fcfid(x);
  139. #else
  140. __asm__ ("fcfid %0, %1"
  141. : "=f" (r) : "f" (x));
  142. #endif
  143. return r;
  144. }
  145. static double lol_frsqrte(double x)
  146. {
  147. #if defined __SNC__
  148. return __builtin_frsqrte(x);
  149. #else
  150. double r;
  151. __asm__ ("frsqrte %0, %1"
  152. : "=f" (r) : "f" (x));
  153. return r;
  154. #endif
  155. }
  156. static inline double lol_fsel(double c, double gte, double lt)
  157. {
  158. #if defined __CELLOS_LV2__ && defined __SNC__
  159. return __fsel(c, gte, lt);
  160. #elif defined __CELLOS_LV2__
  161. double r;
  162. __asm__ ("fsel %0, %1, %2, %3"
  163. : "=f" (r) : "f" (c), "f" (gte), "f" (lt));
  164. return r;
  165. #else
  166. return (c >= 0) ? gte : lt;
  167. #endif
  168. }
  169. static inline double lol_fres(double x)
  170. {
  171. double ret;
  172. #if defined __SNC__
  173. ret = __builtin_fre(x);
  174. #else
  175. __asm__ ("fres %0, %1"
  176. : "=f" (ret) : "f" (x));
  177. #endif
  178. return ret;
  179. }
  180. static inline double lol_fdiv(double a, double b)
  181. {
  182. /* Estimate */
  183. double x0 = lol_fres(b);
  184. /* Two steps of Newton-Raphson */
  185. x0 = (b * x0 - ONE) * -x0 + x0;
  186. x0 = (b * x0 - ONE) * -x0 + x0;
  187. return a * x0;
  188. }
  189. #endif /* __CELLOS_LV2__ */
  190. static inline double lol_fabs(double x)
  191. {
  192. #if defined __CELLOS_LV2__ && defined __SNC__
  193. return __fabs(x);
  194. #elif defined __CELLOS_LV2__
  195. double r;
  196. __asm__ ("fabs %0, %1"
  197. : "=f" (r) : "f" (x));
  198. return r;
  199. #elif defined __GNUC__
  200. return __builtin_fabs(x);
  201. #else
  202. using std::fabs;
  203. return fabs(x);
  204. #endif
  205. }
  206. #if defined __GNUC__
  207. static inline double lol_round(double x)
  208. {
  209. #if defined __CELLOS_LV2__
  210. return lol_fcfid(lol_fctid(x));
  211. #else
  212. return __builtin_round(x);
  213. #endif
  214. }
  215. static inline double lol_trunc(double x)
  216. {
  217. #if defined __CELLOS_LV2__
  218. return lol_fcfid(lol_fctidz(x));
  219. #else
  220. return __builtin_trunc(x);
  221. #endif
  222. }
  223. #endif
  224. double lol_sin(double x)
  225. {
  226. double absx = lol_fabs(x * INV_PI);
  227. /* If branches are cheap, skip the cycle count when |x| < π/4,
  228. * and only do the Taylor series up to the required precision. */
  229. #if LOL_FEATURE_CHEAP_BRANCHES
  230. if (absx < QUARTER)
  231. {
  232. /* Computing x^4 is one multiplication too many we do, but it helps
  233. * interleave the Taylor series operations a lot better. */
  234. double x2 = absx * absx;
  235. double x4 = x2 * x2;
  236. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  237. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  238. double taylor = sub2 * x2 + sub1;
  239. return x * taylor;
  240. }
  241. #endif
  242. /* Wrap |x| to the range [-1, 1] and keep track of the number of
  243. * cycles required. If odd, we'll need to change the sign of the
  244. * result. */
  245. #if defined __CELLOS_LV2__
  246. double sign = lol_fsel(x, D_PI, -D_PI);
  247. double num_cycles = lol_round(absx);
  248. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  249. sign = lol_fsel(is_even, sign, -sign);
  250. #else
  251. double num_cycles = absx + TWO_EXP_52;
  252. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  253. double is_even = TWO * num_cycles - ONE;
  254. FP_USE(is_even); is_even += TWO_EXP_54;
  255. FP_USE(is_even); is_even -= TWO_EXP_54;
  256. FP_USE(is_even);
  257. is_even -= TWO * num_cycles - ONE;
  258. double sign = is_even;
  259. #endif
  260. absx -= num_cycles;
  261. /* If branches are very cheap, we have the option to do the Taylor
  262. * series at a much lower degree by splitting. */
  263. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  264. if (lol_fabs(absx) > QUARTER)
  265. {
  266. sign = (x * absx >= 0.0) ? sign : -sign;
  267. double x1 = HALF - lol_fabs(absx);
  268. double x2 = x1 * x1;
  269. double x4 = x2 * x2;
  270. double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  271. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  272. double taylor = sub2 * x2 + sub1;
  273. return taylor * sign;
  274. }
  275. #endif
  276. #if !defined __CELLOS_LV2__
  277. sign *= (x >= 0.0) ? D_PI : -D_PI;
  278. #endif
  279. /* Compute a Tailor series for sin() and combine sign information. */
  280. double x2 = absx * absx;
  281. double x4 = x2 * x2;
  282. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  283. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  284. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  285. #else
  286. double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  287. double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
  288. #endif
  289. double taylor = sub2 * x2 + sub1;
  290. return absx * taylor * sign;
  291. }
  292. double lol_cos(double x)
  293. {
  294. double absx = lol_fabs(x * INV_PI);
  295. #if LOL_FEATURE_CHEAP_BRANCHES
  296. if (absx < QUARTER)
  297. {
  298. double x2 = absx * absx;
  299. double x4 = x2 * x2;
  300. double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
  301. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  302. double taylor = (sub1 * x2 + sub2) * x2 + ONE;
  303. return taylor;
  304. }
  305. #endif
  306. #if defined __CELLOS_LV2__
  307. double num_cycles = lol_round(absx);
  308. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  309. double sign = lol_fsel(is_even, ONE, NEG_ONE);
  310. #else
  311. double num_cycles = absx + TWO_EXP_52;
  312. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  313. double is_even = TWO * num_cycles - ONE;
  314. FP_USE(is_even); is_even += TWO_EXP_54;
  315. FP_USE(is_even); is_even -= TWO_EXP_54;
  316. FP_USE(is_even);
  317. is_even -= TWO * num_cycles - ONE;
  318. double sign = is_even;
  319. #endif
  320. absx -= num_cycles;
  321. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  322. if (lol_fabs(absx) > QUARTER)
  323. {
  324. double x1 = HALF - lol_fabs(absx);
  325. double x2 = x1 * x1;
  326. double x4 = x2 * x2;
  327. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  328. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  329. double taylor = sub2 * x2 + sub1;
  330. return x1 * taylor * sign * D_PI;
  331. }
  332. #endif
  333. double x2 = absx * absx;
  334. double x4 = x2 * x2;
  335. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  336. double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  337. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  338. #else
  339. double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  340. double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
  341. #endif
  342. double taylor = sub2 * x2 + sub1;
  343. return taylor * sign;
  344. }
  345. void lol_sincos(double x, double *sinx, double *cosx)
  346. {
  347. double absx = lol_fabs(x * INV_PI);
  348. #if LOL_FEATURE_CHEAP_BRANCHES
  349. if (absx < QUARTER)
  350. {
  351. double x2 = absx * absx;
  352. double x4 = x2 * x2;
  353. /* Computing the Taylor series to the 11th order is enough to get
  354. * x * 1e-11 precision, but we push it to the 13th order so that
  355. * tan() has a better precision. */
  356. double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  357. double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  358. double taylors = subs2 * x2 + subs1;
  359. *sinx = x * taylors;
  360. double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
  361. double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  362. double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
  363. *cosx = taylorc;
  364. return;
  365. }
  366. #endif
  367. #if defined __CELLOS_LV2__
  368. double num_cycles = lol_round(absx);
  369. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  370. double sin_sign = lol_fsel(x, D_PI, -D_PI);
  371. sin_sign = lol_fsel(is_even, sin_sign, -sin_sign);
  372. double cos_sign = lol_fsel(is_even, ONE, NEG_ONE);
  373. #else
  374. double num_cycles = absx + TWO_EXP_52;
  375. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  376. double is_even = TWO * num_cycles - ONE;
  377. FP_USE(is_even); is_even += TWO_EXP_54;
  378. FP_USE(is_even); is_even -= TWO_EXP_54;
  379. FP_USE(is_even);
  380. is_even -= TWO * num_cycles - ONE;
  381. double sin_sign = is_even;
  382. double cos_sign = is_even;
  383. #endif
  384. absx -= num_cycles;
  385. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  386. if (lol_fabs(absx) > QUARTER)
  387. {
  388. cos_sign = sin_sign;
  389. sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
  390. double x1 = HALF - lol_fabs(absx);
  391. double x2 = x1 * x1;
  392. double x4 = x2 * x2;
  393. double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  394. double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  395. double taylors = subs2 * x2 + subs1;
  396. *sinx = taylors * sin_sign;
  397. double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  398. double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  399. double taylorc = subc2 * x2 + subc1;
  400. *cosx = x1 * taylorc * cos_sign * D_PI;
  401. return;
  402. }
  403. #endif
  404. #if !defined __CELLOS_LV2__
  405. sin_sign *= (x >= 0.0) ? D_PI : -D_PI;
  406. #endif
  407. double x2 = absx * absx;
  408. double x4 = x2 * x2;
  409. #if LOL_FEATURE_VERY_CHEAP_BRANCHES
  410. double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  411. double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  412. double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  413. double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  414. #else
  415. double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  416. double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
  417. /* Push Taylor series to the 19th order to enhance tan() accuracy. */
  418. double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  419. double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
  420. #endif
  421. double taylors = subs2 * x2 + subs1;
  422. *sinx = absx * taylors * sin_sign;
  423. double taylorc = subc2 * x2 + subc1;
  424. *cosx = taylorc * cos_sign;
  425. }
  426. void lol_sincos(float x, float *sinx, float *cosx)
  427. {
  428. double x2 = static_cast<double>(x);
  429. double s2, c2;
  430. lol_sincos(x2, &s2, &c2);
  431. *sinx = static_cast<float>(s2);
  432. *cosx = static_cast<float>(c2);
  433. }
  434. double lol_tan(double x)
  435. {
  436. #if LOL_FEATURE_CHEAP_BRANCHES
  437. double absx = lol_fabs(x * INV_PI);
  438. /* This value was determined empirically to ensure an error of no
  439. * more than x * 1e-11 in this range. */
  440. if (absx < 0.163)
  441. {
  442. double x2 = absx * absx;
  443. double x4 = x2 * x2;
  444. double sub1 = (((TC[7] * x4 + TC[5]) * x4
  445. + TC[3]) * x4 + TC[1]) * x4 + ONE;
  446. double sub2 = (((TC[8] * x4 + TC[6]) * x4
  447. + TC[4]) * x4 + TC[2]) * x4 + TC[0];
  448. double taylor = sub2 * x2 + sub1;
  449. return x * taylor;
  450. }
  451. #endif
  452. double sinx, cosx;
  453. lol_sincos(x, &sinx, &cosx);
  454. /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
  455. double absc = lol_fabs(cosx);
  456. #if defined __CELLOS_LV2__
  457. double is_cos_not_zero = absc - VERY_SMALL_NUMBER;
  458. cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER);
  459. return lol_fdiv(sinx, cosx);
  460. #else
  461. if (__unlikely(absc < VERY_SMALL_NUMBER))
  462. cosx = VERY_SMALL_NUMBER;
  463. return sinx / cosx;
  464. #endif
  465. }
  466. } /* namespace lol */