Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

518 строки
15 KiB

  1. //
  2. // Lol Engine
  3. //
  4. // Copyright: (c) 2010-2011 Sam Hocevar <sam@hocevar.net>
  5. // This program is free software; you can redistribute it and/or
  6. // modify it under the terms of the Do What The Fuck You Want To
  7. // Public License, Version 2, as published by Sam Hocevar. See
  8. // http://sam.zoy.org/projects/COPYING.WTFPL for more details.
  9. //
  10. #if defined HAVE_CONFIG_H
  11. # include "config.h"
  12. #endif
  13. #if defined HAVE_FASTMATH_H
  14. # include <fastmath.h>
  15. #endif
  16. #include <cmath>
  17. #include "core.h"
  18. using namespace std;
  19. namespace lol
  20. {
  21. static const double PI = 3.14159265358979323846264;
  22. static const double NEG_PI = -3.14159265358979323846264;
  23. static const double PI_2 = 1.57079632679489661923132;
  24. static const double PI_4 = 0.785398163397448309615661;
  25. static const double INV_PI = 0.318309886183790671537768;
  26. static const double ROOT3 = 1.73205080756887729352745;
  27. static const double ZERO = 0.0;
  28. static const double ONE = 1.0;
  29. static const double NEG_ONE = -1.0;
  30. static const double HALF = 0.5;
  31. static const double QUARTER = 0.25;
  32. static const double TWO = 2.0;
  33. #if defined __GNUC__
  34. static const double VERY_SMALL_NUMBER = 0x1.0p-128;
  35. #else
  36. static const double VERY_SMALL_NUMBER = 3e-39;
  37. #endif
  38. static const double TWO_EXP_52 = 4503599627370496.0;
  39. static const double TWO_EXP_54 = 18014398509481984.0;
  40. /** sin Taylor series coefficients. */
  41. static const double SC[] =
  42. {
  43. -1.6449340668482264364724e-0, // π^2/3!
  44. +8.1174242528335364363700e-1, // π^4/5!
  45. -1.9075182412208421369647e-1, // π^6/7!
  46. +2.6147847817654800504653e-2, // π^8/9!
  47. -2.3460810354558236375089e-3, // π^10/11!
  48. +1.4842879303107100368487e-4, // π^12/13!
  49. -6.9758736616563804745344e-6, // π^14/15!
  50. +2.5312174041370276513517e-7, // π^16/17!
  51. };
  52. /* Note: the last value should be -1.3878952462213772114468e-7 (ie.
  53. * π^18/18!) but we tweak it in order to get the better average precision
  54. * required for tan() computations when close to π/2+kπ values. */
  55. static const double CC[] =
  56. {
  57. -4.9348022005446793094172e-0, // π^2/2!
  58. +4.0587121264167682181850e-0, // π^4/4!
  59. -1.3352627688545894958753e-0, // π^6/6!
  60. +2.3533063035889320454188e-1, // π^8/8!
  61. -2.5806891390014060012598e-2, // π^10/10!
  62. +1.9295743094039230479033e-3, // π^12/12!
  63. -1.0463810492484570711802e-4, // π^14/14!
  64. +4.3030695870329470072978e-6, // π^16/16!
  65. -1.3777e-7,
  66. };
  67. /* These coefficients use Sloane’s http://oeis.org/A002430 and
  68. * http://oeis.org/A036279 sequences for the Taylor series of tan().
  69. * Note: the last value should be 2.12485922978838540352881e5 (ie.
  70. * 443861162*π^18/1856156927625), but we tweak it in order to get
  71. * sub 1e-11 average precision in a larger range. */
  72. static const double TC[] =
  73. {
  74. 3.28986813369645287294483e0, // π^2/3
  75. 1.29878788045336582981920e1, // 2*π^4/15
  76. 5.18844961612069061254404e1, // 17*π^6/315
  77. 2.07509320280908496804928e2, // 62*π^8/2835
  78. 8.30024701695986756361561e2, // 1382*π^10/155925
  79. 3.32009324029001216460018e3, // 21844*π^12/6081075
  80. 1.32803704909665483598490e4, // 929569*π^14/638512875
  81. 5.31214808666037709352112e4, // 6404582*π^16/10854718875
  82. 2.373e5,
  83. };
  84. #if defined __CELLOS_LV2__
  85. static inline double lol_fctid(double x) INLINEATTR;
  86. static inline double lol_fctidz(double x) INLINEATTR;
  87. static inline double lol_fcfid(double x) INLINEATTR;
  88. static inline double lol_frsqrte(double x) INLINEATTR;
  89. static inline double lol_fsel(double c, double gte, double lt) INLINEATTR;
  90. static inline double lol_fres(double x) INLINEATTR;
  91. static inline double lol_fdiv(double a, double b) INLINEATTR;
  92. #endif
  93. static inline double lol_fabs(double x) INLINEATTR;
  94. #if defined __GNUC__
  95. static inline double lol_round(double x) INLINEATTR;
  96. static inline double lol_trunc(double x) INLINEATTR;
  97. #endif
  98. #if defined __CELLOS_LV2__
  99. static inline double lol_fctid(double x)
  100. {
  101. double r;
  102. #if defined __SNC__
  103. r = __builtin_fctid(x);
  104. #else
  105. __asm__ ("fctid %0, %1"
  106. : "=f" (r) : "f" (x));
  107. #endif
  108. return r;
  109. }
  110. static double lol_fctidz(double x)
  111. {
  112. double r;
  113. #if defined __SNC__
  114. r = __builtin_fctidz(x);
  115. #else
  116. __asm__ ("fctidz %0, %1"
  117. : "=f" (r) : "f" (x));
  118. #endif
  119. return r;
  120. }
  121. static double lol_fcfid(double x)
  122. {
  123. double r;
  124. #if defined __SNC__
  125. r = __builtin_fcfid(x);
  126. #else
  127. __asm__ ("fcfid %0, %1"
  128. : "=f" (r) : "f" (x));
  129. #endif
  130. return r;
  131. }
  132. static double lol_frsqrte(double x)
  133. {
  134. #if defined __SNC__
  135. return __builtin_frsqrte(x);
  136. #else
  137. double r;
  138. __asm__ ("frsqrte %0, %1"
  139. : "=f" (r) : "f" (x));
  140. return r;
  141. #endif
  142. }
  143. static inline double lol_fsel(double c, double gte, double lt)
  144. {
  145. #if defined __CELLOS_LV2__ && defined __SNC__
  146. return __fsel(c, gte, lt);
  147. #elif defined __CELLOS_LV2__
  148. double r;
  149. __asm__ ("fsel %0, %1, %2, %3"
  150. : "=f" (r) : "f" (c), "f" (gte), "f" (lt));
  151. return r;
  152. #else
  153. return (c >= 0) ? gte : lt;
  154. #endif
  155. }
  156. static inline double lol_fres(double x)
  157. {
  158. double ret;
  159. #if defined __SNC__
  160. ret = __builtin_fre(x);
  161. #else
  162. __asm__ ("fres %0, %1"
  163. : "=f" (ret) : "f" (x));
  164. #endif
  165. return ret;
  166. }
  167. static inline double lol_fdiv(double a, double b)
  168. {
  169. /* Estimate */
  170. double x0 = lol_fres(b);
  171. /* Two steps of Newton-Raphson */
  172. x0 = (b * x0 - ONE) * -x0 + x0;
  173. x0 = (b * x0 - ONE) * -x0 + x0;
  174. return a * x0;
  175. }
  176. #endif /* __CELLOS_LV2__ */
  177. static inline double lol_fabs(double x)
  178. {
  179. #if defined __CELLOS_LV2__ && defined __SNC__
  180. return __fabs(x);
  181. #elif defined __CELLOS_LV2__
  182. double r;
  183. __asm__ ("fabs %0, %1"
  184. : "=f" (r) : "f" (x));
  185. return r;
  186. #elif defined __GNUC__
  187. return __builtin_fabs(x);
  188. #else
  189. return fabs(x);
  190. #endif
  191. }
  192. #if defined __GNUC__
  193. static inline double lol_round(double x)
  194. {
  195. #if defined __CELLOS_LV2__
  196. return lol_fcfid(lol_fctid(x));
  197. #else
  198. return __builtin_round(x);
  199. #endif
  200. }
  201. static inline double lol_trunc(double x)
  202. {
  203. #if defined __CELLOS_LV2__
  204. return lol_fcfid(lol_fctidz(x));
  205. #else
  206. return __builtin_trunc(x);
  207. #endif
  208. }
  209. #endif
  210. double lol_sin(double x)
  211. {
  212. double absx = lol_fabs(x * INV_PI);
  213. /* If branches are cheap, skip the cycle count when |x| < π/4,
  214. * and only do the Taylor series up to the required precision. */
  215. #if defined LOL_FEATURE_CHEAP_BRANCHES
  216. if (absx < QUARTER)
  217. {
  218. /* Computing x^4 is one multiplication too many we do, but it helps
  219. * interleave the Taylor series operations a lot better. */
  220. double x2 = absx * absx;
  221. double x4 = x2 * x2;
  222. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  223. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  224. double taylor = sub2 * x2 + sub1;
  225. return x * taylor;
  226. }
  227. #endif
  228. /* Wrap |x| to the range [-1, 1] and keep track of the number of
  229. * cycles required. If odd, we'll need to change the sign of the
  230. * result. */
  231. #if defined __CELLOS_LV2__
  232. double sign = lol_fsel(x, PI, NEG_PI);
  233. double num_cycles = lol_round(absx);
  234. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  235. sign = lol_fsel(is_even, sign, -sign);
  236. #else
  237. double num_cycles = absx + TWO_EXP_52;
  238. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  239. double is_even = TWO * num_cycles - ONE;
  240. FP_USE(is_even); is_even += TWO_EXP_54;
  241. FP_USE(is_even); is_even -= TWO_EXP_54;
  242. FP_USE(is_even);
  243. is_even -= TWO * num_cycles - ONE;
  244. double sign = is_even;
  245. #endif
  246. absx -= num_cycles;
  247. /* If branches are very cheap, we have the option to do the Taylor
  248. * series at a much lower degree by splitting. */
  249. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  250. if (lol_fabs(absx) > QUARTER)
  251. {
  252. sign = (x * absx >= 0.0) ? sign : -sign;
  253. double x1 = HALF - lol_fabs(absx);
  254. double x2 = x1 * x1;
  255. double x4 = x2 * x2;
  256. double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  257. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  258. double taylor = sub2 * x2 + sub1;
  259. return taylor * sign;
  260. }
  261. #endif
  262. #if !defined __CELLOS_LV2__
  263. sign *= (x >= 0.0) ? PI : NEG_PI;
  264. #endif
  265. /* Compute a Tailor series for sin() and combine sign information. */
  266. double x2 = absx * absx;
  267. double x4 = x2 * x2;
  268. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  269. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  270. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  271. #else
  272. double sub1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  273. double sub2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
  274. #endif
  275. double taylor = sub2 * x2 + sub1;
  276. return absx * taylor * sign;
  277. }
  278. double lol_cos(double x)
  279. {
  280. double absx = lol_fabs(x * INV_PI);
  281. #if defined LOL_FEATURE_CHEAP_BRANCHES
  282. if (absx < QUARTER)
  283. {
  284. double x2 = absx * absx;
  285. double x4 = x2 * x2;
  286. double sub1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
  287. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  288. double taylor = (sub1 * x2 + sub2) * x2 + ONE;
  289. return taylor;
  290. }
  291. #endif
  292. #if defined __CELLOS_LV2__
  293. double num_cycles = lol_round(absx);
  294. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  295. double sign = lol_fsel(is_even, ONE, NEG_ONE);
  296. #else
  297. double num_cycles = absx + TWO_EXP_52;
  298. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  299. double is_even = TWO * num_cycles - ONE;
  300. FP_USE(is_even); is_even += TWO_EXP_54;
  301. FP_USE(is_even); is_even -= TWO_EXP_54;
  302. FP_USE(is_even);
  303. is_even -= TWO * num_cycles - ONE;
  304. double sign = is_even;
  305. #endif
  306. absx -= num_cycles;
  307. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  308. if (lol_fabs(absx) > QUARTER)
  309. {
  310. double x1 = HALF - lol_fabs(absx);
  311. double x2 = x1 * x1;
  312. double x4 = x2 * x2;
  313. double sub1 = (SC[3] * x4 + SC[1]) * x4 + ONE;
  314. double sub2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  315. double taylor = sub2 * x2 + sub1;
  316. return x1 * taylor * sign * PI;
  317. }
  318. #endif
  319. double x2 = absx * absx;
  320. double x4 = x2 * x2;
  321. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  322. double sub1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  323. double sub2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  324. #else
  325. double sub1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  326. double sub2 = ((CC[6] * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
  327. #endif
  328. double taylor = sub2 * x2 + sub1;
  329. return taylor * sign;
  330. }
  331. void lol_sincos(double x, double *sinx, double *cosx)
  332. {
  333. double absx = lol_fabs(x * INV_PI);
  334. #if defined LOL_FEATURE_CHEAP_BRANCHES
  335. if (absx < QUARTER)
  336. {
  337. double x2 = absx * absx;
  338. double x4 = x2 * x2;
  339. /* Computing the Taylor series to the 11th order is enough to get
  340. * x * 1e-11 precision, but we push it to the 13th order so that
  341. * tan() has a better precision. */
  342. double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  343. double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  344. double taylors = subs2 * x2 + subs1;
  345. *sinx = x * taylors;
  346. double subc1 = (CC[5] * x4 + CC[3]) * x4 + CC[1];
  347. double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  348. double taylorc = (subc1 * x2 + subc2) * x2 + ONE;
  349. *cosx = taylorc;
  350. return;
  351. }
  352. #endif
  353. #if defined __CELLOS_LV2__
  354. double num_cycles = lol_round(absx);
  355. double is_even = lol_trunc(num_cycles * HALF) - (num_cycles * HALF);
  356. double sin_sign = lol_fsel(x, PI, NEG_PI);
  357. sin_sign = lol_fsel(is_even, sin_sign, -sin_sign);
  358. double cos_sign = lol_fsel(is_even, ONE, NEG_ONE);
  359. #else
  360. double num_cycles = absx + TWO_EXP_52;
  361. FP_USE(num_cycles); num_cycles -= TWO_EXP_52;
  362. double is_even = TWO * num_cycles - ONE;
  363. FP_USE(is_even); is_even += TWO_EXP_54;
  364. FP_USE(is_even); is_even -= TWO_EXP_54;
  365. FP_USE(is_even);
  366. is_even -= TWO * num_cycles - ONE;
  367. double sin_sign = is_even;
  368. double cos_sign = is_even;
  369. #endif
  370. absx -= num_cycles;
  371. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  372. if (lol_fabs(absx) > QUARTER)
  373. {
  374. cos_sign = sin_sign;
  375. sin_sign = (x * absx >= 0.0) ? sin_sign : -sin_sign;
  376. double x1 = HALF - lol_fabs(absx);
  377. double x2 = x1 * x1;
  378. double x4 = x2 * x2;
  379. double subs1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  380. double subs2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  381. double taylors = subs2 * x2 + subs1;
  382. *sinx = taylors * sin_sign;
  383. double subc1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  384. double subc2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  385. double taylorc = subc2 * x2 + subc1;
  386. *cosx = x1 * taylorc * cos_sign * PI;
  387. return;
  388. }
  389. #endif
  390. #if !defined __CELLOS_LV2__
  391. sin_sign *= (x >= 0.0) ? PI : NEG_PI;
  392. #endif
  393. double x2 = absx * absx;
  394. double x4 = x2 * x2;
  395. #if defined LOL_FEATURE_VERY_CHEAP_BRANCHES
  396. double subs1 = ((SC[5] * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  397. double subs2 = (SC[4] * x4 + SC[2]) * x4 + SC[0];
  398. double subc1 = ((CC[5] * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  399. double subc2 = (CC[4] * x4 + CC[2]) * x4 + CC[0];
  400. #else
  401. double subs1 = (((SC[7] * x4 + SC[5]) * x4 + SC[3]) * x4 + SC[1]) * x4 + ONE;
  402. double subs2 = ((SC[6] * x4 + SC[4]) * x4 + SC[2]) * x4 + SC[0];
  403. /* Push Taylor series to the 19th order to enhance tan() accuracy. */
  404. double subc1 = (((CC[7] * x4 + CC[5]) * x4 + CC[3]) * x4 + CC[1]) * x4 + ONE;
  405. double subc2 = (((CC[8] * x4 + CC[6]) * x4 + CC[4]) * x4 + CC[2]) * x4 + CC[0];
  406. #endif
  407. double taylors = subs2 * x2 + subs1;
  408. *sinx = absx * taylors * sin_sign;
  409. double taylorc = subc2 * x2 + subc1;
  410. *cosx = taylorc * cos_sign;
  411. }
  412. void lol_sincos(float x, float *sinx, float *cosx)
  413. {
  414. double x2 = static_cast<double>(x);
  415. double s2, c2;
  416. lol_sincos(x2, &s2, &c2);
  417. *sinx = static_cast<float>(s2);
  418. *cosx = static_cast<float>(c2);
  419. }
  420. double lol_tan(double x)
  421. {
  422. #if defined LOL_FEATURE_CHEAP_BRANCHES
  423. double absx = lol_fabs(x * INV_PI);
  424. /* This value was determined empirically to ensure an error of no
  425. * more than x * 1e-11 in this range. */
  426. if (absx < 0.163)
  427. {
  428. double x2 = absx * absx;
  429. double x4 = x2 * x2;
  430. double sub1 = (((TC[7] * x4 + TC[5]) * x4
  431. + TC[3]) * x4 + TC[1]) * x4 + ONE;
  432. double sub2 = (((TC[8] * x4 + TC[6]) * x4
  433. + TC[4]) * x4 + TC[2]) * x4 + TC[0];
  434. double taylor = sub2 * x2 + sub1;
  435. return x * taylor;
  436. }
  437. #endif
  438. double sinx, cosx;
  439. lol_sincos(x, &sinx, &cosx);
  440. /* Ensure cosx isn't zero. FIXME: we lose the cosx sign here. */
  441. double absc = lol_fabs(cosx);
  442. #if defined __CELLOS_LV2__
  443. double is_cos_not_zero = absc - VERY_SMALL_NUMBER;
  444. cosx = lol_fsel(is_cos_not_zero, cosx, VERY_SMALL_NUMBER);
  445. return lol_fdiv(sinx, cosx);
  446. #else
  447. if (__unlikely(absc < VERY_SMALL_NUMBER))
  448. cosx = VERY_SMALL_NUMBER;
  449. return sinx / cosx;
  450. #endif
  451. }
  452. } /* namespace lol */