You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

1215 line
36 KiB

  1. /*
  2. Copyright (c) 2012 Advanced Micro Devices, Inc.
  3. This software is provided 'as-is', without any express or implied warranty.
  4. In no event will the authors be held liable for any damages arising from the use of this software.
  5. Permission is granted to anyone to use this software for any purpose,
  6. including commercial applications, and to alter it and redistribute it freely,
  7. subject to the following restrictions:
  8. 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  9. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  10. 3. This notice may not be removed or altered from any source distribution.
  11. */
  12. //Originally written by Takahiro Harada
  13. #include "b3Solver.h"
  14. ///useNewBatchingKernel is a rewritten kernel using just a single thread of the warp, for experiments
  15. bool useNewBatchingKernel = true;
  16. bool gConvertConstraintOnCpu = false;
  17. #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
  18. #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
  19. #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
  20. #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
  21. #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
  22. #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
  23. #include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
  24. #include "kernels/solverSetup.h"
  25. #include "kernels/solverSetup2.h"
  26. #include "kernels/solveContact.h"
  27. #include "kernels/solveFriction.h"
  28. #include "kernels/batchingKernels.h"
  29. #include "kernels/batchingKernelsNew.h"
  30. #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
  31. #include "Bullet3Common/b3Vector3.h"
  32. struct SolverDebugInfo
  33. {
  34. int m_valInt0;
  35. int m_valInt1;
  36. int m_valInt2;
  37. int m_valInt3;
  38. int m_valInt4;
  39. int m_valInt5;
  40. int m_valInt6;
  41. int m_valInt7;
  42. int m_valInt8;
  43. int m_valInt9;
  44. int m_valInt10;
  45. int m_valInt11;
  46. int m_valInt12;
  47. int m_valInt13;
  48. int m_valInt14;
  49. int m_valInt15;
  50. float m_val0;
  51. float m_val1;
  52. float m_val2;
  53. float m_val3;
  54. };
  55. class SolverDeviceInl
  56. {
  57. public:
  58. struct ParallelSolveData
  59. {
  60. b3OpenCLArray<unsigned int>* m_numConstraints;
  61. b3OpenCLArray<unsigned int>* m_offsets;
  62. };
  63. };
  64. b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
  65. :m_context(ctx),
  66. m_device(device),
  67. m_queue(queue),
  68. m_batchSizes(ctx,queue),
  69. m_nIterations(4)
  70. {
  71. m_sort32 = new b3RadixSort32CL(ctx,device,queue);
  72. m_scan = new b3PrefixScanCL(ctx,device,queue,B3_SOLVER_N_CELLS);
  73. m_search = new b3BoundSearchCL(ctx,device,queue,B3_SOLVER_N_CELLS);
  74. const int sortSize = B3NEXTMULTIPLEOF( pairCapacity, 512 );
  75. m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx,queue,sortSize);
  76. m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx,queue);
  77. m_numConstraints = new b3OpenCLArray<unsigned int>(ctx,queue,B3_SOLVER_N_CELLS );
  78. m_numConstraints->resize(B3_SOLVER_N_CELLS);
  79. m_offsets = new b3OpenCLArray<unsigned int>( ctx,queue,B3_SOLVER_N_CELLS);
  80. m_offsets->resize(B3_SOLVER_N_CELLS);
  81. const char* additionalMacros = "";
  82. const char* srcFileNameForCaching="";
  83. cl_int pErrNum;
  84. const char* batchKernelSource = batchingKernelsCL;
  85. const char* batchKernelNewSource = batchingKernelsNewCL;
  86. const char* solverSetupSource = solverSetupCL;
  87. const char* solverSetup2Source = solverSetup2CL;
  88. const char* solveContactSource = solveContactCL;
  89. const char* solveFrictionSource = solveFrictionCL;
  90. {
  91. cl_program solveContactProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveContactSource, &pErrNum,additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
  92. b3Assert(solveContactProg);
  93. cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
  94. b3Assert(solveFrictionProg);
  95. cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
  96. b3Assert(solverSetup2Prog);
  97. cl_program solverSetupProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetupSource, &pErrNum,additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
  98. b3Assert(solverSetupProg);
  99. m_solveFrictionKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg,additionalMacros );
  100. b3Assert(m_solveFrictionKernel);
  101. m_solveContactKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg,additionalMacros );
  102. b3Assert(m_solveContactKernel);
  103. m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg,additionalMacros );
  104. b3Assert(m_contactToConstraintKernel);
  105. m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog,additionalMacros );
  106. b3Assert(m_setSortDataKernel);
  107. m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog,additionalMacros );
  108. b3Assert(m_reorderContactKernel);
  109. m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog,additionalMacros );
  110. b3Assert(m_copyConstraintKernel);
  111. }
  112. {
  113. cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelSource, &pErrNum,additionalMacros, B3_BATCHING_PATH);
  114. //cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
  115. b3Assert(batchingProg);
  116. m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );
  117. b3Assert(m_batchingKernel);
  118. }
  119. {
  120. cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, B3_BATCHING_NEW_PATH);
  121. b3Assert(batchingNewProg);
  122. m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
  123. //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
  124. b3Assert(m_batchingKernelNew);
  125. }
  126. }
  127. b3Solver::~b3Solver()
  128. {
  129. delete m_offsets;
  130. delete m_numConstraints;
  131. delete m_sortDataBuffer;
  132. delete m_contactBuffer2;
  133. delete m_sort32;
  134. delete m_scan;
  135. delete m_search;
  136. clReleaseKernel(m_batchingKernel);
  137. clReleaseKernel(m_batchingKernelNew);
  138. clReleaseKernel( m_solveContactKernel);
  139. clReleaseKernel( m_solveFrictionKernel);
  140. clReleaseKernel( m_contactToConstraintKernel);
  141. clReleaseKernel( m_setSortDataKernel);
  142. clReleaseKernel( m_reorderContactKernel);
  143. clReleaseKernel( m_copyConstraintKernel);
  144. }
  145. template<bool JACOBI>
  146. static
  147. __inline
  148. void solveContact(b3GpuConstraint4& cs,
  149. const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
  150. const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
  151. float maxRambdaDt[4], float minRambdaDt[4])
  152. {
  153. b3Vector3 dLinVelA; dLinVelA.setZero();
  154. b3Vector3 dAngVelA; dAngVelA.setZero();
  155. b3Vector3 dLinVelB; dLinVelB.setZero();
  156. b3Vector3 dAngVelB; dAngVelB.setZero();
  157. for(int ic=0; ic<4; ic++)
  158. {
  159. // dont necessary because this makes change to 0
  160. if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
  161. {
  162. b3Vector3 angular0, angular1, linear;
  163. b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
  164. b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
  165. setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, &linear, &angular0, &angular1 );
  166. float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
  167. linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
  168. rambdaDt *= cs.m_jacCoeffInv[ic];
  169. {
  170. float prevSum = cs.m_appliedRambdaDt[ic];
  171. float updated = prevSum;
  172. updated += rambdaDt;
  173. updated = b3Max( updated, minRambdaDt[ic] );
  174. updated = b3Min( updated, maxRambdaDt[ic] );
  175. rambdaDt = updated - prevSum;
  176. cs.m_appliedRambdaDt[ic] = updated;
  177. }
  178. b3Vector3 linImp0 = invMassA*linear*rambdaDt;
  179. b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
  180. b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
  181. b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
  182. #ifdef _WIN32
  183. b3Assert(_finite(linImp0.getX()));
  184. b3Assert(_finite(linImp1.getX()));
  185. #endif
  186. if( JACOBI )
  187. {
  188. dLinVelA += linImp0;
  189. dAngVelA += angImp0;
  190. dLinVelB += linImp1;
  191. dAngVelB += angImp1;
  192. }
  193. else
  194. {
  195. linVelA += linImp0;
  196. angVelA += angImp0;
  197. linVelB += linImp1;
  198. angVelB += angImp1;
  199. }
  200. }
  201. }
  202. if( JACOBI )
  203. {
  204. linVelA += dLinVelA;
  205. angVelA += dAngVelA;
  206. linVelB += dLinVelB;
  207. angVelB += dAngVelB;
  208. }
  209. }
  210. static
  211. __inline
  212. void solveFriction(b3GpuConstraint4& cs,
  213. const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
  214. const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
  215. float maxRambdaDt[4], float minRambdaDt[4])
  216. {
  217. if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
  218. const b3Vector3& center = (const b3Vector3&)cs.m_center;
  219. b3Vector3 n = -(const b3Vector3&)cs.m_linear;
  220. b3Vector3 tangent[2];
  221. #if 1
  222. b3PlaneSpace1 (n, tangent[0],tangent[1]);
  223. #else
  224. b3Vector3 r = cs.m_worldPos[0]-center;
  225. tangent[0] = cross3( n, r );
  226. tangent[1] = cross3( tangent[0], n );
  227. tangent[0] = normalize3( tangent[0] );
  228. tangent[1] = normalize3( tangent[1] );
  229. #endif
  230. b3Vector3 angular0, angular1, linear;
  231. b3Vector3 r0 = center - posA;
  232. b3Vector3 r1 = center - posB;
  233. for(int i=0; i<2; i++)
  234. {
  235. setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
  236. float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
  237. linVelA, angVelA, linVelB, angVelB );
  238. rambdaDt *= cs.m_fJacCoeffInv[i];
  239. {
  240. float prevSum = cs.m_fAppliedRambdaDt[i];
  241. float updated = prevSum;
  242. updated += rambdaDt;
  243. updated = b3Max( updated, minRambdaDt[i] );
  244. updated = b3Min( updated, maxRambdaDt[i] );
  245. rambdaDt = updated - prevSum;
  246. cs.m_fAppliedRambdaDt[i] = updated;
  247. }
  248. b3Vector3 linImp0 = invMassA*linear*rambdaDt;
  249. b3Vector3 linImp1 = invMassB*(-linear)*rambdaDt;
  250. b3Vector3 angImp0 = (invInertiaA* angular0)*rambdaDt;
  251. b3Vector3 angImp1 = (invInertiaB* angular1)*rambdaDt;
  252. #ifdef _WIN32
  253. b3Assert(_finite(linImp0.getX()));
  254. b3Assert(_finite(linImp1.getX()));
  255. #endif
  256. linVelA += linImp0;
  257. angVelA += angImp0;
  258. linVelB += linImp1;
  259. angVelB += angImp1;
  260. }
  261. { // angular damping for point constraint
  262. b3Vector3 ab = ( posB - posA ).normalized();
  263. b3Vector3 ac = ( center - posA ).normalized();
  264. if( b3Dot( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
  265. {
  266. float angNA = b3Dot( n, angVelA );
  267. float angNB = b3Dot( n, angVelB );
  268. angVelA -= (angNA*0.1f)*n;
  269. angVelB -= (angNB*0.1f)*n;
  270. }
  271. }
  272. }
  273. struct SolveTask// : public ThreadPool::Task
  274. {
  275. SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
  276. int start, int nConstraints,int maxNumBatches,b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
  277. : m_bodies( bodies ), m_shapes( shapes ),
  278. m_constraints( constraints ),
  279. m_batchSizes(batchSizes),
  280. m_cellIndex(cellIndex),
  281. m_curWgidx(curWgidx),
  282. m_start( start ),
  283. m_nConstraints( nConstraints ),
  284. m_solveFriction( true ),
  285. m_maxNumBatches(maxNumBatches)
  286. {}
  287. unsigned short int getType(){ return 0; }
  288. void run(int tIdx)
  289. {
  290. int offset = 0;
  291. for (int ii=0;ii<B3_MAX_NUM_BATCHES;ii++)
  292. {
  293. int numInBatch = m_batchSizes->at(m_cellIndex*B3_MAX_NUM_BATCHES+ii);
  294. if (!numInBatch)
  295. break;
  296. for (int jj=0;jj<numInBatch;jj++)
  297. {
  298. int i = m_start + offset+jj;
  299. int batchId = m_constraints[i].m_batchIdx;
  300. b3Assert(batchId==ii);
  301. float frictionCoeff = m_constraints[i].getFrictionCoeff();
  302. int aIdx = (int)m_constraints[i].m_bodyA;
  303. int bIdx = (int)m_constraints[i].m_bodyB;
  304. int localBatch = m_constraints[i].m_batchIdx;
  305. b3RigidBodyData& bodyA = m_bodies[aIdx];
  306. b3RigidBodyData& bodyB = m_bodies[bIdx];
  307. if( !m_solveFriction )
  308. {
  309. float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  310. float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
  311. solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld,
  312. (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
  313. maxRambdaDt, minRambdaDt );
  314. }
  315. else
  316. {
  317. float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  318. float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
  319. float sum = 0;
  320. for(int j=0; j<4; j++)
  321. {
  322. sum +=m_constraints[i].m_appliedRambdaDt[j];
  323. }
  324. frictionCoeff = 0.7f;
  325. for(int j=0; j<4; j++)
  326. {
  327. maxRambdaDt[j] = frictionCoeff*sum;
  328. minRambdaDt[j] = -maxRambdaDt[j];
  329. }
  330. solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld,
  331. (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
  332. maxRambdaDt, minRambdaDt );
  333. }
  334. }
  335. offset+=numInBatch;
  336. }
  337. /* for (int bb=0;bb<m_maxNumBatches;bb++)
  338. {
  339. //for(int ic=m_nConstraints-1; ic>=0; ic--)
  340. for(int ic=0; ic<m_nConstraints; ic++)
  341. {
  342. int i = m_start + ic;
  343. if (m_constraints[i].m_batchIdx != bb)
  344. continue;
  345. float frictionCoeff = m_constraints[i].getFrictionCoeff();
  346. int aIdx = (int)m_constraints[i].m_bodyA;
  347. int bIdx = (int)m_constraints[i].m_bodyB;
  348. int localBatch = m_constraints[i].m_batchIdx;
  349. b3RigidBodyData& bodyA = m_bodies[aIdx];
  350. b3RigidBodyData& bodyB = m_bodies[bIdx];
  351. if( !m_solveFriction )
  352. {
  353. float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  354. float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
  355. solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld,
  356. (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
  357. maxRambdaDt, minRambdaDt );
  358. }
  359. else
  360. {
  361. float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  362. float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
  363. float sum = 0;
  364. for(int j=0; j<4; j++)
  365. {
  366. sum +=m_constraints[i].m_appliedRambdaDt[j];
  367. }
  368. frictionCoeff = 0.7f;
  369. for(int j=0; j<4; j++)
  370. {
  371. maxRambdaDt[j] = frictionCoeff*sum;
  372. minRambdaDt[j] = -maxRambdaDt[j];
  373. }
  374. solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld,
  375. (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
  376. maxRambdaDt, minRambdaDt );
  377. }
  378. }
  379. }
  380. */
  381. }
  382. b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
  383. b3AlignedObjectArray<b3InertiaData>& m_shapes;
  384. b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
  385. b3AlignedObjectArray<int>* m_batchSizes;
  386. int m_cellIndex;
  387. int m_curWgidx;
  388. int m_start;
  389. int m_nConstraints;
  390. bool m_solveFriction;
  391. int m_maxNumBatches;
  392. };
  393. void b3Solver::solveContactConstraintHost( b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
  394. b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches,b3AlignedObjectArray<int>* batchSizes)
  395. {
  396. #if 0
  397. {
  398. int nSplitX = B3_SOLVER_N_SPLIT_X;
  399. int nSplitY = B3_SOLVER_N_SPLIT_Y;
  400. int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
  401. for (int z=0;z<4;z++)
  402. {
  403. for (int y=0;y<4;y++)
  404. {
  405. for (int x=0;x<4;x++)
  406. {
  407. int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
  408. // printf("newIndex=%d\n",newIndex);
  409. int zIdx = newIndex/(nSplitX*nSplitY);
  410. int remain = newIndex%(nSplitX*nSplitY);
  411. int yIdx = remain/nSplitX;
  412. int xIdx = remain%nSplitX;
  413. // printf("newIndex=%d\n",newIndex);
  414. }
  415. }
  416. }
  417. //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
  418. for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
  419. {
  420. for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
  421. {
  422. int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
  423. int remain= (wgIdx%((nSplitX*nSplitY)/4));
  424. int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
  425. int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
  426. /*int zIdx = newIndex/(nSplitX*nSplitY);
  427. int remain = newIndex%(nSplitX*nSplitY);
  428. int yIdx = remain/nSplitX;
  429. int xIdx = remain%nSplitX;
  430. */
  431. int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
  432. // printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
  433. }
  434. }
  435. }
  436. #endif
  437. b3AlignedObjectArray<b3RigidBodyData> bodyNative;
  438. bodyBuf->copyToHost(bodyNative);
  439. b3AlignedObjectArray<b3InertiaData> shapeNative;
  440. shapeBuf->copyToHost(shapeNative);
  441. b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
  442. constraint->copyToHost(constraintNative);
  443. b3AlignedObjectArray<unsigned int> numConstraintsHost;
  444. m_numConstraints->copyToHost(numConstraintsHost);
  445. //printf("------------------------\n");
  446. b3AlignedObjectArray<unsigned int> offsetsHost;
  447. m_offsets->copyToHost(offsetsHost);
  448. static int frame=0;
  449. bool useBatches=true;
  450. if (useBatches)
  451. {
  452. for(int iter=0; iter<m_nIterations; iter++)
  453. {
  454. for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
  455. {
  456. int nSplitX = B3_SOLVER_N_SPLIT_X;
  457. int nSplitY = B3_SOLVER_N_SPLIT_Y;
  458. int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
  459. //printf("cell Batch %d\n",cellBatch);
  460. b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
  461. for (int i=0;i<B3_SOLVER_N_CELLS;i++)
  462. {
  463. usedBodies[i].resize(0);
  464. }
  465. //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
  466. for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
  467. {
  468. int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
  469. int remain= (wgIdx%((nSplitX*nSplitY)/4));
  470. int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
  471. int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
  472. int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
  473. if( numConstraintsHost[cellIdx] == 0 )
  474. continue;
  475. //printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
  476. //printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
  477. if (zIdx)
  478. {
  479. //printf("?\n");
  480. }
  481. if (iter==0)
  482. {
  483. //printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
  484. //printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
  485. }
  486. const int start = offsetsHost[cellIdx];
  487. int numConstraintsInCell = numConstraintsHost[cellIdx];
  488. const int end = start + numConstraintsInCell;
  489. SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell ,maxNumBatches,usedBodies,wgIdx,batchSizes,cellIdx);
  490. task.m_solveFriction = false;
  491. task.run(0);
  492. }
  493. }
  494. }
  495. for(int iter=0; iter<m_nIterations; iter++)
  496. {
  497. for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
  498. {
  499. int nSplitX = B3_SOLVER_N_SPLIT_X;
  500. int nSplitY = B3_SOLVER_N_SPLIT_Y;
  501. int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
  502. for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
  503. {
  504. int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
  505. int remain= (wgIdx%((nSplitX*nSplitY)/4));
  506. int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
  507. int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
  508. int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
  509. if( numConstraintsHost[cellIdx] == 0 )
  510. continue;
  511. //printf("yIdx=%d\n",yIdx);
  512. const int start = offsetsHost[cellIdx];
  513. int numConstraintsInCell = numConstraintsHost[cellIdx];
  514. const int end = start + numConstraintsInCell;
  515. SolveTask task( bodyNative, shapeNative, constraintNative, start, numConstraintsInCell,maxNumBatches, 0,0,batchSizes,cellIdx);
  516. task.m_solveFriction = true;
  517. task.run(0);
  518. }
  519. }
  520. }
  521. } else
  522. {
  523. for(int iter=0; iter<m_nIterations; iter++)
  524. {
  525. SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
  526. task.m_solveFriction = false;
  527. task.run(0);
  528. }
  529. for(int iter=0; iter<m_nIterations; iter++)
  530. {
  531. SolveTask task( bodyNative, shapeNative, constraintNative, 0, n ,maxNumBatches,0,0,0,0);
  532. task.m_solveFriction = true;
  533. task.run(0);
  534. }
  535. }
  536. bodyBuf->copyFromHost(bodyNative);
  537. shapeBuf->copyFromHost(shapeNative);
  538. constraint->copyFromHost(constraintNative);
  539. frame++;
  540. }
  541. void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
  542. const b3OpenCLArray<b3InertiaData>* shapeBuf,
  543. b3OpenCLArray<b3GpuConstraint4>* constraint,
  544. b3OpenCLArray<unsigned int>* m_numConstraints,
  545. b3OpenCLArray<unsigned int>* m_offsets,
  546. int batchId
  547. )
  548. {
  549. // b3BufferInfoCL( m_numConstraints->getBufferCL() ),
  550. // b3BufferInfoCL( m_offsets->getBufferCL() )
  551. int cellBatch = batchId;
  552. const int nn = B3_SOLVER_N_CELLS;
  553. int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
  554. b3AlignedObjectArray<unsigned int> gN;
  555. m_numConstraints->copyToHost(gN);
  556. b3AlignedObjectArray<unsigned int> gOffsets;
  557. m_offsets->copyToHost(gOffsets);
  558. int nSplitX = B3_SOLVER_N_SPLIT_X;
  559. int nSplitY = B3_SOLVER_N_SPLIT_Y;
  560. int bIdx = batchId;
  561. b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
  562. constraint->copyToHost(cpuConstraints);
  563. printf("batch = %d\n", batchId);
  564. int numWorkgroups = nn/B3_SOLVER_N_BATCHES;
  565. b3AlignedObjectArray<int> usedBodies;
  566. for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
  567. {
  568. printf("wgIdx = %d ", wgIdx);
  569. int zIdx = (wgIdx/((nSplitX*nSplitY))/2)*2+((cellBatch&4)>>2);
  570. int remain = wgIdx%((nSplitX*nSplitY));
  571. int yIdx = (remain%(nSplitX/2))*2 + ((cellBatch&2)>>1);
  572. int xIdx = (remain/(nSplitX/2))*2 + (cellBatch&1);
  573. int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
  574. printf("cellIdx=%d\n",cellIdx);
  575. if( gN[cellIdx] == 0 )
  576. continue;
  577. const int start = gOffsets[cellIdx];
  578. const int end = start + gN[cellIdx];
  579. for (int c=start;c<end;c++)
  580. {
  581. b3GpuConstraint4& constraint = cpuConstraints[c];
  582. //printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
  583. if (usedBodies.findLinearSearch(constraint.m_bodyA)< usedBodies.size())
  584. {
  585. printf("error?\n");
  586. }
  587. if (usedBodies.findLinearSearch(constraint.m_bodyB)< usedBodies.size())
  588. {
  589. printf("error?\n");
  590. }
  591. }
  592. for (int c=start;c<end;c++)
  593. {
  594. b3GpuConstraint4& constraint = cpuConstraints[c];
  595. usedBodies.push_back(constraint.m_bodyA);
  596. usedBodies.push_back(constraint.m_bodyB);
  597. }
  598. }
  599. }
  600. static bool verify=false;
  601. void b3Solver::solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
  602. b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
  603. {
  604. b3Int4 cdata = b3MakeInt4( n, 0, 0, 0 );
  605. {
  606. const int nn = B3_SOLVER_N_CELLS;
  607. cdata.x = 0;
  608. cdata.y = maxNumBatches;//250;
  609. int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
  610. #ifdef DEBUG_ME
  611. SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
  612. adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
  613. #endif
  614. {
  615. B3_PROFILE("m_batchSolveKernel iterations");
  616. for(int iter=0; iter<m_nIterations; iter++)
  617. {
  618. for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
  619. {
  620. if (verify)
  621. {
  622. checkConstraintBatch(bodyBuf,shapeBuf,constraint,m_numConstraints,m_offsets,ib);
  623. }
  624. #ifdef DEBUG_ME
  625. memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
  626. gpuDebugInfo.write(debugInfo,numWorkItems);
  627. #endif
  628. cdata.z = ib;
  629. b3LauncherCL launcher( m_queue, m_solveContactKernel ,"m_solveContactKernel");
  630. #if 1
  631. b3BufferInfoCL bInfo[] = {
  632. b3BufferInfoCL( bodyBuf->getBufferCL() ),
  633. b3BufferInfoCL( shapeBuf->getBufferCL() ),
  634. b3BufferInfoCL( constraint->getBufferCL() ),
  635. b3BufferInfoCL( m_numConstraints->getBufferCL() ),
  636. b3BufferInfoCL( m_offsets->getBufferCL() )
  637. #ifdef DEBUG_ME
  638. , b3BufferInfoCL(&gpuDebugInfo)
  639. #endif
  640. };
  641. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  642. //launcher.setConst( cdata.x );
  643. launcher.setConst( cdata.y );
  644. launcher.setConst( cdata.z );
  645. b3Int4 nSplit;
  646. nSplit.x = B3_SOLVER_N_SPLIT_X;
  647. nSplit.y = B3_SOLVER_N_SPLIT_Y;
  648. nSplit.z = B3_SOLVER_N_SPLIT_Z;
  649. launcher.setConst( nSplit );
  650. launcher.launch1D( numWorkItems, 64 );
  651. #else
  652. const char* fileName = "m_batchSolveKernel.bin";
  653. FILE* f = fopen(fileName,"rb");
  654. if (f)
  655. {
  656. int sizeInBytes=0;
  657. if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
  658. {
  659. printf("error, cannot get file size\n");
  660. exit(0);
  661. }
  662. unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
  663. fread(buf,sizeInBytes,1,f);
  664. int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
  665. int num = *(int*)&buf[serializedBytes];
  666. launcher.launch1D( num);
  667. //this clFinish is for testing on errors
  668. clFinish(m_queue);
  669. }
  670. #endif
  671. #ifdef DEBUG_ME
  672. clFinish(m_queue);
  673. gpuDebugInfo.read(debugInfo,numWorkItems);
  674. clFinish(m_queue);
  675. for (int i=0;i<numWorkItems;i++)
  676. {
  677. if (debugInfo[i].m_valInt2>0)
  678. {
  679. printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
  680. }
  681. if (debugInfo[i].m_valInt3>0)
  682. {
  683. printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
  684. }
  685. }
  686. #endif //DEBUG_ME
  687. }
  688. }
  689. clFinish(m_queue);
  690. }
  691. cdata.x = 1;
  692. bool applyFriction=true;
  693. if (applyFriction)
  694. {
  695. B3_PROFILE("m_batchSolveKernel iterations2");
  696. for(int iter=0; iter<m_nIterations; iter++)
  697. {
  698. for(int ib=0; ib<B3_SOLVER_N_BATCHES; ib++)
  699. {
  700. cdata.z = ib;
  701. b3BufferInfoCL bInfo[] = {
  702. b3BufferInfoCL( bodyBuf->getBufferCL() ),
  703. b3BufferInfoCL( shapeBuf->getBufferCL() ),
  704. b3BufferInfoCL( constraint->getBufferCL() ),
  705. b3BufferInfoCL( m_numConstraints->getBufferCL() ),
  706. b3BufferInfoCL( m_offsets->getBufferCL() )
  707. #ifdef DEBUG_ME
  708. ,b3BufferInfoCL(&gpuDebugInfo)
  709. #endif //DEBUG_ME
  710. };
  711. b3LauncherCL launcher( m_queue, m_solveFrictionKernel,"m_solveFrictionKernel" );
  712. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  713. //launcher.setConst( cdata.x );
  714. launcher.setConst( cdata.y );
  715. launcher.setConst( cdata.z );
  716. b3Int4 nSplit;
  717. nSplit.x = B3_SOLVER_N_SPLIT_X;
  718. nSplit.y = B3_SOLVER_N_SPLIT_Y;
  719. nSplit.z = B3_SOLVER_N_SPLIT_Z;
  720. launcher.setConst( nSplit );
  721. launcher.launch1D( 64*nn/B3_SOLVER_N_BATCHES, 64 );
  722. }
  723. }
  724. clFinish(m_queue);
  725. }
  726. #ifdef DEBUG_ME
  727. delete[] debugInfo;
  728. #endif //DEBUG_ME
  729. }
  730. }
  731. void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
  732. const b3OpenCLArray<b3InertiaData>* shapeBuf,
  733. b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
  734. int nContacts, const ConstraintCfg& cfg )
  735. {
  736. b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
  737. contactCOut->resize(nContacts);
  738. struct CB
  739. {
  740. int m_nContacts;
  741. float m_dt;
  742. float m_positionDrift;
  743. float m_positionConstraintCoeff;
  744. };
  745. {
  746. CB cdata;
  747. cdata.m_nContacts = nContacts;
  748. cdata.m_dt = cfg.m_dt;
  749. cdata.m_positionDrift = cfg.m_positionDrift;
  750. cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
  751. if (gConvertConstraintOnCpu)
  752. {
  753. b3AlignedObjectArray<b3RigidBodyData> gBodies;
  754. bodyBuf->copyToHost(gBodies);
  755. b3AlignedObjectArray<b3Contact4> gContact;
  756. contactsIn->copyToHost(gContact);
  757. b3AlignedObjectArray<b3InertiaData> gShapes;
  758. shapeBuf->copyToHost(gShapes);
  759. b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
  760. gConstraintOut.resize(nContacts);
  761. B3_PROFILE("cpu contactToConstraintKernel");
  762. for (int gIdx=0;gIdx<nContacts;gIdx++)
  763. {
  764. int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
  765. int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
  766. b3Float4 posA = gBodies[aIdx].m_pos;
  767. b3Float4 linVelA = gBodies[aIdx].m_linVel;
  768. b3Float4 angVelA = gBodies[aIdx].m_angVel;
  769. float invMassA = gBodies[aIdx].m_invMass;
  770. b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
  771. b3Float4 posB = gBodies[bIdx].m_pos;
  772. b3Float4 linVelB = gBodies[bIdx].m_linVel;
  773. b3Float4 angVelB = gBodies[bIdx].m_angVel;
  774. float invMassB = gBodies[bIdx].m_invMass;
  775. b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
  776. b3ContactConstraint4_t cs;
  777. setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
  778. &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
  779. &cs );
  780. cs.m_batchIdx = gContact[gIdx].m_batchIdx;
  781. gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
  782. }
  783. contactCOut->copyFromHost(gConstraintOut);
  784. } else
  785. {
  786. B3_PROFILE("gpu m_contactToConstraintKernel");
  787. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()),
  788. b3BufferInfoCL( contactCOut->getBufferCL() )};
  789. b3LauncherCL launcher( m_queue, m_contactToConstraintKernel,"m_contactToConstraintKernel" );
  790. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  791. //launcher.setConst( cdata );
  792. launcher.setConst(cdata.m_nContacts);
  793. launcher.setConst(cdata.m_dt);
  794. launcher.setConst(cdata.m_positionDrift);
  795. launcher.setConst(cdata.m_positionConstraintCoeff);
  796. launcher.launch1D( nContacts, 64 );
  797. clFinish(m_queue);
  798. }
  799. }
  800. }
  801. /*
  802. void b3Solver::sortContacts( const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
  803. b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData,
  804. int nContacts, const b3Solver::ConstraintCfg& cfg )
  805. {
  806. const int sortAlignment = 512; // todo. get this out of sort
  807. if( cfg.m_enableParallelSolve )
  808. {
  809. int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
  810. b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
  811. b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
  812. { // 2. set cell idx
  813. struct CB
  814. {
  815. int m_nContacts;
  816. int m_staticIdx;
  817. float m_scale;
  818. int m_nSplit;
  819. };
  820. b3Assert( sortSize%64 == 0 );
  821. CB cdata;
  822. cdata.m_nContacts = nContacts;
  823. cdata.m_staticIdx = cfg.m_staticIdx;
  824. cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
  825. cdata.m_nSplit = B3_SOLVER_N_SPLIT;
  826. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
  827. b3LauncherCL launcher( m_queue, m_setSortDataKernel );
  828. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  829. launcher.setConst( cdata );
  830. launcher.launch1D( sortSize, 64 );
  831. }
  832. { // 3. sort by cell idx
  833. int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
  834. int sortBit = 32;
  835. //if( n <= 0xffff ) sortBit = 16;
  836. //if( n <= 0xff ) sortBit = 8;
  837. m_sort32->execute(*m_sortDataBuffer,sortSize);
  838. }
  839. { // 4. find entries
  840. m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);
  841. m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
  842. }
  843. { // 5. sort constraints by cellIdx
  844. // todo. preallocate this
  845. // b3Assert( contactsIn->getType() == TYPE_HOST );
  846. // b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn ); // copying contacts to this buffer
  847. {
  848. b3Int4 cdata; cdata.x = nContacts;
  849. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
  850. b3LauncherCL launcher( m_queue, m_reorderContactKernel );
  851. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  852. launcher.setConst( cdata );
  853. launcher.launch1D( nContacts, 64 );
  854. }
  855. // BufferUtils::unmap<true>( out, contactsIn, nContacts );
  856. }
  857. }
  858. }
  859. */
  860. void b3Solver::batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx )
  861. {
  862. int numWorkItems = 64*B3_SOLVER_N_CELLS;
  863. {
  864. B3_PROFILE("batch generation");
  865. b3Int4 cdata;
  866. cdata.x = nContacts;
  867. cdata.y = 0;
  868. cdata.z = staticIdx;
  869. #ifdef BATCH_DEBUG
  870. SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
  871. adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
  872. memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
  873. gpuDebugInfo.write(debugInfo,numWorkItems);
  874. #endif
  875. b3BufferInfoCL bInfo[] = {
  876. b3BufferInfoCL( contacts->getBufferCL() ),
  877. b3BufferInfoCL( m_contactBuffer2->getBufferCL()),
  878. b3BufferInfoCL( nNative->getBufferCL() ),
  879. b3BufferInfoCL( offsetsNative->getBufferCL() ),
  880. #ifdef BATCH_DEBUG
  881. , b3BufferInfoCL(&gpuDebugInfo)
  882. #endif
  883. };
  884. {
  885. m_batchSizes.resize(nNative->size());
  886. B3_PROFILE("batchingKernel");
  887. //b3LauncherCL launcher( m_queue, m_batchingKernel);
  888. cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
  889. b3LauncherCL launcher( m_queue, k,"*batchingKernel");
  890. if (!useNewBatchingKernel )
  891. {
  892. launcher.setBuffer( contacts->getBufferCL() );
  893. }
  894. launcher.setBuffer( m_contactBuffer2->getBufferCL() );
  895. launcher.setBuffer( nNative->getBufferCL());
  896. launcher.setBuffer( offsetsNative->getBufferCL());
  897. launcher.setBuffer(m_batchSizes.getBufferCL());
  898. //launcher.setConst( cdata );
  899. launcher.setConst(staticIdx);
  900. launcher.launch1D( numWorkItems, 64 );
  901. //clFinish(m_queue);
  902. //b3AlignedObjectArray<int> batchSizesCPU;
  903. //m_batchSizes.copyToHost(batchSizesCPU);
  904. //printf(".\n");
  905. }
  906. #ifdef BATCH_DEBUG
  907. aaaa
  908. b3Contact4* hostContacts = new b3Contact4[nContacts];
  909. m_contactBuffer->read(hostContacts,nContacts);
  910. clFinish(m_queue);
  911. gpuDebugInfo.read(debugInfo,numWorkItems);
  912. clFinish(m_queue);
  913. for (int i=0;i<numWorkItems;i++)
  914. {
  915. if (debugInfo[i].m_valInt1>0)
  916. {
  917. printf("catch\n");
  918. }
  919. if (debugInfo[i].m_valInt2>0)
  920. {
  921. printf("catch22\n");
  922. }
  923. if (debugInfo[i].m_valInt3>0)
  924. {
  925. printf("catch666\n");
  926. }
  927. if (debugInfo[i].m_valInt4>0)
  928. {
  929. printf("catch777\n");
  930. }
  931. }
  932. delete[] debugInfo;
  933. #endif //BATCH_DEBUG
  934. }
  935. // copy buffer to buffer
  936. //b3Assert(m_contactBuffer->size()==nContacts);
  937. //contacts->copyFromOpenCLArray( *m_contactBuffer);
  938. //clFinish(m_queue);//needed?
  939. }